raid5.c - OpenGrok cross reference for /openbmc/linux/drivers/md/raid5.c

Deleted Added

sdiffudifftextold (9134d02b..)new (1f98a13f..)

raid5.c (9134d02bc0af4a8747d448d1f811ec5f8eb96df6)	raid5.c (1f98a13f623e0ef666690a18c1250335fc6d7ef1)
1/* 2 * raid5.c : Multiple Devices driver for Linux 3 * Copyright (C) 1996, 1997 Ingo Molnar, Miguel de Icaza, Gadi Oxman 4 * Copyright (C) 1999, 2000 Ingo Molnar 5 * Copyright (C) 2002, 2003 H. Peter Anvin 6 * 7 * RAID-4/5/6 management functions. 8 * Thanks to Penguin Computing for making the RAID-6 development possible --- 33 unchanged lines hidden (view full) --- 42 * This may occasionally write a bit out twice, but is sure never to 43 * miss any bits. 44 */ 45 46#include <linux/blkdev.h> 47#include <linux/kthread.h> 48#include <linux/raid/pq.h> 49#include <linux/async_tx.h>	1/* 2 * raid5.c : Multiple Devices driver for Linux 3 * Copyright (C) 1996, 1997 Ingo Molnar, Miguel de Icaza, Gadi Oxman 4 * Copyright (C) 1999, 2000 Ingo Molnar 5 * Copyright (C) 2002, 2003 H. Peter Anvin 6 * 7 * RAID-4/5/6 management functions. 8 * Thanks to Penguin Computing for making the RAID-6 development possible --- 33 unchanged lines hidden (view full) --- 42 * This may occasionally write a bit out twice, but is sure never to 43 * miss any bits. 44 */ 45 46#include <linux/blkdev.h> 47#include <linux/kthread.h> 48#include <linux/raid/pq.h> 49#include <linux/async_tx.h>
50#include <linux/async.h>
51#include <linux/seq_file.h>	50#include <linux/seq_file.h>
52#include <linux/cpu.h>
53#include "md.h" 54#include "raid5.h" 55#include "bitmap.h" 56 57/* 58 * Stripe cache 59 / 60 --- 435 unchanged lines hidden* (view full) --- 496static struct dma_async_tx_descriptor * 497async_copy_data(int frombio, struct bio bio, struct page page, 498 sector_t sector, struct dma_async_tx_descriptor tx) 499{ 500 struct bio_vec bvl; 501 struct page *bio_page; 502 int i; 503 int page_offset;	51#include "md.h" 52#include "raid5.h" 53#include "bitmap.h" 54 55/* 56 * Stripe cache 57 / 58 --- 435 unchanged lines hidden* (view full) --- 494static struct dma_async_tx_descriptor * 495async_copy_data(int frombio, struct bio bio, struct page page, 496 sector_t sector, struct dma_async_tx_descriptor tx) 497{ 498 struct bio_vec bvl; 499 struct page *bio_page; 500 int i; 501 int page_offset;
504 struct async_submit_ctl submit; 505 enum async_tx_flags flags = 0;
506 507 if (bio->bi_sector >= sector) 508 page_offset = (signed)(bio->bi_sector - sector) * 512; 509 else 510 page_offset = (signed)(sector - bio->bi_sector) * -512;	502 503 if (bio->bi_sector >= sector) 504 page_offset = (signed)(bio->bi_sector - sector) * 512; 505 else 506 page_offset = (signed)(sector - bio->bi_sector) * -512;
511 512 if (frombio) 513 flags \|= ASYNC_TX_FENCE; 514 init_async_submit(&submit, flags, tx, NULL, NULL, NULL); 515
516 bio_for_each_segment(bvl, bio, i) { 517 int len = bio_iovec_idx(bio, i)->bv_len; 518 int clen; 519 int b_offset = 0; 520 521 if (page_offset < 0) { 522 b_offset = -page_offset; 523 page_offset += b_offset; --- 5 unchanged lines hidden (view full) --- 529 else 530 clen = len; 531 532 if (clen > 0) { 533 b_offset += bio_iovec_idx(bio, i)->bv_offset; 534 bio_page = bio_iovec_idx(bio, i)->bv_page; 535 if (frombio) 536 tx = async_memcpy(page, bio_page, page_offset,	507 bio_for_each_segment(bvl, bio, i) { 508 int len = bio_iovec_idx(bio, i)->bv_len; 509 int clen; 510 int b_offset = 0; 511 512 if (page_offset < 0) { 513 b_offset = -page_offset; 514 page_offset += b_offset; --- 5 unchanged lines hidden (view full) --- 520 else 521 clen = len; 522 523 if (clen > 0) { 524 b_offset += bio_iovec_idx(bio, i)->bv_offset; 525 bio_page = bio_iovec_idx(bio, i)->bv_page; 526 if (frombio) 527 tx = async_memcpy(page, bio_page, page_offset,
537 b_offset, clen, &submit);	528 b_offset, clen, 529 ASYNC_TX_DEP_ACK, 530 tx, NULL, NULL);
538 else 539 tx = async_memcpy(bio_page, page, b_offset,	531 else 532 tx = async_memcpy(bio_page, page, b_offset,
540 page_offset, clen, &submit);	533 page_offset, clen, 534 ASYNC_TX_DEP_ACK, 535 tx, NULL, NULL);
541 }	536 }
542 /* chain the operations */ 543 submit.depend_tx = tx; 544
545 if (clen < len) /* hit end of page / 546 break; 547 page_offset += len; 548 } 549 550 return tx; 551} 552 --- 42 unchanged lines hidden* (view full) --- 595 set_bit(STRIPE_HANDLE, &sh->state); 596 release_stripe(sh); 597} 598 599static void ops_run_biofill(struct stripe_head sh) 600{ 601 struct dma_async_tx_descriptor tx = NULL; 602 raid5_conf_t *conf = sh->raid_conf;	537 if (clen < len) /* hit end of page / 538 break; 539 page_offset += len; 540 } 541 542 return tx; 543} 544 --- 42 unchanged lines hidden* (view full) --- 587 set_bit(STRIPE_HANDLE, &sh->state); 588 release_stripe(sh); 589} 590 591static void ops_run_biofill(struct stripe_head sh) 592{ 593 struct dma_async_tx_descriptor tx = NULL; 594 raid5_conf_t *conf = sh->raid_conf;
603 struct async_submit_ctl submit;
604 int i; 605 606 pr_debug("%s: stripe %llu\n", __func__, 607 (unsigned long long)sh->sector); 608 609 for (i = sh->disks; i--; ) { 610 struct r5dev dev = &sh->dev[i]; 611 if (test_bit(R5_Wantfill, &dev->flags)) { --- 7 unchanged lines hidden* (view full) --- 619 tx = async_copy_data(0, rbi, dev->page, 620 dev->sector, tx); 621 rbi = r5_next_bio(rbi, dev->sector); 622 } 623 } 624 } 625 626 atomic_inc(&sh->count);	595 int i; 596 597 pr_debug("%s: stripe %llu\n", __func__, 598 (unsigned long long)sh->sector); 599 600 for (i = sh->disks; i--; ) { 601 struct r5dev dev = &sh->dev[i]; 602 if (test_bit(R5_Wantfill, &dev->flags)) { --- 7 unchanged lines hidden* (view full) --- 610 tx = async_copy_data(0, rbi, dev->page, 611 dev->sector, tx); 612 rbi = r5_next_bio(rbi, dev->sector); 613 } 614 } 615 } 616 617 atomic_inc(&sh->count);
627 init_async_submit(&submit, ASYNC_TX_ACK, tx, ops_complete_biofill, sh, NULL); 628 async_trigger_callback(&submit);	618 async_trigger_callback(ASYNC_TX_DEP_ACK \| ASYNC_TX_ACK, tx, 619 ops_complete_biofill, sh);
629} 630	620} 621
631static void mark_target_uptodate(struct stripe_head *sh, int target)	622static void ops_complete_compute5(void *stripe_head_ref)
632{	623{
633 struct r5dev tgt; 634 635 if (target < 0) 636 return; 637 638 tgt = &sh->dev[target]; 639 set_bit(R5_UPTODATE, &tgt->flags); 640 BUG_ON(!test_bit(R5_Wantcompute, &tgt->flags)); 641 clear_bit(R5_Wantcompute, &tgt->flags); 642} 643 644static void ops_complete_compute(void stripe_head_ref) 645{
646 struct stripe_head *sh = stripe_head_ref;	624 struct stripe_head *sh = stripe_head_ref;
	625 int target = sh->ops.target; 626 struct r5dev *tgt = &sh->dev[target];
647 648 pr_debug("%s: stripe %llu\n", __func__, 649 (unsigned long long)sh->sector); 650	627 628 pr_debug("%s: stripe %llu\n", __func__, 629 (unsigned long long)sh->sector); 630
651 /* mark the computed target(s) as uptodate */ 652 mark_target_uptodate(sh, sh->ops.target); 653 mark_target_uptodate(sh, sh->ops.target2); 654	631 set_bit(R5_UPTODATE, &tgt->flags); 632 BUG_ON(!test_bit(R5_Wantcompute, &tgt->flags)); 633 clear_bit(R5_Wantcompute, &tgt->flags);
655 clear_bit(STRIPE_COMPUTE_RUN, &sh->state); 656 if (sh->check_state == check_state_compute_run) 657 sh->check_state = check_state_compute_result; 658 set_bit(STRIPE_HANDLE, &sh->state); 659 release_stripe(sh); 660} 661	634 clear_bit(STRIPE_COMPUTE_RUN, &sh->state); 635 if (sh->check_state == check_state_compute_run) 636 sh->check_state = check_state_compute_result; 637 set_bit(STRIPE_HANDLE, &sh->state); 638 release_stripe(sh); 639} 640
662/* return a pointer to the address conversion region of the scribble buffer / 663static addr_conv_t to_addr_conv(struct stripe_head sh, 664 struct raid5_percpu percpu)	641static struct dma_async_tx_descriptor ops_run_compute5(struct stripe_head sh)
665{	642{
666 return percpu->scribble + sizeof(struct page ) (sh->disks + 2); 667} 668 669static struct dma_async_tx_descriptor * 670ops_run_compute5(struct stripe_head sh, struct raid5_percpu percpu) 671{	643 /* kernel stack size limits the total number of disks */
672 int disks = sh->disks;	644 int disks = sh->disks;
673 struct page **xor_srcs = percpu->scribble;	645 struct page *xor_srcs[disks];
674 int target = sh->ops.target; 675 struct r5dev tgt = &sh->dev[target]; 676 struct page xor_dest = tgt->page; 677 int count = 0; 678 struct dma_async_tx_descriptor *tx;	646 int target = sh->ops.target; 647 struct r5dev tgt = &sh->dev[target]; 648 struct page xor_dest = tgt->page; 649 int count = 0; 650 struct dma_async_tx_descriptor *tx;
679 struct async_submit_ctl submit;
680 int i; 681 682 pr_debug("%s: stripe %llu block: %d\n", 683 __func__, (unsigned long long)sh->sector, target); 684 BUG_ON(!test_bit(R5_Wantcompute, &tgt->flags)); 685 686 for (i = disks; i--; ) 687 if (i != target) 688 xor_srcs[count++] = sh->dev[i].page; 689 690 atomic_inc(&sh->count); 691	651 int i; 652 653 pr_debug("%s: stripe %llu block: %d\n", 654 __func__, (unsigned long long)sh->sector, target); 655 BUG_ON(!test_bit(R5_Wantcompute, &tgt->flags)); 656 657 for (i = disks; i--; ) 658 if (i != target) 659 xor_srcs[count++] = sh->dev[i].page; 660 661 atomic_inc(&sh->count); 662
692 init_async_submit(&submit, ASYNC_TX_FENCE\|ASYNC_TX_XOR_ZERO_DST, NULL, 693 ops_complete_compute, sh, to_addr_conv(sh, percpu));
694 if (unlikely(count == 1))	663 if (unlikely(count == 1))
695 tx = async_memcpy(xor_dest, xor_srcs[0], 0, 0, STRIPE_SIZE, &submit);	664 tx = async_memcpy(xor_dest, xor_srcs[0], 0, 0, STRIPE_SIZE, 665 0, NULL, ops_complete_compute5, sh);
696 else	666 else
697 tx = async_xor(xor_dest, xor_srcs, 0, count, STRIPE_SIZE, &submit);	667 tx = async_xor(xor_dest, xor_srcs, 0, count, STRIPE_SIZE, 668 ASYNC_TX_XOR_ZERO_DST, NULL, 669 ops_complete_compute5, sh);
698 699 return tx; 700} 701	670 671 return tx; 672} 673
702/* set_syndrome_sources - populate source buffers for gen_syndrome 703 * @srcs - (struct page ) array of size sh->disks 704 @sh - stripe_head to parse 705 * 706 * Populates srcs in proper layout order for the stripe and returns the 707 * 'count' of sources to be used in a call to async_gen_syndrome. The P 708 * destination buffer is recorded in srcs[count] and the Q destination 709 * is recorded in srcs[count+1]]. 710 / 711static int set_syndrome_sources(struct page srcs, struct stripe_head sh) 712{ 713 int disks = sh->disks; 714 int syndrome_disks = sh->ddf_layout ? disks : (disks - 2); 715 int d0_idx = raid6_d0(sh); 716 int count; 717 int i; 718 719 for (i = 0; i < disks; i++) 720 srcs[i] = (void )raid6_empty_zero_page; 721 722 count = 0; 723 i = d0_idx; 724 do { 725 int slot = raid6_idx_to_slot(i, sh, &count, syndrome_disks); 726 727 srcs[slot] = sh->dev[i].page; 728 i = raid6_next_disk(i, disks); 729 } while (i != d0_idx); 730 BUG_ON(count != syndrome_disks); 731 732 return count; 733} 734 735static struct dma_async_tx_descriptor 736ops_run_compute6_1(struct stripe_head sh, struct raid5_percpu percpu) 737{ 738 int disks = sh->disks; 739 struct page *blocks = percpu->scribble; 740 int target; 741 int qd_idx = sh->qd_idx; 742 struct dma_async_tx_descriptor tx; 743 struct async_submit_ctl submit; 744 struct r5dev tgt; 745 struct page dest; 746 int i; 747 int count; 748 749 if (sh->ops.target < 0) 750 target = sh->ops.target2; 751 else if (sh->ops.target2 < 0) 752 target = sh->ops.target; 753 else 754 /* we should only have one valid target / 755 BUG(); 756 BUG_ON(target < 0); 757 pr_debug("%s: stripe %llu block: %d\n", 758 __func__, (unsigned long long)sh->sector, target); 759 760 tgt = &sh->dev[target]; 761 BUG_ON(!test_bit(R5_Wantcompute, &tgt->flags)); 762 dest = tgt->page; 763 764 atomic_inc(&sh->count); 765 766 if (target == qd_idx) { 767 count = set_syndrome_sources(blocks, sh); 768 blocks[count] = NULL; / regenerating p is not necessary / 769 BUG_ON(blocks[count+1] != dest); / q should already be set / 770 init_async_submit(&submit, ASYNC_TX_FENCE, NULL, 771 ops_complete_compute, sh, 772 to_addr_conv(sh, percpu)); 773 tx = async_gen_syndrome(blocks, 0, count+2, STRIPE_SIZE, &submit); 774 } else { 775 / Compute any data- or p-drive using XOR / 776 count = 0; 777 for (i = disks; i-- ; ) { 778 if (i == target \|\| i == qd_idx) 779 continue; 780 blocks[count++] = sh->dev[i].page; 781 } 782 783 init_async_submit(&submit, ASYNC_TX_FENCE\|ASYNC_TX_XOR_ZERO_DST, 784 NULL, ops_complete_compute, sh, 785 to_addr_conv(sh, percpu)); 786 tx = async_xor(dest, blocks, 0, count, STRIPE_SIZE, &submit); 787 } 788 789 return tx; 790} 791 792static struct dma_async_tx_descriptor 793ops_run_compute6_2(struct stripe_head sh, struct raid5_percpu percpu) 794{ 795 int i, count, disks = sh->disks; 796 int syndrome_disks = sh->ddf_layout ? disks : disks-2; 797 int d0_idx = raid6_d0(sh); 798 int faila = -1, failb = -1; 799 int target = sh->ops.target; 800 int target2 = sh->ops.target2; 801 struct r5dev tgt = &sh->dev[target]; 802 struct r5dev tgt2 = &sh->dev[target2]; 803 struct dma_async_tx_descriptor tx; 804 struct page blocks = percpu->scribble; 805 struct async_submit_ctl submit; 806 807 pr_debug("%s: stripe %llu block1: %d block2: %d\n", 808 __func__, (unsigned long long)sh->sector, target, target2); 809 BUG_ON(target < 0 \|\| target2 < 0); 810 BUG_ON(!test_bit(R5_Wantcompute, &tgt->flags)); 811 BUG_ON(!test_bit(R5_Wantcompute, &tgt2->flags)); 812 813 / we need to open-code set_syndrome_sources to handle to the 814 * slot number conversion for 'faila' and 'failb' 815 / 816 for (i = 0; i < disks ; i++) 817 blocks[i] = (void )raid6_empty_zero_page; 818 count = 0; 819 i = d0_idx; 820 do { 821 int slot = raid6_idx_to_slot(i, sh, &count, syndrome_disks); 822 823 blocks[slot] = sh->dev[i].page; 824 825 if (i == target) 826 faila = slot; 827 if (i == target2) 828 failb = slot; 829 i = raid6_next_disk(i, disks); 830 } while (i != d0_idx); 831 BUG_ON(count != syndrome_disks); 832 833 BUG_ON(faila == failb); 834 if (failb < faila) 835 swap(faila, failb); 836 pr_debug("%s: stripe: %llu faila: %d failb: %d\n", 837 __func__, (unsigned long long)sh->sector, faila, failb); 838 839 atomic_inc(&sh->count); 840 841 if (failb == syndrome_disks+1) { 842 /* Q disk is one of the missing disks / 843 if (faila == syndrome_disks) { 844 / Missing P+Q, just recompute / 845 init_async_submit(&submit, ASYNC_TX_FENCE, NULL, 846 ops_complete_compute, sh, 847 to_addr_conv(sh, percpu)); 848 return async_gen_syndrome(blocks, 0, count+2, 849 STRIPE_SIZE, &submit); 850 } else { 851 struct page dest; 852 int data_target; 853 int qd_idx = sh->qd_idx; 854 855 /* Missing D+Q: recompute D from P, then recompute Q / 856 if (target == qd_idx) 857 data_target = target2; 858 else 859 data_target = target; 860 861 count = 0; 862 for (i = disks; i-- ; ) { 863 if (i == data_target \|\| i == qd_idx) 864 continue; 865 blocks[count++] = sh->dev[i].page; 866 } 867 dest = sh->dev[data_target].page; 868 init_async_submit(&submit, 869 ASYNC_TX_FENCE\|ASYNC_TX_XOR_ZERO_DST, 870 NULL, NULL, NULL, 871 to_addr_conv(sh, percpu)); 872 tx = async_xor(dest, blocks, 0, count, STRIPE_SIZE, 873 &submit); 874 875 count = set_syndrome_sources(blocks, sh); 876 init_async_submit(&submit, ASYNC_TX_FENCE, tx, 877 ops_complete_compute, sh, 878 to_addr_conv(sh, percpu)); 879 return async_gen_syndrome(blocks, 0, count+2, 880 STRIPE_SIZE, &submit); 881 } 882 } 883 884 init_async_submit(&submit, ASYNC_TX_FENCE, NULL, ops_complete_compute, 885 sh, to_addr_conv(sh, percpu)); 886 if (failb == syndrome_disks) { 887 / We're missing D+P. / 888 return async_raid6_datap_recov(syndrome_disks+2, STRIPE_SIZE, 889 faila, blocks, &submit); 890 } else { 891 / We're missing D+D. */ 892 return async_raid6_2data_recov(syndrome_disks+2, STRIPE_SIZE, 893 faila, failb, blocks, &submit); 894 } 895} 896 897
898static void ops_complete_prexor(void stripe_head_ref) 899{ 900 struct stripe_head sh = stripe_head_ref; 901 902 pr_debug("%s: stripe %llu\n", __func__, 903 (unsigned long long)sh->sector); 904} 905 906static struct dma_async_tx_descriptor *	674static void ops_complete_prexor(void stripe_head_ref) 675{ 676 struct stripe_head sh = stripe_head_ref; 677 678 pr_debug("%s: stripe %llu\n", __func__, 679 (unsigned long long)sh->sector); 680} 681 682static struct dma_async_tx_descriptor *
907ops_run_prexor(struct stripe_head sh, struct raid5_percpu percpu, 908 struct dma_async_tx_descriptor *tx)	683ops_run_prexor(struct stripe_head sh, struct dma_async_tx_descriptor tx)
909{	684{
	685 /* kernel stack size limits the total number of disks */
910 int disks = sh->disks;	686 int disks = sh->disks;
911 struct page **xor_srcs = percpu->scribble;	687 struct page *xor_srcs[disks];
912 int count = 0, pd_idx = sh->pd_idx, i;	688 int count = 0, pd_idx = sh->pd_idx, i;
913 struct async_submit_ctl submit;
914 915 /* existing parity data subtracted / 916 struct page xor_dest = xor_srcs[count++] = sh->dev[pd_idx].page; 917 918 pr_debug("%s: stripe %llu\n", __func__, 919 (unsigned long long)sh->sector); 920 921 for (i = disks; i--; ) { 922 struct r5dev dev = &sh->dev[i]; 923 / Only process blocks that are known to be uptodate */ 924 if (test_bit(R5_Wantdrain, &dev->flags)) 925 xor_srcs[count++] = dev->page; 926 } 927	689 690 /* existing parity data subtracted / 691 struct page xor_dest = xor_srcs[count++] = sh->dev[pd_idx].page; 692 693 pr_debug("%s: stripe %llu\n", __func__, 694 (unsigned long long)sh->sector); 695 696 for (i = disks; i--; ) { 697 struct r5dev dev = &sh->dev[i]; 698 / Only process blocks that are known to be uptodate */ 699 if (test_bit(R5_Wantdrain, &dev->flags)) 700 xor_srcs[count++] = dev->page; 701 } 702
928 init_async_submit(&submit, ASYNC_TX_FENCE\|ASYNC_TX_XOR_DROP_DST, tx, 929 ops_complete_prexor, sh, to_addr_conv(sh, percpu)); 930 tx = async_xor(xor_dest, xor_srcs, 0, count, STRIPE_SIZE, &submit);	703 tx = async_xor(xor_dest, xor_srcs, 0, count, STRIPE_SIZE, 704 ASYNC_TX_DEP_ACK \| ASYNC_TX_XOR_DROP_DST, tx, 705 ops_complete_prexor, sh);
931 932 return tx; 933} 934 935static struct dma_async_tx_descriptor * 936ops_run_biodrain(struct stripe_head sh, struct dma_async_tx_descriptor tx) 937{ 938 int disks = sh->disks; --- 23 unchanged lines hidden (view full) --- 962 wbi = r5_next_bio(wbi, dev->sector); 963 } 964 } 965 } 966 967 return tx; 968} 969	706 707 return tx; 708} 709 710static struct dma_async_tx_descriptor * 711ops_run_biodrain(struct stripe_head sh, struct dma_async_tx_descriptor tx) 712{ 713 int disks = sh->disks; --- 23 unchanged lines hidden (view full) --- 737 wbi = r5_next_bio(wbi, dev->sector); 738 } 739 } 740 } 741 742 return tx; 743} 744
970static void ops_complete_reconstruct(void *stripe_head_ref)	745static void ops_complete_postxor(void *stripe_head_ref)
971{ 972 struct stripe_head *sh = stripe_head_ref;	746{ 747 struct stripe_head *sh = stripe_head_ref;
973 int disks = sh->disks; 974 int pd_idx = sh->pd_idx; 975 int qd_idx = sh->qd_idx; 976 int i;	748 int disks = sh->disks, i, pd_idx = sh->pd_idx;
977 978 pr_debug("%s: stripe %llu\n", __func__, 979 (unsigned long long)sh->sector); 980 981 for (i = disks; i--; ) { 982 struct r5dev *dev = &sh->dev[i];	749 750 pr_debug("%s: stripe %llu\n", __func__, 751 (unsigned long long)sh->sector); 752 753 for (i = disks; i--; ) { 754 struct r5dev *dev = &sh->dev[i];
983 984 if (dev->written \|\| i == pd_idx \|\| i == qd_idx)	755 if (dev->written \|\| i == pd_idx)
985 set_bit(R5_UPTODATE, &dev->flags); 986 } 987 988 if (sh->reconstruct_state == reconstruct_state_drain_run) 989 sh->reconstruct_state = reconstruct_state_drain_result; 990 else if (sh->reconstruct_state == reconstruct_state_prexor_drain_run) 991 sh->reconstruct_state = reconstruct_state_prexor_drain_result; 992 else { 993 BUG_ON(sh->reconstruct_state != reconstruct_state_run); 994 sh->reconstruct_state = reconstruct_state_result; 995 } 996 997 set_bit(STRIPE_HANDLE, &sh->state); 998 release_stripe(sh); 999} 1000 1001static void	756 set_bit(R5_UPTODATE, &dev->flags); 757 } 758 759 if (sh->reconstruct_state == reconstruct_state_drain_run) 760 sh->reconstruct_state = reconstruct_state_drain_result; 761 else if (sh->reconstruct_state == reconstruct_state_prexor_drain_run) 762 sh->reconstruct_state = reconstruct_state_prexor_drain_result; 763 else { 764 BUG_ON(sh->reconstruct_state != reconstruct_state_run); 765 sh->reconstruct_state = reconstruct_state_result; 766 } 767 768 set_bit(STRIPE_HANDLE, &sh->state); 769 release_stripe(sh); 770} 771 772static void
1002ops_run_reconstruct5(struct stripe_head sh, struct raid5_percpu percpu, 1003 struct dma_async_tx_descriptor *tx)	773ops_run_postxor(struct stripe_head sh, struct dma_async_tx_descriptor tx)
1004{	774{
	775 /* kernel stack size limits the total number of disks */
1005 int disks = sh->disks;	776 int disks = sh->disks;
1006 struct page **xor_srcs = percpu->scribble; 1007 struct async_submit_ctl submit;	777 struct page *xor_srcs[disks]; 778
1008 int count = 0, pd_idx = sh->pd_idx, i; 1009 struct page xor_dest; 1010 int prexor = 0; 1011 unsigned long flags; 1012 1013 pr_debug("%s: stripe %llu\n", __func__, 1014 (unsigned long long)sh->sector); 1015 --- 17 unchanged lines hidden* (view full) --- 1033 } 1034 } 1035 1036 /* 1/ if we prexor'd then the dest is reused as a source 1037 * 2/ if we did not prexor then we are redoing the parity 1038 * set ASYNC_TX_XOR_DROP_DST and ASYNC_TX_XOR_ZERO_DST 1039 * for the synchronous xor case 1040 */	779 int count = 0, pd_idx = sh->pd_idx, i; 780 struct page xor_dest; 781 int prexor = 0; 782 unsigned long flags; 783 784 pr_debug("%s: stripe %llu\n", __func__, 785 (unsigned long long)sh->sector); 786 --- 17 unchanged lines hidden* (view full) --- 804 } 805 } 806 807 /* 1/ if we prexor'd then the dest is reused as a source 808 * 2/ if we did not prexor then we are redoing the parity 809 * set ASYNC_TX_XOR_DROP_DST and ASYNC_TX_XOR_ZERO_DST 810 * for the synchronous xor case 811 */
1041 flags = ASYNC_TX_ACK \|	812 flags = ASYNC_TX_DEP_ACK \| ASYNC_TX_ACK \|
1042 (prexor ? ASYNC_TX_XOR_DROP_DST : ASYNC_TX_XOR_ZERO_DST); 1043 1044 atomic_inc(&sh->count); 1045	813 (prexor ? ASYNC_TX_XOR_DROP_DST : ASYNC_TX_XOR_ZERO_DST); 814 815 atomic_inc(&sh->count); 816
1046 init_async_submit(&submit, flags, tx, ops_complete_reconstruct, sh, 1047 to_addr_conv(sh, percpu)); 1048 if (unlikely(count == 1)) 1049 tx = async_memcpy(xor_dest, xor_srcs[0], 0, 0, STRIPE_SIZE, &submit); 1050 else 1051 tx = async_xor(xor_dest, xor_srcs, 0, count, STRIPE_SIZE, &submit);	817 if (unlikely(count == 1)) { 818 flags &= ~(ASYNC_TX_XOR_DROP_DST \| ASYNC_TX_XOR_ZERO_DST); 819 tx = async_memcpy(xor_dest, xor_srcs[0], 0, 0, STRIPE_SIZE, 820 flags, tx, ops_complete_postxor, sh); 821 } else 822 tx = async_xor(xor_dest, xor_srcs, 0, count, STRIPE_SIZE, 823 flags, tx, ops_complete_postxor, sh);
1052} 1053	824} 825
1054static void 1055ops_run_reconstruct6(struct stripe_head sh, struct raid5_percpu percpu, 1056 struct dma_async_tx_descriptor tx) 1057{ 1058 struct async_submit_ctl submit; 1059 struct page *blocks = percpu->scribble; 1060 int count; 1061 1062 pr_debug("%s: stripe %llu\n", __func__, (unsigned long long)sh->sector); 1063 1064 count = set_syndrome_sources(blocks, sh); 1065 1066 atomic_inc(&sh->count); 1067 1068 init_async_submit(&submit, ASYNC_TX_ACK, tx, ops_complete_reconstruct, 1069 sh, to_addr_conv(sh, percpu)); 1070 async_gen_syndrome(blocks, 0, count+2, STRIPE_SIZE, &submit); 1071} 1072
1073static void ops_complete_check(void stripe_head_ref) 1074{ 1075 struct stripe_head sh = stripe_head_ref; 1076 1077 pr_debug("%s: stripe %llu\n", __func__, 1078 (unsigned long long)sh->sector); 1079 1080 sh->check_state = check_state_check_result; 1081 set_bit(STRIPE_HANDLE, &sh->state); 1082 release_stripe(sh); 1083} 1084	826static void ops_complete_check(void stripe_head_ref) 827{ 828 struct stripe_head sh = stripe_head_ref; 829 830 pr_debug("%s: stripe %llu\n", __func__, 831 (unsigned long long)sh->sector); 832 833 sh->check_state = check_state_check_result; 834 set_bit(STRIPE_HANDLE, &sh->state); 835 release_stripe(sh); 836} 837
1085static void ops_run_check_p(struct stripe_head sh, struct raid5_percpu percpu)	838static void ops_run_check(struct stripe_head *sh)
1086{	839{
	840 /* kernel stack size limits the total number of disks */
1087 int disks = sh->disks;	841 int disks = sh->disks;
1088 int pd_idx = sh->pd_idx; 1089 int qd_idx = sh->qd_idx; 1090 struct page xor_dest; 1091 struct page *xor_srcs = percpu->scribble;	842 struct page *xor_srcs[disks];
1092 struct dma_async_tx_descriptor *tx;	843 struct dma_async_tx_descriptor *tx;
1093 struct async_submit_ctl submit; 1094 int count; 1095 int i;
1096	844
	845 int count = 0, pd_idx = sh->pd_idx, i; 846 struct page *xor_dest = xor_srcs[count++] = sh->dev[pd_idx].page; 847
1097 pr_debug("%s: stripe %llu\n", __func__, 1098 (unsigned long long)sh->sector); 1099	848 pr_debug("%s: stripe %llu\n", __func__, 849 (unsigned long long)sh->sector); 850
1100 count = 0; 1101 xor_dest = sh->dev[pd_idx].page; 1102 xor_srcs[count++] = xor_dest;
1103 for (i = disks; i--; ) {	851 for (i = disks; i--; ) {
1104 if (i == pd_idx \|\| i == qd_idx) 1105 continue; 1106 xor_srcs[count++] = sh->dev[i].page;	852 struct r5dev *dev = &sh->dev[i]; 853 if (i != pd_idx) 854 xor_srcs[count++] = dev->page;
1107 } 1108	855 } 856
1109 init_async_submit(&submit, 0, NULL, NULL, NULL, 1110 to_addr_conv(sh, percpu)); 1111 tx = async_xor_val(xor_dest, xor_srcs, 0, count, STRIPE_SIZE, 1112 &sh->ops.zero_sum_result, &submit);	857 tx = async_xor_zero_sum(xor_dest, xor_srcs, 0, count, STRIPE_SIZE, 858 &sh->ops.zero_sum_result, 0, NULL, NULL, NULL);
1113 1114 atomic_inc(&sh->count);	859 860 atomic_inc(&sh->count);
1115 init_async_submit(&submit, ASYNC_TX_ACK, tx, ops_complete_check, sh, NULL); 1116 tx = async_trigger_callback(&submit);	861 tx = async_trigger_callback(ASYNC_TX_DEP_ACK \| ASYNC_TX_ACK, tx, 862 ops_complete_check, sh);
1117} 1118	863} 864
1119static void ops_run_check_pq(struct stripe_head sh, struct raid5_percpu percpu, int checkp)	865static void raid5_run_ops(struct stripe_head *sh, unsigned long ops_request)
1120{	866{
1121 struct page *srcs = percpu->scribble; 1122 struct async_submit_ctl submit; 1123 int count; 1124 1125 pr_debug("%s: stripe %llu checkp: %d\n", __func__, 1126 (unsigned long long)sh->sector, checkp); 1127 1128 count = set_syndrome_sources(srcs, sh); 1129 if (!checkp) 1130 srcs[count] = NULL; 1131 1132 atomic_inc(&sh->count); 1133 init_async_submit(&submit, ASYNC_TX_ACK, NULL, ops_complete_check, 1134 sh, to_addr_conv(sh, percpu)); 1135 async_syndrome_val(srcs, 0, count+2, STRIPE_SIZE, 1136 &sh->ops.zero_sum_result, percpu->spare_page, &submit); 1137} 1138 1139static void raid_run_ops(struct stripe_head sh, unsigned long ops_request) 1140{
1141 int overlap_clear = 0, i, disks = sh->disks; 1142 struct dma_async_tx_descriptor *tx = NULL;	867 int overlap_clear = 0, i, disks = sh->disks; 868 struct dma_async_tx_descriptor *tx = NULL;
1143 raid5_conf_t conf = sh->raid_conf; 1144 int level = conf->level; 1145 struct raid5_percpu percpu; 1146 unsigned long cpu;
1147	869
1148 cpu = get_cpu(); 1149 percpu = per_cpu_ptr(conf->percpu, cpu);
1150 if (test_bit(STRIPE_OP_BIOFILL, &ops_request)) { 1151 ops_run_biofill(sh); 1152 overlap_clear++; 1153 } 1154 1155 if (test_bit(STRIPE_OP_COMPUTE_BLK, &ops_request)) {	870 if (test_bit(STRIPE_OP_BIOFILL, &ops_request)) { 871 ops_run_biofill(sh); 872 overlap_clear++; 873 } 874 875 if (test_bit(STRIPE_OP_COMPUTE_BLK, &ops_request)) {
1156 if (level < 6) 1157 tx = ops_run_compute5(sh, percpu); 1158 else { 1159 if (sh->ops.target2 < 0 \|\| sh->ops.target < 0) 1160 tx = ops_run_compute6_1(sh, percpu); 1161 else 1162 tx = ops_run_compute6_2(sh, percpu); 1163 } 1164 /* terminate the chain if reconstruct is not set to be run */ 1165 if (tx && !test_bit(STRIPE_OP_RECONSTRUCT, &ops_request))	876 tx = ops_run_compute5(sh); 877 /* terminate the chain if postxor is not set to be run */ 878 if (tx && !test_bit(STRIPE_OP_POSTXOR, &ops_request))
1166 async_tx_ack(tx); 1167 } 1168 1169 if (test_bit(STRIPE_OP_PREXOR, &ops_request))	879 async_tx_ack(tx); 880 } 881 882 if (test_bit(STRIPE_OP_PREXOR, &ops_request))
1170 tx = ops_run_prexor(sh, percpu, tx);	883 tx = ops_run_prexor(sh, tx);
1171 1172 if (test_bit(STRIPE_OP_BIODRAIN, &ops_request)) { 1173 tx = ops_run_biodrain(sh, tx); 1174 overlap_clear++; 1175 } 1176	884 885 if (test_bit(STRIPE_OP_BIODRAIN, &ops_request)) { 886 tx = ops_run_biodrain(sh, tx); 887 overlap_clear++; 888 } 889
1177 if (test_bit(STRIPE_OP_RECONSTRUCT, &ops_request)) { 1178 if (level < 6) 1179 ops_run_reconstruct5(sh, percpu, tx); 1180 else 1181 ops_run_reconstruct6(sh, percpu, tx); 1182 }	890 if (test_bit(STRIPE_OP_POSTXOR, &ops_request)) 891 ops_run_postxor(sh, tx);
1183	892
1184 if (test_bit(STRIPE_OP_CHECK, &ops_request)) { 1185 if (sh->check_state == check_state_run) 1186 ops_run_check_p(sh, percpu); 1187 else if (sh->check_state == check_state_run_q) 1188 ops_run_check_pq(sh, percpu, 0); 1189 else if (sh->check_state == check_state_run_pq) 1190 ops_run_check_pq(sh, percpu, 1); 1191 else 1192 BUG(); 1193 }	893 if (test_bit(STRIPE_OP_CHECK, &ops_request)) 894 ops_run_check(sh);
1194 1195 if (overlap_clear) 1196 for (i = disks; i--; ) { 1197 struct r5dev *dev = &sh->dev[i]; 1198 if (test_and_clear_bit(R5_Overlap, &dev->flags)) 1199 wake_up(&sh->raid_conf->wait_for_overlap); 1200 }	895 896 if (overlap_clear) 897 for (i = disks; i--; ) { 898 struct r5dev *dev = &sh->dev[i]; 899 if (test_and_clear_bit(R5_Overlap, &dev->flags)) 900 wake_up(&sh->raid_conf->wait_for_overlap); 901 }
1201 put_cpu();
1202} 1203 1204static int grow_one_stripe(raid5_conf_t conf) 1205{ 1206 struct stripe_head sh; 1207 sh = kmem_cache_alloc(conf->slab_cache, GFP_KERNEL); 1208 if (!sh) 1209 return 0; --- 33 unchanged lines hidden (view full) --- 1243 conf->slab_cache = sc; 1244 conf->pool_size = devs; 1245 while (num--) 1246 if (!grow_one_stripe(conf)) 1247 return 1; 1248 return 0; 1249} 1250	902} 903 904static int grow_one_stripe(raid5_conf_t conf) 905{ 906 struct stripe_head sh; 907 sh = kmem_cache_alloc(conf->slab_cache, GFP_KERNEL); 908 if (!sh) 909 return 0; --- 33 unchanged lines hidden (view full) --- 943 conf->slab_cache = sc; 944 conf->pool_size = devs; 945 while (num--) 946 if (!grow_one_stripe(conf)) 947 return 1; 948 return 0; 949} 950
1251/** 1252 * scribble_len - return the required size of the scribble region 1253 * @num - total number of disks in the array 1254 * 1255 * The size must be enough to contain: 1256 * 1/ a struct page pointer for each device in the array +2 1257 * 2/ room to convert each entry in (1) to its corresponding dma 1258 * (dma_map_page()) or page (page_address()) address. 1259 * 1260 * Note: the +2 is for the destination buffers of the ddf/raid6 case where we 1261 * calculate over all devices (not just the data blocks), using zeros in place 1262 * of the P and Q blocks. 1263 / 1264static size_t scribble_len(int num) 1265{ 1266 size_t len; 1267 1268 len = sizeof(struct page ) * (num+2) + sizeof(addr_conv_t) * (num+2); 1269 1270 return len; 1271} 1272
1273static int resize_stripes(raid5_conf_t conf, int newsize) 1274{ 1275 / Make all the stripes able to hold 'newsize' devices. 1276 * New slots in each stripe get 'page' set to a new page. 1277 * 1278 * This happens in stages: 1279 * 1/ create a new kmem_cache and allocate the required number of 1280 * stripe_heads. --- 12 unchanged lines hidden (view full) --- 1293 * active service. 1294 * 1295 * Once step2 is started, we cannot afford to wait for a write, 1296 * so we use GFP_NOIO allocations. 1297 / 1298 struct stripe_head osh, nsh; 1299 LIST_HEAD(newstripes); 1300 struct disk_info ndisks;	951static int resize_stripes(raid5_conf_t conf, int newsize) 952{ 953 / Make all the stripes able to hold 'newsize' devices. 954 * New slots in each stripe get 'page' set to a new page. 955 * 956 * This happens in stages: 957 * 1/ create a new kmem_cache and allocate the required number of 958 * stripe_heads. --- 12 unchanged lines hidden (view full) --- 971 * active service. 972 * 973 * Once step2 is started, we cannot afford to wait for a write, 974 * so we use GFP_NOIO allocations. 975 / 976 struct stripe_head osh, nsh; 977 LIST_HEAD(newstripes); 978 struct disk_info ndisks;
1301 unsigned long cpu;
1302 int err; 1303 struct kmem_cache sc; 1304 int i; 1305 1306 if (newsize <= conf->pool_size) 1307 return 0; / never bother to shrink / 1308 1309 err = md_allow_write(conf->mddev); --- 49 unchanged lines hidden* (view full) --- 1359 nsh->dev[i].page = NULL; 1360 kmem_cache_free(conf->slab_cache, osh); 1361 } 1362 kmem_cache_destroy(conf->slab_cache); 1363 1364 /* Step 3. 1365 * At this point, we are holding all the stripes so the array 1366 * is completely stalled, so now is a good time to resize	979 int err; 980 struct kmem_cache sc; 981 int i; 982 983 if (newsize <= conf->pool_size) 984 return 0; / never bother to shrink / 985 986 err = md_allow_write(conf->mddev); --- 49 unchanged lines hidden* (view full) --- 1036 nsh->dev[i].page = NULL; 1037 kmem_cache_free(conf->slab_cache, osh); 1038 } 1039 kmem_cache_destroy(conf->slab_cache); 1040 1041 /* Step 3. 1042 * At this point, we are holding all the stripes so the array 1043 * is completely stalled, so now is a good time to resize
1367 * conf->disks and the scribble region	1044 * conf->disks.
1368 / 1369 ndisks = kzalloc(newsize sizeof(struct disk_info), GFP_NOIO); 1370 if (ndisks) { 1371 for (i=0; i<conf->raid_disks; i++) 1372 ndisks[i] = conf->disks[i]; 1373 kfree(conf->disks); 1374 conf->disks = ndisks; 1375 } else 1376 err = -ENOMEM; 1377	1045 / 1046 ndisks = kzalloc(newsize sizeof(struct disk_info), GFP_NOIO); 1047 if (ndisks) { 1048 for (i=0; i<conf->raid_disks; i++) 1049 ndisks[i] = conf->disks[i]; 1050 kfree(conf->disks); 1051 conf->disks = ndisks; 1052 } else 1053 err = -ENOMEM; 1054
1378 get_online_cpus(); 1379 conf->scribble_len = scribble_len(newsize); 1380 for_each_present_cpu(cpu) { 1381 struct raid5_percpu percpu; 1382 void scribble; 1383 1384 percpu = per_cpu_ptr(conf->percpu, cpu); 1385 scribble = kmalloc(conf->scribble_len, GFP_NOIO); 1386 1387 if (scribble) { 1388 kfree(percpu->scribble); 1389 percpu->scribble = scribble; 1390 } else { 1391 err = -ENOMEM; 1392 break; 1393 } 1394 } 1395 put_online_cpus(); 1396
1397 /* Step 4, return new stripes to service */ 1398 while(!list_empty(&newstripes)) { 1399 nsh = list_entry(newstripes.next, struct stripe_head, lru); 1400 list_del_init(&nsh->lru);	1055 /* Step 4, return new stripes to service */ 1056 while(!list_empty(&newstripes)) { 1057 nsh = list_entry(newstripes.next, struct stripe_head, lru); 1058 list_del_init(&nsh->lru);
1401
1402 for (i=conf->raid_disks; i < newsize; i++) 1403 if (nsh->dev[i].page == NULL) { 1404 struct page p = alloc_page(GFP_NOIO); 1405 nsh->dev[i].page = p; 1406 if (!p) 1407 err = -ENOMEM; 1408 } 1409 release_stripe(nsh); --- 522 unchanged lines hidden* (view full) --- 1932 \|\| sh2.qd_idx != sh->qd_idx) { 1933 printk(KERN_ERR "compute_blocknr: map not correct\n"); 1934 return 0; 1935 } 1936 return r_sector; 1937} 1938 1939	1059 for (i=conf->raid_disks; i < newsize; i++) 1060 if (nsh->dev[i].page == NULL) { 1061 struct page p = alloc_page(GFP_NOIO); 1062 nsh->dev[i].page = p; 1063 if (!p) 1064 err = -ENOMEM; 1065 } 1066 release_stripe(nsh); --- 522 unchanged lines hidden* (view full) --- 1589 \|\| sh2.qd_idx != sh->qd_idx) { 1590 printk(KERN_ERR "compute_blocknr: map not correct\n"); 1591 return 0; 1592 } 1593 return r_sector; 1594} 1595 1596
	1597 1598/* 1599 * Copy data between a page in the stripe cache, and one or more bion 1600 * The page could align with the middle of the bio, or there could be 1601 * several bion, each with several bio_vecs, which cover part of the page 1602 * Multiple bion are linked together on bi_next. There may be extras 1603 * at the end of this list. We ignore them. 1604 / 1605static void copy_data(int frombio, struct bio bio, 1606 struct page page, 1607 sector_t sector) 1608{ 1609 char pa = page_address(page); 1610 struct bio_vec bvl; 1611 int i; 1612 int page_offset; 1613 1614 if (bio->bi_sector >= sector) 1615 page_offset = (signed)(bio->bi_sector - sector) 512; 1616 else 1617 page_offset = (signed)(sector - bio->bi_sector) * -512; 1618 bio_for_each_segment(bvl, bio, i) { 1619 int len = bio_iovec_idx(bio,i)->bv_len; 1620 int clen; 1621 int b_offset = 0; 1622 1623 if (page_offset < 0) { 1624 b_offset = -page_offset; 1625 page_offset += b_offset; 1626 len -= b_offset; 1627 } 1628 1629 if (len > 0 && page_offset + len > STRIPE_SIZE) 1630 clen = STRIPE_SIZE - page_offset; 1631 else clen = len; 1632 1633 if (clen > 0) { 1634 char ba = __bio_kmap_atomic(bio, i, KM_USER0); 1635 if (frombio) 1636 memcpy(pa+page_offset, ba+b_offset, clen); 1637 else 1638 memcpy(ba+b_offset, pa+page_offset, clen); 1639 __bio_kunmap_atomic(ba, KM_USER0); 1640 } 1641 if (clen < len) / hit end of page / 1642 break; 1643 page_offset += len; 1644 } 1645} 1646 1647#define check_xor() do { \ 1648 if (count == MAX_XOR_BLOCKS) { \ 1649 xor_blocks(count, STRIPE_SIZE, dest, ptr);\ 1650 count = 0; \ 1651 } \ 1652 } while(0) 1653 1654static void compute_parity6(struct stripe_head sh, int method) 1655{ 1656 raid5_conf_t conf = sh->raid_conf; 1657 int i, pd_idx, qd_idx, d0_idx, disks = sh->disks, count; 1658 int syndrome_disks = sh->ddf_layout ? disks : (disks - 2); 1659 struct bio chosen; 1660 /** FIX THIS: This could be very bad if disks is close to 256 */ 1661 void ptrs[syndrome_disks+2]; 1662 1663 pd_idx = sh->pd_idx; 1664 qd_idx = sh->qd_idx; 1665 d0_idx = raid6_d0(sh); 1666 1667 pr_debug("compute_parity, stripe %llu, method %d\n", 1668 (unsigned long long)sh->sector, method); 1669 1670 switch(method) { 1671 case READ_MODIFY_WRITE: 1672 BUG(); /* READ_MODIFY_WRITE N/A for RAID-6 / 1673 case RECONSTRUCT_WRITE: 1674 for (i= disks; i-- ;) 1675 if ( i != pd_idx && i != qd_idx && sh->dev[i].towrite ) { 1676 chosen = sh->dev[i].towrite; 1677 sh->dev[i].towrite = NULL; 1678 1679 if (test_and_clear_bit(R5_Overlap, &sh->dev[i].flags)) 1680 wake_up(&conf->wait_for_overlap); 1681 1682 BUG_ON(sh->dev[i].written); 1683 sh->dev[i].written = chosen; 1684 } 1685 break; 1686 case CHECK_PARITY: 1687 BUG(); / Not implemented yet / 1688 } 1689 1690 for (i = disks; i--;) 1691 if (sh->dev[i].written) { 1692 sector_t sector = sh->dev[i].sector; 1693 struct bio wbi = sh->dev[i].written; 1694 while (wbi && wbi->bi_sector < sector + STRIPE_SECTORS) { 1695 copy_data(1, wbi, sh->dev[i].page, sector); 1696 wbi = r5_next_bio(wbi, sector); 1697 } 1698 1699 set_bit(R5_LOCKED, &sh->dev[i].flags); 1700 set_bit(R5_UPTODATE, &sh->dev[i].flags); 1701 } 1702 1703 /* Note that unlike RAID-5, the ordering of the disks matters greatly./ 1704 1705 for (i = 0; i < disks; i++) 1706 ptrs[i] = (void )raid6_empty_zero_page; 1707 1708 count = 0; 1709 i = d0_idx; 1710 do { 1711 int slot = raid6_idx_to_slot(i, sh, &count, syndrome_disks); 1712 1713 ptrs[slot] = page_address(sh->dev[i].page); 1714 if (slot < syndrome_disks && 1715 !test_bit(R5_UPTODATE, &sh->dev[i].flags)) { 1716 printk(KERN_ERR "block %d/%d not uptodate " 1717 "on parity calc\n", i, count); 1718 BUG(); 1719 } 1720 1721 i = raid6_next_disk(i, disks); 1722 } while (i != d0_idx); 1723 BUG_ON(count != syndrome_disks); 1724 1725 raid6_call.gen_syndrome(syndrome_disks+2, STRIPE_SIZE, ptrs); 1726 1727 switch(method) { 1728 case RECONSTRUCT_WRITE: 1729 set_bit(R5_UPTODATE, &sh->dev[pd_idx].flags); 1730 set_bit(R5_UPTODATE, &sh->dev[qd_idx].flags); 1731 set_bit(R5_LOCKED, &sh->dev[pd_idx].flags); 1732 set_bit(R5_LOCKED, &sh->dev[qd_idx].flags); 1733 break; 1734 case UPDATE_PARITY: 1735 set_bit(R5_UPTODATE, &sh->dev[pd_idx].flags); 1736 set_bit(R5_UPTODATE, &sh->dev[qd_idx].flags); 1737 break; 1738 } 1739} 1740 1741 1742/* Compute one missing block / 1743static void compute_block_1(struct stripe_head sh, int dd_idx, int nozero) 1744{ 1745 int i, count, disks = sh->disks; 1746 void ptr[MAX_XOR_BLOCKS], dest, p; 1747 int qd_idx = sh->qd_idx; 1748 1749 pr_debug("compute_block_1, stripe %llu, idx %d\n", 1750 (unsigned long long)sh->sector, dd_idx); 1751 1752 if ( dd_idx == qd_idx ) { 1753 / We're actually computing the Q drive / 1754 compute_parity6(sh, UPDATE_PARITY); 1755 } else { 1756 dest = page_address(sh->dev[dd_idx].page); 1757 if (!nozero) memset(dest, 0, STRIPE_SIZE); 1758 count = 0; 1759 for (i = disks ; i--; ) { 1760 if (i == dd_idx \|\| i == qd_idx) 1761 continue; 1762 p = page_address(sh->dev[i].page); 1763 if (test_bit(R5_UPTODATE, &sh->dev[i].flags)) 1764 ptr[count++] = p; 1765 else 1766 printk("compute_block() %d, stripe %llu, %d" 1767 " not present\n", dd_idx, 1768 (unsigned long long)sh->sector, i); 1769 1770 check_xor(); 1771 } 1772 if (count) 1773 xor_blocks(count, STRIPE_SIZE, dest, ptr); 1774 if (!nozero) set_bit(R5_UPTODATE, &sh->dev[dd_idx].flags); 1775 else clear_bit(R5_UPTODATE, &sh->dev[dd_idx].flags); 1776 } 1777} 1778 1779/ Compute two missing blocks / 1780static void compute_block_2(struct stripe_head sh, int dd_idx1, int dd_idx2) 1781{ 1782 int i, count, disks = sh->disks; 1783 int syndrome_disks = sh->ddf_layout ? disks : disks-2; 1784 int d0_idx = raid6_d0(sh); 1785 int faila = -1, failb = -1; 1786 /** FIX THIS: This could be very bad if disks is close to 256 */ 1787 void ptrs[syndrome_disks+2]; 1788 1789 for (i = 0; i < disks ; i++) 1790 ptrs[i] = (void )raid6_empty_zero_page; 1791 count = 0; 1792 i = d0_idx; 1793 do { 1794 int slot = raid6_idx_to_slot(i, sh, &count, syndrome_disks); 1795 1796 ptrs[slot] = page_address(sh->dev[i].page); 1797 1798 if (i == dd_idx1) 1799 faila = slot; 1800 if (i == dd_idx2) 1801 failb = slot; 1802 i = raid6_next_disk(i, disks); 1803 } while (i != d0_idx); 1804 BUG_ON(count != syndrome_disks); 1805 1806 BUG_ON(faila == failb); 1807 if ( failb < faila ) { int tmp = faila; faila = failb; failb = tmp; } 1808 1809 pr_debug("compute_block_2, stripe %llu, idx %d,%d (%d,%d)\n", 1810 (unsigned long long)sh->sector, dd_idx1, dd_idx2, 1811 faila, failb); 1812 1813 if (failb == syndrome_disks+1) { 1814 / Q disk is one of the missing disks / 1815 if (faila == syndrome_disks) { 1816 / Missing P+Q, just recompute / 1817 compute_parity6(sh, UPDATE_PARITY); 1818 return; 1819 } else { 1820 / We're missing D+Q; recompute D from P / 1821 compute_block_1(sh, ((dd_idx1 == sh->qd_idx) ? 1822 dd_idx2 : dd_idx1), 1823 0); 1824 compute_parity6(sh, UPDATE_PARITY); / Is this necessary? / 1825 return; 1826 } 1827 } 1828 1829 / We're missing D+P or D+D; / 1830 if (failb == syndrome_disks) { 1831 / We're missing D+P. / 1832 raid6_datap_recov(syndrome_disks+2, STRIPE_SIZE, faila, ptrs); 1833 } else { 1834 / We're missing D+D. / 1835 raid6_2data_recov(syndrome_disks+2, STRIPE_SIZE, faila, failb, 1836 ptrs); 1837 } 1838 1839 / Both the above update both missing blocks */ 1840 set_bit(R5_UPTODATE, &sh->dev[dd_idx1].flags); 1841 set_bit(R5_UPTODATE, &sh->dev[dd_idx2].flags); 1842} 1843
1940static void	1844static void
1941schedule_reconstruction(struct stripe_head sh, struct stripe_head_state s,	1845schedule_reconstruction5(struct stripe_head sh, struct stripe_head_state s,
1942 int rcw, int expand) 1943{ 1944 int i, pd_idx = sh->pd_idx, disks = sh->disks;	1846 int rcw, int expand) 1847{ 1848 int i, pd_idx = sh->pd_idx, disks = sh->disks;
1945 raid5_conf_t *conf = sh->raid_conf; 1946 int level = conf->level;
1947 1948 if (rcw) { 1949 /* if we are not expanding this is a proper write request, and 1950 * there will be bios with new data to be drained into the 1951 * stripe cache 1952 */ 1953 if (!expand) { 1954 sh->reconstruct_state = reconstruct_state_drain_run; 1955 set_bit(STRIPE_OP_BIODRAIN, &s->ops_request); 1956 } else 1957 sh->reconstruct_state = reconstruct_state_run; 1958	1849 1850 if (rcw) { 1851 /* if we are not expanding this is a proper write request, and 1852 * there will be bios with new data to be drained into the 1853 * stripe cache 1854 */ 1855 if (!expand) { 1856 sh->reconstruct_state = reconstruct_state_drain_run; 1857 set_bit(STRIPE_OP_BIODRAIN, &s->ops_request); 1858 } else 1859 sh->reconstruct_state = reconstruct_state_run; 1860
1959 set_bit(STRIPE_OP_RECONSTRUCT, &s->ops_request);	1861 set_bit(STRIPE_OP_POSTXOR, &s->ops_request);
1960 1961 for (i = disks; i--; ) { 1962 struct r5dev *dev = &sh->dev[i]; 1963 1964 if (dev->towrite) { 1965 set_bit(R5_LOCKED, &dev->flags); 1966 set_bit(R5_Wantdrain, &dev->flags); 1967 if (!expand) 1968 clear_bit(R5_UPTODATE, &dev->flags); 1969 s->locked++; 1970 } 1971 }	1862 1863 for (i = disks; i--; ) { 1864 struct r5dev *dev = &sh->dev[i]; 1865 1866 if (dev->towrite) { 1867 set_bit(R5_LOCKED, &dev->flags); 1868 set_bit(R5_Wantdrain, &dev->flags); 1869 if (!expand) 1870 clear_bit(R5_UPTODATE, &dev->flags); 1871 s->locked++; 1872 } 1873 }
1972 if (s->locked + conf->max_degraded == disks)	1874 if (s->locked + 1 == disks)
1973 if (!test_and_set_bit(STRIPE_FULL_WRITE, &sh->state))	1875 if (!test_and_set_bit(STRIPE_FULL_WRITE, &sh->state))
1974 atomic_inc(&conf->pending_full_writes);	1876 atomic_inc(&sh->raid_conf->pending_full_writes);
1975 } else {	1877 } else {
1976 BUG_ON(level == 6);
1977 BUG_ON(!(test_bit(R5_UPTODATE, &sh->dev[pd_idx].flags) \|\| 1978 test_bit(R5_Wantcompute, &sh->dev[pd_idx].flags))); 1979 1980 sh->reconstruct_state = reconstruct_state_prexor_drain_run; 1981 set_bit(STRIPE_OP_PREXOR, &s->ops_request); 1982 set_bit(STRIPE_OP_BIODRAIN, &s->ops_request);	1878 BUG_ON(!(test_bit(R5_UPTODATE, &sh->dev[pd_idx].flags) \|\| 1879 test_bit(R5_Wantcompute, &sh->dev[pd_idx].flags))); 1880 1881 sh->reconstruct_state = reconstruct_state_prexor_drain_run; 1882 set_bit(STRIPE_OP_PREXOR, &s->ops_request); 1883 set_bit(STRIPE_OP_BIODRAIN, &s->ops_request);
1983 set_bit(STRIPE_OP_RECONSTRUCT, &s->ops_request);	1884 set_bit(STRIPE_OP_POSTXOR, &s->ops_request);
1984 1985 for (i = disks; i--; ) { 1986 struct r5dev *dev = &sh->dev[i]; 1987 if (i == pd_idx) 1988 continue; 1989 1990 if (dev->towrite && 1991 (test_bit(R5_UPTODATE, &dev->flags) \|\| 1992 test_bit(R5_Wantcompute, &dev->flags))) { 1993 set_bit(R5_Wantdrain, &dev->flags); 1994 set_bit(R5_LOCKED, &dev->flags); 1995 clear_bit(R5_UPTODATE, &dev->flags); 1996 s->locked++; 1997 } 1998 } 1999 } 2000	1885 1886 for (i = disks; i--; ) { 1887 struct r5dev *dev = &sh->dev[i]; 1888 if (i == pd_idx) 1889 continue; 1890 1891 if (dev->towrite && 1892 (test_bit(R5_UPTODATE, &dev->flags) \|\| 1893 test_bit(R5_Wantcompute, &dev->flags))) { 1894 set_bit(R5_Wantdrain, &dev->flags); 1895 set_bit(R5_LOCKED, &dev->flags); 1896 clear_bit(R5_UPTODATE, &dev->flags); 1897 s->locked++; 1898 } 1899 } 1900 } 1901
2001 /* keep the parity disk(s) locked while asynchronous operations	1902 /* keep the parity disk locked while asynchronous operations
2002 * are in flight 2003 */ 2004 set_bit(R5_LOCKED, &sh->dev[pd_idx].flags); 2005 clear_bit(R5_UPTODATE, &sh->dev[pd_idx].flags); 2006 s->locked++; 2007	1903 * are in flight 1904 */ 1905 set_bit(R5_LOCKED, &sh->dev[pd_idx].flags); 1906 clear_bit(R5_UPTODATE, &sh->dev[pd_idx].flags); 1907 s->locked++; 1908
2008 if (level == 6) { 2009 int qd_idx = sh->qd_idx; 2010 struct r5dev *dev = &sh->dev[qd_idx]; 2011 2012 set_bit(R5_LOCKED, &dev->flags); 2013 clear_bit(R5_UPTODATE, &dev->flags); 2014 s->locked++; 2015 } 2016
2017 pr_debug("%s: stripe %llu locked: %d ops_request: %lx\n", 2018 __func__, (unsigned long long)sh->sector, 2019 s->locked, s->ops_request); 2020} 2021 2022/* 2023 * Each stripe/dev can have one or more bion attached. 2024 * toread/towrite point to the first in a chain. --- 64 unchanged lines hidden (view full) --- 2089 set_bit(R5_Overlap, &sh->dev[dd_idx].flags); 2090 spin_unlock_irq(&conf->device_lock); 2091 spin_unlock(&sh->lock); 2092 return 0; 2093} 2094 2095static void end_reshape(raid5_conf_t *conf); 2096	1909 pr_debug("%s: stripe %llu locked: %d ops_request: %lx\n", 1910 __func__, (unsigned long long)sh->sector, 1911 s->locked, s->ops_request); 1912} 1913 1914/* 1915 * Each stripe/dev can have one or more bion attached. 1916 * toread/towrite point to the first in a chain. --- 64 unchanged lines hidden (view full) --- 1981 set_bit(R5_Overlap, &sh->dev[dd_idx].flags); 1982 spin_unlock_irq(&conf->device_lock); 1983 spin_unlock(&sh->lock); 1984 return 0; 1985} 1986 1987static void end_reshape(raid5_conf_t *conf); 1988
	1989static int page_is_zero(struct page p) 1990{ 1991 char a = page_address(p); 1992 return (((u32)a) == 0 && 1993 memcmp(a, a+4, STRIPE_SIZE-4)==0); 1994} 1995
2097static void stripe_set_idx(sector_t stripe, raid5_conf_t conf, int previous, 2098 struct stripe_head sh) 2099{ 2100 int sectors_per_chunk = 2101 previous ? conf->prev_chunk_sectors : conf->chunk_sectors; 2102 int dd_idx; 2103 int chunk_offset = sector_div(stripe, sectors_per_chunk); 2104 int disks = previous ? conf->previous_raid_disks : conf->raid_disks; --- 123 unchanged lines hidden (view full) --- 2228 * otherwise read it if the backing disk is insync 2229 */ 2230 if ((s->uptodate == disks - 1) && 2231 (s->failed && disk_idx == s->failed_num)) { 2232 set_bit(STRIPE_COMPUTE_RUN, &sh->state); 2233 set_bit(STRIPE_OP_COMPUTE_BLK, &s->ops_request); 2234 set_bit(R5_Wantcompute, &dev->flags); 2235 sh->ops.target = disk_idx;	1996static void stripe_set_idx(sector_t stripe, raid5_conf_t conf, int previous, 1997 struct stripe_head sh) 1998{ 1999 int sectors_per_chunk = 2000 previous ? conf->prev_chunk_sectors : conf->chunk_sectors; 2001 int dd_idx; 2002 int chunk_offset = sector_div(stripe, sectors_per_chunk); 2003 int disks = previous ? conf->previous_raid_disks : conf->raid_disks; --- 123 unchanged lines hidden (view full) --- 2127 * otherwise read it if the backing disk is insync 2128 */ 2129 if ((s->uptodate == disks - 1) && 2130 (s->failed && disk_idx == s->failed_num)) { 2131 set_bit(STRIPE_COMPUTE_RUN, &sh->state); 2132 set_bit(STRIPE_OP_COMPUTE_BLK, &s->ops_request); 2133 set_bit(R5_Wantcompute, &dev->flags); 2134 sh->ops.target = disk_idx;
2236 sh->ops.target2 = -1;
2237 s->req_compute = 1; 2238 /* Careful: from this point on 'uptodate' is in the eye	2135 s->req_compute = 1; 2136 /* Careful: from this point on 'uptodate' is in the eye
2239 * of raid_run_ops which services 'compute' operations	2137 * of raid5_run_ops which services 'compute' operations
2240 * before writes. R5_Wantcompute flags a block that will 2241 * be R5_UPTODATE by the time it is needed for a 2242 * subsequent operation. 2243 / 2244 s->uptodate++; 2245 return 1; / uptodate + compute == disks / 2246 } else if (test_bit(R5_Insync, &dev->flags)) { 2247 set_bit(R5_LOCKED, &dev->flags); --- 22 unchanged lines hidden* (view full) --- 2270 if (!test_bit(STRIPE_COMPUTE_RUN, &sh->state) && !sh->check_state && 2271 !sh->reconstruct_state) 2272 for (i = disks; i--; ) 2273 if (fetch_block5(sh, s, i, disks)) 2274 break; 2275 set_bit(STRIPE_HANDLE, &sh->state); 2276} 2277	2138 * before writes. R5_Wantcompute flags a block that will 2139 * be R5_UPTODATE by the time it is needed for a 2140 * subsequent operation. 2141 / 2142 s->uptodate++; 2143 return 1; / uptodate + compute == disks / 2144 } else if (test_bit(R5_Insync, &dev->flags)) { 2145 set_bit(R5_LOCKED, &dev->flags); --- 22 unchanged lines hidden* (view full) --- 2168 if (!test_bit(STRIPE_COMPUTE_RUN, &sh->state) && !sh->check_state && 2169 !sh->reconstruct_state) 2170 for (i = disks; i--; ) 2171 if (fetch_block5(sh, s, i, disks)) 2172 break; 2173 set_bit(STRIPE_HANDLE, &sh->state); 2174} 2175
2278/* fetch_block6 - checks the given member device to see if its data needs 2279 * to be read or computed to satisfy a request. 2280 * 2281 * Returns 1 when no more member devices need to be checked, otherwise returns 2282 * 0 to tell the loop in handle_stripe_fill6 to continue 2283 / 2284static int fetch_block6(struct stripe_head sh, struct stripe_head_state s, 2285 struct r6_state r6s, int disk_idx, int disks) 2286{ 2287 struct r5dev dev = &sh->dev[disk_idx]; 2288 struct r5dev fdev[2] = { &sh->dev[r6s->failed_num[0]], 2289 &sh->dev[r6s->failed_num[1]] }; 2290 2291 if (!test_bit(R5_LOCKED, &dev->flags) && 2292 !test_bit(R5_UPTODATE, &dev->flags) && 2293 (dev->toread \|\| 2294 (dev->towrite && !test_bit(R5_OVERWRITE, &dev->flags)) \|\| 2295 s->syncing \|\| s->expanding \|\| 2296 (s->failed >= 1 && 2297 (fdev[0]->toread \|\| s->to_write)) \|\| 2298 (s->failed >= 2 && 2299 (fdev[1]->toread \|\| s->to_write)))) { 2300 /* we would like to get this block, possibly by computing it, 2301 * otherwise read it if the backing disk is insync 2302 / 2303 BUG_ON(test_bit(R5_Wantcompute, &dev->flags)); 2304 BUG_ON(test_bit(R5_Wantread, &dev->flags)); 2305 if ((s->uptodate == disks - 1) && 2306 (s->failed && (disk_idx == r6s->failed_num[0] \|\| 2307 disk_idx == r6s->failed_num[1]))) { 2308 / have disk failed, and we're requested to fetch it; 2309 * do compute it 2310 / 2311 pr_debug("Computing stripe %llu block %d\n", 2312 (unsigned long long)sh->sector, disk_idx); 2313 set_bit(STRIPE_COMPUTE_RUN, &sh->state); 2314 set_bit(STRIPE_OP_COMPUTE_BLK, &s->ops_request); 2315 set_bit(R5_Wantcompute, &dev->flags); 2316 sh->ops.target = disk_idx; 2317 sh->ops.target2 = -1; / no 2nd target / 2318 s->req_compute = 1; 2319 s->uptodate++; 2320 return 1; 2321 } else if (s->uptodate == disks-2 && s->failed >= 2) { 2322 / Computing 2-failure is very expensive; only 2323 * do it if failed >= 2 2324 / 2325 int other; 2326 for (other = disks; other--; ) { 2327 if (other == disk_idx) 2328 continue; 2329 if (!test_bit(R5_UPTODATE, 2330 &sh->dev[other].flags)) 2331 break; 2332 } 2333 BUG_ON(other < 0); 2334 pr_debug("Computing stripe %llu blocks %d,%d\n", 2335 (unsigned long long)sh->sector, 2336 disk_idx, other); 2337 set_bit(STRIPE_COMPUTE_RUN, &sh->state); 2338 set_bit(STRIPE_OP_COMPUTE_BLK, &s->ops_request); 2339 set_bit(R5_Wantcompute, &sh->dev[disk_idx].flags); 2340 set_bit(R5_Wantcompute, &sh->dev[other].flags); 2341 sh->ops.target = disk_idx; 2342 sh->ops.target2 = other; 2343 s->uptodate += 2; 2344 s->req_compute = 1; 2345 return 1; 2346 } else if (test_bit(R5_Insync, &dev->flags)) { 2347 set_bit(R5_LOCKED, &dev->flags); 2348 set_bit(R5_Wantread, &dev->flags); 2349 s->locked++; 2350 pr_debug("Reading block %d (sync=%d)\n", 2351 disk_idx, s->syncing); 2352 } 2353 } 2354 2355 return 0; 2356} 2357 2358/* 2359 * handle_stripe_fill6 - read or compute data to satisfy pending requests. 2360 */
2361static void handle_stripe_fill6(struct stripe_head sh, 2362 struct stripe_head_state s, struct r6_state *r6s, 2363 int disks) 2364{ 2365 int i;	2176static void handle_stripe_fill6(struct stripe_head sh, 2177 struct stripe_head_state s, struct r6_state *r6s, 2178 int disks) 2179{ 2180 int i;
2366 2367 /* look for blocks to read/compute, skip this if a compute 2368 * is already in flight, or if the stripe contents are in the 2369 * midst of changing due to a write 2370 */ 2371 if (!test_bit(STRIPE_COMPUTE_RUN, &sh->state) && !sh->check_state && 2372 !sh->reconstruct_state) 2373 for (i = disks; i--; ) 2374 if (fetch_block6(sh, s, r6s, i, disks)) 2375 break;	2181 for (i = disks; i--; ) { 2182 struct r5dev dev = &sh->dev[i]; 2183 if (!test_bit(R5_LOCKED, &dev->flags) && 2184 !test_bit(R5_UPTODATE, &dev->flags) && 2185 (dev->toread \|\| (dev->towrite && 2186 !test_bit(R5_OVERWRITE, &dev->flags)) \|\| 2187 s->syncing \|\| s->expanding \|\| 2188 (s->failed >= 1 && 2189 (sh->dev[r6s->failed_num[0]].toread \|\| 2190 s->to_write)) \|\| 2191 (s->failed >= 2 && 2192 (sh->dev[r6s->failed_num[1]].toread \|\| 2193 s->to_write)))) { 2194 / we would like to get this block, possibly 2195 * by computing it, but we might not be able to 2196 / 2197 if ((s->uptodate == disks - 1) && 2198 (s->failed && (i == r6s->failed_num[0] \|\| 2199 i == r6s->failed_num[1]))) { 2200 pr_debug("Computing stripe %llu block %d\n", 2201 (unsigned long long)sh->sector, i); 2202 compute_block_1(sh, i, 0); 2203 s->uptodate++; 2204 } else if ( s->uptodate == disks-2 && s->failed >= 2 ) { 2205 / Computing 2-failure is very expensive; only 2206 * do it if failed >= 2 2207 */ 2208 int other; 2209 for (other = disks; other--; ) { 2210 if (other == i) 2211 continue; 2212 if (!test_bit(R5_UPTODATE, 2213 &sh->dev[other].flags)) 2214 break; 2215 } 2216 BUG_ON(other < 0); 2217 pr_debug("Computing stripe %llu blocks %d,%d\n", 2218 (unsigned long long)sh->sector, 2219 i, other); 2220 compute_block_2(sh, i, other); 2221 s->uptodate += 2; 2222 } else if (test_bit(R5_Insync, &dev->flags)) { 2223 set_bit(R5_LOCKED, &dev->flags); 2224 set_bit(R5_Wantread, &dev->flags); 2225 s->locked++; 2226 pr_debug("Reading block %d (sync=%d)\n", 2227 i, s->syncing); 2228 } 2229 } 2230 }
2376 set_bit(STRIPE_HANDLE, &sh->state); 2377} 2378 2379 2380/* handle_stripe_clean_event 2381 * any written block on an uptodate or failed drive can be returned. 2382 * Note that if we 'wrote' to a failed drive, it will be UPTODATE, but 2383 * never LOCKED, so we don't need to test 'failed' directly. --- 117 unchanged lines hidden (view full) --- 2501 } 2502 } 2503 } 2504 /* now if nothing is locked, and if we have enough data, 2505 * we can start a write request 2506 / 2507 / since handle_stripe can be called at any time we need to handle the 2508 * case where a compute block operation has been submitted and then a	2231 set_bit(STRIPE_HANDLE, &sh->state); 2232} 2233 2234 2235/* handle_stripe_clean_event 2236 * any written block on an uptodate or failed drive can be returned. 2237 * Note that if we 'wrote' to a failed drive, it will be UPTODATE, but 2238 * never LOCKED, so we don't need to test 'failed' directly. --- 117 unchanged lines hidden (view full) --- 2356 } 2357 } 2358 } 2359 /* now if nothing is locked, and if we have enough data, 2360 * we can start a write request 2361 / 2362 / since handle_stripe can be called at any time we need to handle the 2363 * case where a compute block operation has been submitted and then a
2509 * subsequent call wants to start a write request. raid_run_ops only 2510 * handles the case where compute block and reconstruct are requested	2364 * subsequent call wants to start a write request. raid5_run_ops only 2365 * handles the case where compute block and postxor are requested
2511 * simultaneously. If this is not the case then new writes need to be 2512 * held off until the compute completes. 2513 */ 2514 if ((s->req_compute \|\| !test_bit(STRIPE_COMPUTE_RUN, &sh->state)) && 2515 (s->locked == 0 && (rcw == 0 \|\| rmw == 0) && 2516 !test_bit(STRIPE_BIT_DELAY, &sh->state)))	2366 * simultaneously. If this is not the case then new writes need to be 2367 * held off until the compute completes. 2368 */ 2369 if ((s->req_compute \|\| !test_bit(STRIPE_COMPUTE_RUN, &sh->state)) && 2370 (s->locked == 0 && (rcw == 0 \|\| rmw == 0) && 2371 !test_bit(STRIPE_BIT_DELAY, &sh->state)))
2517 schedule_reconstruction(sh, s, rcw == 0, 0);	2372 schedule_reconstruction5(sh, s, rcw == 0, 0);
2518} 2519 2520static void handle_stripe_dirtying6(raid5_conf_t conf, 2521 struct stripe_head sh, struct stripe_head_state s, 2522 struct r6_state r6s, int disks) 2523{	2373} 2374 2375static void handle_stripe_dirtying6(raid5_conf_t conf, 2376 struct stripe_head sh, struct stripe_head_state s, 2377 struct r6_state r6s, int disks) 2378{
2524 int rcw = 0, pd_idx = sh->pd_idx, i;	2379 int rcw = 0, must_compute = 0, pd_idx = sh->pd_idx, i;
2525 int qd_idx = sh->qd_idx;	2380 int qd_idx = sh->qd_idx;
2526 2527 set_bit(STRIPE_HANDLE, &sh->state);
2528 for (i = disks; i--; ) { 2529 struct r5dev *dev = &sh->dev[i];	2381 for (i = disks; i--; ) { 2382 struct r5dev *dev = &sh->dev[i];
2530 /* check if we haven't enough data / 2531 if (!test_bit(R5_OVERWRITE, &dev->flags) && 2532 i != pd_idx && i != qd_idx && 2533 !test_bit(R5_LOCKED, &dev->flags) && 2534 !(test_bit(R5_UPTODATE, &dev->flags) \|\| 2535 test_bit(R5_Wantcompute, &dev->flags))) { 2536 rcw++; 2537 if (!test_bit(R5_Insync, &dev->flags)) 2538 continue; / it's a failed drive */ 2539 2540 if ( 2541 test_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) { 2542 pr_debug("Read_old stripe %llu " 2543 "block %d for Reconstruct\n", 2544 (unsigned long long)sh->sector, i); 2545 set_bit(R5_LOCKED, &dev->flags); 2546 set_bit(R5_Wantread, &dev->flags); 2547 s->locked++; 2548 } else { 2549 pr_debug("Request delayed stripe %llu " 2550 "block %d for Reconstruct\n", 2551 (unsigned long long)sh->sector, i); 2552 set_bit(STRIPE_DELAYED, &sh->state); 2553 set_bit(STRIPE_HANDLE, &sh->state);	2383 /* Would I have to read this buffer for reconstruct_write */ 2384 if (!test_bit(R5_OVERWRITE, &dev->flags) 2385 && i != pd_idx && i != qd_idx 2386 && (!test_bit(R5_LOCKED, &dev->flags) 2387 ) && 2388 !test_bit(R5_UPTODATE, &dev->flags)) { 2389 if (test_bit(R5_Insync, &dev->flags)) rcw++; 2390 else { 2391 pr_debug("raid6: must_compute: " 2392 "disk %d flags=%#lx\n", i, dev->flags); 2393 must_compute++;
2554 } 2555 } 2556 }	2394 } 2395 } 2396 }
	2397 pr_debug("for sector %llu, rcw=%d, must_compute=%d\n", 2398 (unsigned long long)sh->sector, rcw, must_compute); 2399 set_bit(STRIPE_HANDLE, &sh->state); 2400 2401 if (rcw > 0) 2402 /* want reconstruct write, but need to get some data / 2403 for (i = disks; i--; ) { 2404 struct r5dev dev = &sh->dev[i]; 2405 if (!test_bit(R5_OVERWRITE, &dev->flags) 2406 && !(s->failed == 0 && (i == pd_idx \|\| i == qd_idx)) 2407 && !test_bit(R5_LOCKED, &dev->flags) && 2408 !test_bit(R5_UPTODATE, &dev->flags) && 2409 test_bit(R5_Insync, &dev->flags)) { 2410 if ( 2411 test_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) { 2412 pr_debug("Read_old stripe %llu " 2413 "block %d for Reconstruct\n", 2414 (unsigned long long)sh->sector, i); 2415 set_bit(R5_LOCKED, &dev->flags); 2416 set_bit(R5_Wantread, &dev->flags); 2417 s->locked++; 2418 } else { 2419 pr_debug("Request delayed stripe %llu " 2420 "block %d for Reconstruct\n", 2421 (unsigned long long)sh->sector, i); 2422 set_bit(STRIPE_DELAYED, &sh->state); 2423 set_bit(STRIPE_HANDLE, &sh->state); 2424 } 2425 } 2426 }
2557 /* now if nothing is locked, and if we have enough data, we can start a 2558 * write request 2559 */	2427 /* now if nothing is locked, and if we have enough data, we can start a 2428 * write request 2429 */
2560 if ((s->req_compute \|\| !test_bit(STRIPE_COMPUTE_RUN, &sh->state)) && 2561 s->locked == 0 && rcw == 0 &&	2430 if (s->locked == 0 && rcw == 0 &&
2562 !test_bit(STRIPE_BIT_DELAY, &sh->state)) {	2431 !test_bit(STRIPE_BIT_DELAY, &sh->state)) {
2563 schedule_reconstruction(sh, s, 1, 0);	2432 if (must_compute > 0) { 2433 /* We have failed blocks and need to compute them / 2434 switch (s->failed) { 2435 case 0: 2436 BUG(); 2437 case 1: 2438 compute_block_1(sh, r6s->failed_num[0], 0); 2439 break; 2440 case 2: 2441 compute_block_2(sh, r6s->failed_num[0], 2442 r6s->failed_num[1]); 2443 break; 2444 default: / This request should have been failed? / 2445 BUG(); 2446 } 2447 } 2448 2449 pr_debug("Computing parity for stripe %llu\n", 2450 (unsigned long long)sh->sector); 2451 compute_parity6(sh, RECONSTRUCT_WRITE); 2452 / now every locked buffer is ready to be written / 2453 for (i = disks; i--; ) 2454 if (test_bit(R5_LOCKED, &sh->dev[i].flags)) { 2455 pr_debug("Writing stripe %llu block %d\n", 2456 (unsigned long long)sh->sector, i); 2457 s->locked++; 2458 set_bit(R5_Wantwrite, &sh->dev[i].flags); 2459 } 2460 if (s->locked == disks) 2461 if (!test_and_set_bit(STRIPE_FULL_WRITE, &sh->state)) 2462 atomic_inc(&conf->pending_full_writes); 2463 / after a RECONSTRUCT_WRITE, the stripe MUST be in-sync */ 2464 set_bit(STRIPE_INSYNC, &sh->state); 2465 2466 if (test_and_clear_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) { 2467 atomic_dec(&conf->preread_active_stripes); 2468 if (atomic_read(&conf->preread_active_stripes) < 2469 IO_THRESHOLD) 2470 md_wakeup_thread(conf->mddev->thread); 2471 }
2564 } 2565} 2566 2567static void handle_parity_checks5(raid5_conf_t conf, struct stripe_head sh, 2568 struct stripe_head_state s, int disks) 2569{ 2570 struct r5dev dev = NULL; 2571 --- 42 unchanged lines hidden (view full) --- 2614 / 2615 if (s->failed) 2616 break; 2617 2618 / handle a successful check operation, if parity is correct 2619 * we are done. Otherwise update the mismatch count and repair 2620 * parity if !MD_RECOVERY_CHECK 2621 */	2472 } 2473} 2474 2475static void handle_parity_checks5(raid5_conf_t conf, struct stripe_head sh, 2476 struct stripe_head_state s, int disks) 2477{ 2478 struct r5dev dev = NULL; 2479 --- 42 unchanged lines hidden (view full) --- 2522 / 2523 if (s->failed) 2524 break; 2525 2526 / handle a successful check operation, if parity is correct 2527 * we are done. Otherwise update the mismatch count and repair 2528 * parity if !MD_RECOVERY_CHECK 2529 */
2622 if ((sh->ops.zero_sum_result & SUM_CHECK_P_RESULT) == 0)	2530 if (sh->ops.zero_sum_result == 0)
2623 /* parity is correct (on disc, 2624 * not in buffer any more) 2625 / 2626 set_bit(STRIPE_INSYNC, &sh->state); 2627 else { 2628 conf->mddev->resync_mismatches += STRIPE_SECTORS; 2629 if (test_bit(MD_RECOVERY_CHECK, &conf->mddev->recovery)) 2630 / don't try to repair!! */ 2631 set_bit(STRIPE_INSYNC, &sh->state); 2632 else { 2633 sh->check_state = check_state_compute_run; 2634 set_bit(STRIPE_COMPUTE_RUN, &sh->state); 2635 set_bit(STRIPE_OP_COMPUTE_BLK, &s->ops_request); 2636 set_bit(R5_Wantcompute, 2637 &sh->dev[sh->pd_idx].flags); 2638 sh->ops.target = sh->pd_idx;	2531 /* parity is correct (on disc, 2532 * not in buffer any more) 2533 / 2534 set_bit(STRIPE_INSYNC, &sh->state); 2535 else { 2536 conf->mddev->resync_mismatches += STRIPE_SECTORS; 2537 if (test_bit(MD_RECOVERY_CHECK, &conf->mddev->recovery)) 2538 / don't try to repair!! */ 2539 set_bit(STRIPE_INSYNC, &sh->state); 2540 else { 2541 sh->check_state = check_state_compute_run; 2542 set_bit(STRIPE_COMPUTE_RUN, &sh->state); 2543 set_bit(STRIPE_OP_COMPUTE_BLK, &s->ops_request); 2544 set_bit(R5_Wantcompute, 2545 &sh->dev[sh->pd_idx].flags); 2546 sh->ops.target = sh->pd_idx;
2639 sh->ops.target2 = -1;
2640 s->uptodate++; 2641 } 2642 } 2643 break; 2644 case check_state_compute_run: 2645 break; 2646 default: 2647 printk(KERN_ERR "%s: unknown check_state: %d sector: %llu\n", 2648 __func__, sh->check_state, 2649 (unsigned long long) sh->sector); 2650 BUG(); 2651 } 2652} 2653 2654 2655static void handle_parity_checks6(raid5_conf_t conf, struct stripe_head sh,	2547 s->uptodate++; 2548 } 2549 } 2550 break; 2551 case check_state_compute_run: 2552 break; 2553 default: 2554 printk(KERN_ERR "%s: unknown check_state: %d sector: %llu\n", 2555 __func__, sh->check_state, 2556 (unsigned long long) sh->sector); 2557 BUG(); 2558 } 2559} 2560 2561 2562static void handle_parity_checks6(raid5_conf_t conf, struct stripe_head sh,
2656 struct stripe_head_state s, 2657 struct r6_state r6s, int disks)	2563 struct stripe_head_state s, 2564 struct r6_state r6s, struct page *tmp_page, 2565 int disks)
2658{	2566{
	2567 int update_p = 0, update_q = 0; 2568 struct r5dev *dev;
2659 int pd_idx = sh->pd_idx; 2660 int qd_idx = sh->qd_idx;	2569 int pd_idx = sh->pd_idx; 2570 int qd_idx = sh->qd_idx;
2661 struct r5dev *dev;
2662 2663 set_bit(STRIPE_HANDLE, &sh->state); 2664 2665 BUG_ON(s->failed > 2);	2571 2572 set_bit(STRIPE_HANDLE, &sh->state); 2573 2574 BUG_ON(s->failed > 2);
2666	2575 BUG_ON(s->uptodate < disks);
2667 /* Want to check and possibly repair P and Q. 2668 * However there could be one 'failed' device, in which 2669 * case we can only check one of them, possibly using the 2670 * other to generate missing data 2671 */ 2672	2576 /* Want to check and possibly repair P and Q. 2577 * However there could be one 'failed' device, in which 2578 * case we can only check one of them, possibly using the 2579 * other to generate missing data 2580 */ 2581
2673 switch (sh->check_state) { 2674 case check_state_idle: 2675 /* start a new check operation if there are < 2 failures */	2582 /* If !tmp_page, we cannot do the calculations, 2583 * but as we have set STRIPE_HANDLE, we will soon be called 2584 * by stripe_handle with a tmp_page - just wait until then. 2585 */ 2586 if (tmp_page) {
2676 if (s->failed == r6s->q_failed) {	2587 if (s->failed == r6s->q_failed) {
2677 /* The only possible failed device holds Q, so it	2588 /* The only possible failed device holds 'Q', so it
2678 * makes sense to check P (If anything else were failed, 2679 * we would have used P to recreate it). 2680 */	2589 * makes sense to check P (If anything else were failed, 2590 * we would have used P to recreate it). 2591 */
2681 sh->check_state = check_state_run;	2592 compute_block_1(sh, pd_idx, 1); 2593 if (!page_is_zero(sh->dev[pd_idx].page)) { 2594 compute_block_1(sh, pd_idx, 0); 2595 update_p = 1; 2596 }
2682 } 2683 if (!r6s->q_failed && s->failed < 2) {	2597 } 2598 if (!r6s->q_failed && s->failed < 2) {
2684 /* Q is not failed, and we didn't use it to generate	2599 /* q is not failed, and we didn't use it to generate
2685 * anything, so it makes sense to check it 2686 */	2600 * anything, so it makes sense to check it 2601 */
2687 if (sh->check_state == check_state_run) 2688 sh->check_state = check_state_run_pq; 2689 else 2690 sh->check_state = check_state_run_q;	2602 memcpy(page_address(tmp_page), 2603 page_address(sh->dev[qd_idx].page), 2604 STRIPE_SIZE); 2605 compute_parity6(sh, UPDATE_PARITY); 2606 if (memcmp(page_address(tmp_page), 2607 page_address(sh->dev[qd_idx].page), 2608 STRIPE_SIZE) != 0) { 2609 clear_bit(STRIPE_INSYNC, &sh->state); 2610 update_q = 1; 2611 }
2691 }	2612 }
2692 2693 /* discard potentially stale zero_sum_result / 2694 sh->ops.zero_sum_result = 0; 2695 2696 if (sh->check_state == check_state_run) { 2697 / async_xor_zero_sum destroys the contents of P */ 2698 clear_bit(R5_UPTODATE, &sh->dev[pd_idx].flags); 2699 s->uptodate--;	2613 if (update_p \|\| update_q) { 2614 conf->mddev->resync_mismatches += STRIPE_SECTORS; 2615 if (test_bit(MD_RECOVERY_CHECK, &conf->mddev->recovery)) 2616 /* don't try to repair!! */ 2617 update_p = update_q = 0;
2700 }	2618 }
2701 if (sh->check_state >= check_state_run && 2702 sh->check_state <= check_state_run_pq) { 2703 /* async_syndrome_zero_sum preserves P and Q, so 2704 * no need to mark them !uptodate here 2705 */ 2706 set_bit(STRIPE_OP_CHECK, &s->ops_request); 2707 break; 2708 }
2709	2619
2710 /* we have 2-disk failure / 2711 BUG_ON(s->failed != 2); 2712 / fall through / 2713 case check_state_compute_result: 2714 sh->check_state = check_state_idle; 2715 2716 / check that a write has not made the stripe insync */ 2717 if (test_bit(STRIPE_INSYNC, &sh->state)) 2718 break; 2719
2720 /* now write out any block on a failed drive,	2620 /* now write out any block on a failed drive,
2721 * or P or Q if they were recomputed	2621 * or P or Q if they need it
2722 */	2622 */
2723 BUG_ON(s->uptodate < disks - 1); /* We don't need Q to recover */	2623
2724 if (s->failed == 2) { 2725 dev = &sh->dev[r6s->failed_num[1]]; 2726 s->locked++; 2727 set_bit(R5_LOCKED, &dev->flags); 2728 set_bit(R5_Wantwrite, &dev->flags); 2729 } 2730 if (s->failed >= 1) { 2731 dev = &sh->dev[r6s->failed_num[0]]; 2732 s->locked++; 2733 set_bit(R5_LOCKED, &dev->flags); 2734 set_bit(R5_Wantwrite, &dev->flags); 2735 }	2624 if (s->failed == 2) { 2625 dev = &sh->dev[r6s->failed_num[1]]; 2626 s->locked++; 2627 set_bit(R5_LOCKED, &dev->flags); 2628 set_bit(R5_Wantwrite, &dev->flags); 2629 } 2630 if (s->failed >= 1) { 2631 dev = &sh->dev[r6s->failed_num[0]]; 2632 s->locked++; 2633 set_bit(R5_LOCKED, &dev->flags); 2634 set_bit(R5_Wantwrite, &dev->flags); 2635 }
2736 if (sh->ops.zero_sum_result & SUM_CHECK_P_RESULT) {	2636 2637 if (update_p) {
2737 dev = &sh->dev[pd_idx]; 2738 s->locked++; 2739 set_bit(R5_LOCKED, &dev->flags); 2740 set_bit(R5_Wantwrite, &dev->flags); 2741 }	2638 dev = &sh->dev[pd_idx]; 2639 s->locked++; 2640 set_bit(R5_LOCKED, &dev->flags); 2641 set_bit(R5_Wantwrite, &dev->flags); 2642 }
2742 if (sh->ops.zero_sum_result & SUM_CHECK_Q_RESULT) {	2643 if (update_q) {
2743 dev = &sh->dev[qd_idx]; 2744 s->locked++; 2745 set_bit(R5_LOCKED, &dev->flags); 2746 set_bit(R5_Wantwrite, &dev->flags); 2747 } 2748 clear_bit(STRIPE_DEGRADED, &sh->state); 2749 2750 set_bit(STRIPE_INSYNC, &sh->state);	2644 dev = &sh->dev[qd_idx]; 2645 s->locked++; 2646 set_bit(R5_LOCKED, &dev->flags); 2647 set_bit(R5_Wantwrite, &dev->flags); 2648 } 2649 clear_bit(STRIPE_DEGRADED, &sh->state); 2650 2651 set_bit(STRIPE_INSYNC, &sh->state);
2751 break; 2752 case check_state_run: 2753 case check_state_run_q: 2754 case check_state_run_pq: 2755 break; /* we will be called again upon completion / 2756 case check_state_check_result: 2757 sh->check_state = check_state_idle; 2758 2759 / handle a successful check operation, if parity is correct 2760 * we are done. Otherwise update the mismatch count and repair 2761 * parity if !MD_RECOVERY_CHECK 2762 / 2763 if (sh->ops.zero_sum_result == 0) { 2764 / both parities are correct / 2765 if (!s->failed) 2766 set_bit(STRIPE_INSYNC, &sh->state); 2767 else { 2768 / in contrast to the raid5 case we can validate 2769 * parity, but still have a failure to write 2770 * back 2771 / 2772 sh->check_state = check_state_compute_result; 2773 / Returning at this point means that we may go 2774 * off and bring p and/or q uptodate again so 2775 * we make sure to check zero_sum_result again 2776 * to verify if p or q need writeback 2777 / 2778 } 2779 } else { 2780 conf->mddev->resync_mismatches += STRIPE_SECTORS; 2781 if (test_bit(MD_RECOVERY_CHECK, &conf->mddev->recovery)) 2782 / don't try to repair!! / 2783 set_bit(STRIPE_INSYNC, &sh->state); 2784 else { 2785 int target = &sh->ops.target; 2786 2787 sh->ops.target = -1; 2788 sh->ops.target2 = -1; 2789 sh->check_state = check_state_compute_run; 2790 set_bit(STRIPE_COMPUTE_RUN, &sh->state); 2791 set_bit(STRIPE_OP_COMPUTE_BLK, &s->ops_request); 2792 if (sh->ops.zero_sum_result & SUM_CHECK_P_RESULT) { 2793 set_bit(R5_Wantcompute, 2794 &sh->dev[pd_idx].flags); 2795 target = pd_idx; 2796 target = &sh->ops.target2; 2797 s->uptodate++; 2798 } 2799 if (sh->ops.zero_sum_result & SUM_CHECK_Q_RESULT) { 2800 set_bit(R5_Wantcompute, 2801 &sh->dev[qd_idx].flags); 2802 target = qd_idx; 2803 s->uptodate++; 2804 } 2805 } 2806 } 2807 break; 2808 case check_state_compute_run: 2809 break; 2810 default: 2811 printk(KERN_ERR "%s: unknown check_state: %d sector: %llu\n", 2812 __func__, sh->check_state, 2813 (unsigned long long) sh->sector); 2814 BUG();
2815 } 2816} 2817 2818static void handle_stripe_expansion(raid5_conf_t conf, struct stripe_head sh, 2819 struct r6_state r6s) 2820{ 2821 int i; 2822 2823 / We have read all the blocks in this stripe and now we need to 2824 * copy some of them into a target stripe for expand. 2825 / 2826 struct dma_async_tx_descriptor tx = NULL; 2827 clear_bit(STRIPE_EXPAND_SOURCE, &sh->state); 2828 for (i = 0; i < sh->disks; i++) 2829 if (i != sh->pd_idx && i != sh->qd_idx) { 2830 int dd_idx, j; 2831 struct stripe_head *sh2;	2652 } 2653} 2654 2655static void handle_stripe_expansion(raid5_conf_t conf, struct stripe_head sh, 2656 struct r6_state r6s) 2657{ 2658 int i; 2659 2660 / We have read all the blocks in this stripe and now we need to 2661 * copy some of them into a target stripe for expand. 2662 / 2663 struct dma_async_tx_descriptor tx = NULL; 2664 clear_bit(STRIPE_EXPAND_SOURCE, &sh->state); 2665 for (i = 0; i < sh->disks; i++) 2666 if (i != sh->pd_idx && i != sh->qd_idx) { 2667 int dd_idx, j; 2668 struct stripe_head *sh2;
2832 struct async_submit_ctl submit;
2833 2834 sector_t bn = compute_blocknr(sh, i, 1); 2835 sector_t s = raid5_compute_sector(conf, bn, 0, 2836 &dd_idx, NULL); 2837 sh2 = get_active_stripe(conf, s, 0, 1, 1); 2838 if (sh2 == NULL) 2839 /* so far only the early blocks of this stripe 2840 * have been requested. When later blocks 2841 * get requested, we will try again 2842 / 2843 continue; 2844 if (!test_bit(STRIPE_EXPANDING, &sh2->state) \|\| 2845 test_bit(R5_Expanded, &sh2->dev[dd_idx].flags)) { 2846 / must have already done this block / 2847 release_stripe(sh2); 2848 continue; 2849 } 2850 2851 / place all the copies on one channel */	2669 2670 sector_t bn = compute_blocknr(sh, i, 1); 2671 sector_t s = raid5_compute_sector(conf, bn, 0, 2672 &dd_idx, NULL); 2673 sh2 = get_active_stripe(conf, s, 0, 1, 1); 2674 if (sh2 == NULL) 2675 /* so far only the early blocks of this stripe 2676 * have been requested. When later blocks 2677 * get requested, we will try again 2678 / 2679 continue; 2680 if (!test_bit(STRIPE_EXPANDING, &sh2->state) \|\| 2681 test_bit(R5_Expanded, &sh2->dev[dd_idx].flags)) { 2682 / must have already done this block / 2683 release_stripe(sh2); 2684 continue; 2685 } 2686 2687 / place all the copies on one channel */
2852 init_async_submit(&submit, 0, tx, NULL, NULL, NULL);
2853 tx = async_memcpy(sh2->dev[dd_idx].page,	2688 tx = async_memcpy(sh2->dev[dd_idx].page,
2854 sh->dev[i].page, 0, 0, STRIPE_SIZE, 2855 &submit);	2689 sh->dev[i].page, 0, 0, STRIPE_SIZE, 2690 ASYNC_TX_DEP_ACK, tx, NULL, NULL);
2856 2857 set_bit(R5_Expanded, &sh2->dev[dd_idx].flags); 2858 set_bit(R5_UPTODATE, &sh2->dev[dd_idx].flags); 2859 for (j = 0; j < conf->raid_disks; j++) 2860 if (j != sh2->pd_idx && 2861 (!r6s \|\| j != sh2->qd_idx) && 2862 !test_bit(R5_Expanded, &sh2->dev[j].flags)) 2863 break; --- 269 unchanged lines hidden (view full) --- 3133 } 3134 } 3135 3136 if (s.expanded && test_bit(STRIPE_EXPANDING, &sh->state) && 3137 !sh->reconstruct_state) { 3138 /* Need to write out all blocks after computing parity */ 3139 sh->disks = conf->raid_disks; 3140 stripe_set_idx(sh->sector, conf, 0, sh);	2691 2692 set_bit(R5_Expanded, &sh2->dev[dd_idx].flags); 2693 set_bit(R5_UPTODATE, &sh2->dev[dd_idx].flags); 2694 for (j = 0; j < conf->raid_disks; j++) 2695 if (j != sh2->pd_idx && 2696 (!r6s \|\| j != sh2->qd_idx) && 2697 !test_bit(R5_Expanded, &sh2->dev[j].flags)) 2698 break; --- 269 unchanged lines hidden (view full) --- 2968 } 2969 } 2970 2971 if (s.expanded && test_bit(STRIPE_EXPANDING, &sh->state) && 2972 !sh->reconstruct_state) { 2973 /* Need to write out all blocks after computing parity */ 2974 sh->disks = conf->raid_disks; 2975 stripe_set_idx(sh->sector, conf, 0, sh);
3141 schedule_reconstruction(sh, &s, 1, 1);	2976 schedule_reconstruction5(sh, &s, 1, 1);
3142 } else if (s.expanded && !sh->reconstruct_state && s.locked == 0) { 3143 clear_bit(STRIPE_EXPAND_READY, &sh->state); 3144 atomic_dec(&conf->reshape_stripes); 3145 wake_up(&conf->wait_for_overlap); 3146 md_done_sync(conf->mddev, STRIPE_SECTORS, 1); 3147 } 3148 3149 if (s.expanding && s.locked == 0 && 3150 !test_bit(STRIPE_COMPUTE_RUN, &sh->state)) 3151 handle_stripe_expansion(conf, sh, NULL); 3152 3153 unlock: 3154 spin_unlock(&sh->lock); 3155 3156 /* wait for this device to become unblocked */ 3157 if (unlikely(blocked_rdev)) 3158 md_wait_for_blocked_rdev(blocked_rdev, conf->mddev); 3159 3160 if (s.ops_request)	2977 } else if (s.expanded && !sh->reconstruct_state && s.locked == 0) { 2978 clear_bit(STRIPE_EXPAND_READY, &sh->state); 2979 atomic_dec(&conf->reshape_stripes); 2980 wake_up(&conf->wait_for_overlap); 2981 md_done_sync(conf->mddev, STRIPE_SECTORS, 1); 2982 } 2983 2984 if (s.expanding && s.locked == 0 && 2985 !test_bit(STRIPE_COMPUTE_RUN, &sh->state)) 2986 handle_stripe_expansion(conf, sh, NULL); 2987 2988 unlock: 2989 spin_unlock(&sh->lock); 2990 2991 /* wait for this device to become unblocked */ 2992 if (unlikely(blocked_rdev)) 2993 md_wait_for_blocked_rdev(blocked_rdev, conf->mddev); 2994 2995 if (s.ops_request)
3161 raid_run_ops(sh, s.ops_request);	2996 raid5_run_ops(sh, s.ops_request);
3162 3163 ops_run_io(sh, &s); 3164 3165 return_io(return_bi); 3166 3167 return blocked_rdev == NULL; 3168} 3169	2997 2998 ops_run_io(sh, &s); 2999 3000 return_io(return_bi); 3001 3002 return blocked_rdev == NULL; 3003} 3004
3170static bool handle_stripe6(struct stripe_head *sh)	3005static bool handle_stripe6(struct stripe_head sh, struct page tmp_page)
3171{ 3172 raid5_conf_t conf = sh->raid_conf; 3173 int disks = sh->disks; 3174 struct bio return_bi = NULL; 3175 int i, pd_idx = sh->pd_idx, qd_idx = sh->qd_idx; 3176 struct stripe_head_state s; 3177 struct r6_state r6s; 3178 struct r5dev dev, pdev, qdev; 3179 mdk_rdev_t blocked_rdev = NULL; 3180 3181 pr_debug("handling stripe %llu, state=%#lx cnt=%d, "	3006{ 3007 raid5_conf_t conf = sh->raid_conf; 3008 int disks = sh->disks; 3009 struct bio return_bi = NULL; 3010 int i, pd_idx = sh->pd_idx, qd_idx = sh->qd_idx; 3011 struct stripe_head_state s; 3012 struct r6_state r6s; 3013 struct r5dev dev, pdev, qdev; 3014 mdk_rdev_t blocked_rdev = NULL; 3015 3016 pr_debug("handling stripe %llu, state=%#lx cnt=%d, "
3182 "pd_idx=%d, qd_idx=%d\n, check:%d, reconstruct:%d\n",	3017 "pd_idx=%d, qd_idx=%d\n",
3183 (unsigned long long)sh->sector, sh->state,	3018 (unsigned long long)sh->sector, sh->state,
3184 atomic_read(&sh->count), pd_idx, qd_idx, 3185 sh->check_state, sh->reconstruct_state);	3019 atomic_read(&sh->count), pd_idx, qd_idx);
3186 memset(&s, 0, sizeof(s)); 3187 3188 spin_lock(&sh->lock); 3189 clear_bit(STRIPE_HANDLE, &sh->state); 3190 clear_bit(STRIPE_DELAYED, &sh->state); 3191 3192 s.syncing = test_bit(STRIPE_SYNCING, &sh->state); 3193 s.expanding = test_bit(STRIPE_EXPAND_SOURCE, &sh->state); 3194 s.expanded = test_bit(STRIPE_EXPAND_READY, &sh->state); 3195 /* Now to look around and see what can be done / 3196 3197 rcu_read_lock(); 3198 for (i=disks; i--; ) { 3199 mdk_rdev_t rdev; 3200 dev = &sh->dev[i]; 3201 clear_bit(R5_Insync, &dev->flags); 3202 3203 pr_debug("check %d: state 0x%lx read %p write %p written %p\n", 3204 i, dev->flags, dev->toread, dev->towrite, dev->written);	3020 memset(&s, 0, sizeof(s)); 3021 3022 spin_lock(&sh->lock); 3023 clear_bit(STRIPE_HANDLE, &sh->state); 3024 clear_bit(STRIPE_DELAYED, &sh->state); 3025 3026 s.syncing = test_bit(STRIPE_SYNCING, &sh->state); 3027 s.expanding = test_bit(STRIPE_EXPAND_SOURCE, &sh->state); 3028 s.expanded = test_bit(STRIPE_EXPAND_READY, &sh->state); 3029 /* Now to look around and see what can be done / 3030 3031 rcu_read_lock(); 3032 for (i=disks; i--; ) { 3033 mdk_rdev_t rdev; 3034 dev = &sh->dev[i]; 3035 clear_bit(R5_Insync, &dev->flags); 3036 3037 pr_debug("check %d: state 0x%lx read %p write %p written %p\n", 3038 i, dev->flags, dev->toread, dev->towrite, dev->written);
3205 /* maybe we can reply to a read 3206 * 3207 * new wantfill requests are only permitted while 3208 * ops_complete_biofill is guaranteed to be inactive 3209 */ 3210 if (test_bit(R5_UPTODATE, &dev->flags) && dev->toread && 3211 !test_bit(STRIPE_BIOFILL_RUN, &sh->state)) 3212 set_bit(R5_Wantfill, &dev->flags);	3039 /* maybe we can reply to a read / 3040 if (test_bit(R5_UPTODATE, &dev->flags) && dev->toread) { 3041 struct bio rbi, *rbi2; 3042 pr_debug("Return read for disc %d\n", i); 3043 spin_lock_irq(&conf->device_lock); 3044 rbi = dev->toread; 3045 dev->toread = NULL; 3046 if (test_and_clear_bit(R5_Overlap, &dev->flags)) 3047 wake_up(&conf->wait_for_overlap); 3048 spin_unlock_irq(&conf->device_lock); 3049 while (rbi && rbi->bi_sector < dev->sector + STRIPE_SECTORS) { 3050 copy_data(0, rbi, dev->page, dev->sector); 3051 rbi2 = r5_next_bio(rbi, dev->sector); 3052 spin_lock_irq(&conf->device_lock); 3053 if (!raid5_dec_bi_phys_segments(rbi)) { 3054 rbi->bi_next = return_bi; 3055 return_bi = rbi; 3056 } 3057 spin_unlock_irq(&conf->device_lock); 3058 rbi = rbi2; 3059 } 3060 }
3213 3214 /* now count some things */ 3215 if (test_bit(R5_LOCKED, &dev->flags)) s.locked++; 3216 if (test_bit(R5_UPTODATE, &dev->flags)) s.uptodate++;	3061 3062 /* now count some things */ 3063 if (test_bit(R5_LOCKED, &dev->flags)) s.locked++; 3064 if (test_bit(R5_UPTODATE, &dev->flags)) s.uptodate++;
3217 if (test_bit(R5_Wantcompute, &dev->flags)) 3218 BUG_ON(++s.compute > 2);
3219	3065
3220 if (test_bit(R5_Wantfill, &dev->flags)) { 3221 s.to_fill++; 3222 } else if (dev->toread)	3066 3067 if (dev->toread)
3223 s.to_read++; 3224 if (dev->towrite) { 3225 s.to_write++; 3226 if (!test_bit(R5_OVERWRITE, &dev->flags)) 3227 s.non_overwrite++; 3228 } 3229 if (dev->written) 3230 s.written++; --- 24 unchanged lines hidden (view full) --- 3255 set_bit(STRIPE_HANDLE, &sh->state); 3256 goto unlock; 3257 } 3258 /* There is nothing for the blocked_rdev to block */ 3259 rdev_dec_pending(blocked_rdev, conf->mddev); 3260 blocked_rdev = NULL; 3261 } 3262	3068 s.to_read++; 3069 if (dev->towrite) { 3070 s.to_write++; 3071 if (!test_bit(R5_OVERWRITE, &dev->flags)) 3072 s.non_overwrite++; 3073 } 3074 if (dev->written) 3075 s.written++; --- 24 unchanged lines hidden (view full) --- 3100 set_bit(STRIPE_HANDLE, &sh->state); 3101 goto unlock; 3102 } 3103 /* There is nothing for the blocked_rdev to block */ 3104 rdev_dec_pending(blocked_rdev, conf->mddev); 3105 blocked_rdev = NULL; 3106 } 3107
3263 if (s.to_fill && !test_bit(STRIPE_BIOFILL_RUN, &sh->state)) { 3264 set_bit(STRIPE_OP_BIOFILL, &s.ops_request); 3265 set_bit(STRIPE_BIOFILL_RUN, &sh->state); 3266 } 3267
3268 pr_debug("locked=%d uptodate=%d to_read=%d" 3269 " to_write=%d failed=%d failed_num=%d,%d\n", 3270 s.locked, s.uptodate, s.to_read, s.to_write, s.failed, 3271 r6s.failed_num[0], r6s.failed_num[1]); 3272 /* check if the array has lost >2 devices and, if so, some requests 3273 * might need to be failed 3274 / 3275 if (s.failed > 2 && s.to_read+s.to_write+s.written) --- 24 unchanged lines hidden* (view full) --- 3300 && test_bit(R5_UPTODATE, &qdev->flags))))) 3301 handle_stripe_clean_event(conf, sh, disks, &return_bi); 3302 3303 /* Now we might consider reading some blocks, either to check/generate 3304 * parity, or to satisfy requests 3305 * or to load a block that is being partially written. 3306 */ 3307 if (s.to_read \|\| s.non_overwrite \|\| (s.to_write && s.failed) \|\|	3108 pr_debug("locked=%d uptodate=%d to_read=%d" 3109 " to_write=%d failed=%d failed_num=%d,%d\n", 3110 s.locked, s.uptodate, s.to_read, s.to_write, s.failed, 3111 r6s.failed_num[0], r6s.failed_num[1]); 3112 /* check if the array has lost >2 devices and, if so, some requests 3113 * might need to be failed 3114 / 3115 if (s.failed > 2 && s.to_read+s.to_write+s.written) --- 24 unchanged lines hidden* (view full) --- 3140 && test_bit(R5_UPTODATE, &qdev->flags))))) 3141 handle_stripe_clean_event(conf, sh, disks, &return_bi); 3142 3143 /* Now we might consider reading some blocks, either to check/generate 3144 * parity, or to satisfy requests 3145 * or to load a block that is being partially written. 3146 */ 3147 if (s.to_read \|\| s.non_overwrite \|\| (s.to_write && s.failed) \|\|
3308 (s.syncing && (s.uptodate + s.compute < disks)) \|\| s.expanding)	3148 (s.syncing && (s.uptodate < disks)) \|\| s.expanding)
3309 handle_stripe_fill6(sh, &s, &r6s, disks); 3310	3149 handle_stripe_fill6(sh, &s, &r6s, disks); 3150
3311 /* Now we check to see if any write operations have recently 3312 * completed 3313 / 3314 if (sh->reconstruct_state == reconstruct_state_drain_result) { 3315 int qd_idx = sh->qd_idx; 3316 3317 sh->reconstruct_state = reconstruct_state_idle; 3318 / All the 'written' buffers and the parity blocks are ready to 3319 * be written back to disk 3320 / 3321 BUG_ON(!test_bit(R5_UPTODATE, &sh->dev[sh->pd_idx].flags)); 3322 BUG_ON(!test_bit(R5_UPTODATE, &sh->dev[qd_idx].flags)); 3323 for (i = disks; i--; ) { 3324 dev = &sh->dev[i]; 3325 if (test_bit(R5_LOCKED, &dev->flags) && 3326 (i == sh->pd_idx \|\| i == qd_idx \|\| 3327 dev->written)) { 3328 pr_debug("Writing block %d\n", i); 3329 BUG_ON(!test_bit(R5_UPTODATE, &dev->flags)); 3330 set_bit(R5_Wantwrite, &dev->flags); 3331 if (!test_bit(R5_Insync, &dev->flags) \|\| 3332 ((i == sh->pd_idx \|\| i == qd_idx) && 3333 s.failed == 0)) 3334 set_bit(STRIPE_INSYNC, &sh->state); 3335 } 3336 } 3337 if (test_and_clear_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) { 3338 atomic_dec(&conf->preread_active_stripes); 3339 if (atomic_read(&conf->preread_active_stripes) < 3340 IO_THRESHOLD) 3341 md_wakeup_thread(conf->mddev->thread); 3342 } 3343 } 3344 3345 / Now to consider new write requests and what else, if anything 3346 * should be read. We do not handle new writes when: 3347 * 1/ A 'write' operation (copy+gen_syndrome) is already in flight. 3348 * 2/ A 'check' operation is in flight, as it may clobber the parity 3349 * block. 3350 */ 3351 if (s.to_write && !sh->reconstruct_state && !sh->check_state)	3151 /* now to consider writing and what else, if anything should be read */ 3152 if (s.to_write)
3352 handle_stripe_dirtying6(conf, sh, &s, &r6s, disks); 3353 3354 /* maybe we need to check and possibly fix the parity for this stripe 3355 * Any reads will already have been scheduled, so we just see if enough	3153 handle_stripe_dirtying6(conf, sh, &s, &r6s, disks); 3154 3155 /* maybe we need to check and possibly fix the parity for this stripe 3156 * Any reads will already have been scheduled, so we just see if enough
3356 * data is available. The parity check is held off while parity 3357 * dependent operations are in flight.	3157 * data is available
3358 */	3158 */
3359 if (sh->check_state \|\| 3360 (s.syncing && s.locked == 0 && 3361 !test_bit(STRIPE_COMPUTE_RUN, &sh->state) && 3362 !test_bit(STRIPE_INSYNC, &sh->state))) 3363 handle_parity_checks6(conf, sh, &s, &r6s, disks);	3159 if (s.syncing && s.locked == 0 && !test_bit(STRIPE_INSYNC, &sh->state)) 3160 handle_parity_checks6(conf, sh, &s, &r6s, tmp_page, disks);
3364 3365 if (s.syncing && s.locked == 0 && test_bit(STRIPE_INSYNC, &sh->state)) { 3366 md_done_sync(conf->mddev, STRIPE_SECTORS,1); 3367 clear_bit(STRIPE_SYNCING, &sh->state); 3368 } 3369 3370 /* If the failed drives are just a ReadError, then we might need 3371 * to progress the repair/check process --- 4 unchanged lines hidden (view full) --- 3376 if (test_bit(R5_ReadError, &dev->flags) 3377 && !test_bit(R5_LOCKED, &dev->flags) 3378 && test_bit(R5_UPTODATE, &dev->flags) 3379 ) { 3380 if (!test_bit(R5_ReWrite, &dev->flags)) { 3381 set_bit(R5_Wantwrite, &dev->flags); 3382 set_bit(R5_ReWrite, &dev->flags); 3383 set_bit(R5_LOCKED, &dev->flags);	3161 3162 if (s.syncing && s.locked == 0 && test_bit(STRIPE_INSYNC, &sh->state)) { 3163 md_done_sync(conf->mddev, STRIPE_SECTORS,1); 3164 clear_bit(STRIPE_SYNCING, &sh->state); 3165 } 3166 3167 /* If the failed drives are just a ReadError, then we might need 3168 * to progress the repair/check process --- 4 unchanged lines hidden (view full) --- 3173 if (test_bit(R5_ReadError, &dev->flags) 3174 && !test_bit(R5_LOCKED, &dev->flags) 3175 && test_bit(R5_UPTODATE, &dev->flags) 3176 ) { 3177 if (!test_bit(R5_ReWrite, &dev->flags)) { 3178 set_bit(R5_Wantwrite, &dev->flags); 3179 set_bit(R5_ReWrite, &dev->flags); 3180 set_bit(R5_LOCKED, &dev->flags);
3384 s.locked++;
3385 } else { 3386 /* let's read it back */ 3387 set_bit(R5_Wantread, &dev->flags); 3388 set_bit(R5_LOCKED, &dev->flags);	3181 } else { 3182 /* let's read it back */ 3183 set_bit(R5_Wantread, &dev->flags); 3184 set_bit(R5_LOCKED, &dev->flags);
3389 s.locked++;
3390 } 3391 } 3392 } 3393	3185 } 3186 } 3187 } 3188
3394 /* Finish reconstruct operations initiated by the expansion process */ 3395 if (sh->reconstruct_state == reconstruct_state_result) { 3396 sh->reconstruct_state = reconstruct_state_idle; 3397 clear_bit(STRIPE_EXPANDING, &sh->state); 3398 for (i = conf->raid_disks; i--; ) { 3399 set_bit(R5_Wantwrite, &sh->dev[i].flags); 3400 set_bit(R5_LOCKED, &sh->dev[i].flags); 3401 s.locked++; 3402 } 3403 } 3404 3405 if (s.expanded && test_bit(STRIPE_EXPANDING, &sh->state) && 3406 !sh->reconstruct_state) {	3189 if (s.expanded && test_bit(STRIPE_EXPANDING, &sh->state)) {
3407 struct stripe_head sh2 3408 = get_active_stripe(conf, sh->sector, 1, 1, 1); 3409 if (sh2 && test_bit(STRIPE_EXPAND_SOURCE, &sh2->state)) { 3410 / sh cannot be written until sh2 has been read. 3411 * so arrange for sh to be delayed a little 3412 / 3413 set_bit(STRIPE_DELAYED, &sh->state); 3414 set_bit(STRIPE_HANDLE, &sh->state); --- 4 unchanged lines hidden* (view full) --- 3419 goto unlock; 3420 } 3421 if (sh2) 3422 release_stripe(sh2); 3423 3424 /* Need to write out all blocks after computing P&Q */ 3425 sh->disks = conf->raid_disks; 3426 stripe_set_idx(sh->sector, conf, 0, sh);	3190 struct stripe_head sh2 3191 = get_active_stripe(conf, sh->sector, 1, 1, 1); 3192 if (sh2 && test_bit(STRIPE_EXPAND_SOURCE, &sh2->state)) { 3193 / sh cannot be written until sh2 has been read. 3194 * so arrange for sh to be delayed a little 3195 / 3196 set_bit(STRIPE_DELAYED, &sh->state); 3197 set_bit(STRIPE_HANDLE, &sh->state); --- 4 unchanged lines hidden* (view full) --- 3202 goto unlock; 3203 } 3204 if (sh2) 3205 release_stripe(sh2); 3206 3207 /* Need to write out all blocks after computing P&Q */ 3208 sh->disks = conf->raid_disks; 3209 stripe_set_idx(sh->sector, conf, 0, sh);
3427 schedule_reconstruction(sh, &s, 1, 1); 3428 } else if (s.expanded && !sh->reconstruct_state && s.locked == 0) {	3210 compute_parity6(sh, RECONSTRUCT_WRITE); 3211 for (i = conf->raid_disks ; i-- ; ) { 3212 set_bit(R5_LOCKED, &sh->dev[i].flags); 3213 s.locked++; 3214 set_bit(R5_Wantwrite, &sh->dev[i].flags); 3215 } 3216 clear_bit(STRIPE_EXPANDING, &sh->state); 3217 } else if (s.expanded) {
3429 clear_bit(STRIPE_EXPAND_READY, &sh->state); 3430 atomic_dec(&conf->reshape_stripes); 3431 wake_up(&conf->wait_for_overlap); 3432 md_done_sync(conf->mddev, STRIPE_SECTORS, 1); 3433 } 3434 3435 if (s.expanding && s.locked == 0 && 3436 !test_bit(STRIPE_COMPUTE_RUN, &sh->state)) 3437 handle_stripe_expansion(conf, sh, &r6s); 3438 3439 unlock: 3440 spin_unlock(&sh->lock); 3441 3442 /* wait for this device to become unblocked */ 3443 if (unlikely(blocked_rdev)) 3444 md_wait_for_blocked_rdev(blocked_rdev, conf->mddev); 3445	3218 clear_bit(STRIPE_EXPAND_READY, &sh->state); 3219 atomic_dec(&conf->reshape_stripes); 3220 wake_up(&conf->wait_for_overlap); 3221 md_done_sync(conf->mddev, STRIPE_SECTORS, 1); 3222 } 3223 3224 if (s.expanding && s.locked == 0 && 3225 !test_bit(STRIPE_COMPUTE_RUN, &sh->state)) 3226 handle_stripe_expansion(conf, sh, &r6s); 3227 3228 unlock: 3229 spin_unlock(&sh->lock); 3230 3231 /* wait for this device to become unblocked */ 3232 if (unlikely(blocked_rdev)) 3233 md_wait_for_blocked_rdev(blocked_rdev, conf->mddev); 3234
3446 if (s.ops_request) 3447 raid_run_ops(sh, s.ops_request); 3448
3449 ops_run_io(sh, &s); 3450 3451 return_io(return_bi); 3452 3453 return blocked_rdev == NULL; 3454} 3455 3456/* returns true if the stripe was handled */	3235 ops_run_io(sh, &s); 3236 3237 return_io(return_bi); 3238 3239 return blocked_rdev == NULL; 3240} 3241 3242/* returns true if the stripe was handled */
3457static bool handle_stripe(struct stripe_head *sh)	3243static bool handle_stripe(struct stripe_head sh, struct page tmp_page)
3458{ 3459 if (sh->raid_conf->level == 6)	3244{ 3245 if (sh->raid_conf->level == 6)
3460 return handle_stripe6(sh);	3246 return handle_stripe6(sh, tmp_page);
3461 else 3462 return handle_stripe5(sh); 3463} 3464	3247 else 3248 return handle_stripe5(sh); 3249} 3250
	3251 3252
3465static void raid5_activate_delayed(raid5_conf_t conf) 3466{ 3467 if (atomic_read(&conf->preread_active_stripes) < IO_THRESHOLD) { 3468 while (!list_empty(&conf->delayed_list)) { 3469 struct list_head l = conf->delayed_list.next; 3470 struct stripe_head sh; 3471 sh = list_entry(l, struct stripe_head, lru); 3472 list_del_init(l); --- 340 unchanged lines hidden* (view full) --- 3813 raid5_conf_t conf = mddev->private; 3814 int dd_idx; 3815 sector_t new_sector; 3816 sector_t logical_sector, last_sector; 3817 struct stripe_head sh; 3818 const int rw = bio_data_dir(bi); 3819 int cpu, remaining; 3820	3253static void raid5_activate_delayed(raid5_conf_t conf) 3254{ 3255 if (atomic_read(&conf->preread_active_stripes) < IO_THRESHOLD) { 3256 while (!list_empty(&conf->delayed_list)) { 3257 struct list_head l = conf->delayed_list.next; 3258 struct stripe_head sh; 3259 sh = list_entry(l, struct stripe_head, lru); 3260 list_del_init(l); --- 340 unchanged lines hidden* (view full) --- 3601 raid5_conf_t conf = mddev->private; 3602 int dd_idx; 3603 sector_t new_sector; 3604 sector_t logical_sector, last_sector; 3605 struct stripe_head sh; 3606 const int rw = bio_data_dir(bi); 3607 int cpu, remaining; 3608
3821 if (unlikely(bio_barrier(bi))) {	3609 if (unlikely(bio_rw_flagged(bi, BIO_RW_BARRIER))) {
3822 bio_endio(bi, -EOPNOTSUPP); 3823 return 0; 3824 } 3825 3826 md_write_start(mddev, bi); 3827 3828 cpu = part_stat_lock(); 3829 part_stat_inc(cpu, &mddev->gendisk->part0, ios[rw]); --- 434 unchanged lines hidden (view full) --- 4264 bitmap_start_sync(mddev->bitmap, sector_nr, &sync_blocks, still_degraded); 4265 4266 spin_lock(&sh->lock); 4267 set_bit(STRIPE_SYNCING, &sh->state); 4268 clear_bit(STRIPE_INSYNC, &sh->state); 4269 spin_unlock(&sh->lock); 4270 4271 /* wait for any blocked device to be handled */	3610 bio_endio(bi, -EOPNOTSUPP); 3611 return 0; 3612 } 3613 3614 md_write_start(mddev, bi); 3615 3616 cpu = part_stat_lock(); 3617 part_stat_inc(cpu, &mddev->gendisk->part0, ios[rw]); --- 434 unchanged lines hidden (view full) --- 4052 bitmap_start_sync(mddev->bitmap, sector_nr, &sync_blocks, still_degraded); 4053 4054 spin_lock(&sh->lock); 4055 set_bit(STRIPE_SYNCING, &sh->state); 4056 clear_bit(STRIPE_INSYNC, &sh->state); 4057 spin_unlock(&sh->lock); 4058 4059 /* wait for any blocked device to be handled */
4272 while (unlikely(!handle_stripe(sh)))	4060 while(unlikely(!handle_stripe(sh, NULL)))
4273 ; 4274 release_stripe(sh); 4275 4276 return STRIPE_SECTORS; 4277} 4278 4279static int retry_aligned_read(raid5_conf_t conf, struct bio raid_bio) 4280{ --- 40 unchanged lines hidden (view full) --- 4321 set_bit(R5_ReadError, &sh->dev[dd_idx].flags); 4322 if (!add_stripe_bio(sh, raid_bio, dd_idx, 0)) { 4323 release_stripe(sh); 4324 raid5_set_bi_hw_segments(raid_bio, scnt); 4325 conf->retry_read_aligned = raid_bio; 4326 return handled; 4327 } 4328	4061 ; 4062 release_stripe(sh); 4063 4064 return STRIPE_SECTORS; 4065} 4066 4067static int retry_aligned_read(raid5_conf_t conf, struct bio raid_bio) 4068{ --- 40 unchanged lines hidden (view full) --- 4109 set_bit(R5_ReadError, &sh->dev[dd_idx].flags); 4110 if (!add_stripe_bio(sh, raid_bio, dd_idx, 0)) { 4111 release_stripe(sh); 4112 raid5_set_bi_hw_segments(raid_bio, scnt); 4113 conf->retry_read_aligned = raid_bio; 4114 return handled; 4115 } 4116
4329 handle_stripe(sh);	4117 handle_stripe(sh, NULL);
4330 release_stripe(sh); 4331 handled++; 4332 } 4333 spin_lock_irq(&conf->device_lock); 4334 remaining = raid5_dec_bi_phys_segments(raid_bio); 4335 spin_unlock_irq(&conf->device_lock); 4336 if (remaining == 0) 4337 bio_endio(raid_bio, 0); 4338 if (atomic_dec_and_test(&conf->active_aligned_reads)) 4339 wake_up(&conf->wait_for_stripe); 4340 return handled; 4341} 4342	4118 release_stripe(sh); 4119 handled++; 4120 } 4121 spin_lock_irq(&conf->device_lock); 4122 remaining = raid5_dec_bi_phys_segments(raid_bio); 4123 spin_unlock_irq(&conf->device_lock); 4124 if (remaining == 0) 4125 bio_endio(raid_bio, 0); 4126 if (atomic_dec_and_test(&conf->active_aligned_reads)) 4127 wake_up(&conf->wait_for_stripe); 4128 return handled; 4129} 4130
4343#ifdef CONFIG_MULTICORE_RAID456 4344static void __process_stripe(void param, async_cookie_t cookie) 4345{ 4346 struct stripe_head sh = param;
4347	4131
4348 handle_stripe(sh); 4349 release_stripe(sh); 4350}
4351	4132
4352static void process_stripe(struct stripe_head sh, struct list_head domain) 4353{ 4354 async_schedule_domain(__process_stripe, sh, domain); 4355} 4356 4357static void synchronize_stripe_processing(struct list_head domain) 4358{ 4359 async_synchronize_full_domain(domain); 4360} 4361#else 4362static void process_stripe(struct stripe_head sh, struct list_head domain) 4363{ 4364 handle_stripe(sh); 4365 release_stripe(sh); 4366 cond_resched(); 4367} 4368 4369static void synchronize_stripe_processing(struct list_head domain) 4370{ 4371} 4372#endif 4373 4374
4375/* 4376 * This is our raid5 kernel thread. 4377 * 4378 * We scan the hash table for stripes which can be handled now. 4379 * During the scan, completed stripes are saved for us by the interrupt 4380 * handler, so that they will not have to wait for our next wakeup. 4381 / 4382static void raid5d(mddev_t mddev) 4383{ 4384 struct stripe_head sh; 4385 raid5_conf_t conf = mddev->private; 4386 int handled;	4133/* 4134 * This is our raid5 kernel thread. 4135 * 4136 * We scan the hash table for stripes which can be handled now. 4137 * During the scan, completed stripes are saved for us by the interrupt 4138 * handler, so that they will not have to wait for our next wakeup. 4139 / 4140static void raid5d(mddev_t mddev) 4141{ 4142 struct stripe_head sh; 4143 raid5_conf_t conf = mddev->private; 4144 int handled;
4387 LIST_HEAD(raid_domain);
4388 4389 pr_debug("+++ raid5d active\n"); 4390 4391 md_check_recovery(mddev); 4392 4393 handled = 0; 4394 spin_lock_irq(&conf->device_lock); 4395 while (1) { --- 20 unchanged lines hidden (view full) --- 4416 4417 sh = __get_priority_stripe(conf); 4418 4419 if (!sh) 4420 break; 4421 spin_unlock_irq(&conf->device_lock); 4422 4423 handled++;	4145 4146 pr_debug("+++ raid5d active\n"); 4147 4148 md_check_recovery(mddev); 4149 4150 handled = 0; 4151 spin_lock_irq(&conf->device_lock); 4152 while (1) { --- 20 unchanged lines hidden (view full) --- 4173 4174 sh = __get_priority_stripe(conf); 4175 4176 if (!sh) 4177 break; 4178 spin_unlock_irq(&conf->device_lock); 4179 4180 handled++;
4424 process_stripe(sh, &raid_domain);	4181 handle_stripe(sh, conf->spare_page); 4182 release_stripe(sh);
4425 4426 spin_lock_irq(&conf->device_lock); 4427 } 4428 pr_debug("%d stripes handled\n", handled); 4429 4430 spin_unlock_irq(&conf->device_lock); 4431	4183 4184 spin_lock_irq(&conf->device_lock); 4185 } 4186 pr_debug("%d stripes handled\n", handled); 4187 4188 spin_unlock_irq(&conf->device_lock); 4189
4432 synchronize_stripe_processing(&raid_domain);
4433 async_tx_issue_pending_all(); 4434 unplug_slaves(mddev); 4435 4436 pr_debug("--- raid5d inactive\n"); 4437} 4438 4439static ssize_t 4440raid5_show_stripe_cache_size(mddev_t mddev, char page) --- 116 unchanged lines hidden (view full) --- 4557 raid_disks = conf->previous_raid_disks; 4558 } 4559 4560 sectors &= ~((sector_t)mddev->chunk_sectors - 1); 4561 sectors &= ~((sector_t)mddev->new_chunk_sectors - 1); 4562 return sectors * (raid_disks - conf->max_degraded); 4563} 4564	4190 async_tx_issue_pending_all(); 4191 unplug_slaves(mddev); 4192 4193 pr_debug("--- raid5d inactive\n"); 4194} 4195 4196static ssize_t 4197raid5_show_stripe_cache_size(mddev_t mddev, char page) --- 116 unchanged lines hidden (view full) --- 4314 raid_disks = conf->previous_raid_disks; 4315 } 4316 4317 sectors &= ~((sector_t)mddev->chunk_sectors - 1); 4318 sectors &= ~((sector_t)mddev->new_chunk_sectors - 1); 4319 return sectors * (raid_disks - conf->max_degraded); 4320} 4321
4565static void raid5_free_percpu(raid5_conf_t conf) 4566{ 4567 struct raid5_percpu percpu; 4568 unsigned long cpu; 4569 4570 if (!conf->percpu) 4571 return; 4572 4573 get_online_cpus(); 4574 for_each_possible_cpu(cpu) { 4575 percpu = per_cpu_ptr(conf->percpu, cpu); 4576 safe_put_page(percpu->spare_page); 4577 kfree(percpu->scribble); 4578 } 4579#ifdef CONFIG_HOTPLUG_CPU 4580 unregister_cpu_notifier(&conf->cpu_notify); 4581#endif 4582 put_online_cpus(); 4583 4584 free_percpu(conf->percpu); 4585} 4586
4587static void free_conf(raid5_conf_t *conf) 4588{ 4589 shrink_stripes(conf);	4322static void free_conf(raid5_conf_t *conf) 4323{ 4324 shrink_stripes(conf);
4590 raid5_free_percpu(conf);	4325 safe_put_page(conf->spare_page);
4591 kfree(conf->disks); 4592 kfree(conf->stripe_hashtbl); 4593 kfree(conf); 4594} 4595	4326 kfree(conf->disks); 4327 kfree(conf->stripe_hashtbl); 4328 kfree(conf); 4329} 4330
4596#ifdef CONFIG_HOTPLUG_CPU 4597static int raid456_cpu_notify(struct notifier_block nfb, unsigned long action, 4598 void hcpu) 4599{ 4600 raid5_conf_t conf = container_of(nfb, raid5_conf_t, cpu_notify); 4601 long cpu = (long)hcpu; 4602 struct raid5_percpu percpu = per_cpu_ptr(conf->percpu, cpu); 4603 4604 switch (action) { 4605 case CPU_UP_PREPARE: 4606 case CPU_UP_PREPARE_FROZEN: 4607 if (conf->level == 6 && !percpu->spare_page) 4608 percpu->spare_page = alloc_page(GFP_KERNEL); 4609 if (!percpu->scribble) 4610 percpu->scribble = kmalloc(conf->scribble_len, GFP_KERNEL); 4611 4612 if (!percpu->scribble \|\| 4613 (conf->level == 6 && !percpu->spare_page)) { 4614 safe_put_page(percpu->spare_page); 4615 kfree(percpu->scribble); 4616 pr_err("%s: failed memory allocation for cpu%ld\n", 4617 __func__, cpu); 4618 return NOTIFY_BAD; 4619 } 4620 break; 4621 case CPU_DEAD: 4622 case CPU_DEAD_FROZEN: 4623 safe_put_page(percpu->spare_page); 4624 kfree(percpu->scribble); 4625 percpu->spare_page = NULL; 4626 percpu->scribble = NULL; 4627 break; 4628 default: 4629 break; 4630 } 4631 return NOTIFY_OK; 4632} 4633#endif 4634 4635static int raid5_alloc_percpu(raid5_conf_t conf) 4636{ 4637 unsigned long cpu; 4638 struct page spare_page; 4639 struct raid5_percpu allcpus; 4640 void scribble; 4641 int err; 4642 4643 allcpus = alloc_percpu(struct raid5_percpu); 4644 if (!allcpus) 4645 return -ENOMEM; 4646 conf->percpu = allcpus; 4647 4648 get_online_cpus(); 4649 err = 0; 4650 for_each_present_cpu(cpu) { 4651 if (conf->level == 6) { 4652 spare_page = alloc_page(GFP_KERNEL); 4653 if (!spare_page) { 4654 err = -ENOMEM; 4655 break; 4656 } 4657 per_cpu_ptr(conf->percpu, cpu)->spare_page = spare_page; 4658 } 4659 scribble = kmalloc(scribble_len(conf->raid_disks), GFP_KERNEL); 4660 if (!scribble) { 4661 err = -ENOMEM; 4662 break; 4663 } 4664 per_cpu_ptr(conf->percpu, cpu)->scribble = scribble; 4665 } 4666#ifdef CONFIG_HOTPLUG_CPU 4667 conf->cpu_notify.notifier_call = raid456_cpu_notify; 4668 conf->cpu_notify.priority = 0; 4669 if (err == 0) 4670 err = register_cpu_notifier(&conf->cpu_notify); 4671#endif 4672 put_online_cpus(); 4673 4674 return err; 4675} 4676
4677static raid5_conf_t setup_conf(mddev_t mddev) 4678{ 4679 raid5_conf_t conf; 4680 int raid_disk, memory; 4681 mdk_rdev_t rdev; 4682 struct disk_info disk; 4683 4684 if (mddev->new_level != 5 --- 25 unchanged lines hidden* (view full) --- 4710 return ERR_PTR(-EINVAL); 4711 } 4712 4713 conf = kzalloc(sizeof(raid5_conf_t), GFP_KERNEL); 4714 if (conf == NULL) 4715 goto abort; 4716 4717 conf->raid_disks = mddev->raid_disks;	4331static raid5_conf_t setup_conf(mddev_t mddev) 4332{ 4333 raid5_conf_t conf; 4334 int raid_disk, memory; 4335 mdk_rdev_t rdev; 4336 struct disk_info disk; 4337 4338 if (mddev->new_level != 5 --- 25 unchanged lines hidden* (view full) --- 4364 return ERR_PTR(-EINVAL); 4365 } 4366 4367 conf = kzalloc(sizeof(raid5_conf_t), GFP_KERNEL); 4368 if (conf == NULL) 4369 goto abort; 4370 4371 conf->raid_disks = mddev->raid_disks;
4718 conf->scribble_len = scribble_len(conf->raid_disks);
4719 if (mddev->reshape_position == MaxSector) 4720 conf->previous_raid_disks = mddev->raid_disks; 4721 else 4722 conf->previous_raid_disks = mddev->raid_disks - mddev->delta_disks; 4723 4724 conf->disks = kzalloc(conf->raid_disks * sizeof(struct disk_info), 4725 GFP_KERNEL); 4726 if (!conf->disks) 4727 goto abort; 4728 4729 conf->mddev = mddev; 4730 4731 if ((conf->stripe_hashtbl = kzalloc(PAGE_SIZE, GFP_KERNEL)) == NULL) 4732 goto abort; 4733	4372 if (mddev->reshape_position == MaxSector) 4373 conf->previous_raid_disks = mddev->raid_disks; 4374 else 4375 conf->previous_raid_disks = mddev->raid_disks - mddev->delta_disks; 4376 4377 conf->disks = kzalloc(conf->raid_disks * sizeof(struct disk_info), 4378 GFP_KERNEL); 4379 if (!conf->disks) 4380 goto abort; 4381 4382 conf->mddev = mddev; 4383 4384 if ((conf->stripe_hashtbl = kzalloc(PAGE_SIZE, GFP_KERNEL)) == NULL) 4385 goto abort; 4386
4734 conf->level = mddev->new_level; 4735 if (raid5_alloc_percpu(conf) != 0) 4736 goto abort; 4737	4387 if (mddev->new_level == 6) { 4388 conf->spare_page = alloc_page(GFP_KERNEL); 4389 if (!conf->spare_page) 4390 goto abort; 4391 }
4738 spin_lock_init(&conf->device_lock); 4739 init_waitqueue_head(&conf->wait_for_stripe); 4740 init_waitqueue_head(&conf->wait_for_overlap); 4741 INIT_LIST_HEAD(&conf->handle_list); 4742 INIT_LIST_HEAD(&conf->hold_list); 4743 INIT_LIST_HEAD(&conf->delayed_list); 4744 INIT_LIST_HEAD(&conf->bitmap_list); 4745 INIT_LIST_HEAD(&conf->inactive_list); --- 1043 unchanged lines hidden ---	4392 spin_lock_init(&conf->device_lock); 4393 init_waitqueue_head(&conf->wait_for_stripe); 4394 init_waitqueue_head(&conf->wait_for_overlap); 4395 INIT_LIST_HEAD(&conf->handle_list); 4396 INIT_LIST_HEAD(&conf->hold_list); 4397 INIT_LIST_HEAD(&conf->delayed_list); 4398 INIT_LIST_HEAD(&conf->bitmap_list); 4399 INIT_LIST_HEAD(&conf->inactive_list); --- 1043 unchanged lines hidden ---