raid5.c (3bddb7f8f264ec58dc86e11ca97341c24f9d38f6) raid5.c (3a83f4677539bce8eaa2bca9ee9c20e172d7ab04)
1/*
2 * raid5.c : Multiple Devices driver for Linux
3 * Copyright (C) 1996, 1997 Ingo Molnar, Miguel de Icaza, Gadi Oxman
4 * Copyright (C) 1999, 2000 Ingo Molnar
5 * Copyright (C) 2002, 2003 H. Peter Anvin
6 *
7 * RAID-4/5/6 management functions.
8 * Thanks to Penguin Computing for making the RAID-6 development possible

--- 56 unchanged lines hidden (view full) ---

65#define cpu_to_group(cpu) cpu_to_node(cpu)
66#define ANY_GROUP NUMA_NO_NODE
67
68static bool devices_handle_discard_safely = false;
69module_param(devices_handle_discard_safely, bool, 0644);
70MODULE_PARM_DESC(devices_handle_discard_safely,
71 "Set to Y if all devices in each array reliably return zeroes on reads from discarded regions");
72static struct workqueue_struct *raid5_wq;
1/*
2 * raid5.c : Multiple Devices driver for Linux
3 * Copyright (C) 1996, 1997 Ingo Molnar, Miguel de Icaza, Gadi Oxman
4 * Copyright (C) 1999, 2000 Ingo Molnar
5 * Copyright (C) 2002, 2003 H. Peter Anvin
6 *
7 * RAID-4/5/6 management functions.
8 * Thanks to Penguin Computing for making the RAID-6 development possible

--- 56 unchanged lines hidden (view full) ---

65#define cpu_to_group(cpu) cpu_to_node(cpu)
66#define ANY_GROUP NUMA_NO_NODE
67
68static bool devices_handle_discard_safely = false;
69module_param(devices_handle_discard_safely, bool, 0644);
70MODULE_PARM_DESC(devices_handle_discard_safely,
71 "Set to Y if all devices in each array reliably return zeroes on reads from discarded regions");
72static struct workqueue_struct *raid5_wq;
73/*
74 * Stripe cache
75 */
73
76
77#define NR_STRIPES 256
78#define STRIPE_SIZE PAGE_SIZE
79#define STRIPE_SHIFT (PAGE_SHIFT - 9)
80#define STRIPE_SECTORS (STRIPE_SIZE>>9)
81#define IO_THRESHOLD 1
82#define BYPASS_THRESHOLD 1
83#define NR_HASH (PAGE_SIZE / sizeof(struct hlist_head))
84#define HASH_MASK (NR_HASH - 1)
85#define MAX_STRIPE_BATCH 8
86
74static inline struct hlist_head *stripe_hash(struct r5conf *conf, sector_t sect)
75{
76 int hash = (sect >> STRIPE_SHIFT) & HASH_MASK;
77 return &conf->stripe_hashtbl[hash];
78}
79
80static inline int stripe_hash_locks_hash(sector_t sect)
81{

--- 26 unchanged lines hidden (view full) ---

108{
109 int i;
110 spin_unlock(&conf->device_lock);
111 for (i = NR_STRIPE_HASH_LOCKS; i; i--)
112 spin_unlock(conf->hash_locks + i - 1);
113 local_irq_enable();
114}
115
87static inline struct hlist_head *stripe_hash(struct r5conf *conf, sector_t sect)
88{
89 int hash = (sect >> STRIPE_SHIFT) & HASH_MASK;
90 return &conf->stripe_hashtbl[hash];
91}
92
93static inline int stripe_hash_locks_hash(sector_t sect)
94{

--- 26 unchanged lines hidden (view full) ---

121{
122 int i;
123 spin_unlock(&conf->device_lock);
124 for (i = NR_STRIPE_HASH_LOCKS; i; i--)
125 spin_unlock(conf->hash_locks + i - 1);
126 local_irq_enable();
127}
128
129/* bio's attached to a stripe+device for I/O are linked together in bi_sector
130 * order without overlap. There may be several bio's per stripe+device, and
131 * a bio could span several devices.
132 * When walking this list for a particular stripe+device, we must never proceed
133 * beyond a bio that extends past this device, as the next bio might no longer
134 * be valid.
135 * This function is used to determine the 'next' bio in the list, given the sector
136 * of the current stripe+device
137 */
138static inline struct bio *r5_next_bio(struct bio *bio, sector_t sector)
139{
140 int sectors = bio_sectors(bio);
141 if (bio->bi_iter.bi_sector + sectors < sector + STRIPE_SECTORS)
142 return bio->bi_next;
143 else
144 return NULL;
145}
146
147/*
148 * We maintain a biased count of active stripes in the bottom 16 bits of
149 * bi_phys_segments, and a count of processed stripes in the upper 16 bits
150 */
151static inline int raid5_bi_processed_stripes(struct bio *bio)
152{
153 atomic_t *segments = (atomic_t *)&bio->bi_phys_segments;
154 return (atomic_read(segments) >> 16) & 0xffff;
155}
156
157static inline int raid5_dec_bi_active_stripes(struct bio *bio)
158{
159 atomic_t *segments = (atomic_t *)&bio->bi_phys_segments;
160 return atomic_sub_return(1, segments) & 0xffff;
161}
162
163static inline void raid5_inc_bi_active_stripes(struct bio *bio)
164{
165 atomic_t *segments = (atomic_t *)&bio->bi_phys_segments;
166 atomic_inc(segments);
167}
168
169static inline void raid5_set_bi_processed_stripes(struct bio *bio,
170 unsigned int cnt)
171{
172 atomic_t *segments = (atomic_t *)&bio->bi_phys_segments;
173 int old, new;
174
175 do {
176 old = atomic_read(segments);
177 new = (old & 0xffff) | (cnt << 16);
178 } while (atomic_cmpxchg(segments, old, new) != old);
179}
180
181static inline void raid5_set_bi_stripes(struct bio *bio, unsigned int cnt)
182{
183 atomic_t *segments = (atomic_t *)&bio->bi_phys_segments;
184 atomic_set(segments, cnt);
185}
186
116/* Find first data disk in a raid6 stripe */
117static inline int raid6_d0(struct stripe_head *sh)
118{
119 if (sh->ddf_layout)
120 /* ddf always start from first device */
121 return 0;
122 /* md starts just after Q block */
123 if (sh->qd_idx == sh->disks - 1)

--- 89 unchanged lines hidden (view full) ---

213 thread_cnt--;
214 }
215 }
216}
217
218static void do_release_stripe(struct r5conf *conf, struct stripe_head *sh,
219 struct list_head *temp_inactive_list)
220{
187/* Find first data disk in a raid6 stripe */
188static inline int raid6_d0(struct stripe_head *sh)
189{
190 if (sh->ddf_layout)
191 /* ddf always start from first device */
192 return 0;
193 /* md starts just after Q block */
194 if (sh->qd_idx == sh->disks - 1)

--- 89 unchanged lines hidden (view full) ---

284 thread_cnt--;
285 }
286 }
287}
288
289static void do_release_stripe(struct r5conf *conf, struct stripe_head *sh,
290 struct list_head *temp_inactive_list)
291{
221 int i;
222 int injournal = 0; /* number of date pages with R5_InJournal */
223
224 BUG_ON(!list_empty(&sh->lru));
225 BUG_ON(atomic_read(&conf->active_stripes)==0);
292 BUG_ON(!list_empty(&sh->lru));
293 BUG_ON(atomic_read(&conf->active_stripes)==0);
226
227 if (r5c_is_writeback(conf->log))
228 for (i = sh->disks; i--; )
229 if (test_bit(R5_InJournal, &sh->dev[i].flags))
230 injournal++;
231 /*
232 * When quiesce in r5c write back, set STRIPE_HANDLE for stripes with
233 * data in journal, so they are not released to cached lists
234 */
235 if (conf->quiesce && r5c_is_writeback(conf->log) &&
236 !test_bit(STRIPE_HANDLE, &sh->state) && injournal != 0) {
237 if (test_bit(STRIPE_R5C_CACHING, &sh->state))
238 r5c_make_stripe_write_out(sh);
239 set_bit(STRIPE_HANDLE, &sh->state);
240 }
241
242 if (test_bit(STRIPE_HANDLE, &sh->state)) {
243 if (test_bit(STRIPE_DELAYED, &sh->state) &&
244 !test_bit(STRIPE_PREREAD_ACTIVE, &sh->state))
245 list_add_tail(&sh->lru, &conf->delayed_list);
246 else if (test_bit(STRIPE_BIT_DELAY, &sh->state) &&
247 sh->bm_seq - conf->seq_write > 0)
248 list_add_tail(&sh->lru, &conf->bitmap_list);
249 else {

--- 9 unchanged lines hidden (view full) ---

259 md_wakeup_thread(conf->mddev->thread);
260 } else {
261 BUG_ON(stripe_operations_active(sh));
262 if (test_and_clear_bit(STRIPE_PREREAD_ACTIVE, &sh->state))
263 if (atomic_dec_return(&conf->preread_active_stripes)
264 < IO_THRESHOLD)
265 md_wakeup_thread(conf->mddev->thread);
266 atomic_dec(&conf->active_stripes);
294 if (test_bit(STRIPE_HANDLE, &sh->state)) {
295 if (test_bit(STRIPE_DELAYED, &sh->state) &&
296 !test_bit(STRIPE_PREREAD_ACTIVE, &sh->state))
297 list_add_tail(&sh->lru, &conf->delayed_list);
298 else if (test_bit(STRIPE_BIT_DELAY, &sh->state) &&
299 sh->bm_seq - conf->seq_write > 0)
300 list_add_tail(&sh->lru, &conf->bitmap_list);
301 else {

--- 9 unchanged lines hidden (view full) ---

311 md_wakeup_thread(conf->mddev->thread);
312 } else {
313 BUG_ON(stripe_operations_active(sh));
314 if (test_and_clear_bit(STRIPE_PREREAD_ACTIVE, &sh->state))
315 if (atomic_dec_return(&conf->preread_active_stripes)
316 < IO_THRESHOLD)
317 md_wakeup_thread(conf->mddev->thread);
318 atomic_dec(&conf->active_stripes);
267 if (!test_bit(STRIPE_EXPANDING, &sh->state)) {
268 if (!r5c_is_writeback(conf->log))
269 list_add_tail(&sh->lru, temp_inactive_list);
270 else {
271 WARN_ON(test_bit(R5_InJournal, &sh->dev[sh->pd_idx].flags));
272 if (injournal == 0)
273 list_add_tail(&sh->lru, temp_inactive_list);
274 else if (injournal == conf->raid_disks - conf->max_degraded) {
275 /* full stripe */
276 if (!test_and_set_bit(STRIPE_R5C_FULL_STRIPE, &sh->state))
277 atomic_inc(&conf->r5c_cached_full_stripes);
278 if (test_and_clear_bit(STRIPE_R5C_PARTIAL_STRIPE, &sh->state))
279 atomic_dec(&conf->r5c_cached_partial_stripes);
280 list_add_tail(&sh->lru, &conf->r5c_full_stripe_list);
281 r5c_check_cached_full_stripe(conf);
282 } else {
283 /* partial stripe */
284 if (!test_and_set_bit(STRIPE_R5C_PARTIAL_STRIPE,
285 &sh->state))
286 atomic_inc(&conf->r5c_cached_partial_stripes);
287 list_add_tail(&sh->lru, &conf->r5c_partial_stripe_list);
288 }
289 }
290 }
319 if (!test_bit(STRIPE_EXPANDING, &sh->state))
320 list_add_tail(&sh->lru, temp_inactive_list);
291 }
292}
293
294static void __release_stripe(struct r5conf *conf, struct stripe_head *sh,
295 struct list_head *temp_inactive_list)
296{
297 if (atomic_dec_and_test(&sh->count))
298 do_release_stripe(conf, sh, temp_inactive_list);

--- 207 unchanged lines hidden (view full) ---

506 stripe_set_idx(sector, conf, previous, sh);
507 sh->state = 0;
508
509 for (i = sh->disks; i--; ) {
510 struct r5dev *dev = &sh->dev[i];
511
512 if (dev->toread || dev->read || dev->towrite || dev->written ||
513 test_bit(R5_LOCKED, &dev->flags)) {
321 }
322}
323
324static void __release_stripe(struct r5conf *conf, struct stripe_head *sh,
325 struct list_head *temp_inactive_list)
326{
327 if (atomic_dec_and_test(&sh->count))
328 do_release_stripe(conf, sh, temp_inactive_list);

--- 207 unchanged lines hidden (view full) ---

536 stripe_set_idx(sector, conf, previous, sh);
537 sh->state = 0;
538
539 for (i = sh->disks; i--; ) {
540 struct r5dev *dev = &sh->dev[i];
541
542 if (dev->toread || dev->read || dev->towrite || dev->written ||
543 test_bit(R5_LOCKED, &dev->flags)) {
514 pr_err("sector=%llx i=%d %p %p %p %p %d\n",
544 printk(KERN_ERR "sector=%llx i=%d %p %p %p %p %d\n",
515 (unsigned long long)sh->sector, i, dev->toread,
516 dev->read, dev->towrite, dev->written,
517 test_bit(R5_LOCKED, &dev->flags));
518 WARN_ON(1);
519 }
520 dev->flags = 0;
521 raid5_build_block(sh, i, previous);
522 }

--- 122 unchanged lines hidden (view full) ---

645 sh = get_free_stripe(conf, hash);
646 if (!sh && !test_bit(R5_DID_ALLOC,
647 &conf->cache_state))
648 set_bit(R5_ALLOC_MORE,
649 &conf->cache_state);
650 }
651 if (noblock && sh == NULL)
652 break;
545 (unsigned long long)sh->sector, i, dev->toread,
546 dev->read, dev->towrite, dev->written,
547 test_bit(R5_LOCKED, &dev->flags));
548 WARN_ON(1);
549 }
550 dev->flags = 0;
551 raid5_build_block(sh, i, previous);
552 }

--- 122 unchanged lines hidden (view full) ---

675 sh = get_free_stripe(conf, hash);
676 if (!sh && !test_bit(R5_DID_ALLOC,
677 &conf->cache_state))
678 set_bit(R5_ALLOC_MORE,
679 &conf->cache_state);
680 }
681 if (noblock && sh == NULL)
682 break;
653
654 r5c_check_stripe_cache_usage(conf);
655 if (!sh) {
656 set_bit(R5_INACTIVE_BLOCKED,
657 &conf->cache_state);
683 if (!sh) {
684 set_bit(R5_INACTIVE_BLOCKED,
685 &conf->cache_state);
658 r5l_wake_reclaim(conf->log, 0);
659 wait_event_lock_irq(
660 conf->wait_for_stripe,
661 !list_empty(conf->inactive_list + hash) &&
662 (atomic_read(&conf->active_stripes)
663 < (conf->max_nr_stripes * 3 / 4)
664 || !test_bit(R5_INACTIVE_BLOCKED,
665 &conf->cache_state)),
666 *(conf->hash_locks + hash));

--- 202 unchanged lines hidden (view full) ---

869static void ops_run_io(struct stripe_head *sh, struct stripe_head_state *s)
870{
871 struct r5conf *conf = sh->raid_conf;
872 int i, disks = sh->disks;
873 struct stripe_head *head_sh = sh;
874
875 might_sleep();
876
686 wait_event_lock_irq(
687 conf->wait_for_stripe,
688 !list_empty(conf->inactive_list + hash) &&
689 (atomic_read(&conf->active_stripes)
690 < (conf->max_nr_stripes * 3 / 4)
691 || !test_bit(R5_INACTIVE_BLOCKED,
692 &conf->cache_state)),
693 *(conf->hash_locks + hash));

--- 202 unchanged lines hidden (view full) ---

896static void ops_run_io(struct stripe_head *sh, struct stripe_head_state *s)
897{
898 struct r5conf *conf = sh->raid_conf;
899 int i, disks = sh->disks;
900 struct stripe_head *head_sh = sh;
901
902 might_sleep();
903
877 if (!test_bit(STRIPE_R5C_CACHING, &sh->state)) {
878 /* writing out phase */
879 if (r5l_write_stripe(conf->log, sh) == 0)
880 return;
881 } else { /* caching phase */
882 if (test_bit(STRIPE_LOG_TRAPPED, &sh->state)) {
883 r5c_cache_data(conf->log, sh, s);
884 return;
885 }
886 }
887
904 if (r5l_write_stripe(conf->log, sh) == 0)
905 return;
888 for (i = disks; i--; ) {
889 int op, op_flags = 0;
890 int replace_only = 0;
891 struct bio *bi, *rbi;
892 struct md_rdev *rdev, *rrdev = NULL;
893
894 sh = head_sh;
895 if (test_and_clear_bit(R5_Wantwrite, &sh->dev[i].flags)) {
896 op = REQ_OP_WRITE;
897 if (test_and_clear_bit(R5_WantFUA, &sh->dev[i].flags))
906 for (i = disks; i--; ) {
907 int op, op_flags = 0;
908 int replace_only = 0;
909 struct bio *bi, *rbi;
910 struct md_rdev *rdev, *rrdev = NULL;
911
912 sh = head_sh;
913 if (test_and_clear_bit(R5_Wantwrite, &sh->dev[i].flags)) {
914 op = REQ_OP_WRITE;
915 if (test_and_clear_bit(R5_WantFUA, &sh->dev[i].flags))
898 op_flags = WRITE_FUA;
916 op_flags = REQ_FUA;
899 if (test_bit(R5_Discard, &sh->dev[i].flags))
900 op = REQ_OP_DISCARD;
901 } else if (test_and_clear_bit(R5_Wantread, &sh->dev[i].flags))
902 op = REQ_OP_READ;
903 else if (test_and_clear_bit(R5_WantReplace,
904 &sh->dev[i].flags)) {
905 op = REQ_OP_WRITE;
906 replace_only = 1;

--- 185 unchanged lines hidden (view full) ---

1092 if (sh != head_sh)
1093 goto again;
1094 }
1095}
1096
1097static struct dma_async_tx_descriptor *
1098async_copy_data(int frombio, struct bio *bio, struct page **page,
1099 sector_t sector, struct dma_async_tx_descriptor *tx,
917 if (test_bit(R5_Discard, &sh->dev[i].flags))
918 op = REQ_OP_DISCARD;
919 } else if (test_and_clear_bit(R5_Wantread, &sh->dev[i].flags))
920 op = REQ_OP_READ;
921 else if (test_and_clear_bit(R5_WantReplace,
922 &sh->dev[i].flags)) {
923 op = REQ_OP_WRITE;
924 replace_only = 1;

--- 185 unchanged lines hidden (view full) ---

1110 if (sh != head_sh)
1111 goto again;
1112 }
1113}
1114
1115static struct dma_async_tx_descriptor *
1116async_copy_data(int frombio, struct bio *bio, struct page **page,
1117 sector_t sector, struct dma_async_tx_descriptor *tx,
1100 struct stripe_head *sh, int no_skipcopy)
1118 struct stripe_head *sh)
1101{
1102 struct bio_vec bvl;
1103 struct bvec_iter iter;
1104 struct page *bio_page;
1105 int page_offset;
1106 struct async_submit_ctl submit;
1107 enum async_tx_flags flags = 0;
1108

--- 23 unchanged lines hidden (view full) ---

1132 clen = len;
1133
1134 if (clen > 0) {
1135 b_offset += bvl.bv_offset;
1136 bio_page = bvl.bv_page;
1137 if (frombio) {
1138 if (sh->raid_conf->skip_copy &&
1139 b_offset == 0 && page_offset == 0 &&
1119{
1120 struct bio_vec bvl;
1121 struct bvec_iter iter;
1122 struct page *bio_page;
1123 int page_offset;
1124 struct async_submit_ctl submit;
1125 enum async_tx_flags flags = 0;
1126

--- 23 unchanged lines hidden (view full) ---

1150 clen = len;
1151
1152 if (clen > 0) {
1153 b_offset += bvl.bv_offset;
1154 bio_page = bvl.bv_page;
1155 if (frombio) {
1156 if (sh->raid_conf->skip_copy &&
1157 b_offset == 0 && page_offset == 0 &&
1140 clen == STRIPE_SIZE &&
1141 !no_skipcopy)
1158 clen == STRIPE_SIZE)
1142 *page = bio_page;
1143 else
1144 tx = async_memcpy(*page, bio_page, page_offset,
1145 b_offset, clen, &submit);
1146 } else
1147 tx = async_memcpy(bio_page, *page, b_offset,
1148 page_offset, clen, &submit);
1149 }

--- 65 unchanged lines hidden (view full) ---

1215 struct bio *rbi;
1216 spin_lock_irq(&sh->stripe_lock);
1217 dev->read = rbi = dev->toread;
1218 dev->toread = NULL;
1219 spin_unlock_irq(&sh->stripe_lock);
1220 while (rbi && rbi->bi_iter.bi_sector <
1221 dev->sector + STRIPE_SECTORS) {
1222 tx = async_copy_data(0, rbi, &dev->page,
1159 *page = bio_page;
1160 else
1161 tx = async_memcpy(*page, bio_page, page_offset,
1162 b_offset, clen, &submit);
1163 } else
1164 tx = async_memcpy(bio_page, *page, b_offset,
1165 page_offset, clen, &submit);
1166 }

--- 65 unchanged lines hidden (view full) ---

1232 struct bio *rbi;
1233 spin_lock_irq(&sh->stripe_lock);
1234 dev->read = rbi = dev->toread;
1235 dev->toread = NULL;
1236 spin_unlock_irq(&sh->stripe_lock);
1237 while (rbi && rbi->bi_iter.bi_sector <
1238 dev->sector + STRIPE_SECTORS) {
1239 tx = async_copy_data(0, rbi, &dev->page,
1223 dev->sector, tx, sh, 0);
1240 dev->sector, tx, sh);
1224 rbi = r5_next_bio(rbi, dev->sector);
1225 }
1226 }
1227 }
1228
1229 atomic_inc(&sh->count);
1230 init_async_submit(&submit, ASYNC_TX_ACK, tx, ops_complete_biofill, sh, NULL);
1231 async_trigger_callback(&submit);

--- 110 unchanged lines hidden (view full) ---

1342 i = d0_idx;
1343 do {
1344 int slot = raid6_idx_to_slot(i, sh, &count, syndrome_disks);
1345 struct r5dev *dev = &sh->dev[i];
1346
1347 if (i == sh->qd_idx || i == sh->pd_idx ||
1348 (srctype == SYNDROME_SRC_ALL) ||
1349 (srctype == SYNDROME_SRC_WANT_DRAIN &&
1241 rbi = r5_next_bio(rbi, dev->sector);
1242 }
1243 }
1244 }
1245
1246 atomic_inc(&sh->count);
1247 init_async_submit(&submit, ASYNC_TX_ACK, tx, ops_complete_biofill, sh, NULL);
1248 async_trigger_callback(&submit);

--- 110 unchanged lines hidden (view full) ---

1359 i = d0_idx;
1360 do {
1361 int slot = raid6_idx_to_slot(i, sh, &count, syndrome_disks);
1362 struct r5dev *dev = &sh->dev[i];
1363
1364 if (i == sh->qd_idx || i == sh->pd_idx ||
1365 (srctype == SYNDROME_SRC_ALL) ||
1366 (srctype == SYNDROME_SRC_WANT_DRAIN &&
1350 (test_bit(R5_Wantdrain, &dev->flags) ||
1351 test_bit(R5_InJournal, &dev->flags))) ||
1367 test_bit(R5_Wantdrain, &dev->flags)) ||
1352 (srctype == SYNDROME_SRC_WRITTEN &&
1368 (srctype == SYNDROME_SRC_WRITTEN &&
1353 dev->written)) {
1354 if (test_bit(R5_InJournal, &dev->flags))
1355 srcs[slot] = sh->dev[i].orig_page;
1356 else
1357 srcs[slot] = sh->dev[i].page;
1358 }
1369 dev->written))
1370 srcs[slot] = sh->dev[i].page;
1359 i = raid6_next_disk(i, disks);
1360 } while (i != d0_idx);
1361
1362 return syndrome_disks;
1363}
1364
1365static struct dma_async_tx_descriptor *
1366ops_run_compute6_1(struct stripe_head *sh, struct raid5_percpu *percpu)

--- 162 unchanged lines hidden (view full) ---

1529}
1530
1531static void ops_complete_prexor(void *stripe_head_ref)
1532{
1533 struct stripe_head *sh = stripe_head_ref;
1534
1535 pr_debug("%s: stripe %llu\n", __func__,
1536 (unsigned long long)sh->sector);
1371 i = raid6_next_disk(i, disks);
1372 } while (i != d0_idx);
1373
1374 return syndrome_disks;
1375}
1376
1377static struct dma_async_tx_descriptor *
1378ops_run_compute6_1(struct stripe_head *sh, struct raid5_percpu *percpu)

--- 162 unchanged lines hidden (view full) ---

1541}
1542
1543static void ops_complete_prexor(void *stripe_head_ref)
1544{
1545 struct stripe_head *sh = stripe_head_ref;
1546
1547 pr_debug("%s: stripe %llu\n", __func__,
1548 (unsigned long long)sh->sector);
1537
1538 if (r5c_is_writeback(sh->raid_conf->log))
1539 /*
1540 * raid5-cache write back uses orig_page during prexor.
1541 * After prexor, it is time to free orig_page
1542 */
1543 r5c_release_extra_page(sh);
1544}
1545
1546static struct dma_async_tx_descriptor *
1547ops_run_prexor5(struct stripe_head *sh, struct raid5_percpu *percpu,
1548 struct dma_async_tx_descriptor *tx)
1549{
1550 int disks = sh->disks;
1551 struct page **xor_srcs = to_addr_page(percpu, 0);

--- 5 unchanged lines hidden (view full) ---

1557
1558 BUG_ON(sh->batch_head);
1559 pr_debug("%s: stripe %llu\n", __func__,
1560 (unsigned long long)sh->sector);
1561
1562 for (i = disks; i--; ) {
1563 struct r5dev *dev = &sh->dev[i];
1564 /* Only process blocks that are known to be uptodate */
1549}
1550
1551static struct dma_async_tx_descriptor *
1552ops_run_prexor5(struct stripe_head *sh, struct raid5_percpu *percpu,
1553 struct dma_async_tx_descriptor *tx)
1554{
1555 int disks = sh->disks;
1556 struct page **xor_srcs = to_addr_page(percpu, 0);

--- 5 unchanged lines hidden (view full) ---

1562
1563 BUG_ON(sh->batch_head);
1564 pr_debug("%s: stripe %llu\n", __func__,
1565 (unsigned long long)sh->sector);
1566
1567 for (i = disks; i--; ) {
1568 struct r5dev *dev = &sh->dev[i];
1569 /* Only process blocks that are known to be uptodate */
1565 if (test_bit(R5_InJournal, &dev->flags))
1566 xor_srcs[count++] = dev->orig_page;
1567 else if (test_bit(R5_Wantdrain, &dev->flags))
1570 if (test_bit(R5_Wantdrain, &dev->flags))
1568 xor_srcs[count++] = dev->page;
1569 }
1570
1571 init_async_submit(&submit, ASYNC_TX_FENCE|ASYNC_TX_XOR_DROP_DST, tx,
1572 ops_complete_prexor, sh, to_addr_conv(sh, percpu, 0));
1573 tx = async_xor(xor_dest, xor_srcs, 0, count, STRIPE_SIZE, &submit);
1574
1575 return tx;

--- 17 unchanged lines hidden (view full) ---

1593 tx = async_gen_syndrome(blocks, 0, count+2, STRIPE_SIZE, &submit);
1594
1595 return tx;
1596}
1597
1598static struct dma_async_tx_descriptor *
1599ops_run_biodrain(struct stripe_head *sh, struct dma_async_tx_descriptor *tx)
1600{
1571 xor_srcs[count++] = dev->page;
1572 }
1573
1574 init_async_submit(&submit, ASYNC_TX_FENCE|ASYNC_TX_XOR_DROP_DST, tx,
1575 ops_complete_prexor, sh, to_addr_conv(sh, percpu, 0));
1576 tx = async_xor(xor_dest, xor_srcs, 0, count, STRIPE_SIZE, &submit);
1577
1578 return tx;

--- 17 unchanged lines hidden (view full) ---

1596 tx = async_gen_syndrome(blocks, 0, count+2, STRIPE_SIZE, &submit);
1597
1598 return tx;
1599}
1600
1601static struct dma_async_tx_descriptor *
1602ops_run_biodrain(struct stripe_head *sh, struct dma_async_tx_descriptor *tx)
1603{
1601 struct r5conf *conf = sh->raid_conf;
1602 int disks = sh->disks;
1603 int i;
1604 struct stripe_head *head_sh = sh;
1605
1606 pr_debug("%s: stripe %llu\n", __func__,
1607 (unsigned long long)sh->sector);
1608
1609 for (i = disks; i--; ) {
1610 struct r5dev *dev;
1611 struct bio *chosen;
1612
1613 sh = head_sh;
1614 if (test_and_clear_bit(R5_Wantdrain, &head_sh->dev[i].flags)) {
1615 struct bio *wbi;
1616
1617again:
1618 dev = &sh->dev[i];
1604 int disks = sh->disks;
1605 int i;
1606 struct stripe_head *head_sh = sh;
1607
1608 pr_debug("%s: stripe %llu\n", __func__,
1609 (unsigned long long)sh->sector);
1610
1611 for (i = disks; i--; ) {
1612 struct r5dev *dev;
1613 struct bio *chosen;
1614
1615 sh = head_sh;
1616 if (test_and_clear_bit(R5_Wantdrain, &head_sh->dev[i].flags)) {
1617 struct bio *wbi;
1618
1619again:
1620 dev = &sh->dev[i];
1619 /*
1620 * clear R5_InJournal, so when rewriting a page in
1621 * journal, it is not skipped by r5l_log_stripe()
1622 */
1623 clear_bit(R5_InJournal, &dev->flags);
1624 spin_lock_irq(&sh->stripe_lock);
1625 chosen = dev->towrite;
1626 dev->towrite = NULL;
1627 sh->overwrite_disks = 0;
1628 BUG_ON(dev->written);
1629 wbi = dev->written = chosen;
1630 spin_unlock_irq(&sh->stripe_lock);
1631 WARN_ON(dev->page != dev->orig_page);
1632
1633 while (wbi && wbi->bi_iter.bi_sector <
1634 dev->sector + STRIPE_SECTORS) {
1635 if (wbi->bi_opf & REQ_FUA)
1636 set_bit(R5_WantFUA, &dev->flags);
1637 if (wbi->bi_opf & REQ_SYNC)
1638 set_bit(R5_SyncIO, &dev->flags);
1639 if (bio_op(wbi) == REQ_OP_DISCARD)
1640 set_bit(R5_Discard, &dev->flags);
1641 else {
1642 tx = async_copy_data(1, wbi, &dev->page,
1621 spin_lock_irq(&sh->stripe_lock);
1622 chosen = dev->towrite;
1623 dev->towrite = NULL;
1624 sh->overwrite_disks = 0;
1625 BUG_ON(dev->written);
1626 wbi = dev->written = chosen;
1627 spin_unlock_irq(&sh->stripe_lock);
1628 WARN_ON(dev->page != dev->orig_page);
1629
1630 while (wbi && wbi->bi_iter.bi_sector <
1631 dev->sector + STRIPE_SECTORS) {
1632 if (wbi->bi_opf & REQ_FUA)
1633 set_bit(R5_WantFUA, &dev->flags);
1634 if (wbi->bi_opf & REQ_SYNC)
1635 set_bit(R5_SyncIO, &dev->flags);
1636 if (bio_op(wbi) == REQ_OP_DISCARD)
1637 set_bit(R5_Discard, &dev->flags);
1638 else {
1639 tx = async_copy_data(1, wbi, &dev->page,
1643 dev->sector, tx, sh,
1644 r5c_is_writeback(conf->log));
1645 if (dev->page != dev->orig_page &&
1646 !r5c_is_writeback(conf->log)) {
1640 dev->sector, tx, sh);
1641 if (dev->page != dev->orig_page) {
1647 set_bit(R5_SkipCopy, &dev->flags);
1648 clear_bit(R5_UPTODATE, &dev->flags);
1649 clear_bit(R5_OVERWRITE, &dev->flags);
1650 }
1651 }
1652 wbi = r5_next_bio(wbi, dev->sector);
1653 }
1654

--- 91 unchanged lines hidden (view full) ---

1746 /* check if prexor is active which means only process blocks
1747 * that are part of a read-modify-write (written)
1748 */
1749 if (head_sh->reconstruct_state == reconstruct_state_prexor_drain_run) {
1750 prexor = 1;
1751 xor_dest = xor_srcs[count++] = sh->dev[pd_idx].page;
1752 for (i = disks; i--; ) {
1753 struct r5dev *dev = &sh->dev[i];
1642 set_bit(R5_SkipCopy, &dev->flags);
1643 clear_bit(R5_UPTODATE, &dev->flags);
1644 clear_bit(R5_OVERWRITE, &dev->flags);
1645 }
1646 }
1647 wbi = r5_next_bio(wbi, dev->sector);
1648 }
1649

--- 91 unchanged lines hidden (view full) ---

1741 /* check if prexor is active which means only process blocks
1742 * that are part of a read-modify-write (written)
1743 */
1744 if (head_sh->reconstruct_state == reconstruct_state_prexor_drain_run) {
1745 prexor = 1;
1746 xor_dest = xor_srcs[count++] = sh->dev[pd_idx].page;
1747 for (i = disks; i--; ) {
1748 struct r5dev *dev = &sh->dev[i];
1754 if (head_sh->dev[i].written ||
1755 test_bit(R5_InJournal, &head_sh->dev[i].flags))
1749 if (head_sh->dev[i].written)
1756 xor_srcs[count++] = dev->page;
1757 }
1758 } else {
1759 xor_dest = sh->dev[pd_idx].page;
1760 for (i = disks; i--; ) {
1761 struct r5dev *dev = &sh->dev[i];
1762 if (i != pd_idx)
1763 xor_srcs[count++] = dev->page;

--- 237 unchanged lines hidden (view full) ---

2001 int i;
2002
2003 sh = kmem_cache_zalloc(sc, gfp);
2004 if (sh) {
2005 spin_lock_init(&sh->stripe_lock);
2006 spin_lock_init(&sh->batch_lock);
2007 INIT_LIST_HEAD(&sh->batch_list);
2008 INIT_LIST_HEAD(&sh->lru);
1750 xor_srcs[count++] = dev->page;
1751 }
1752 } else {
1753 xor_dest = sh->dev[pd_idx].page;
1754 for (i = disks; i--; ) {
1755 struct r5dev *dev = &sh->dev[i];
1756 if (i != pd_idx)
1757 xor_srcs[count++] = dev->page;

--- 237 unchanged lines hidden (view full) ---

1995 int i;
1996
1997 sh = kmem_cache_zalloc(sc, gfp);
1998 if (sh) {
1999 spin_lock_init(&sh->stripe_lock);
2000 spin_lock_init(&sh->batch_lock);
2001 INIT_LIST_HEAD(&sh->batch_list);
2002 INIT_LIST_HEAD(&sh->lru);
2009 INIT_LIST_HEAD(&sh->r5c);
2010 atomic_set(&sh->count, 1);
2003 atomic_set(&sh->count, 1);
2011 sh->log_start = MaxSector;
2012 for (i = 0; i < disks; i++) {
2013 struct r5dev *dev = &sh->dev[i];
2014
2004 for (i = 0; i < disks; i++) {
2005 struct r5dev *dev = &sh->dev[i];
2006
2015 bio_init(&dev->req);
2016 dev->req.bi_io_vec = &dev->vec;
2017 dev->req.bi_max_vecs = 1;
2018
2019 bio_init(&dev->rreq);
2020 dev->rreq.bi_io_vec = &dev->rvec;
2021 dev->rreq.bi_max_vecs = 1;
2007 bio_init(&dev->req, &dev->vec, 1);
2008 bio_init(&dev->rreq, &dev->rvec, 1);
2022 }
2023 }
2024 return sh;
2025}
2026static int grow_one_stripe(struct r5conf *conf, gfp_t gfp)
2027{
2028 struct stripe_head *sh;
2029

--- 320 unchanged lines hidden (view full) ---

2350 s = sh->sector + rdev->data_offset;
2351 if (!bi->bi_error) {
2352 set_bit(R5_UPTODATE, &sh->dev[i].flags);
2353 if (test_bit(R5_ReadError, &sh->dev[i].flags)) {
2354 /* Note that this cannot happen on a
2355 * replacement device. We just fail those on
2356 * any error
2357 */
2009 }
2010 }
2011 return sh;
2012}
2013static int grow_one_stripe(struct r5conf *conf, gfp_t gfp)
2014{
2015 struct stripe_head *sh;
2016

--- 320 unchanged lines hidden (view full) ---

2337 s = sh->sector + rdev->data_offset;
2338 if (!bi->bi_error) {
2339 set_bit(R5_UPTODATE, &sh->dev[i].flags);
2340 if (test_bit(R5_ReadError, &sh->dev[i].flags)) {
2341 /* Note that this cannot happen on a
2342 * replacement device. We just fail those on
2343 * any error
2344 */
2358 pr_info_ratelimited(
2359 "md/raid:%s: read error corrected (%lu sectors at %llu on %s)\n",
2345 printk_ratelimited(
2346 KERN_INFO
2347 "md/raid:%s: read error corrected"
2348 " (%lu sectors at %llu on %s)\n",
2360 mdname(conf->mddev), STRIPE_SECTORS,
2361 (unsigned long long)s,
2362 bdevname(rdev->bdev, b));
2363 atomic_add(STRIPE_SECTORS, &rdev->corrected_errors);
2364 clear_bit(R5_ReadError, &sh->dev[i].flags);
2365 clear_bit(R5_ReWrite, &sh->dev[i].flags);
2366 } else if (test_bit(R5_ReadNoMerge, &sh->dev[i].flags))
2367 clear_bit(R5_ReadNoMerge, &sh->dev[i].flags);
2368
2369 if (atomic_read(&rdev->read_errors))
2370 atomic_set(&rdev->read_errors, 0);
2371 } else {
2372 const char *bdn = bdevname(rdev->bdev, b);
2373 int retry = 0;
2374 int set_bad = 0;
2375
2376 clear_bit(R5_UPTODATE, &sh->dev[i].flags);
2377 atomic_inc(&rdev->read_errors);
2378 if (test_bit(R5_ReadRepl, &sh->dev[i].flags))
2349 mdname(conf->mddev), STRIPE_SECTORS,
2350 (unsigned long long)s,
2351 bdevname(rdev->bdev, b));
2352 atomic_add(STRIPE_SECTORS, &rdev->corrected_errors);
2353 clear_bit(R5_ReadError, &sh->dev[i].flags);
2354 clear_bit(R5_ReWrite, &sh->dev[i].flags);
2355 } else if (test_bit(R5_ReadNoMerge, &sh->dev[i].flags))
2356 clear_bit(R5_ReadNoMerge, &sh->dev[i].flags);
2357
2358 if (atomic_read(&rdev->read_errors))
2359 atomic_set(&rdev->read_errors, 0);
2360 } else {
2361 const char *bdn = bdevname(rdev->bdev, b);
2362 int retry = 0;
2363 int set_bad = 0;
2364
2365 clear_bit(R5_UPTODATE, &sh->dev[i].flags);
2366 atomic_inc(&rdev->read_errors);
2367 if (test_bit(R5_ReadRepl, &sh->dev[i].flags))
2379 pr_warn_ratelimited(
2380 "md/raid:%s: read error on replacement device (sector %llu on %s).\n",
2368 printk_ratelimited(
2369 KERN_WARNING
2370 "md/raid:%s: read error on replacement device "
2371 "(sector %llu on %s).\n",
2381 mdname(conf->mddev),
2382 (unsigned long long)s,
2383 bdn);
2384 else if (conf->mddev->degraded >= conf->max_degraded) {
2385 set_bad = 1;
2372 mdname(conf->mddev),
2373 (unsigned long long)s,
2374 bdn);
2375 else if (conf->mddev->degraded >= conf->max_degraded) {
2376 set_bad = 1;
2386 pr_warn_ratelimited(
2387 "md/raid:%s: read error not correctable (sector %llu on %s).\n",
2377 printk_ratelimited(
2378 KERN_WARNING
2379 "md/raid:%s: read error not correctable "
2380 "(sector %llu on %s).\n",
2388 mdname(conf->mddev),
2389 (unsigned long long)s,
2390 bdn);
2391 } else if (test_bit(R5_ReWrite, &sh->dev[i].flags)) {
2392 /* Oh, no!!! */
2393 set_bad = 1;
2381 mdname(conf->mddev),
2382 (unsigned long long)s,
2383 bdn);
2384 } else if (test_bit(R5_ReWrite, &sh->dev[i].flags)) {
2385 /* Oh, no!!! */
2386 set_bad = 1;
2394 pr_warn_ratelimited(
2395 "md/raid:%s: read error NOT corrected!! (sector %llu on %s).\n",
2387 printk_ratelimited(
2388 KERN_WARNING
2389 "md/raid:%s: read error NOT corrected!! "
2390 "(sector %llu on %s).\n",
2396 mdname(conf->mddev),
2397 (unsigned long long)s,
2398 bdn);
2399 } else if (atomic_read(&rdev->read_errors)
2400 > conf->max_nr_stripes)
2391 mdname(conf->mddev),
2392 (unsigned long long)s,
2393 bdn);
2394 } else if (atomic_read(&rdev->read_errors)
2395 > conf->max_nr_stripes)
2401 pr_warn("md/raid:%s: Too many read errors, failing device %s.\n",
2396 printk(KERN_WARNING
2397 "md/raid:%s: Too many read errors, failing device %s.\n",
2402 mdname(conf->mddev), bdn);
2403 else
2404 retry = 1;
2405 if (set_bad && test_bit(In_sync, &rdev->flags)
2406 && !test_bit(R5_ReadNoMerge, &sh->dev[i].flags))
2407 retry = 1;
2408 if (retry)
2409 if (test_bit(R5_ReadNoMerge, &sh->dev[i].flags)) {

--- 117 unchanged lines hidden (view full) ---

2527 mddev->degraded = calc_degraded(conf);
2528 spin_unlock_irqrestore(&conf->device_lock, flags);
2529 set_bit(MD_RECOVERY_INTR, &mddev->recovery);
2530
2531 set_bit(Blocked, &rdev->flags);
2532 set_bit(Faulty, &rdev->flags);
2533 set_mask_bits(&mddev->flags, 0,
2534 BIT(MD_CHANGE_DEVS) | BIT(MD_CHANGE_PENDING));
2398 mdname(conf->mddev), bdn);
2399 else
2400 retry = 1;
2401 if (set_bad && test_bit(In_sync, &rdev->flags)
2402 && !test_bit(R5_ReadNoMerge, &sh->dev[i].flags))
2403 retry = 1;
2404 if (retry)
2405 if (test_bit(R5_ReadNoMerge, &sh->dev[i].flags)) {

--- 117 unchanged lines hidden (view full) ---

2523 mddev->degraded = calc_degraded(conf);
2524 spin_unlock_irqrestore(&conf->device_lock, flags);
2525 set_bit(MD_RECOVERY_INTR, &mddev->recovery);
2526
2527 set_bit(Blocked, &rdev->flags);
2528 set_bit(Faulty, &rdev->flags);
2529 set_mask_bits(&mddev->flags, 0,
2530 BIT(MD_CHANGE_DEVS) | BIT(MD_CHANGE_PENDING));
2535 pr_crit("md/raid:%s: Disk failure on %s, disabling device.\n"
2536 "md/raid:%s: Operation continuing on %d devices.\n",
2537 mdname(mddev),
2538 bdevname(rdev->bdev, b),
2539 mdname(mddev),
2540 conf->raid_disks - mddev->degraded);
2531 printk(KERN_ALERT
2532 "md/raid:%s: Disk failure on %s, disabling device.\n"
2533 "md/raid:%s: Operation continuing on %d devices.\n",
2534 mdname(mddev),
2535 bdevname(rdev->bdev, b),
2536 mdname(mddev),
2537 conf->raid_disks - mddev->degraded);
2541}
2542
2543/*
2544 * Input: a 'big' sector number,
2545 * Output: index of the data and parity disk, and the sector # in them.
2546 */
2547sector_t raid5_compute_sector(struct r5conf *conf, sector_t r_sector,
2548 int previous, int *dd_idx,

--- 305 unchanged lines hidden (view full) ---

2854
2855 chunk_number = stripe * data_disks + i;
2856 r_sector = chunk_number * sectors_per_chunk + chunk_offset;
2857
2858 check = raid5_compute_sector(conf, r_sector,
2859 previous, &dummy1, &sh2);
2860 if (check != sh->sector || dummy1 != dd_idx || sh2.pd_idx != sh->pd_idx
2861 || sh2.qd_idx != sh->qd_idx) {
2538}
2539
2540/*
2541 * Input: a 'big' sector number,
2542 * Output: index of the data and parity disk, and the sector # in them.
2543 */
2544sector_t raid5_compute_sector(struct r5conf *conf, sector_t r_sector,
2545 int previous, int *dd_idx,

--- 305 unchanged lines hidden (view full) ---

2851
2852 chunk_number = stripe * data_disks + i;
2853 r_sector = chunk_number * sectors_per_chunk + chunk_offset;
2854
2855 check = raid5_compute_sector(conf, r_sector,
2856 previous, &dummy1, &sh2);
2857 if (check != sh->sector || dummy1 != dd_idx || sh2.pd_idx != sh->pd_idx
2858 || sh2.qd_idx != sh->qd_idx) {
2862 pr_warn("md/raid:%s: compute_blocknr: map not correct\n",
2863 mdname(conf->mddev));
2859 printk(KERN_ERR "md/raid:%s: compute_blocknr: map not correct\n",
2860 mdname(conf->mddev));
2864 return 0;
2865 }
2866 return r_sector;
2867}
2868
2869static void
2870schedule_reconstruction(struct stripe_head *sh, struct stripe_head_state *s,
2871 int rcw, int expand)
2872{
2873 int i, pd_idx = sh->pd_idx, qd_idx = sh->qd_idx, disks = sh->disks;
2874 struct r5conf *conf = sh->raid_conf;
2875 int level = conf->level;
2876
2877 if (rcw) {
2861 return 0;
2862 }
2863 return r_sector;
2864}
2865
2866static void
2867schedule_reconstruction(struct stripe_head *sh, struct stripe_head_state *s,
2868 int rcw, int expand)
2869{
2870 int i, pd_idx = sh->pd_idx, qd_idx = sh->qd_idx, disks = sh->disks;
2871 struct r5conf *conf = sh->raid_conf;
2872 int level = conf->level;
2873
2874 if (rcw) {
2878 /*
2879 * In some cases, handle_stripe_dirtying initially decided to
2880 * run rmw and allocates extra page for prexor. However, rcw is
2881 * cheaper later on. We need to free the extra page now,
2882 * because we won't be able to do that in ops_complete_prexor().
2883 */
2884 r5c_release_extra_page(sh);
2885
2886 for (i = disks; i--; ) {
2887 struct r5dev *dev = &sh->dev[i];
2888
2889 if (dev->towrite) {
2890 set_bit(R5_LOCKED, &dev->flags);
2891 set_bit(R5_Wantdrain, &dev->flags);
2892 if (!expand)
2893 clear_bit(R5_UPTODATE, &dev->flags);
2894 s->locked++;
2875
2876 for (i = disks; i--; ) {
2877 struct r5dev *dev = &sh->dev[i];
2878
2879 if (dev->towrite) {
2880 set_bit(R5_LOCKED, &dev->flags);
2881 set_bit(R5_Wantdrain, &dev->flags);
2882 if (!expand)
2883 clear_bit(R5_UPTODATE, &dev->flags);
2884 s->locked++;
2895 } else if (test_bit(R5_InJournal, &dev->flags)) {
2896 set_bit(R5_LOCKED, &dev->flags);
2897 s->locked++;
2898 }
2899 }
2900 /* if we are not expanding this is a proper write request, and
2901 * there will be bios with new data to be drained into the
2902 * stripe cache
2903 */
2904 if (!expand) {
2905 if (!s->locked)

--- 23 unchanged lines hidden (view full) ---

2929
2930 if (dev->towrite &&
2931 (test_bit(R5_UPTODATE, &dev->flags) ||
2932 test_bit(R5_Wantcompute, &dev->flags))) {
2933 set_bit(R5_Wantdrain, &dev->flags);
2934 set_bit(R5_LOCKED, &dev->flags);
2935 clear_bit(R5_UPTODATE, &dev->flags);
2936 s->locked++;
2885 }
2886 }
2887 /* if we are not expanding this is a proper write request, and
2888 * there will be bios with new data to be drained into the
2889 * stripe cache
2890 */
2891 if (!expand) {
2892 if (!s->locked)

--- 23 unchanged lines hidden (view full) ---

2916
2917 if (dev->towrite &&
2918 (test_bit(R5_UPTODATE, &dev->flags) ||
2919 test_bit(R5_Wantcompute, &dev->flags))) {
2920 set_bit(R5_Wantdrain, &dev->flags);
2921 set_bit(R5_LOCKED, &dev->flags);
2922 clear_bit(R5_UPTODATE, &dev->flags);
2923 s->locked++;
2937 } else if (test_bit(R5_InJournal, &dev->flags)) {
2938 set_bit(R5_LOCKED, &dev->flags);
2939 s->locked++;
2940 }
2941 }
2942 if (!s->locked)
2943 /* False alarm - nothing to do */
2944 return;
2945 sh->reconstruct_state = reconstruct_state_prexor_drain_run;
2946 set_bit(STRIPE_OP_PREXOR, &s->ops_request);
2947 set_bit(STRIPE_OP_BIODRAIN, &s->ops_request);

--- 655 unchanged lines hidden (view full) ---

3603 */
3604 rcw = 1; rmw = 2;
3605 pr_debug("force RCW rmw_level=%u, recovery_cp=%llu sh->sector=%llu\n",
3606 conf->rmw_level, (unsigned long long)recovery_cp,
3607 (unsigned long long)sh->sector);
3608 } else for (i = disks; i--; ) {
3609 /* would I have to read this buffer for read_modify_write */
3610 struct r5dev *dev = &sh->dev[i];
2924 }
2925 }
2926 if (!s->locked)
2927 /* False alarm - nothing to do */
2928 return;
2929 sh->reconstruct_state = reconstruct_state_prexor_drain_run;
2930 set_bit(STRIPE_OP_PREXOR, &s->ops_request);
2931 set_bit(STRIPE_OP_BIODRAIN, &s->ops_request);

--- 655 unchanged lines hidden (view full) ---

3587 */
3588 rcw = 1; rmw = 2;
3589 pr_debug("force RCW rmw_level=%u, recovery_cp=%llu sh->sector=%llu\n",
3590 conf->rmw_level, (unsigned long long)recovery_cp,
3591 (unsigned long long)sh->sector);
3592 } else for (i = disks; i--; ) {
3593 /* would I have to read this buffer for read_modify_write */
3594 struct r5dev *dev = &sh->dev[i];
3611 if ((dev->towrite || i == sh->pd_idx || i == sh->qd_idx ||
3612 test_bit(R5_InJournal, &dev->flags)) &&
3595 if ((dev->towrite || i == sh->pd_idx || i == sh->qd_idx) &&
3613 !test_bit(R5_LOCKED, &dev->flags) &&
3596 !test_bit(R5_LOCKED, &dev->flags) &&
3614 !((test_bit(R5_UPTODATE, &dev->flags) &&
3615 (!test_bit(R5_InJournal, &dev->flags) ||
3616 dev->page != dev->orig_page)) ||
3597 !(test_bit(R5_UPTODATE, &dev->flags) ||
3617 test_bit(R5_Wantcompute, &dev->flags))) {
3618 if (test_bit(R5_Insync, &dev->flags))
3619 rmw++;
3620 else
3621 rmw += 2*disks; /* cannot read it */
3622 }
3623 /* Would I have to read this buffer for reconstruct_write */
3624 if (!test_bit(R5_OVERWRITE, &dev->flags) &&
3625 i != sh->pd_idx && i != sh->qd_idx &&
3626 !test_bit(R5_LOCKED, &dev->flags) &&
3627 !(test_bit(R5_UPTODATE, &dev->flags) ||
3598 test_bit(R5_Wantcompute, &dev->flags))) {
3599 if (test_bit(R5_Insync, &dev->flags))
3600 rmw++;
3601 else
3602 rmw += 2*disks; /* cannot read it */
3603 }
3604 /* Would I have to read this buffer for reconstruct_write */
3605 if (!test_bit(R5_OVERWRITE, &dev->flags) &&
3606 i != sh->pd_idx && i != sh->qd_idx &&
3607 !test_bit(R5_LOCKED, &dev->flags) &&
3608 !(test_bit(R5_UPTODATE, &dev->flags) ||
3628 test_bit(R5_InJournal, &dev->flags) ||
3629 test_bit(R5_Wantcompute, &dev->flags))) {
3609 test_bit(R5_Wantcompute, &dev->flags))) {
3630 if (test_bit(R5_Insync, &dev->flags))
3631 rcw++;
3632 else
3633 rcw += 2*disks;
3634 }
3635 }
3610 if (test_bit(R5_Insync, &dev->flags))
3611 rcw++;
3612 else
3613 rcw += 2*disks;
3614 }
3615 }
3636
3637 pr_debug("for sector %llu, rmw=%d rcw=%d\n",
3638 (unsigned long long)sh->sector, rmw, rcw);
3639 set_bit(STRIPE_HANDLE, &sh->state);
3640 if ((rmw < rcw || (rmw == rcw && conf->rmw_level == PARITY_PREFER_RMW)) && rmw > 0) {
3641 /* prefer read-modify-write, but need to get some data */
3642 if (conf->mddev->queue)
3643 blk_add_trace_msg(conf->mddev->queue,
3644 "raid5 rmw %llu %d",
3645 (unsigned long long)sh->sector, rmw);
3646 for (i = disks; i--; ) {
3647 struct r5dev *dev = &sh->dev[i];
3616 pr_debug("for sector %llu, rmw=%d rcw=%d\n",
3617 (unsigned long long)sh->sector, rmw, rcw);
3618 set_bit(STRIPE_HANDLE, &sh->state);
3619 if ((rmw < rcw || (rmw == rcw && conf->rmw_level == PARITY_PREFER_RMW)) && rmw > 0) {
3620 /* prefer read-modify-write, but need to get some data */
3621 if (conf->mddev->queue)
3622 blk_add_trace_msg(conf->mddev->queue,
3623 "raid5 rmw %llu %d",
3624 (unsigned long long)sh->sector, rmw);
3625 for (i = disks; i--; ) {
3626 struct r5dev *dev = &sh->dev[i];
3648 if (test_bit(R5_InJournal, &dev->flags) &&
3649 dev->page == dev->orig_page &&
3650 !test_bit(R5_LOCKED, &sh->dev[sh->pd_idx].flags)) {
3651 /* alloc page for prexor */
3652 dev->orig_page = alloc_page(GFP_NOIO);
3653
3654 /* will handle failure in a later patch*/
3655 BUG_ON(!dev->orig_page);
3656 }
3657
3658 if ((dev->towrite ||
3659 i == sh->pd_idx || i == sh->qd_idx ||
3660 test_bit(R5_InJournal, &dev->flags)) &&
3627 if ((dev->towrite || i == sh->pd_idx || i == sh->qd_idx) &&
3661 !test_bit(R5_LOCKED, &dev->flags) &&
3628 !test_bit(R5_LOCKED, &dev->flags) &&
3662 !((test_bit(R5_UPTODATE, &dev->flags) &&
3663 (!test_bit(R5_InJournal, &dev->flags) ||
3664 dev->page != dev->orig_page)) ||
3665 test_bit(R5_Wantcompute, &dev->flags)) &&
3629 !(test_bit(R5_UPTODATE, &dev->flags) ||
3630 test_bit(R5_Wantcompute, &dev->flags)) &&
3666 test_bit(R5_Insync, &dev->flags)) {
3667 if (test_bit(STRIPE_PREREAD_ACTIVE,
3668 &sh->state)) {
3669 pr_debug("Read_old block %d for r-m-w\n",
3670 i);
3671 set_bit(R5_LOCKED, &dev->flags);
3672 set_bit(R5_Wantread, &dev->flags);
3673 s->locked++;

--- 9 unchanged lines hidden (view full) ---

3683 int qread =0;
3684 rcw = 0;
3685 for (i = disks; i--; ) {
3686 struct r5dev *dev = &sh->dev[i];
3687 if (!test_bit(R5_OVERWRITE, &dev->flags) &&
3688 i != sh->pd_idx && i != sh->qd_idx &&
3689 !test_bit(R5_LOCKED, &dev->flags) &&
3690 !(test_bit(R5_UPTODATE, &dev->flags) ||
3631 test_bit(R5_Insync, &dev->flags)) {
3632 if (test_bit(STRIPE_PREREAD_ACTIVE,
3633 &sh->state)) {
3634 pr_debug("Read_old block %d for r-m-w\n",
3635 i);
3636 set_bit(R5_LOCKED, &dev->flags);
3637 set_bit(R5_Wantread, &dev->flags);
3638 s->locked++;

--- 9 unchanged lines hidden (view full) ---

3648 int qread =0;
3649 rcw = 0;
3650 for (i = disks; i--; ) {
3651 struct r5dev *dev = &sh->dev[i];
3652 if (!test_bit(R5_OVERWRITE, &dev->flags) &&
3653 i != sh->pd_idx && i != sh->qd_idx &&
3654 !test_bit(R5_LOCKED, &dev->flags) &&
3655 !(test_bit(R5_UPTODATE, &dev->flags) ||
3691 test_bit(R5_InJournal, &dev->flags) ||
3692 test_bit(R5_Wantcompute, &dev->flags))) {
3693 rcw++;
3694 if (test_bit(R5_Insync, &dev->flags) &&
3695 test_bit(STRIPE_PREREAD_ACTIVE,
3696 &sh->state)) {
3697 pr_debug("Read_old block "
3698 "%d for Reconstruct\n", i);
3699 set_bit(R5_LOCKED, &dev->flags);

--- 23 unchanged lines hidden (view full) ---

3723 * case where a compute block operation has been submitted and then a
3724 * subsequent call wants to start a write request. raid_run_ops only
3725 * handles the case where compute block and reconstruct are requested
3726 * simultaneously. If this is not the case then new writes need to be
3727 * held off until the compute completes.
3728 */
3729 if ((s->req_compute || !test_bit(STRIPE_COMPUTE_RUN, &sh->state)) &&
3730 (s->locked == 0 && (rcw == 0 || rmw == 0) &&
3656 test_bit(R5_Wantcompute, &dev->flags))) {
3657 rcw++;
3658 if (test_bit(R5_Insync, &dev->flags) &&
3659 test_bit(STRIPE_PREREAD_ACTIVE,
3660 &sh->state)) {
3661 pr_debug("Read_old block "
3662 "%d for Reconstruct\n", i);
3663 set_bit(R5_LOCKED, &dev->flags);

--- 23 unchanged lines hidden (view full) ---

3687 * case where a compute block operation has been submitted and then a
3688 * subsequent call wants to start a write request. raid_run_ops only
3689 * handles the case where compute block and reconstruct are requested
3690 * simultaneously. If this is not the case then new writes need to be
3691 * held off until the compute completes.
3692 */
3693 if ((s->req_compute || !test_bit(STRIPE_COMPUTE_RUN, &sh->state)) &&
3694 (s->locked == 0 && (rcw == 0 || rmw == 0) &&
3731 !test_bit(STRIPE_BIT_DELAY, &sh->state)))
3695 !test_bit(STRIPE_BIT_DELAY, &sh->state)))
3732 schedule_reconstruction(sh, s, rcw == 0, 0);
3733}
3734
3735static void handle_parity_checks5(struct r5conf *conf, struct stripe_head *sh,
3736 struct stripe_head_state *s, int disks)
3737{
3738 struct r5dev *dev = NULL;
3739

--- 68 unchanged lines hidden (view full) ---

3808 sh->ops.target2 = -1;
3809 s->uptodate++;
3810 }
3811 }
3812 break;
3813 case check_state_compute_run:
3814 break;
3815 default:
3696 schedule_reconstruction(sh, s, rcw == 0, 0);
3697}
3698
3699static void handle_parity_checks5(struct r5conf *conf, struct stripe_head *sh,
3700 struct stripe_head_state *s, int disks)
3701{
3702 struct r5dev *dev = NULL;
3703

--- 68 unchanged lines hidden (view full) ---

3772 sh->ops.target2 = -1;
3773 s->uptodate++;
3774 }
3775 }
3776 break;
3777 case check_state_compute_run:
3778 break;
3779 default:
3816 pr_err("%s: unknown check_state: %d sector: %llu\n",
3780 printk(KERN_ERR "%s: unknown check_state: %d sector: %llu\n",
3817 __func__, sh->check_state,
3818 (unsigned long long) sh->sector);
3819 BUG();
3820 }
3821}
3822
3823static void handle_parity_checks6(struct r5conf *conf, struct stripe_head *sh,
3824 struct stripe_head_state *s,

--- 147 unchanged lines hidden (view full) ---

3972 s->uptodate++;
3973 }
3974 }
3975 }
3976 break;
3977 case check_state_compute_run:
3978 break;
3979 default:
3781 __func__, sh->check_state,
3782 (unsigned long long) sh->sector);
3783 BUG();
3784 }
3785}
3786
3787static void handle_parity_checks6(struct r5conf *conf, struct stripe_head *sh,
3788 struct stripe_head_state *s,

--- 147 unchanged lines hidden (view full) ---

3936 s->uptodate++;
3937 }
3938 }
3939 }
3940 break;
3941 case check_state_compute_run:
3942 break;
3943 default:
3980 pr_warn("%s: unknown check_state: %d sector: %llu\n",
3981 __func__, sh->check_state,
3982 (unsigned long long) sh->sector);
3944 printk(KERN_ERR "%s: unknown check_state: %d sector: %llu\n",
3945 __func__, sh->check_state,
3946 (unsigned long long) sh->sector);
3983 BUG();
3984 }
3985}
3986
3987static void handle_stripe_expansion(struct r5conf *conf, struct stripe_head *sh)
3988{
3989 int i;
3990

--- 223 unchanged lines hidden (view full) ---

4214 clear_bit(R5_Insync, &dev->flags);
4215 if (!test_bit(R5_Insync, &dev->flags)) {
4216 if (s->failed < 2)
4217 s->failed_num[s->failed] = i;
4218 s->failed++;
4219 if (rdev && !test_bit(Faulty, &rdev->flags))
4220 do_recovery = 1;
4221 }
3947 BUG();
3948 }
3949}
3950
3951static void handle_stripe_expansion(struct r5conf *conf, struct stripe_head *sh)
3952{
3953 int i;
3954

--- 223 unchanged lines hidden (view full) ---

4178 clear_bit(R5_Insync, &dev->flags);
4179 if (!test_bit(R5_Insync, &dev->flags)) {
4180 if (s->failed < 2)
4181 s->failed_num[s->failed] = i;
4182 s->failed++;
4183 if (rdev && !test_bit(Faulty, &rdev->flags))
4184 do_recovery = 1;
4185 }
4222
4223 if (test_bit(R5_InJournal, &dev->flags))
4224 s->injournal++;
4225 if (test_bit(R5_InJournal, &dev->flags) && dev->written)
4226 s->just_cached++;
4227 }
4228 if (test_bit(STRIPE_SYNCING, &sh->state)) {
4229 /* If there is a failed device being replaced,
4230 * we must be recovering.
4231 * else if we are after recovery_cp, we must be syncing
4232 * else if MD_RECOVERY_REQUESTED is set, we also are syncing.
4233 * else we can only be replacing
4234 * sync and recovery both need to read all devices, and so

--- 212 unchanged lines hidden (view full) ---

4447 !test_bit(R5_Discard, &sh->dev[sh->pd_idx].flags));
4448 BUG_ON(sh->qd_idx >= 0 &&
4449 !test_bit(R5_UPTODATE, &sh->dev[sh->qd_idx].flags) &&
4450 !test_bit(R5_Discard, &sh->dev[sh->qd_idx].flags));
4451 for (i = disks; i--; ) {
4452 struct r5dev *dev = &sh->dev[i];
4453 if (test_bit(R5_LOCKED, &dev->flags) &&
4454 (i == sh->pd_idx || i == sh->qd_idx ||
4186 }
4187 if (test_bit(STRIPE_SYNCING, &sh->state)) {
4188 /* If there is a failed device being replaced,
4189 * we must be recovering.
4190 * else if we are after recovery_cp, we must be syncing
4191 * else if MD_RECOVERY_REQUESTED is set, we also are syncing.
4192 * else we can only be replacing
4193 * sync and recovery both need to read all devices, and so

--- 212 unchanged lines hidden (view full) ---

4406 !test_bit(R5_Discard, &sh->dev[sh->pd_idx].flags));
4407 BUG_ON(sh->qd_idx >= 0 &&
4408 !test_bit(R5_UPTODATE, &sh->dev[sh->qd_idx].flags) &&
4409 !test_bit(R5_Discard, &sh->dev[sh->qd_idx].flags));
4410 for (i = disks; i--; ) {
4411 struct r5dev *dev = &sh->dev[i];
4412 if (test_bit(R5_LOCKED, &dev->flags) &&
4413 (i == sh->pd_idx || i == sh->qd_idx ||
4455 dev->written || test_bit(R5_InJournal,
4456 &dev->flags))) {
4414 dev->written)) {
4457 pr_debug("Writing block %d\n", i);
4458 set_bit(R5_Wantwrite, &dev->flags);
4459 if (prexor)
4460 continue;
4461 if (s.failed > 1)
4462 continue;
4463 if (!test_bit(R5_Insync, &dev->flags) ||
4464 ((i == sh->pd_idx || i == sh->qd_idx) &&

--- 23 unchanged lines hidden (view full) ---

4488 && (test_bit(R5_UPTODATE, &pdev->flags) ||
4489 test_bit(R5_Discard, &pdev->flags))))) &&
4490 (s.q_failed || ((test_bit(R5_Insync, &qdev->flags)
4491 && !test_bit(R5_LOCKED, &qdev->flags)
4492 && (test_bit(R5_UPTODATE, &qdev->flags) ||
4493 test_bit(R5_Discard, &qdev->flags))))))
4494 handle_stripe_clean_event(conf, sh, disks, &s.return_bi);
4495
4415 pr_debug("Writing block %d\n", i);
4416 set_bit(R5_Wantwrite, &dev->flags);
4417 if (prexor)
4418 continue;
4419 if (s.failed > 1)
4420 continue;
4421 if (!test_bit(R5_Insync, &dev->flags) ||
4422 ((i == sh->pd_idx || i == sh->qd_idx) &&

--- 23 unchanged lines hidden (view full) ---

4446 && (test_bit(R5_UPTODATE, &pdev->flags) ||
4447 test_bit(R5_Discard, &pdev->flags))))) &&
4448 (s.q_failed || ((test_bit(R5_Insync, &qdev->flags)
4449 && !test_bit(R5_LOCKED, &qdev->flags)
4450 && (test_bit(R5_UPTODATE, &qdev->flags) ||
4451 test_bit(R5_Discard, &qdev->flags))))))
4452 handle_stripe_clean_event(conf, sh, disks, &s.return_bi);
4453
4496 if (s.just_cached)
4497 r5c_handle_cached_data_endio(conf, sh, disks, &s.return_bi);
4498 r5l_stripe_write_finished(sh);
4499
4500 /* Now we might consider reading some blocks, either to check/generate
4501 * parity, or to satisfy requests
4502 * or to load a block that is being partially written.
4503 */
4504 if (s.to_read || s.non_overwrite
4505 || (conf->level == 6 && s.to_write && s.failed)
4506 || (s.syncing && (s.uptodate + s.compute < disks))
4507 || s.replacing
4508 || s.expanding)
4509 handle_stripe_fill(sh, &s, disks);
4510
4454 /* Now we might consider reading some blocks, either to check/generate
4455 * parity, or to satisfy requests
4456 * or to load a block that is being partially written.
4457 */
4458 if (s.to_read || s.non_overwrite
4459 || (conf->level == 6 && s.to_write && s.failed)
4460 || (s.syncing && (s.uptodate + s.compute < disks))
4461 || s.replacing
4462 || s.expanding)
4463 handle_stripe_fill(sh, &s, disks);
4464
4511 /*
4512 * When the stripe finishes full journal write cycle (write to journal
4513 * and raid disk), this is the clean up procedure so it is ready for
4514 * next operation.
4515 */
4516 r5c_finish_stripe_write_out(conf, sh, &s);
4517
4518 /*
4519 * Now to consider new write requests, cache write back and what else,
4520 * if anything should be read. We do not handle new writes when:
4465 /* Now to consider new write requests and what else, if anything
4466 * should be read. We do not handle new writes when:
4521 * 1/ A 'write' operation (copy+xor) is already in flight.
4522 * 2/ A 'check' operation is in flight, as it may clobber the parity
4523 * block.
4467 * 1/ A 'write' operation (copy+xor) is already in flight.
4468 * 2/ A 'check' operation is in flight, as it may clobber the parity
4469 * block.
4524 * 3/ A r5c cache log write is in flight.
4525 */
4470 */
4471 if (s.to_write && !sh->reconstruct_state && !sh->check_state)
4472 handle_stripe_dirtying(conf, sh, &s, disks);
4526
4473
4527 if (!sh->reconstruct_state && !sh->check_state && !sh->log_io) {
4528 if (!r5c_is_writeback(conf->log)) {
4529 if (s.to_write)
4530 handle_stripe_dirtying(conf, sh, &s, disks);
4531 } else { /* write back cache */
4532 int ret = 0;
4533
4534 /* First, try handle writes in caching phase */
4535 if (s.to_write)
4536 ret = r5c_try_caching_write(conf, sh, &s,
4537 disks);
4538 /*
4539 * If caching phase failed: ret == -EAGAIN
4540 * OR
4541 * stripe under reclaim: !caching && injournal
4542 *
4543 * fall back to handle_stripe_dirtying()
4544 */
4545 if (ret == -EAGAIN ||
4546 /* stripe under reclaim: !caching && injournal */
4547 (!test_bit(STRIPE_R5C_CACHING, &sh->state) &&
4548 s.injournal > 0))
4549 handle_stripe_dirtying(conf, sh, &s, disks);
4550 }
4551 }
4552
4553 /* maybe we need to check and possibly fix the parity for this stripe
4554 * Any reads will already have been scheduled, so we just see if enough
4555 * data is available. The parity check is held off while parity
4556 * dependent operations are in flight.
4557 */
4558 if (sh->check_state ||
4559 (s.syncing && s.locked == 0 &&
4560 !test_bit(STRIPE_COMPUTE_RUN, &sh->state) &&

--- 153 unchanged lines hidden (view full) ---

4714 */
4715 atomic_dec(&conf->preread_active_stripes);
4716 if (atomic_read(&conf->preread_active_stripes) <
4717 IO_THRESHOLD)
4718 md_wakeup_thread(conf->mddev->thread);
4719 }
4720
4721 if (!bio_list_empty(&s.return_bi)) {
4474 /* maybe we need to check and possibly fix the parity for this stripe
4475 * Any reads will already have been scheduled, so we just see if enough
4476 * data is available. The parity check is held off while parity
4477 * dependent operations are in flight.
4478 */
4479 if (sh->check_state ||
4480 (s.syncing && s.locked == 0 &&
4481 !test_bit(STRIPE_COMPUTE_RUN, &sh->state) &&

--- 153 unchanged lines hidden (view full) ---

4635 */
4636 atomic_dec(&conf->preread_active_stripes);
4637 if (atomic_read(&conf->preread_active_stripes) <
4638 IO_THRESHOLD)
4639 md_wakeup_thread(conf->mddev->thread);
4640 }
4641
4642 if (!bio_list_empty(&s.return_bi)) {
4722 if (test_bit(MD_CHANGE_PENDING, &conf->mddev->flags)) {
4643 if (test_bit(MD_CHANGE_PENDING, &conf->mddev->flags) &&
4644 (s.failed <= conf->max_degraded ||
4645 conf->mddev->external == 0)) {
4723 spin_lock_irq(&conf->device_lock);
4724 bio_list_merge(&conf->return_bi, &s.return_bi);
4725 spin_unlock_irq(&conf->device_lock);
4726 md_wakeup_thread(conf->mddev->thread);
4727 } else
4728 return_io(&s.return_bi);
4729 }
4730

--- 39 unchanged lines hidden (view full) ---

4770 struct r5conf *conf = mddev->private;
4771
4772 /* No difference between reads and writes. Just check
4773 * how busy the stripe_cache is
4774 */
4775
4776 if (test_bit(R5_INACTIVE_BLOCKED, &conf->cache_state))
4777 return 1;
4646 spin_lock_irq(&conf->device_lock);
4647 bio_list_merge(&conf->return_bi, &s.return_bi);
4648 spin_unlock_irq(&conf->device_lock);
4649 md_wakeup_thread(conf->mddev->thread);
4650 } else
4651 return_io(&s.return_bi);
4652 }
4653

--- 39 unchanged lines hidden (view full) ---

4693 struct r5conf *conf = mddev->private;
4694
4695 /* No difference between reads and writes. Just check
4696 * how busy the stripe_cache is
4697 */
4698
4699 if (test_bit(R5_INACTIVE_BLOCKED, &conf->cache_state))
4700 return 1;
4778
4779 /* Also checks whether there is pressure on r5cache log space */
4780 if (test_bit(R5C_LOG_TIGHT, &conf->cache_state))
4781 return 1;
4782 if (conf->quiesce)
4783 return 1;
4784 if (atomic_read(&conf->empty_inactive_list_nr))
4785 return 1;
4786
4787 return 0;
4788}
4789

--- 453 unchanged lines hidden (view full) ---

5243 int dd_idx;
5244 sector_t new_sector;
5245 sector_t logical_sector, last_sector;
5246 struct stripe_head *sh;
5247 const int rw = bio_data_dir(bi);
5248 int remaining;
5249 DEFINE_WAIT(w);
5250 bool do_prepare;
4701 if (conf->quiesce)
4702 return 1;
4703 if (atomic_read(&conf->empty_inactive_list_nr))
4704 return 1;
4705
4706 return 0;
4707}
4708

--- 453 unchanged lines hidden (view full) ---

5162 int dd_idx;
5163 sector_t new_sector;
5164 sector_t logical_sector, last_sector;
5165 struct stripe_head *sh;
5166 const int rw = bio_data_dir(bi);
5167 int remaining;
5168 DEFINE_WAIT(w);
5169 bool do_prepare;
5251 bool do_flush = false;
5252
5253 if (unlikely(bi->bi_opf & REQ_PREFLUSH)) {
5254 int ret = r5l_handle_flush_request(conf->log, bi);
5255
5256 if (ret == 0)
5257 return;
5258 if (ret == -ENODEV) {
5259 md_flush_request(mddev, bi);
5260 return;
5261 }
5262 /* ret == -EAGAIN, fallback */
5170
5171 if (unlikely(bi->bi_opf & REQ_PREFLUSH)) {
5172 int ret = r5l_handle_flush_request(conf->log, bi);
5173
5174 if (ret == 0)
5175 return;
5176 if (ret == -ENODEV) {
5177 md_flush_request(mddev, bi);
5178 return;
5179 }
5180 /* ret == -EAGAIN, fallback */
5263 /*
5264 * if r5l_handle_flush_request() didn't clear REQ_PREFLUSH,
5265 * we need to flush journal device
5266 */
5267 do_flush = bi->bi_opf & REQ_PREFLUSH;
5268 }
5269
5270 md_write_start(mddev, bi);
5271
5272 /*
5273 * If array is degraded, better not do chunk aligned read because
5274 * later we might have to read it again in order to reconstruct
5275 * data on failed drives.
5276 */
5277 if (rw == READ && mddev->degraded == 0 &&
5181 }
5182
5183 md_write_start(mddev, bi);
5184
5185 /*
5186 * If array is degraded, better not do chunk aligned read because
5187 * later we might have to read it again in order to reconstruct
5188 * data on failed drives.
5189 */
5190 if (rw == READ && mddev->degraded == 0 &&
5278 !r5c_is_writeback(conf->log) &&
5279 mddev->reshape_position == MaxSector) {
5280 bi = chunk_aligned_read(mddev, bi);
5281 if (!bi)
5282 return;
5283 }
5284
5285 if (unlikely(bio_op(bi) == REQ_OP_DISCARD)) {
5286 make_discard_request(mddev, bi);

--- 112 unchanged lines hidden (view full) ---

5399 * and wait a while
5400 */
5401 md_wakeup_thread(mddev->thread);
5402 raid5_release_stripe(sh);
5403 schedule();
5404 do_prepare = true;
5405 goto retry;
5406 }
5191 mddev->reshape_position == MaxSector) {
5192 bi = chunk_aligned_read(mddev, bi);
5193 if (!bi)
5194 return;
5195 }
5196
5197 if (unlikely(bio_op(bi) == REQ_OP_DISCARD)) {
5198 make_discard_request(mddev, bi);

--- 112 unchanged lines hidden (view full) ---

5311 * and wait a while
5312 */
5313 md_wakeup_thread(mddev->thread);
5314 raid5_release_stripe(sh);
5315 schedule();
5316 do_prepare = true;
5317 goto retry;
5318 }
5407 if (do_flush) {
5408 set_bit(STRIPE_R5C_PREFLUSH, &sh->state);
5409 /* we only need flush for one stripe */
5410 do_flush = false;
5411 }
5412
5413 set_bit(STRIPE_HANDLE, &sh->state);
5414 clear_bit(STRIPE_DELAYED, &sh->state);
5415 if ((!sh->batch_head || sh == sh->batch_head) &&
5416 (bi->bi_opf & REQ_SYNC) &&
5417 !test_and_set_bit(STRIPE_PREREAD_ACTIVE, &sh->state))
5418 atomic_inc(&conf->preread_active_stripes);
5419 release_stripe_plug(mddev, sh);
5420 } else {

--- 905 unchanged lines hidden (view full) ---

6326
6327static struct attribute *raid5_attrs[] = {
6328 &raid5_stripecache_size.attr,
6329 &raid5_stripecache_active.attr,
6330 &raid5_preread_bypass_threshold.attr,
6331 &raid5_group_thread_cnt.attr,
6332 &raid5_skip_copy.attr,
6333 &raid5_rmw_level.attr,
5319 set_bit(STRIPE_HANDLE, &sh->state);
5320 clear_bit(STRIPE_DELAYED, &sh->state);
5321 if ((!sh->batch_head || sh == sh->batch_head) &&
5322 (bi->bi_opf & REQ_SYNC) &&
5323 !test_and_set_bit(STRIPE_PREREAD_ACTIVE, &sh->state))
5324 atomic_inc(&conf->preread_active_stripes);
5325 release_stripe_plug(mddev, sh);
5326 } else {

--- 905 unchanged lines hidden (view full) ---

6232
6233static struct attribute *raid5_attrs[] = {
6234 &raid5_stripecache_size.attr,
6235 &raid5_stripecache_active.attr,
6236 &raid5_preread_bypass_threshold.attr,
6237 &raid5_group_thread_cnt.attr,
6238 &raid5_skip_copy.attr,
6239 &raid5_rmw_level.attr,
6334 &r5c_journal_mode.attr,
6335 NULL,
6336};
6337static struct attribute_group raid5_attrs_group = {
6338 .name = NULL,
6339 .attrs = raid5_attrs,
6340};
6341
6342static int alloc_thread_groups(struct r5conf *conf, int cnt,

--- 129 unchanged lines hidden (view full) ---

6472}
6473
6474static int raid456_cpu_up_prepare(unsigned int cpu, struct hlist_node *node)
6475{
6476 struct r5conf *conf = hlist_entry_safe(node, struct r5conf, node);
6477 struct raid5_percpu *percpu = per_cpu_ptr(conf->percpu, cpu);
6478
6479 if (alloc_scratch_buffer(conf, percpu)) {
6240 NULL,
6241};
6242static struct attribute_group raid5_attrs_group = {
6243 .name = NULL,
6244 .attrs = raid5_attrs,
6245};
6246
6247static int alloc_thread_groups(struct r5conf *conf, int cnt,

--- 129 unchanged lines hidden (view full) ---

6377}
6378
6379static int raid456_cpu_up_prepare(unsigned int cpu, struct hlist_node *node)
6380{
6381 struct r5conf *conf = hlist_entry_safe(node, struct r5conf, node);
6382 struct raid5_percpu *percpu = per_cpu_ptr(conf->percpu, cpu);
6383
6384 if (alloc_scratch_buffer(conf, percpu)) {
6480 pr_warn("%s: failed memory allocation for cpu%u\n",
6481 __func__, cpu);
6385 pr_err("%s: failed memory allocation for cpu%u\n",
6386 __func__, cpu);
6482 return -ENOMEM;
6483 }
6484 return 0;
6485}
6486
6487static int raid5_alloc_percpu(struct r5conf *conf)
6488{
6489 int err = 0;

--- 53 unchanged lines hidden (view full) ---

6543 char pers_name[6];
6544 int i;
6545 int group_cnt, worker_cnt_per_group;
6546 struct r5worker_group *new_group;
6547
6548 if (mddev->new_level != 5
6549 && mddev->new_level != 4
6550 && mddev->new_level != 6) {
6387 return -ENOMEM;
6388 }
6389 return 0;
6390}
6391
6392static int raid5_alloc_percpu(struct r5conf *conf)
6393{
6394 int err = 0;

--- 53 unchanged lines hidden (view full) ---

6448 char pers_name[6];
6449 int i;
6450 int group_cnt, worker_cnt_per_group;
6451 struct r5worker_group *new_group;
6452
6453 if (mddev->new_level != 5
6454 && mddev->new_level != 4
6455 && mddev->new_level != 6) {
6551 pr_warn("md/raid:%s: raid level not set to 4/5/6 (%d)\n",
6552 mdname(mddev), mddev->new_level);
6456 printk(KERN_ERR "md/raid:%s: raid level not set to 4/5/6 (%d)\n",
6457 mdname(mddev), mddev->new_level);
6553 return ERR_PTR(-EIO);
6554 }
6555 if ((mddev->new_level == 5
6556 && !algorithm_valid_raid5(mddev->new_layout)) ||
6557 (mddev->new_level == 6
6558 && !algorithm_valid_raid6(mddev->new_layout))) {
6458 return ERR_PTR(-EIO);
6459 }
6460 if ((mddev->new_level == 5
6461 && !algorithm_valid_raid5(mddev->new_layout)) ||
6462 (mddev->new_level == 6
6463 && !algorithm_valid_raid6(mddev->new_layout))) {
6559 pr_warn("md/raid:%s: layout %d not supported\n",
6560 mdname(mddev), mddev->new_layout);
6464 printk(KERN_ERR "md/raid:%s: layout %d not supported\n",
6465 mdname(mddev), mddev->new_layout);
6561 return ERR_PTR(-EIO);
6562 }
6563 if (mddev->new_level == 6 && mddev->raid_disks < 4) {
6466 return ERR_PTR(-EIO);
6467 }
6468 if (mddev->new_level == 6 && mddev->raid_disks < 4) {
6564 pr_warn("md/raid:%s: not enough configured devices (%d, minimum 4)\n",
6565 mdname(mddev), mddev->raid_disks);
6469 printk(KERN_ERR "md/raid:%s: not enough configured devices (%d, minimum 4)\n",
6470 mdname(mddev), mddev->raid_disks);
6566 return ERR_PTR(-EINVAL);
6567 }
6568
6569 if (!mddev->new_chunk_sectors ||
6570 (mddev->new_chunk_sectors << 9) % PAGE_SIZE ||
6571 !is_power_of_2(mddev->new_chunk_sectors)) {
6471 return ERR_PTR(-EINVAL);
6472 }
6473
6474 if (!mddev->new_chunk_sectors ||
6475 (mddev->new_chunk_sectors << 9) % PAGE_SIZE ||
6476 !is_power_of_2(mddev->new_chunk_sectors)) {
6572 pr_warn("md/raid:%s: invalid chunk size %d\n",
6573 mdname(mddev), mddev->new_chunk_sectors << 9);
6477 printk(KERN_ERR "md/raid:%s: invalid chunk size %d\n",
6478 mdname(mddev), mddev->new_chunk_sectors << 9);
6574 return ERR_PTR(-EINVAL);
6575 }
6576
6577 conf = kzalloc(sizeof(struct r5conf), GFP_KERNEL);
6578 if (conf == NULL)
6579 goto abort;
6580 /* Don't enable multi-threading by default*/
6581 if (!alloc_thread_groups(conf, 0, &group_cnt, &worker_cnt_per_group,

--- 48 unchanged lines hidden (view full) ---

6630 spin_lock_init(conf->hash_locks + i);
6631
6632 for (i = 0; i < NR_STRIPE_HASH_LOCKS; i++)
6633 INIT_LIST_HEAD(conf->inactive_list + i);
6634
6635 for (i = 0; i < NR_STRIPE_HASH_LOCKS; i++)
6636 INIT_LIST_HEAD(conf->temp_inactive_list + i);
6637
6479 return ERR_PTR(-EINVAL);
6480 }
6481
6482 conf = kzalloc(sizeof(struct r5conf), GFP_KERNEL);
6483 if (conf == NULL)
6484 goto abort;
6485 /* Don't enable multi-threading by default*/
6486 if (!alloc_thread_groups(conf, 0, &group_cnt, &worker_cnt_per_group,

--- 48 unchanged lines hidden (view full) ---

6535 spin_lock_init(conf->hash_locks + i);
6536
6537 for (i = 0; i < NR_STRIPE_HASH_LOCKS; i++)
6538 INIT_LIST_HEAD(conf->inactive_list + i);
6539
6540 for (i = 0; i < NR_STRIPE_HASH_LOCKS; i++)
6541 INIT_LIST_HEAD(conf->temp_inactive_list + i);
6542
6638 atomic_set(&conf->r5c_cached_full_stripes, 0);
6639 INIT_LIST_HEAD(&conf->r5c_full_stripe_list);
6640 atomic_set(&conf->r5c_cached_partial_stripes, 0);
6641 INIT_LIST_HEAD(&conf->r5c_partial_stripe_list);
6642
6643 conf->level = mddev->new_level;
6644 conf->chunk_sectors = mddev->new_chunk_sectors;
6645 if (raid5_alloc_percpu(conf) != 0)
6646 goto abort;
6647
6648 pr_debug("raid456: run(%s) called.\n", mdname(mddev));
6649
6650 rdev_for_each(rdev, mddev) {

--- 10 unchanged lines hidden (view full) ---

6661 } else {
6662 if (disk->rdev)
6663 goto abort;
6664 disk->rdev = rdev;
6665 }
6666
6667 if (test_bit(In_sync, &rdev->flags)) {
6668 char b[BDEVNAME_SIZE];
6543 conf->level = mddev->new_level;
6544 conf->chunk_sectors = mddev->new_chunk_sectors;
6545 if (raid5_alloc_percpu(conf) != 0)
6546 goto abort;
6547
6548 pr_debug("raid456: run(%s) called.\n", mdname(mddev));
6549
6550 rdev_for_each(rdev, mddev) {

--- 10 unchanged lines hidden (view full) ---

6561 } else {
6562 if (disk->rdev)
6563 goto abort;
6564 disk->rdev = rdev;
6565 }
6566
6567 if (test_bit(In_sync, &rdev->flags)) {
6568 char b[BDEVNAME_SIZE];
6669 pr_info("md/raid:%s: device %s operational as raid disk %d\n",
6670 mdname(mddev), bdevname(rdev->bdev, b), raid_disk);
6569 printk(KERN_INFO "md/raid:%s: device %s operational as raid"
6570 " disk %d\n",
6571 mdname(mddev), bdevname(rdev->bdev, b), raid_disk);
6671 } else if (rdev->saved_raid_disk != raid_disk)
6672 /* Cannot rely on bitmap to complete recovery */
6673 conf->fullsync = 1;
6674 }
6675
6676 conf->level = mddev->new_level;
6677 if (conf->level == 6) {
6678 conf->max_degraded = 2;

--- 17 unchanged lines hidden (view full) ---

6696
6697 conf->min_nr_stripes = NR_STRIPES;
6698 if (mddev->reshape_position != MaxSector) {
6699 int stripes = max_t(int,
6700 ((mddev->chunk_sectors << 9) / STRIPE_SIZE) * 4,
6701 ((mddev->new_chunk_sectors << 9) / STRIPE_SIZE) * 4);
6702 conf->min_nr_stripes = max(NR_STRIPES, stripes);
6703 if (conf->min_nr_stripes != NR_STRIPES)
6572 } else if (rdev->saved_raid_disk != raid_disk)
6573 /* Cannot rely on bitmap to complete recovery */
6574 conf->fullsync = 1;
6575 }
6576
6577 conf->level = mddev->new_level;
6578 if (conf->level == 6) {
6579 conf->max_degraded = 2;

--- 17 unchanged lines hidden (view full) ---

6597
6598 conf->min_nr_stripes = NR_STRIPES;
6599 if (mddev->reshape_position != MaxSector) {
6600 int stripes = max_t(int,
6601 ((mddev->chunk_sectors << 9) / STRIPE_SIZE) * 4,
6602 ((mddev->new_chunk_sectors << 9) / STRIPE_SIZE) * 4);
6603 conf->min_nr_stripes = max(NR_STRIPES, stripes);
6604 if (conf->min_nr_stripes != NR_STRIPES)
6704 pr_info("md/raid:%s: force stripe size %d for reshape\n",
6605 printk(KERN_INFO
6606 "md/raid:%s: force stripe size %d for reshape\n",
6705 mdname(mddev), conf->min_nr_stripes);
6706 }
6707 memory = conf->min_nr_stripes * (sizeof(struct stripe_head) +
6708 max_disks * ((sizeof(struct bio) + PAGE_SIZE))) / 1024;
6709 atomic_set(&conf->empty_inactive_list_nr, NR_STRIPE_HASH_LOCKS);
6710 if (grow_stripes(conf, conf->min_nr_stripes)) {
6607 mdname(mddev), conf->min_nr_stripes);
6608 }
6609 memory = conf->min_nr_stripes * (sizeof(struct stripe_head) +
6610 max_disks * ((sizeof(struct bio) + PAGE_SIZE))) / 1024;
6611 atomic_set(&conf->empty_inactive_list_nr, NR_STRIPE_HASH_LOCKS);
6612 if (grow_stripes(conf, conf->min_nr_stripes)) {
6711 pr_warn("md/raid:%s: couldn't allocate %dkB for buffers\n",
6712 mdname(mddev), memory);
6613 printk(KERN_ERR
6614 "md/raid:%s: couldn't allocate %dkB for buffers\n",
6615 mdname(mddev), memory);
6713 goto abort;
6714 } else
6616 goto abort;
6617 } else
6715 pr_debug("md/raid:%s: allocated %dkB\n", mdname(mddev), memory);
6618 printk(KERN_INFO "md/raid:%s: allocated %dkB\n",
6619 mdname(mddev), memory);
6716 /*
6717 * Losing a stripe head costs more than the time to refill it,
6718 * it reduces the queue depth and so can hurt throughput.
6719 * So set it rather large, scaled by number of devices.
6720 */
6721 conf->shrinker.seeks = DEFAULT_SEEKS * conf->raid_disks * 4;
6722 conf->shrinker.scan_objects = raid5_cache_scan;
6723 conf->shrinker.count_objects = raid5_cache_count;
6724 conf->shrinker.batch = 128;
6725 conf->shrinker.flags = 0;
6726 if (register_shrinker(&conf->shrinker)) {
6620 /*
6621 * Losing a stripe head costs more than the time to refill it,
6622 * it reduces the queue depth and so can hurt throughput.
6623 * So set it rather large, scaled by number of devices.
6624 */
6625 conf->shrinker.seeks = DEFAULT_SEEKS * conf->raid_disks * 4;
6626 conf->shrinker.scan_objects = raid5_cache_scan;
6627 conf->shrinker.count_objects = raid5_cache_count;
6628 conf->shrinker.batch = 128;
6629 conf->shrinker.flags = 0;
6630 if (register_shrinker(&conf->shrinker)) {
6727 pr_warn("md/raid:%s: couldn't register shrinker.\n",
6728 mdname(mddev));
6631 printk(KERN_ERR
6632 "md/raid:%s: couldn't register shrinker.\n",
6633 mdname(mddev));
6729 goto abort;
6730 }
6731
6732 sprintf(pers_name, "raid%d", mddev->new_level);
6733 conf->thread = md_register_thread(raid5d, mddev, pers_name);
6734 if (!conf->thread) {
6634 goto abort;
6635 }
6636
6637 sprintf(pers_name, "raid%d", mddev->new_level);
6638 conf->thread = md_register_thread(raid5d, mddev, pers_name);
6639 if (!conf->thread) {
6735 pr_warn("md/raid:%s: couldn't allocate thread.\n",
6736 mdname(mddev));
6640 printk(KERN_ERR
6641 "md/raid:%s: couldn't allocate thread.\n",
6642 mdname(mddev));
6737 goto abort;
6738 }
6739
6740 return conf;
6741
6742 abort:
6743 if (conf) {
6744 free_conf(conf);

--- 36 unchanged lines hidden (view full) ---

6781 struct md_rdev *rdev;
6782 struct md_rdev *journal_dev = NULL;
6783 sector_t reshape_offset = 0;
6784 int i;
6785 long long min_offset_diff = 0;
6786 int first = 1;
6787
6788 if (mddev->recovery_cp != MaxSector)
6643 goto abort;
6644 }
6645
6646 return conf;
6647
6648 abort:
6649 if (conf) {
6650 free_conf(conf);

--- 36 unchanged lines hidden (view full) ---

6687 struct md_rdev *rdev;
6688 struct md_rdev *journal_dev = NULL;
6689 sector_t reshape_offset = 0;
6690 int i;
6691 long long min_offset_diff = 0;
6692 int first = 1;
6693
6694 if (mddev->recovery_cp != MaxSector)
6789 pr_notice("md/raid:%s: not clean -- starting background reconstruction\n",
6790 mdname(mddev));
6695 printk(KERN_NOTICE "md/raid:%s: not clean"
6696 " -- starting background reconstruction\n",
6697 mdname(mddev));
6791
6792 rdev_for_each(rdev, mddev) {
6793 long long diff;
6794
6795 if (test_bit(Journal, &rdev->flags)) {
6796 journal_dev = rdev;
6797 continue;
6798 }

--- 26 unchanged lines hidden (view full) ---

6825 */
6826 sector_t here_new, here_old;
6827 int old_disks;
6828 int max_degraded = (mddev->level == 6 ? 2 : 1);
6829 int chunk_sectors;
6830 int new_data_disks;
6831
6832 if (journal_dev) {
6698
6699 rdev_for_each(rdev, mddev) {
6700 long long diff;
6701
6702 if (test_bit(Journal, &rdev->flags)) {
6703 journal_dev = rdev;
6704 continue;
6705 }

--- 26 unchanged lines hidden (view full) ---

6732 */
6733 sector_t here_new, here_old;
6734 int old_disks;
6735 int max_degraded = (mddev->level == 6 ? 2 : 1);
6736 int chunk_sectors;
6737 int new_data_disks;
6738
6739 if (journal_dev) {
6833 pr_warn("md/raid:%s: don't support reshape with journal - aborting.\n",
6834 mdname(mddev));
6740 printk(KERN_ERR "md/raid:%s: don't support reshape with journal - aborting.\n",
6741 mdname(mddev));
6835 return -EINVAL;
6836 }
6837
6838 if (mddev->new_level != mddev->level) {
6742 return -EINVAL;
6743 }
6744
6745 if (mddev->new_level != mddev->level) {
6839 pr_warn("md/raid:%s: unsupported reshape required - aborting.\n",
6840 mdname(mddev));
6746 printk(KERN_ERR "md/raid:%s: unsupported reshape "
6747 "required - aborting.\n",
6748 mdname(mddev));
6841 return -EINVAL;
6842 }
6843 old_disks = mddev->raid_disks - mddev->delta_disks;
6844 /* reshape_position must be on a new-stripe boundary, and one
6845 * further up in new geometry must map after here in old
6846 * geometry.
6847 * If the chunk sizes are different, then as we perform reshape
6848 * in units of the largest of the two, reshape_position needs
6849 * be a multiple of the largest chunk size times new data disks.
6850 */
6851 here_new = mddev->reshape_position;
6852 chunk_sectors = max(mddev->chunk_sectors, mddev->new_chunk_sectors);
6853 new_data_disks = mddev->raid_disks - max_degraded;
6854 if (sector_div(here_new, chunk_sectors * new_data_disks)) {
6749 return -EINVAL;
6750 }
6751 old_disks = mddev->raid_disks - mddev->delta_disks;
6752 /* reshape_position must be on a new-stripe boundary, and one
6753 * further up in new geometry must map after here in old
6754 * geometry.
6755 * If the chunk sizes are different, then as we perform reshape
6756 * in units of the largest of the two, reshape_position needs
6757 * be a multiple of the largest chunk size times new data disks.
6758 */
6759 here_new = mddev->reshape_position;
6760 chunk_sectors = max(mddev->chunk_sectors, mddev->new_chunk_sectors);
6761 new_data_disks = mddev->raid_disks - max_degraded;
6762 if (sector_div(here_new, chunk_sectors * new_data_disks)) {
6855 pr_warn("md/raid:%s: reshape_position not on a stripe boundary\n",
6856 mdname(mddev));
6763 printk(KERN_ERR "md/raid:%s: reshape_position not "
6764 "on a stripe boundary\n", mdname(mddev));
6857 return -EINVAL;
6858 }
6859 reshape_offset = here_new * chunk_sectors;
6860 /* here_new is the stripe we will write to */
6861 here_old = mddev->reshape_position;
6862 sector_div(here_old, chunk_sectors * (old_disks-max_degraded));
6863 /* here_old is the first stripe that we might need to read
6864 * from */

--- 4 unchanged lines hidden (view full) ---

6869 * mdadm always starts a situation like this in
6870 * readonly mode so it can take control before
6871 * allowing any writes. So just check for that.
6872 */
6873 if (abs(min_offset_diff) >= mddev->chunk_sectors &&
6874 abs(min_offset_diff) >= mddev->new_chunk_sectors)
6875 /* not really in-place - so OK */;
6876 else if (mddev->ro == 0) {
6765 return -EINVAL;
6766 }
6767 reshape_offset = here_new * chunk_sectors;
6768 /* here_new is the stripe we will write to */
6769 here_old = mddev->reshape_position;
6770 sector_div(here_old, chunk_sectors * (old_disks-max_degraded));
6771 /* here_old is the first stripe that we might need to read
6772 * from */

--- 4 unchanged lines hidden (view full) ---

6777 * mdadm always starts a situation like this in
6778 * readonly mode so it can take control before
6779 * allowing any writes. So just check for that.
6780 */
6781 if (abs(min_offset_diff) >= mddev->chunk_sectors &&
6782 abs(min_offset_diff) >= mddev->new_chunk_sectors)
6783 /* not really in-place - so OK */;
6784 else if (mddev->ro == 0) {
6877 pr_warn("md/raid:%s: in-place reshape must be started in read-only mode - aborting\n",
6878 mdname(mddev));
6785 printk(KERN_ERR "md/raid:%s: in-place reshape "
6786 "must be started in read-only mode "
6787 "- aborting\n",
6788 mdname(mddev));
6879 return -EINVAL;
6880 }
6881 } else if (mddev->reshape_backwards
6882 ? (here_new * chunk_sectors + min_offset_diff <=
6883 here_old * chunk_sectors)
6884 : (here_new * chunk_sectors >=
6885 here_old * chunk_sectors + (-min_offset_diff))) {
6886 /* Reading from the same stripe as writing to - bad */
6789 return -EINVAL;
6790 }
6791 } else if (mddev->reshape_backwards
6792 ? (here_new * chunk_sectors + min_offset_diff <=
6793 here_old * chunk_sectors)
6794 : (here_new * chunk_sectors >=
6795 here_old * chunk_sectors + (-min_offset_diff))) {
6796 /* Reading from the same stripe as writing to - bad */
6887 pr_warn("md/raid:%s: reshape_position too early for auto-recovery - aborting.\n",
6888 mdname(mddev));
6797 printk(KERN_ERR "md/raid:%s: reshape_position too early for "
6798 "auto-recovery - aborting.\n",
6799 mdname(mddev));
6889 return -EINVAL;
6890 }
6800 return -EINVAL;
6801 }
6891 pr_debug("md/raid:%s: reshape will continue\n", mdname(mddev));
6802 printk(KERN_INFO "md/raid:%s: reshape will continue\n",
6803 mdname(mddev));
6892 /* OK, we should be able to continue; */
6893 } else {
6894 BUG_ON(mddev->level != mddev->new_level);
6895 BUG_ON(mddev->layout != mddev->new_layout);
6896 BUG_ON(mddev->chunk_sectors != mddev->new_chunk_sectors);
6897 BUG_ON(mddev->delta_disks != 0);
6898 }
6899
6900 if (mddev->private == NULL)
6901 conf = setup_conf(mddev);
6902 else
6903 conf = mddev->private;
6904
6905 if (IS_ERR(conf))
6906 return PTR_ERR(conf);
6907
6908 if (test_bit(MD_HAS_JOURNAL, &mddev->flags)) {
6909 if (!journal_dev) {
6804 /* OK, we should be able to continue; */
6805 } else {
6806 BUG_ON(mddev->level != mddev->new_level);
6807 BUG_ON(mddev->layout != mddev->new_layout);
6808 BUG_ON(mddev->chunk_sectors != mddev->new_chunk_sectors);
6809 BUG_ON(mddev->delta_disks != 0);
6810 }
6811
6812 if (mddev->private == NULL)
6813 conf = setup_conf(mddev);
6814 else
6815 conf = mddev->private;
6816
6817 if (IS_ERR(conf))
6818 return PTR_ERR(conf);
6819
6820 if (test_bit(MD_HAS_JOURNAL, &mddev->flags)) {
6821 if (!journal_dev) {
6910 pr_warn("md/raid:%s: journal disk is missing, force array readonly\n",
6911 mdname(mddev));
6822 pr_err("md/raid:%s: journal disk is missing, force array readonly\n",
6823 mdname(mddev));
6912 mddev->ro = 1;
6913 set_disk_ro(mddev->gendisk, 1);
6914 } else if (mddev->recovery_cp == MaxSector)
6915 set_bit(MD_JOURNAL_CLEAN, &mddev->flags);
6916 }
6917
6918 conf->min_offset_diff = min_offset_diff;
6919 mddev->thread = conf->thread;

--- 10 unchanged lines hidden (view full) ---

6930 clear_bit(Replacement, &rdev->flags);
6931 conf->disks[i].rdev = rdev;
6932 }
6933 if (!rdev)
6934 continue;
6935 if (conf->disks[i].replacement &&
6936 conf->reshape_progress != MaxSector) {
6937 /* replacements and reshape simply do not mix. */
6824 mddev->ro = 1;
6825 set_disk_ro(mddev->gendisk, 1);
6826 } else if (mddev->recovery_cp == MaxSector)
6827 set_bit(MD_JOURNAL_CLEAN, &mddev->flags);
6828 }
6829
6830 conf->min_offset_diff = min_offset_diff;
6831 mddev->thread = conf->thread;

--- 10 unchanged lines hidden (view full) ---

6842 clear_bit(Replacement, &rdev->flags);
6843 conf->disks[i].rdev = rdev;
6844 }
6845 if (!rdev)
6846 continue;
6847 if (conf->disks[i].replacement &&
6848 conf->reshape_progress != MaxSector) {
6849 /* replacements and reshape simply do not mix. */
6938 pr_warn("md: cannot handle concurrent replacement and reshape.\n");
6850 printk(KERN_ERR "md: cannot handle concurrent "
6851 "replacement and reshape.\n");
6939 goto abort;
6940 }
6941 if (test_bit(In_sync, &rdev->flags)) {
6942 working_disks++;
6943 continue;
6944 }
6945 /* This disc is not fully in-sync. However if it
6946 * just stored parity (beyond the recovery_offset),

--- 25 unchanged lines hidden (view full) ---

6972 }
6973
6974 /*
6975 * 0 for a fully functional array, 1 or 2 for a degraded array.
6976 */
6977 mddev->degraded = calc_degraded(conf);
6978
6979 if (has_failed(conf)) {
6852 goto abort;
6853 }
6854 if (test_bit(In_sync, &rdev->flags)) {
6855 working_disks++;
6856 continue;
6857 }
6858 /* This disc is not fully in-sync. However if it
6859 * just stored parity (beyond the recovery_offset),

--- 25 unchanged lines hidden (view full) ---

6885 }
6886
6887 /*
6888 * 0 for a fully functional array, 1 or 2 for a degraded array.
6889 */
6890 mddev->degraded = calc_degraded(conf);
6891
6892 if (has_failed(conf)) {
6980 pr_crit("md/raid:%s: not enough operational devices (%d/%d failed)\n",
6893 printk(KERN_ERR "md/raid:%s: not enough operational devices"
6894 " (%d/%d failed)\n",
6981 mdname(mddev), mddev->degraded, conf->raid_disks);
6982 goto abort;
6983 }
6984
6985 /* device size must be a multiple of chunk size */
6986 mddev->dev_sectors &= ~(mddev->chunk_sectors - 1);
6987 mddev->resync_max_sectors = mddev->dev_sectors;
6988
6989 if (mddev->degraded > dirty_parity_disks &&
6990 mddev->recovery_cp != MaxSector) {
6991 if (mddev->ok_start_degraded)
6895 mdname(mddev), mddev->degraded, conf->raid_disks);
6896 goto abort;
6897 }
6898
6899 /* device size must be a multiple of chunk size */
6900 mddev->dev_sectors &= ~(mddev->chunk_sectors - 1);
6901 mddev->resync_max_sectors = mddev->dev_sectors;
6902
6903 if (mddev->degraded > dirty_parity_disks &&
6904 mddev->recovery_cp != MaxSector) {
6905 if (mddev->ok_start_degraded)
6992 pr_crit("md/raid:%s: starting dirty degraded array - data corruption possible.\n",
6993 mdname(mddev));
6906 printk(KERN_WARNING
6907 "md/raid:%s: starting dirty degraded array"
6908 " - data corruption possible.\n",
6909 mdname(mddev));
6994 else {
6910 else {
6995 pr_crit("md/raid:%s: cannot start dirty degraded array.\n",
6996 mdname(mddev));
6911 printk(KERN_ERR
6912 "md/raid:%s: cannot start dirty degraded array.\n",
6913 mdname(mddev));
6997 goto abort;
6998 }
6999 }
7000
6914 goto abort;
6915 }
6916 }
6917
7001 pr_info("md/raid:%s: raid level %d active with %d out of %d devices, algorithm %d\n",
7002 mdname(mddev), conf->level,
7003 mddev->raid_disks-mddev->degraded, mddev->raid_disks,
7004 mddev->new_layout);
6918 if (mddev->degraded == 0)
6919 printk(KERN_INFO "md/raid:%s: raid level %d active with %d out of %d"
6920 " devices, algorithm %d\n", mdname(mddev), conf->level,
6921 mddev->raid_disks-mddev->degraded, mddev->raid_disks,
6922 mddev->new_layout);
6923 else
6924 printk(KERN_ALERT "md/raid:%s: raid level %d active with %d"
6925 " out of %d devices, algorithm %d\n",
6926 mdname(mddev), conf->level,
6927 mddev->raid_disks - mddev->degraded,
6928 mddev->raid_disks, mddev->new_layout);
7005
7006 print_raid5_conf(conf);
7007
7008 if (conf->reshape_progress != MaxSector) {
7009 conf->reshape_safe = conf->reshape_progress;
7010 atomic_set(&conf->reshape_stripes, 0);
7011 clear_bit(MD_RECOVERY_SYNC, &mddev->recovery);
7012 clear_bit(MD_RECOVERY_CHECK, &mddev->recovery);
7013 set_bit(MD_RECOVERY_RESHAPE, &mddev->recovery);
7014 set_bit(MD_RECOVERY_RUNNING, &mddev->recovery);
7015 mddev->sync_thread = md_register_thread(md_do_sync, mddev,
7016 "reshape");
7017 }
7018
7019 /* Ok, everything is just fine now */
7020 if (mddev->to_remove == &raid5_attrs_group)
7021 mddev->to_remove = NULL;
7022 else if (mddev->kobj.sd &&
7023 sysfs_create_group(&mddev->kobj, &raid5_attrs_group))
6929
6930 print_raid5_conf(conf);
6931
6932 if (conf->reshape_progress != MaxSector) {
6933 conf->reshape_safe = conf->reshape_progress;
6934 atomic_set(&conf->reshape_stripes, 0);
6935 clear_bit(MD_RECOVERY_SYNC, &mddev->recovery);
6936 clear_bit(MD_RECOVERY_CHECK, &mddev->recovery);
6937 set_bit(MD_RECOVERY_RESHAPE, &mddev->recovery);
6938 set_bit(MD_RECOVERY_RUNNING, &mddev->recovery);
6939 mddev->sync_thread = md_register_thread(md_do_sync, mddev,
6940 "reshape");
6941 }
6942
6943 /* Ok, everything is just fine now */
6944 if (mddev->to_remove == &raid5_attrs_group)
6945 mddev->to_remove = NULL;
6946 else if (mddev->kobj.sd &&
6947 sysfs_create_group(&mddev->kobj, &raid5_attrs_group))
7024 pr_warn("raid5: failed to create sysfs attributes for %s\n",
7025 mdname(mddev));
6948 printk(KERN_WARNING
6949 "raid5: failed to create sysfs attributes for %s\n",
6950 mdname(mddev));
7026 md_set_array_sectors(mddev, raid5_size(mddev, 0, 0));
7027
7028 if (mddev->queue) {
7029 int chunk_size;
7030 bool discard_supported = true;
7031 /* read-ahead size must cover two whole stripes, which
7032 * is 2 * (datadisks) * chunksize where 'n' is the
7033 * number of raid devices

--- 71 unchanged lines hidden (view full) ---

7105 mddev->queue);
7106
7107 blk_queue_max_hw_sectors(mddev->queue, UINT_MAX);
7108 }
7109
7110 if (journal_dev) {
7111 char b[BDEVNAME_SIZE];
7112
6951 md_set_array_sectors(mddev, raid5_size(mddev, 0, 0));
6952
6953 if (mddev->queue) {
6954 int chunk_size;
6955 bool discard_supported = true;
6956 /* read-ahead size must cover two whole stripes, which
6957 * is 2 * (datadisks) * chunksize where 'n' is the
6958 * number of raid devices

--- 71 unchanged lines hidden (view full) ---

7030 mddev->queue);
7031
7032 blk_queue_max_hw_sectors(mddev->queue, UINT_MAX);
7033 }
7034
7035 if (journal_dev) {
7036 char b[BDEVNAME_SIZE];
7037
7113 pr_debug("md/raid:%s: using device %s as journal\n",
7114 mdname(mddev), bdevname(journal_dev->bdev, b));
7115 if (r5l_init_log(conf, journal_dev))
7116 goto abort;
7038 printk(KERN_INFO"md/raid:%s: using device %s as journal\n",
7039 mdname(mddev), bdevname(journal_dev->bdev, b));
7040 r5l_init_log(conf, journal_dev);
7117 }
7118
7119 return 0;
7120abort:
7121 md_unregister_thread(&mddev->thread);
7122 print_raid5_conf(conf);
7123 free_conf(conf);
7124 mddev->private = NULL;
7041 }
7042
7043 return 0;
7044abort:
7045 md_unregister_thread(&mddev->thread);
7046 print_raid5_conf(conf);
7047 free_conf(conf);
7048 mddev->private = NULL;
7125 pr_warn("md/raid:%s: failed to run raid set.\n", mdname(mddev));
7049 printk(KERN_ALERT "md/raid:%s: failed to run raid set.\n", mdname(mddev));
7126 return -EIO;
7127}
7128
7129static void raid5_free(struct mddev *mddev, void *priv)
7130{
7131 struct r5conf *conf = priv;
7132
7133 free_conf(conf);

--- 17 unchanged lines hidden (view full) ---

7151 seq_printf (seq, "]");
7152}
7153
7154static void print_raid5_conf (struct r5conf *conf)
7155{
7156 int i;
7157 struct disk_info *tmp;
7158
7050 return -EIO;
7051}
7052
7053static void raid5_free(struct mddev *mddev, void *priv)
7054{
7055 struct r5conf *conf = priv;
7056
7057 free_conf(conf);

--- 17 unchanged lines hidden (view full) ---

7075 seq_printf (seq, "]");
7076}
7077
7078static void print_raid5_conf (struct r5conf *conf)
7079{
7080 int i;
7081 struct disk_info *tmp;
7082
7159 pr_debug("RAID conf printout:\n");
7083 printk(KERN_DEBUG "RAID conf printout:\n");
7160 if (!conf) {
7084 if (!conf) {
7161 pr_debug("(conf==NULL)\n");
7085 printk("(conf==NULL)\n");
7162 return;
7163 }
7086 return;
7087 }
7164 pr_debug(" --- level:%d rd:%d wd:%d\n", conf->level,
7088 printk(KERN_DEBUG " --- level:%d rd:%d wd:%d\n", conf->level,
7165 conf->raid_disks,
7166 conf->raid_disks - conf->mddev->degraded);
7167
7168 for (i = 0; i < conf->raid_disks; i++) {
7169 char b[BDEVNAME_SIZE];
7170 tmp = conf->disks + i;
7171 if (tmp->rdev)
7089 conf->raid_disks,
7090 conf->raid_disks - conf->mddev->degraded);
7091
7092 for (i = 0; i < conf->raid_disks; i++) {
7093 char b[BDEVNAME_SIZE];
7094 tmp = conf->disks + i;
7095 if (tmp->rdev)
7172 pr_debug(" disk %d, o:%d, dev:%s\n",
7096 printk(KERN_DEBUG " disk %d, o:%d, dev:%s\n",
7173 i, !test_bit(Faulty, &tmp->rdev->flags),
7174 bdevname(tmp->rdev->bdev, b));
7175 }
7176}
7177
7178static int raid5_spare_active(struct mddev *mddev)
7179{
7180 int i;

--- 131 unchanged lines hidden (view full) ---

7312 return -EBUSY;
7313
7314 rdev->raid_disk = 0;
7315 /*
7316 * The array is in readonly mode if journal is missing, so no
7317 * write requests running. We should be safe
7318 */
7319 r5l_init_log(conf, rdev);
7097 i, !test_bit(Faulty, &tmp->rdev->flags),
7098 bdevname(tmp->rdev->bdev, b));
7099 }
7100}
7101
7102static int raid5_spare_active(struct mddev *mddev)
7103{
7104 int i;

--- 131 unchanged lines hidden (view full) ---

7236 return -EBUSY;
7237
7238 rdev->raid_disk = 0;
7239 /*
7240 * The array is in readonly mode if journal is missing, so no
7241 * write requests running. We should be safe
7242 */
7243 r5l_init_log(conf, rdev);
7320 pr_debug("md/raid:%s: using device %s as journal\n",
7321 mdname(mddev), bdevname(rdev->bdev, b));
7244 printk(KERN_INFO"md/raid:%s: using device %s as journal\n",
7245 mdname(mddev), bdevname(rdev->bdev, b));
7322 return 0;
7323 }
7324 if (mddev->recovery_disabled == conf->recovery_disabled)
7325 return -EBUSY;
7326
7327 if (rdev->saved_raid_disk < 0 && has_failed(conf))
7328 /* no point adding a device */
7329 return -EINVAL;

--- 87 unchanged lines hidden (view full) ---

7417 * If the chunk size is greater, user-space should request more
7418 * stripe_heads first.
7419 */
7420 struct r5conf *conf = mddev->private;
7421 if (((mddev->chunk_sectors << 9) / STRIPE_SIZE) * 4
7422 > conf->min_nr_stripes ||
7423 ((mddev->new_chunk_sectors << 9) / STRIPE_SIZE) * 4
7424 > conf->min_nr_stripes) {
7246 return 0;
7247 }
7248 if (mddev->recovery_disabled == conf->recovery_disabled)
7249 return -EBUSY;
7250
7251 if (rdev->saved_raid_disk < 0 && has_failed(conf))
7252 /* no point adding a device */
7253 return -EINVAL;

--- 87 unchanged lines hidden (view full) ---

7341 * If the chunk size is greater, user-space should request more
7342 * stripe_heads first.
7343 */
7344 struct r5conf *conf = mddev->private;
7345 if (((mddev->chunk_sectors << 9) / STRIPE_SIZE) * 4
7346 > conf->min_nr_stripes ||
7347 ((mddev->new_chunk_sectors << 9) / STRIPE_SIZE) * 4
7348 > conf->min_nr_stripes) {
7425 pr_warn("md/raid:%s: reshape: not enough stripes. Needed %lu\n",
7426 mdname(mddev),
7427 ((max(mddev->chunk_sectors, mddev->new_chunk_sectors) << 9)
7428 / STRIPE_SIZE)*4);
7349 printk(KERN_WARNING "md/raid:%s: reshape: not enough stripes. Needed %lu\n",
7350 mdname(mddev),
7351 ((max(mddev->chunk_sectors, mddev->new_chunk_sectors) << 9)
7352 / STRIPE_SIZE)*4);
7429 return 0;
7430 }
7431 return 1;
7432}
7433
7434static int check_reshape(struct mddev *mddev)
7435{
7436 struct r5conf *conf = mddev->private;

--- 64 unchanged lines hidden (view full) ---

7501 return -EINVAL;
7502
7503 /* Refuse to reduce size of the array. Any reductions in
7504 * array size must be through explicit setting of array_size
7505 * attribute.
7506 */
7507 if (raid5_size(mddev, 0, conf->raid_disks + mddev->delta_disks)
7508 < mddev->array_sectors) {
7353 return 0;
7354 }
7355 return 1;
7356}
7357
7358static int check_reshape(struct mddev *mddev)
7359{
7360 struct r5conf *conf = mddev->private;

--- 64 unchanged lines hidden (view full) ---

7425 return -EINVAL;
7426
7427 /* Refuse to reduce size of the array. Any reductions in
7428 * array size must be through explicit setting of array_size
7429 * attribute.
7430 */
7431 if (raid5_size(mddev, 0, conf->raid_disks + mddev->delta_disks)
7432 < mddev->array_sectors) {
7509 pr_warn("md/raid:%s: array size must be reduced before number of disks\n",
7510 mdname(mddev));
7433 printk(KERN_ERR "md/raid:%s: array size must be reduced "
7434 "before number of disks\n", mdname(mddev));
7511 return -EINVAL;
7512 }
7513
7514 atomic_set(&conf->reshape_stripes, 0);
7515 spin_lock_irq(&conf->device_lock);
7516 write_seqcount_begin(&conf->gen_lock);
7517 conf->previous_raid_disks = conf->raid_disks;
7518 conf->raid_disks += mddev->delta_disks;

--- 171 unchanged lines hidden (view full) ---

7690 wake_up(&conf->wait_for_overlap);
7691 break;
7692
7693 case 1: /* stop all writes */
7694 lock_all_device_hash_locks_irq(conf);
7695 /* '2' tells resync/reshape to pause so that all
7696 * active stripes can drain
7697 */
7435 return -EINVAL;
7436 }
7437
7438 atomic_set(&conf->reshape_stripes, 0);
7439 spin_lock_irq(&conf->device_lock);
7440 write_seqcount_begin(&conf->gen_lock);
7441 conf->previous_raid_disks = conf->raid_disks;
7442 conf->raid_disks += mddev->delta_disks;

--- 171 unchanged lines hidden (view full) ---

7614 wake_up(&conf->wait_for_overlap);
7615 break;
7616
7617 case 1: /* stop all writes */
7618 lock_all_device_hash_locks_irq(conf);
7619 /* '2' tells resync/reshape to pause so that all
7620 * active stripes can drain
7621 */
7698 r5c_flush_cache(conf, INT_MAX);
7699 conf->quiesce = 2;
7700 wait_event_cmd(conf->wait_for_quiescent,
7701 atomic_read(&conf->active_stripes) == 0 &&
7702 atomic_read(&conf->active_aligned_reads) == 0,
7703 unlock_all_device_hash_locks_irq(conf),
7704 lock_all_device_hash_locks_irq(conf));
7705 conf->quiesce = 1;
7706 unlock_all_device_hash_locks_irq(conf);

--- 14 unchanged lines hidden (view full) ---

7721
7722static void *raid45_takeover_raid0(struct mddev *mddev, int level)
7723{
7724 struct r0conf *raid0_conf = mddev->private;
7725 sector_t sectors;
7726
7727 /* for raid0 takeover only one zone is supported */
7728 if (raid0_conf->nr_strip_zones > 1) {
7622 conf->quiesce = 2;
7623 wait_event_cmd(conf->wait_for_quiescent,
7624 atomic_read(&conf->active_stripes) == 0 &&
7625 atomic_read(&conf->active_aligned_reads) == 0,
7626 unlock_all_device_hash_locks_irq(conf),
7627 lock_all_device_hash_locks_irq(conf));
7628 conf->quiesce = 1;
7629 unlock_all_device_hash_locks_irq(conf);

--- 14 unchanged lines hidden (view full) ---

7644
7645static void *raid45_takeover_raid0(struct mddev *mddev, int level)
7646{
7647 struct r0conf *raid0_conf = mddev->private;
7648 sector_t sectors;
7649
7650 /* for raid0 takeover only one zone is supported */
7651 if (raid0_conf->nr_strip_zones > 1) {
7729 pr_warn("md/raid:%s: cannot takeover raid0 with more than one zone.\n",
7730 mdname(mddev));
7652 printk(KERN_ERR "md/raid:%s: cannot takeover raid0 with more than one zone.\n",
7653 mdname(mddev));
7731 return ERR_PTR(-EINVAL);
7732 }
7733
7734 sectors = raid0_conf->strip_zone[0].zone_end;
7735 sector_div(sectors, raid0_conf->strip_zone[0].nb_dev);
7736 mddev->dev_sectors = sectors;
7737 mddev->new_level = level;
7738 mddev->new_layout = ALGORITHM_PARITY_N;

--- 336 unchanged lines hidden ---
7654 return ERR_PTR(-EINVAL);
7655 }
7656
7657 sectors = raid0_conf->strip_zone[0].zone_end;
7658 sector_div(sectors, raid0_conf->strip_zone[0].nb_dev);
7659 mddev->dev_sectors = sectors;
7660 mddev->new_level = level;
7661 mddev->new_layout = ALGORITHM_PARITY_N;

--- 336 unchanged lines hidden ---