1 /* 2 * raid5.c : Multiple Devices driver for Linux 3 * Copyright (C) 1996, 1997 Ingo Molnar, Miguel de Icaza, Gadi Oxman 4 * Copyright (C) 1999, 2000 Ingo Molnar 5 * Copyright (C) 2002, 2003 H. Peter Anvin 6 * 7 * RAID-4/5/6 management functions. 8 * Thanks to Penguin Computing for making the RAID-6 development possible 9 * by donating a test server! 10 * 11 * This program is free software; you can redistribute it and/or modify 12 * it under the terms of the GNU General Public License as published by 13 * the Free Software Foundation; either version 2, or (at your option) 14 * any later version. 15 * 16 * You should have received a copy of the GNU General Public License 17 * (for example /usr/src/linux/COPYING); if not, write to the Free 18 * Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. 19 */ 20 21 /* 22 * BITMAP UNPLUGGING: 23 * 24 * The sequencing for updating the bitmap reliably is a little 25 * subtle (and I got it wrong the first time) so it deserves some 26 * explanation. 27 * 28 * We group bitmap updates into batches. Each batch has a number. 29 * We may write out several batches at once, but that isn't very important. 30 * conf->seq_write is the number of the last batch successfully written. 31 * conf->seq_flush is the number of the last batch that was closed to 32 * new additions. 33 * When we discover that we will need to write to any block in a stripe 34 * (in add_stripe_bio) we update the in-memory bitmap and record in sh->bm_seq 35 * the number of the batch it will be in. This is seq_flush+1. 36 * When we are ready to do a write, if that batch hasn't been written yet, 37 * we plug the array and queue the stripe for later. 38 * When an unplug happens, we increment bm_flush, thus closing the current 39 * batch. 40 * When we notice that bm_flush > bm_write, we write out all pending updates 41 * to the bitmap, and advance bm_write to where bm_flush was. 42 * This may occasionally write a bit out twice, but is sure never to 43 * miss any bits. 44 */ 45 46 #include <linux/blkdev.h> 47 #include <linux/kthread.h> 48 #include <linux/raid/pq.h> 49 #include <linux/async_tx.h> 50 #include <linux/module.h> 51 #include <linux/async.h> 52 #include <linux/seq_file.h> 53 #include <linux/cpu.h> 54 #include <linux/slab.h> 55 #include <linux/ratelimit.h> 56 #include <linux/nodemask.h> 57 #include <linux/flex_array.h> 58 #include <trace/events/block.h> 59 60 #include "md.h" 61 #include "raid5.h" 62 #include "raid0.h" 63 #include "bitmap.h" 64 65 #define cpu_to_group(cpu) cpu_to_node(cpu) 66 #define ANY_GROUP NUMA_NO_NODE 67 68 static bool devices_handle_discard_safely = false; 69 module_param(devices_handle_discard_safely, bool, 0644); 70 MODULE_PARM_DESC(devices_handle_discard_safely, 71 "Set to Y if all devices in each array reliably return zeroes on reads from discarded regions"); 72 static struct workqueue_struct *raid5_wq; 73 /* 74 * Stripe cache 75 */ 76 77 #define NR_STRIPES 256 78 #define STRIPE_SIZE PAGE_SIZE 79 #define STRIPE_SHIFT (PAGE_SHIFT - 9) 80 #define STRIPE_SECTORS (STRIPE_SIZE>>9) 81 #define IO_THRESHOLD 1 82 #define BYPASS_THRESHOLD 1 83 #define NR_HASH (PAGE_SIZE / sizeof(struct hlist_head)) 84 #define HASH_MASK (NR_HASH - 1) 85 #define MAX_STRIPE_BATCH 8 86 87 static inline struct hlist_head *stripe_hash(struct r5conf *conf, sector_t sect) 88 { 89 int hash = (sect >> STRIPE_SHIFT) & HASH_MASK; 90 return &conf->stripe_hashtbl[hash]; 91 } 92 93 static inline int stripe_hash_locks_hash(sector_t sect) 94 { 95 return (sect >> STRIPE_SHIFT) & STRIPE_HASH_LOCKS_MASK; 96 } 97 98 static inline void lock_device_hash_lock(struct r5conf *conf, int hash) 99 { 100 spin_lock_irq(conf->hash_locks + hash); 101 spin_lock(&conf->device_lock); 102 } 103 104 static inline void unlock_device_hash_lock(struct r5conf *conf, int hash) 105 { 106 spin_unlock(&conf->device_lock); 107 spin_unlock_irq(conf->hash_locks + hash); 108 } 109 110 static inline void lock_all_device_hash_locks_irq(struct r5conf *conf) 111 { 112 int i; 113 local_irq_disable(); 114 spin_lock(conf->hash_locks); 115 for (i = 1; i < NR_STRIPE_HASH_LOCKS; i++) 116 spin_lock_nest_lock(conf->hash_locks + i, conf->hash_locks); 117 spin_lock(&conf->device_lock); 118 } 119 120 static inline void unlock_all_device_hash_locks_irq(struct r5conf *conf) 121 { 122 int i; 123 spin_unlock(&conf->device_lock); 124 for (i = NR_STRIPE_HASH_LOCKS; i; i--) 125 spin_unlock(conf->hash_locks + i - 1); 126 local_irq_enable(); 127 } 128 129 /* bio's attached to a stripe+device for I/O are linked together in bi_sector 130 * order without overlap. There may be several bio's per stripe+device, and 131 * a bio could span several devices. 132 * When walking this list for a particular stripe+device, we must never proceed 133 * beyond a bio that extends past this device, as the next bio might no longer 134 * be valid. 135 * This function is used to determine the 'next' bio in the list, given the sector 136 * of the current stripe+device 137 */ 138 static inline struct bio *r5_next_bio(struct bio *bio, sector_t sector) 139 { 140 int sectors = bio_sectors(bio); 141 if (bio->bi_iter.bi_sector + sectors < sector + STRIPE_SECTORS) 142 return bio->bi_next; 143 else 144 return NULL; 145 } 146 147 /* 148 * We maintain a biased count of active stripes in the bottom 16 bits of 149 * bi_phys_segments, and a count of processed stripes in the upper 16 bits 150 */ 151 static inline int raid5_bi_processed_stripes(struct bio *bio) 152 { 153 atomic_t *segments = (atomic_t *)&bio->bi_phys_segments; 154 return (atomic_read(segments) >> 16) & 0xffff; 155 } 156 157 static inline int raid5_dec_bi_active_stripes(struct bio *bio) 158 { 159 atomic_t *segments = (atomic_t *)&bio->bi_phys_segments; 160 return atomic_sub_return(1, segments) & 0xffff; 161 } 162 163 static inline void raid5_inc_bi_active_stripes(struct bio *bio) 164 { 165 atomic_t *segments = (atomic_t *)&bio->bi_phys_segments; 166 atomic_inc(segments); 167 } 168 169 static inline void raid5_set_bi_processed_stripes(struct bio *bio, 170 unsigned int cnt) 171 { 172 atomic_t *segments = (atomic_t *)&bio->bi_phys_segments; 173 int old, new; 174 175 do { 176 old = atomic_read(segments); 177 new = (old & 0xffff) | (cnt << 16); 178 } while (atomic_cmpxchg(segments, old, new) != old); 179 } 180 181 static inline void raid5_set_bi_stripes(struct bio *bio, unsigned int cnt) 182 { 183 atomic_t *segments = (atomic_t *)&bio->bi_phys_segments; 184 atomic_set(segments, cnt); 185 } 186 187 /* Find first data disk in a raid6 stripe */ 188 static inline int raid6_d0(struct stripe_head *sh) 189 { 190 if (sh->ddf_layout) 191 /* ddf always start from first device */ 192 return 0; 193 /* md starts just after Q block */ 194 if (sh->qd_idx == sh->disks - 1) 195 return 0; 196 else 197 return sh->qd_idx + 1; 198 } 199 static inline int raid6_next_disk(int disk, int raid_disks) 200 { 201 disk++; 202 return (disk < raid_disks) ? disk : 0; 203 } 204 205 /* When walking through the disks in a raid5, starting at raid6_d0, 206 * We need to map each disk to a 'slot', where the data disks are slot 207 * 0 .. raid_disks-3, the parity disk is raid_disks-2 and the Q disk 208 * is raid_disks-1. This help does that mapping. 209 */ 210 static int raid6_idx_to_slot(int idx, struct stripe_head *sh, 211 int *count, int syndrome_disks) 212 { 213 int slot = *count; 214 215 if (sh->ddf_layout) 216 (*count)++; 217 if (idx == sh->pd_idx) 218 return syndrome_disks; 219 if (idx == sh->qd_idx) 220 return syndrome_disks + 1; 221 if (!sh->ddf_layout) 222 (*count)++; 223 return slot; 224 } 225 226 static void return_io(struct bio *return_bi) 227 { 228 struct bio *bi = return_bi; 229 while (bi) { 230 231 return_bi = bi->bi_next; 232 bi->bi_next = NULL; 233 bi->bi_iter.bi_size = 0; 234 trace_block_bio_complete(bdev_get_queue(bi->bi_bdev), 235 bi, 0); 236 bio_endio(bi, 0); 237 bi = return_bi; 238 } 239 } 240 241 static void print_raid5_conf (struct r5conf *conf); 242 243 static int stripe_operations_active(struct stripe_head *sh) 244 { 245 return sh->check_state || sh->reconstruct_state || 246 test_bit(STRIPE_BIOFILL_RUN, &sh->state) || 247 test_bit(STRIPE_COMPUTE_RUN, &sh->state); 248 } 249 250 static void raid5_wakeup_stripe_thread(struct stripe_head *sh) 251 { 252 struct r5conf *conf = sh->raid_conf; 253 struct r5worker_group *group; 254 int thread_cnt; 255 int i, cpu = sh->cpu; 256 257 if (!cpu_online(cpu)) { 258 cpu = cpumask_any(cpu_online_mask); 259 sh->cpu = cpu; 260 } 261 262 if (list_empty(&sh->lru)) { 263 struct r5worker_group *group; 264 group = conf->worker_groups + cpu_to_group(cpu); 265 list_add_tail(&sh->lru, &group->handle_list); 266 group->stripes_cnt++; 267 sh->group = group; 268 } 269 270 if (conf->worker_cnt_per_group == 0) { 271 md_wakeup_thread(conf->mddev->thread); 272 return; 273 } 274 275 group = conf->worker_groups + cpu_to_group(sh->cpu); 276 277 group->workers[0].working = true; 278 /* at least one worker should run to avoid race */ 279 queue_work_on(sh->cpu, raid5_wq, &group->workers[0].work); 280 281 thread_cnt = group->stripes_cnt / MAX_STRIPE_BATCH - 1; 282 /* wakeup more workers */ 283 for (i = 1; i < conf->worker_cnt_per_group && thread_cnt > 0; i++) { 284 if (group->workers[i].working == false) { 285 group->workers[i].working = true; 286 queue_work_on(sh->cpu, raid5_wq, 287 &group->workers[i].work); 288 thread_cnt--; 289 } 290 } 291 } 292 293 static void do_release_stripe(struct r5conf *conf, struct stripe_head *sh, 294 struct list_head *temp_inactive_list) 295 { 296 BUG_ON(!list_empty(&sh->lru)); 297 BUG_ON(atomic_read(&conf->active_stripes)==0); 298 if (test_bit(STRIPE_HANDLE, &sh->state)) { 299 if (test_bit(STRIPE_DELAYED, &sh->state) && 300 !test_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) 301 list_add_tail(&sh->lru, &conf->delayed_list); 302 else if (test_bit(STRIPE_BIT_DELAY, &sh->state) && 303 sh->bm_seq - conf->seq_write > 0) 304 list_add_tail(&sh->lru, &conf->bitmap_list); 305 else { 306 clear_bit(STRIPE_DELAYED, &sh->state); 307 clear_bit(STRIPE_BIT_DELAY, &sh->state); 308 if (conf->worker_cnt_per_group == 0) { 309 list_add_tail(&sh->lru, &conf->handle_list); 310 } else { 311 raid5_wakeup_stripe_thread(sh); 312 return; 313 } 314 } 315 md_wakeup_thread(conf->mddev->thread); 316 } else { 317 BUG_ON(stripe_operations_active(sh)); 318 if (test_and_clear_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) 319 if (atomic_dec_return(&conf->preread_active_stripes) 320 < IO_THRESHOLD) 321 md_wakeup_thread(conf->mddev->thread); 322 atomic_dec(&conf->active_stripes); 323 if (!test_bit(STRIPE_EXPANDING, &sh->state)) 324 list_add_tail(&sh->lru, temp_inactive_list); 325 } 326 } 327 328 static void __release_stripe(struct r5conf *conf, struct stripe_head *sh, 329 struct list_head *temp_inactive_list) 330 { 331 if (atomic_dec_and_test(&sh->count)) 332 do_release_stripe(conf, sh, temp_inactive_list); 333 } 334 335 /* 336 * @hash could be NR_STRIPE_HASH_LOCKS, then we have a list of inactive_list 337 * 338 * Be careful: Only one task can add/delete stripes from temp_inactive_list at 339 * given time. Adding stripes only takes device lock, while deleting stripes 340 * only takes hash lock. 341 */ 342 static void release_inactive_stripe_list(struct r5conf *conf, 343 struct list_head *temp_inactive_list, 344 int hash) 345 { 346 int size; 347 unsigned long do_wakeup = 0; 348 int i = 0; 349 unsigned long flags; 350 351 if (hash == NR_STRIPE_HASH_LOCKS) { 352 size = NR_STRIPE_HASH_LOCKS; 353 hash = NR_STRIPE_HASH_LOCKS - 1; 354 } else 355 size = 1; 356 while (size) { 357 struct list_head *list = &temp_inactive_list[size - 1]; 358 359 /* 360 * We don't hold any lock here yet, get_active_stripe() might 361 * remove stripes from the list 362 */ 363 if (!list_empty_careful(list)) { 364 spin_lock_irqsave(conf->hash_locks + hash, flags); 365 if (list_empty(conf->inactive_list + hash) && 366 !list_empty(list)) 367 atomic_dec(&conf->empty_inactive_list_nr); 368 list_splice_tail_init(list, conf->inactive_list + hash); 369 do_wakeup |= 1 << hash; 370 spin_unlock_irqrestore(conf->hash_locks + hash, flags); 371 } 372 size--; 373 hash--; 374 } 375 376 for (i = 0; i < NR_STRIPE_HASH_LOCKS; i++) { 377 if (do_wakeup & (1 << i)) 378 wake_up(&conf->wait_for_stripe[i]); 379 } 380 381 if (do_wakeup) { 382 if (atomic_read(&conf->active_stripes) == 0) 383 wake_up(&conf->wait_for_quiescent); 384 if (conf->retry_read_aligned) 385 md_wakeup_thread(conf->mddev->thread); 386 } 387 } 388 389 /* should hold conf->device_lock already */ 390 static int release_stripe_list(struct r5conf *conf, 391 struct list_head *temp_inactive_list) 392 { 393 struct stripe_head *sh; 394 int count = 0; 395 struct llist_node *head; 396 397 head = llist_del_all(&conf->released_stripes); 398 head = llist_reverse_order(head); 399 while (head) { 400 int hash; 401 402 sh = llist_entry(head, struct stripe_head, release_list); 403 head = llist_next(head); 404 /* sh could be readded after STRIPE_ON_RELEASE_LIST is cleard */ 405 smp_mb(); 406 clear_bit(STRIPE_ON_RELEASE_LIST, &sh->state); 407 /* 408 * Don't worry the bit is set here, because if the bit is set 409 * again, the count is always > 1. This is true for 410 * STRIPE_ON_UNPLUG_LIST bit too. 411 */ 412 hash = sh->hash_lock_index; 413 __release_stripe(conf, sh, &temp_inactive_list[hash]); 414 count++; 415 } 416 417 return count; 418 } 419 420 static void release_stripe(struct stripe_head *sh) 421 { 422 struct r5conf *conf = sh->raid_conf; 423 unsigned long flags; 424 struct list_head list; 425 int hash; 426 bool wakeup; 427 428 /* Avoid release_list until the last reference. 429 */ 430 if (atomic_add_unless(&sh->count, -1, 1)) 431 return; 432 433 if (unlikely(!conf->mddev->thread) || 434 test_and_set_bit(STRIPE_ON_RELEASE_LIST, &sh->state)) 435 goto slow_path; 436 wakeup = llist_add(&sh->release_list, &conf->released_stripes); 437 if (wakeup) 438 md_wakeup_thread(conf->mddev->thread); 439 return; 440 slow_path: 441 local_irq_save(flags); 442 /* we are ok here if STRIPE_ON_RELEASE_LIST is set or not */ 443 if (atomic_dec_and_lock(&sh->count, &conf->device_lock)) { 444 INIT_LIST_HEAD(&list); 445 hash = sh->hash_lock_index; 446 do_release_stripe(conf, sh, &list); 447 spin_unlock(&conf->device_lock); 448 release_inactive_stripe_list(conf, &list, hash); 449 } 450 local_irq_restore(flags); 451 } 452 453 static inline void remove_hash(struct stripe_head *sh) 454 { 455 pr_debug("remove_hash(), stripe %llu\n", 456 (unsigned long long)sh->sector); 457 458 hlist_del_init(&sh->hash); 459 } 460 461 static inline void insert_hash(struct r5conf *conf, struct stripe_head *sh) 462 { 463 struct hlist_head *hp = stripe_hash(conf, sh->sector); 464 465 pr_debug("insert_hash(), stripe %llu\n", 466 (unsigned long long)sh->sector); 467 468 hlist_add_head(&sh->hash, hp); 469 } 470 471 /* find an idle stripe, make sure it is unhashed, and return it. */ 472 static struct stripe_head *get_free_stripe(struct r5conf *conf, int hash) 473 { 474 struct stripe_head *sh = NULL; 475 struct list_head *first; 476 477 if (list_empty(conf->inactive_list + hash)) 478 goto out; 479 first = (conf->inactive_list + hash)->next; 480 sh = list_entry(first, struct stripe_head, lru); 481 list_del_init(first); 482 remove_hash(sh); 483 atomic_inc(&conf->active_stripes); 484 BUG_ON(hash != sh->hash_lock_index); 485 if (list_empty(conf->inactive_list + hash)) 486 atomic_inc(&conf->empty_inactive_list_nr); 487 out: 488 return sh; 489 } 490 491 static void shrink_buffers(struct stripe_head *sh) 492 { 493 struct page *p; 494 int i; 495 int num = sh->raid_conf->pool_size; 496 497 for (i = 0; i < num ; i++) { 498 WARN_ON(sh->dev[i].page != sh->dev[i].orig_page); 499 p = sh->dev[i].page; 500 if (!p) 501 continue; 502 sh->dev[i].page = NULL; 503 put_page(p); 504 } 505 } 506 507 static int grow_buffers(struct stripe_head *sh, gfp_t gfp) 508 { 509 int i; 510 int num = sh->raid_conf->pool_size; 511 512 for (i = 0; i < num; i++) { 513 struct page *page; 514 515 if (!(page = alloc_page(gfp))) { 516 return 1; 517 } 518 sh->dev[i].page = page; 519 sh->dev[i].orig_page = page; 520 } 521 return 0; 522 } 523 524 static void raid5_build_block(struct stripe_head *sh, int i, int previous); 525 static void stripe_set_idx(sector_t stripe, struct r5conf *conf, int previous, 526 struct stripe_head *sh); 527 528 static void init_stripe(struct stripe_head *sh, sector_t sector, int previous) 529 { 530 struct r5conf *conf = sh->raid_conf; 531 int i, seq; 532 533 BUG_ON(atomic_read(&sh->count) != 0); 534 BUG_ON(test_bit(STRIPE_HANDLE, &sh->state)); 535 BUG_ON(stripe_operations_active(sh)); 536 BUG_ON(sh->batch_head); 537 538 pr_debug("init_stripe called, stripe %llu\n", 539 (unsigned long long)sector); 540 retry: 541 seq = read_seqcount_begin(&conf->gen_lock); 542 sh->generation = conf->generation - previous; 543 sh->disks = previous ? conf->previous_raid_disks : conf->raid_disks; 544 sh->sector = sector; 545 stripe_set_idx(sector, conf, previous, sh); 546 sh->state = 0; 547 548 for (i = sh->disks; i--; ) { 549 struct r5dev *dev = &sh->dev[i]; 550 551 if (dev->toread || dev->read || dev->towrite || dev->written || 552 test_bit(R5_LOCKED, &dev->flags)) { 553 printk(KERN_ERR "sector=%llx i=%d %p %p %p %p %d\n", 554 (unsigned long long)sh->sector, i, dev->toread, 555 dev->read, dev->towrite, dev->written, 556 test_bit(R5_LOCKED, &dev->flags)); 557 WARN_ON(1); 558 } 559 dev->flags = 0; 560 raid5_build_block(sh, i, previous); 561 } 562 if (read_seqcount_retry(&conf->gen_lock, seq)) 563 goto retry; 564 sh->overwrite_disks = 0; 565 insert_hash(conf, sh); 566 sh->cpu = smp_processor_id(); 567 set_bit(STRIPE_BATCH_READY, &sh->state); 568 } 569 570 static struct stripe_head *__find_stripe(struct r5conf *conf, sector_t sector, 571 short generation) 572 { 573 struct stripe_head *sh; 574 575 pr_debug("__find_stripe, sector %llu\n", (unsigned long long)sector); 576 hlist_for_each_entry(sh, stripe_hash(conf, sector), hash) 577 if (sh->sector == sector && sh->generation == generation) 578 return sh; 579 pr_debug("__stripe %llu not in cache\n", (unsigned long long)sector); 580 return NULL; 581 } 582 583 /* 584 * Need to check if array has failed when deciding whether to: 585 * - start an array 586 * - remove non-faulty devices 587 * - add a spare 588 * - allow a reshape 589 * This determination is simple when no reshape is happening. 590 * However if there is a reshape, we need to carefully check 591 * both the before and after sections. 592 * This is because some failed devices may only affect one 593 * of the two sections, and some non-in_sync devices may 594 * be insync in the section most affected by failed devices. 595 */ 596 static int calc_degraded(struct r5conf *conf) 597 { 598 int degraded, degraded2; 599 int i; 600 601 rcu_read_lock(); 602 degraded = 0; 603 for (i = 0; i < conf->previous_raid_disks; i++) { 604 struct md_rdev *rdev = rcu_dereference(conf->disks[i].rdev); 605 if (rdev && test_bit(Faulty, &rdev->flags)) 606 rdev = rcu_dereference(conf->disks[i].replacement); 607 if (!rdev || test_bit(Faulty, &rdev->flags)) 608 degraded++; 609 else if (test_bit(In_sync, &rdev->flags)) 610 ; 611 else 612 /* not in-sync or faulty. 613 * If the reshape increases the number of devices, 614 * this is being recovered by the reshape, so 615 * this 'previous' section is not in_sync. 616 * If the number of devices is being reduced however, 617 * the device can only be part of the array if 618 * we are reverting a reshape, so this section will 619 * be in-sync. 620 */ 621 if (conf->raid_disks >= conf->previous_raid_disks) 622 degraded++; 623 } 624 rcu_read_unlock(); 625 if (conf->raid_disks == conf->previous_raid_disks) 626 return degraded; 627 rcu_read_lock(); 628 degraded2 = 0; 629 for (i = 0; i < conf->raid_disks; i++) { 630 struct md_rdev *rdev = rcu_dereference(conf->disks[i].rdev); 631 if (rdev && test_bit(Faulty, &rdev->flags)) 632 rdev = rcu_dereference(conf->disks[i].replacement); 633 if (!rdev || test_bit(Faulty, &rdev->flags)) 634 degraded2++; 635 else if (test_bit(In_sync, &rdev->flags)) 636 ; 637 else 638 /* not in-sync or faulty. 639 * If reshape increases the number of devices, this 640 * section has already been recovered, else it 641 * almost certainly hasn't. 642 */ 643 if (conf->raid_disks <= conf->previous_raid_disks) 644 degraded2++; 645 } 646 rcu_read_unlock(); 647 if (degraded2 > degraded) 648 return degraded2; 649 return degraded; 650 } 651 652 static int has_failed(struct r5conf *conf) 653 { 654 int degraded; 655 656 if (conf->mddev->reshape_position == MaxSector) 657 return conf->mddev->degraded > conf->max_degraded; 658 659 degraded = calc_degraded(conf); 660 if (degraded > conf->max_degraded) 661 return 1; 662 return 0; 663 } 664 665 static struct stripe_head * 666 get_active_stripe(struct r5conf *conf, sector_t sector, 667 int previous, int noblock, int noquiesce) 668 { 669 struct stripe_head *sh; 670 int hash = stripe_hash_locks_hash(sector); 671 672 pr_debug("get_stripe, sector %llu\n", (unsigned long long)sector); 673 674 spin_lock_irq(conf->hash_locks + hash); 675 676 do { 677 wait_event_lock_irq(conf->wait_for_quiescent, 678 conf->quiesce == 0 || noquiesce, 679 *(conf->hash_locks + hash)); 680 sh = __find_stripe(conf, sector, conf->generation - previous); 681 if (!sh) { 682 if (!test_bit(R5_INACTIVE_BLOCKED, &conf->cache_state)) { 683 sh = get_free_stripe(conf, hash); 684 if (!sh && !test_bit(R5_DID_ALLOC, 685 &conf->cache_state)) 686 set_bit(R5_ALLOC_MORE, 687 &conf->cache_state); 688 } 689 if (noblock && sh == NULL) 690 break; 691 if (!sh) { 692 set_bit(R5_INACTIVE_BLOCKED, 693 &conf->cache_state); 694 wait_event_exclusive_cmd( 695 conf->wait_for_stripe[hash], 696 !list_empty(conf->inactive_list + hash) && 697 (atomic_read(&conf->active_stripes) 698 < (conf->max_nr_stripes * 3 / 4) 699 || !test_bit(R5_INACTIVE_BLOCKED, 700 &conf->cache_state)), 701 spin_unlock_irq(conf->hash_locks + hash), 702 spin_lock_irq(conf->hash_locks + hash)); 703 clear_bit(R5_INACTIVE_BLOCKED, 704 &conf->cache_state); 705 } else { 706 init_stripe(sh, sector, previous); 707 atomic_inc(&sh->count); 708 } 709 } else if (!atomic_inc_not_zero(&sh->count)) { 710 spin_lock(&conf->device_lock); 711 if (!atomic_read(&sh->count)) { 712 if (!test_bit(STRIPE_HANDLE, &sh->state)) 713 atomic_inc(&conf->active_stripes); 714 BUG_ON(list_empty(&sh->lru) && 715 !test_bit(STRIPE_EXPANDING, &sh->state)); 716 list_del_init(&sh->lru); 717 if (sh->group) { 718 sh->group->stripes_cnt--; 719 sh->group = NULL; 720 } 721 } 722 atomic_inc(&sh->count); 723 spin_unlock(&conf->device_lock); 724 } 725 } while (sh == NULL); 726 727 if (!list_empty(conf->inactive_list + hash)) 728 wake_up(&conf->wait_for_stripe[hash]); 729 730 spin_unlock_irq(conf->hash_locks + hash); 731 return sh; 732 } 733 734 static bool is_full_stripe_write(struct stripe_head *sh) 735 { 736 BUG_ON(sh->overwrite_disks > (sh->disks - sh->raid_conf->max_degraded)); 737 return sh->overwrite_disks == (sh->disks - sh->raid_conf->max_degraded); 738 } 739 740 static void lock_two_stripes(struct stripe_head *sh1, struct stripe_head *sh2) 741 { 742 local_irq_disable(); 743 if (sh1 > sh2) { 744 spin_lock(&sh2->stripe_lock); 745 spin_lock_nested(&sh1->stripe_lock, 1); 746 } else { 747 spin_lock(&sh1->stripe_lock); 748 spin_lock_nested(&sh2->stripe_lock, 1); 749 } 750 } 751 752 static void unlock_two_stripes(struct stripe_head *sh1, struct stripe_head *sh2) 753 { 754 spin_unlock(&sh1->stripe_lock); 755 spin_unlock(&sh2->stripe_lock); 756 local_irq_enable(); 757 } 758 759 /* Only freshly new full stripe normal write stripe can be added to a batch list */ 760 static bool stripe_can_batch(struct stripe_head *sh) 761 { 762 return test_bit(STRIPE_BATCH_READY, &sh->state) && 763 !test_bit(STRIPE_BITMAP_PENDING, &sh->state) && 764 is_full_stripe_write(sh); 765 } 766 767 /* we only do back search */ 768 static void stripe_add_to_batch_list(struct r5conf *conf, struct stripe_head *sh) 769 { 770 struct stripe_head *head; 771 sector_t head_sector, tmp_sec; 772 int hash; 773 int dd_idx; 774 775 if (!stripe_can_batch(sh)) 776 return; 777 /* Don't cross chunks, so stripe pd_idx/qd_idx is the same */ 778 tmp_sec = sh->sector; 779 if (!sector_div(tmp_sec, conf->chunk_sectors)) 780 return; 781 head_sector = sh->sector - STRIPE_SECTORS; 782 783 hash = stripe_hash_locks_hash(head_sector); 784 spin_lock_irq(conf->hash_locks + hash); 785 head = __find_stripe(conf, head_sector, conf->generation); 786 if (head && !atomic_inc_not_zero(&head->count)) { 787 spin_lock(&conf->device_lock); 788 if (!atomic_read(&head->count)) { 789 if (!test_bit(STRIPE_HANDLE, &head->state)) 790 atomic_inc(&conf->active_stripes); 791 BUG_ON(list_empty(&head->lru) && 792 !test_bit(STRIPE_EXPANDING, &head->state)); 793 list_del_init(&head->lru); 794 if (head->group) { 795 head->group->stripes_cnt--; 796 head->group = NULL; 797 } 798 } 799 atomic_inc(&head->count); 800 spin_unlock(&conf->device_lock); 801 } 802 spin_unlock_irq(conf->hash_locks + hash); 803 804 if (!head) 805 return; 806 if (!stripe_can_batch(head)) 807 goto out; 808 809 lock_two_stripes(head, sh); 810 /* clear_batch_ready clear the flag */ 811 if (!stripe_can_batch(head) || !stripe_can_batch(sh)) 812 goto unlock_out; 813 814 if (sh->batch_head) 815 goto unlock_out; 816 817 dd_idx = 0; 818 while (dd_idx == sh->pd_idx || dd_idx == sh->qd_idx) 819 dd_idx++; 820 if (head->dev[dd_idx].towrite->bi_rw != sh->dev[dd_idx].towrite->bi_rw) 821 goto unlock_out; 822 823 if (head->batch_head) { 824 spin_lock(&head->batch_head->batch_lock); 825 /* This batch list is already running */ 826 if (!stripe_can_batch(head)) { 827 spin_unlock(&head->batch_head->batch_lock); 828 goto unlock_out; 829 } 830 831 /* 832 * at this point, head's BATCH_READY could be cleared, but we 833 * can still add the stripe to batch list 834 */ 835 list_add(&sh->batch_list, &head->batch_list); 836 spin_unlock(&head->batch_head->batch_lock); 837 838 sh->batch_head = head->batch_head; 839 } else { 840 head->batch_head = head; 841 sh->batch_head = head->batch_head; 842 spin_lock(&head->batch_lock); 843 list_add_tail(&sh->batch_list, &head->batch_list); 844 spin_unlock(&head->batch_lock); 845 } 846 847 if (test_and_clear_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) 848 if (atomic_dec_return(&conf->preread_active_stripes) 849 < IO_THRESHOLD) 850 md_wakeup_thread(conf->mddev->thread); 851 852 if (test_and_clear_bit(STRIPE_BIT_DELAY, &sh->state)) { 853 int seq = sh->bm_seq; 854 if (test_bit(STRIPE_BIT_DELAY, &sh->batch_head->state) && 855 sh->batch_head->bm_seq > seq) 856 seq = sh->batch_head->bm_seq; 857 set_bit(STRIPE_BIT_DELAY, &sh->batch_head->state); 858 sh->batch_head->bm_seq = seq; 859 } 860 861 atomic_inc(&sh->count); 862 unlock_out: 863 unlock_two_stripes(head, sh); 864 out: 865 release_stripe(head); 866 } 867 868 /* Determine if 'data_offset' or 'new_data_offset' should be used 869 * in this stripe_head. 870 */ 871 static int use_new_offset(struct r5conf *conf, struct stripe_head *sh) 872 { 873 sector_t progress = conf->reshape_progress; 874 /* Need a memory barrier to make sure we see the value 875 * of conf->generation, or ->data_offset that was set before 876 * reshape_progress was updated. 877 */ 878 smp_rmb(); 879 if (progress == MaxSector) 880 return 0; 881 if (sh->generation == conf->generation - 1) 882 return 0; 883 /* We are in a reshape, and this is a new-generation stripe, 884 * so use new_data_offset. 885 */ 886 return 1; 887 } 888 889 static void 890 raid5_end_read_request(struct bio *bi, int error); 891 static void 892 raid5_end_write_request(struct bio *bi, int error); 893 894 static void ops_run_io(struct stripe_head *sh, struct stripe_head_state *s) 895 { 896 struct r5conf *conf = sh->raid_conf; 897 int i, disks = sh->disks; 898 struct stripe_head *head_sh = sh; 899 900 might_sleep(); 901 902 for (i = disks; i--; ) { 903 int rw; 904 int replace_only = 0; 905 struct bio *bi, *rbi; 906 struct md_rdev *rdev, *rrdev = NULL; 907 908 sh = head_sh; 909 if (test_and_clear_bit(R5_Wantwrite, &sh->dev[i].flags)) { 910 if (test_and_clear_bit(R5_WantFUA, &sh->dev[i].flags)) 911 rw = WRITE_FUA; 912 else 913 rw = WRITE; 914 if (test_bit(R5_Discard, &sh->dev[i].flags)) 915 rw |= REQ_DISCARD; 916 } else if (test_and_clear_bit(R5_Wantread, &sh->dev[i].flags)) 917 rw = READ; 918 else if (test_and_clear_bit(R5_WantReplace, 919 &sh->dev[i].flags)) { 920 rw = WRITE; 921 replace_only = 1; 922 } else 923 continue; 924 if (test_and_clear_bit(R5_SyncIO, &sh->dev[i].flags)) 925 rw |= REQ_SYNC; 926 927 again: 928 bi = &sh->dev[i].req; 929 rbi = &sh->dev[i].rreq; /* For writing to replacement */ 930 931 rcu_read_lock(); 932 rrdev = rcu_dereference(conf->disks[i].replacement); 933 smp_mb(); /* Ensure that if rrdev is NULL, rdev won't be */ 934 rdev = rcu_dereference(conf->disks[i].rdev); 935 if (!rdev) { 936 rdev = rrdev; 937 rrdev = NULL; 938 } 939 if (rw & WRITE) { 940 if (replace_only) 941 rdev = NULL; 942 if (rdev == rrdev) 943 /* We raced and saw duplicates */ 944 rrdev = NULL; 945 } else { 946 if (test_bit(R5_ReadRepl, &head_sh->dev[i].flags) && rrdev) 947 rdev = rrdev; 948 rrdev = NULL; 949 } 950 951 if (rdev && test_bit(Faulty, &rdev->flags)) 952 rdev = NULL; 953 if (rdev) 954 atomic_inc(&rdev->nr_pending); 955 if (rrdev && test_bit(Faulty, &rrdev->flags)) 956 rrdev = NULL; 957 if (rrdev) 958 atomic_inc(&rrdev->nr_pending); 959 rcu_read_unlock(); 960 961 /* We have already checked bad blocks for reads. Now 962 * need to check for writes. We never accept write errors 963 * on the replacement, so we don't to check rrdev. 964 */ 965 while ((rw & WRITE) && rdev && 966 test_bit(WriteErrorSeen, &rdev->flags)) { 967 sector_t first_bad; 968 int bad_sectors; 969 int bad = is_badblock(rdev, sh->sector, STRIPE_SECTORS, 970 &first_bad, &bad_sectors); 971 if (!bad) 972 break; 973 974 if (bad < 0) { 975 set_bit(BlockedBadBlocks, &rdev->flags); 976 if (!conf->mddev->external && 977 conf->mddev->flags) { 978 /* It is very unlikely, but we might 979 * still need to write out the 980 * bad block log - better give it 981 * a chance*/ 982 md_check_recovery(conf->mddev); 983 } 984 /* 985 * Because md_wait_for_blocked_rdev 986 * will dec nr_pending, we must 987 * increment it first. 988 */ 989 atomic_inc(&rdev->nr_pending); 990 md_wait_for_blocked_rdev(rdev, conf->mddev); 991 } else { 992 /* Acknowledged bad block - skip the write */ 993 rdev_dec_pending(rdev, conf->mddev); 994 rdev = NULL; 995 } 996 } 997 998 if (rdev) { 999 if (s->syncing || s->expanding || s->expanded 1000 || s->replacing) 1001 md_sync_acct(rdev->bdev, STRIPE_SECTORS); 1002 1003 set_bit(STRIPE_IO_STARTED, &sh->state); 1004 1005 bio_reset(bi); 1006 bi->bi_bdev = rdev->bdev; 1007 bi->bi_rw = rw; 1008 bi->bi_end_io = (rw & WRITE) 1009 ? raid5_end_write_request 1010 : raid5_end_read_request; 1011 bi->bi_private = sh; 1012 1013 pr_debug("%s: for %llu schedule op %ld on disc %d\n", 1014 __func__, (unsigned long long)sh->sector, 1015 bi->bi_rw, i); 1016 atomic_inc(&sh->count); 1017 if (sh != head_sh) 1018 atomic_inc(&head_sh->count); 1019 if (use_new_offset(conf, sh)) 1020 bi->bi_iter.bi_sector = (sh->sector 1021 + rdev->new_data_offset); 1022 else 1023 bi->bi_iter.bi_sector = (sh->sector 1024 + rdev->data_offset); 1025 if (test_bit(R5_ReadNoMerge, &head_sh->dev[i].flags)) 1026 bi->bi_rw |= REQ_NOMERGE; 1027 1028 if (test_bit(R5_SkipCopy, &sh->dev[i].flags)) 1029 WARN_ON(test_bit(R5_UPTODATE, &sh->dev[i].flags)); 1030 sh->dev[i].vec.bv_page = sh->dev[i].page; 1031 bi->bi_vcnt = 1; 1032 bi->bi_io_vec[0].bv_len = STRIPE_SIZE; 1033 bi->bi_io_vec[0].bv_offset = 0; 1034 bi->bi_iter.bi_size = STRIPE_SIZE; 1035 /* 1036 * If this is discard request, set bi_vcnt 0. We don't 1037 * want to confuse SCSI because SCSI will replace payload 1038 */ 1039 if (rw & REQ_DISCARD) 1040 bi->bi_vcnt = 0; 1041 if (rrdev) 1042 set_bit(R5_DOUBLE_LOCKED, &sh->dev[i].flags); 1043 1044 if (conf->mddev->gendisk) 1045 trace_block_bio_remap(bdev_get_queue(bi->bi_bdev), 1046 bi, disk_devt(conf->mddev->gendisk), 1047 sh->dev[i].sector); 1048 generic_make_request(bi); 1049 } 1050 if (rrdev) { 1051 if (s->syncing || s->expanding || s->expanded 1052 || s->replacing) 1053 md_sync_acct(rrdev->bdev, STRIPE_SECTORS); 1054 1055 set_bit(STRIPE_IO_STARTED, &sh->state); 1056 1057 bio_reset(rbi); 1058 rbi->bi_bdev = rrdev->bdev; 1059 rbi->bi_rw = rw; 1060 BUG_ON(!(rw & WRITE)); 1061 rbi->bi_end_io = raid5_end_write_request; 1062 rbi->bi_private = sh; 1063 1064 pr_debug("%s: for %llu schedule op %ld on " 1065 "replacement disc %d\n", 1066 __func__, (unsigned long long)sh->sector, 1067 rbi->bi_rw, i); 1068 atomic_inc(&sh->count); 1069 if (sh != head_sh) 1070 atomic_inc(&head_sh->count); 1071 if (use_new_offset(conf, sh)) 1072 rbi->bi_iter.bi_sector = (sh->sector 1073 + rrdev->new_data_offset); 1074 else 1075 rbi->bi_iter.bi_sector = (sh->sector 1076 + rrdev->data_offset); 1077 if (test_bit(R5_SkipCopy, &sh->dev[i].flags)) 1078 WARN_ON(test_bit(R5_UPTODATE, &sh->dev[i].flags)); 1079 sh->dev[i].rvec.bv_page = sh->dev[i].page; 1080 rbi->bi_vcnt = 1; 1081 rbi->bi_io_vec[0].bv_len = STRIPE_SIZE; 1082 rbi->bi_io_vec[0].bv_offset = 0; 1083 rbi->bi_iter.bi_size = STRIPE_SIZE; 1084 /* 1085 * If this is discard request, set bi_vcnt 0. We don't 1086 * want to confuse SCSI because SCSI will replace payload 1087 */ 1088 if (rw & REQ_DISCARD) 1089 rbi->bi_vcnt = 0; 1090 if (conf->mddev->gendisk) 1091 trace_block_bio_remap(bdev_get_queue(rbi->bi_bdev), 1092 rbi, disk_devt(conf->mddev->gendisk), 1093 sh->dev[i].sector); 1094 generic_make_request(rbi); 1095 } 1096 if (!rdev && !rrdev) { 1097 if (rw & WRITE) 1098 set_bit(STRIPE_DEGRADED, &sh->state); 1099 pr_debug("skip op %ld on disc %d for sector %llu\n", 1100 bi->bi_rw, i, (unsigned long long)sh->sector); 1101 clear_bit(R5_LOCKED, &sh->dev[i].flags); 1102 set_bit(STRIPE_HANDLE, &sh->state); 1103 } 1104 1105 if (!head_sh->batch_head) 1106 continue; 1107 sh = list_first_entry(&sh->batch_list, struct stripe_head, 1108 batch_list); 1109 if (sh != head_sh) 1110 goto again; 1111 } 1112 } 1113 1114 static struct dma_async_tx_descriptor * 1115 async_copy_data(int frombio, struct bio *bio, struct page **page, 1116 sector_t sector, struct dma_async_tx_descriptor *tx, 1117 struct stripe_head *sh) 1118 { 1119 struct bio_vec bvl; 1120 struct bvec_iter iter; 1121 struct page *bio_page; 1122 int page_offset; 1123 struct async_submit_ctl submit; 1124 enum async_tx_flags flags = 0; 1125 1126 if (bio->bi_iter.bi_sector >= sector) 1127 page_offset = (signed)(bio->bi_iter.bi_sector - sector) * 512; 1128 else 1129 page_offset = (signed)(sector - bio->bi_iter.bi_sector) * -512; 1130 1131 if (frombio) 1132 flags |= ASYNC_TX_FENCE; 1133 init_async_submit(&submit, flags, tx, NULL, NULL, NULL); 1134 1135 bio_for_each_segment(bvl, bio, iter) { 1136 int len = bvl.bv_len; 1137 int clen; 1138 int b_offset = 0; 1139 1140 if (page_offset < 0) { 1141 b_offset = -page_offset; 1142 page_offset += b_offset; 1143 len -= b_offset; 1144 } 1145 1146 if (len > 0 && page_offset + len > STRIPE_SIZE) 1147 clen = STRIPE_SIZE - page_offset; 1148 else 1149 clen = len; 1150 1151 if (clen > 0) { 1152 b_offset += bvl.bv_offset; 1153 bio_page = bvl.bv_page; 1154 if (frombio) { 1155 if (sh->raid_conf->skip_copy && 1156 b_offset == 0 && page_offset == 0 && 1157 clen == STRIPE_SIZE) 1158 *page = bio_page; 1159 else 1160 tx = async_memcpy(*page, bio_page, page_offset, 1161 b_offset, clen, &submit); 1162 } else 1163 tx = async_memcpy(bio_page, *page, b_offset, 1164 page_offset, clen, &submit); 1165 } 1166 /* chain the operations */ 1167 submit.depend_tx = tx; 1168 1169 if (clen < len) /* hit end of page */ 1170 break; 1171 page_offset += len; 1172 } 1173 1174 return tx; 1175 } 1176 1177 static void ops_complete_biofill(void *stripe_head_ref) 1178 { 1179 struct stripe_head *sh = stripe_head_ref; 1180 struct bio *return_bi = NULL; 1181 int i; 1182 1183 pr_debug("%s: stripe %llu\n", __func__, 1184 (unsigned long long)sh->sector); 1185 1186 /* clear completed biofills */ 1187 for (i = sh->disks; i--; ) { 1188 struct r5dev *dev = &sh->dev[i]; 1189 1190 /* acknowledge completion of a biofill operation */ 1191 /* and check if we need to reply to a read request, 1192 * new R5_Wantfill requests are held off until 1193 * !STRIPE_BIOFILL_RUN 1194 */ 1195 if (test_and_clear_bit(R5_Wantfill, &dev->flags)) { 1196 struct bio *rbi, *rbi2; 1197 1198 BUG_ON(!dev->read); 1199 rbi = dev->read; 1200 dev->read = NULL; 1201 while (rbi && rbi->bi_iter.bi_sector < 1202 dev->sector + STRIPE_SECTORS) { 1203 rbi2 = r5_next_bio(rbi, dev->sector); 1204 if (!raid5_dec_bi_active_stripes(rbi)) { 1205 rbi->bi_next = return_bi; 1206 return_bi = rbi; 1207 } 1208 rbi = rbi2; 1209 } 1210 } 1211 } 1212 clear_bit(STRIPE_BIOFILL_RUN, &sh->state); 1213 1214 return_io(return_bi); 1215 1216 set_bit(STRIPE_HANDLE, &sh->state); 1217 release_stripe(sh); 1218 } 1219 1220 static void ops_run_biofill(struct stripe_head *sh) 1221 { 1222 struct dma_async_tx_descriptor *tx = NULL; 1223 struct async_submit_ctl submit; 1224 int i; 1225 1226 BUG_ON(sh->batch_head); 1227 pr_debug("%s: stripe %llu\n", __func__, 1228 (unsigned long long)sh->sector); 1229 1230 for (i = sh->disks; i--; ) { 1231 struct r5dev *dev = &sh->dev[i]; 1232 if (test_bit(R5_Wantfill, &dev->flags)) { 1233 struct bio *rbi; 1234 spin_lock_irq(&sh->stripe_lock); 1235 dev->read = rbi = dev->toread; 1236 dev->toread = NULL; 1237 spin_unlock_irq(&sh->stripe_lock); 1238 while (rbi && rbi->bi_iter.bi_sector < 1239 dev->sector + STRIPE_SECTORS) { 1240 tx = async_copy_data(0, rbi, &dev->page, 1241 dev->sector, tx, sh); 1242 rbi = r5_next_bio(rbi, dev->sector); 1243 } 1244 } 1245 } 1246 1247 atomic_inc(&sh->count); 1248 init_async_submit(&submit, ASYNC_TX_ACK, tx, ops_complete_biofill, sh, NULL); 1249 async_trigger_callback(&submit); 1250 } 1251 1252 static void mark_target_uptodate(struct stripe_head *sh, int target) 1253 { 1254 struct r5dev *tgt; 1255 1256 if (target < 0) 1257 return; 1258 1259 tgt = &sh->dev[target]; 1260 set_bit(R5_UPTODATE, &tgt->flags); 1261 BUG_ON(!test_bit(R5_Wantcompute, &tgt->flags)); 1262 clear_bit(R5_Wantcompute, &tgt->flags); 1263 } 1264 1265 static void ops_complete_compute(void *stripe_head_ref) 1266 { 1267 struct stripe_head *sh = stripe_head_ref; 1268 1269 pr_debug("%s: stripe %llu\n", __func__, 1270 (unsigned long long)sh->sector); 1271 1272 /* mark the computed target(s) as uptodate */ 1273 mark_target_uptodate(sh, sh->ops.target); 1274 mark_target_uptodate(sh, sh->ops.target2); 1275 1276 clear_bit(STRIPE_COMPUTE_RUN, &sh->state); 1277 if (sh->check_state == check_state_compute_run) 1278 sh->check_state = check_state_compute_result; 1279 set_bit(STRIPE_HANDLE, &sh->state); 1280 release_stripe(sh); 1281 } 1282 1283 /* return a pointer to the address conversion region of the scribble buffer */ 1284 static addr_conv_t *to_addr_conv(struct stripe_head *sh, 1285 struct raid5_percpu *percpu, int i) 1286 { 1287 void *addr; 1288 1289 addr = flex_array_get(percpu->scribble, i); 1290 return addr + sizeof(struct page *) * (sh->disks + 2); 1291 } 1292 1293 /* return a pointer to the address conversion region of the scribble buffer */ 1294 static struct page **to_addr_page(struct raid5_percpu *percpu, int i) 1295 { 1296 void *addr; 1297 1298 addr = flex_array_get(percpu->scribble, i); 1299 return addr; 1300 } 1301 1302 static struct dma_async_tx_descriptor * 1303 ops_run_compute5(struct stripe_head *sh, struct raid5_percpu *percpu) 1304 { 1305 int disks = sh->disks; 1306 struct page **xor_srcs = to_addr_page(percpu, 0); 1307 int target = sh->ops.target; 1308 struct r5dev *tgt = &sh->dev[target]; 1309 struct page *xor_dest = tgt->page; 1310 int count = 0; 1311 struct dma_async_tx_descriptor *tx; 1312 struct async_submit_ctl submit; 1313 int i; 1314 1315 BUG_ON(sh->batch_head); 1316 1317 pr_debug("%s: stripe %llu block: %d\n", 1318 __func__, (unsigned long long)sh->sector, target); 1319 BUG_ON(!test_bit(R5_Wantcompute, &tgt->flags)); 1320 1321 for (i = disks; i--; ) 1322 if (i != target) 1323 xor_srcs[count++] = sh->dev[i].page; 1324 1325 atomic_inc(&sh->count); 1326 1327 init_async_submit(&submit, ASYNC_TX_FENCE|ASYNC_TX_XOR_ZERO_DST, NULL, 1328 ops_complete_compute, sh, to_addr_conv(sh, percpu, 0)); 1329 if (unlikely(count == 1)) 1330 tx = async_memcpy(xor_dest, xor_srcs[0], 0, 0, STRIPE_SIZE, &submit); 1331 else 1332 tx = async_xor(xor_dest, xor_srcs, 0, count, STRIPE_SIZE, &submit); 1333 1334 return tx; 1335 } 1336 1337 /* set_syndrome_sources - populate source buffers for gen_syndrome 1338 * @srcs - (struct page *) array of size sh->disks 1339 * @sh - stripe_head to parse 1340 * 1341 * Populates srcs in proper layout order for the stripe and returns the 1342 * 'count' of sources to be used in a call to async_gen_syndrome. The P 1343 * destination buffer is recorded in srcs[count] and the Q destination 1344 * is recorded in srcs[count+1]]. 1345 */ 1346 static int set_syndrome_sources(struct page **srcs, 1347 struct stripe_head *sh, 1348 int srctype) 1349 { 1350 int disks = sh->disks; 1351 int syndrome_disks = sh->ddf_layout ? disks : (disks - 2); 1352 int d0_idx = raid6_d0(sh); 1353 int count; 1354 int i; 1355 1356 for (i = 0; i < disks; i++) 1357 srcs[i] = NULL; 1358 1359 count = 0; 1360 i = d0_idx; 1361 do { 1362 int slot = raid6_idx_to_slot(i, sh, &count, syndrome_disks); 1363 struct r5dev *dev = &sh->dev[i]; 1364 1365 if (i == sh->qd_idx || i == sh->pd_idx || 1366 (srctype == SYNDROME_SRC_ALL) || 1367 (srctype == SYNDROME_SRC_WANT_DRAIN && 1368 test_bit(R5_Wantdrain, &dev->flags)) || 1369 (srctype == SYNDROME_SRC_WRITTEN && 1370 dev->written)) 1371 srcs[slot] = sh->dev[i].page; 1372 i = raid6_next_disk(i, disks); 1373 } while (i != d0_idx); 1374 1375 return syndrome_disks; 1376 } 1377 1378 static struct dma_async_tx_descriptor * 1379 ops_run_compute6_1(struct stripe_head *sh, struct raid5_percpu *percpu) 1380 { 1381 int disks = sh->disks; 1382 struct page **blocks = to_addr_page(percpu, 0); 1383 int target; 1384 int qd_idx = sh->qd_idx; 1385 struct dma_async_tx_descriptor *tx; 1386 struct async_submit_ctl submit; 1387 struct r5dev *tgt; 1388 struct page *dest; 1389 int i; 1390 int count; 1391 1392 BUG_ON(sh->batch_head); 1393 if (sh->ops.target < 0) 1394 target = sh->ops.target2; 1395 else if (sh->ops.target2 < 0) 1396 target = sh->ops.target; 1397 else 1398 /* we should only have one valid target */ 1399 BUG(); 1400 BUG_ON(target < 0); 1401 pr_debug("%s: stripe %llu block: %d\n", 1402 __func__, (unsigned long long)sh->sector, target); 1403 1404 tgt = &sh->dev[target]; 1405 BUG_ON(!test_bit(R5_Wantcompute, &tgt->flags)); 1406 dest = tgt->page; 1407 1408 atomic_inc(&sh->count); 1409 1410 if (target == qd_idx) { 1411 count = set_syndrome_sources(blocks, sh, SYNDROME_SRC_ALL); 1412 blocks[count] = NULL; /* regenerating p is not necessary */ 1413 BUG_ON(blocks[count+1] != dest); /* q should already be set */ 1414 init_async_submit(&submit, ASYNC_TX_FENCE, NULL, 1415 ops_complete_compute, sh, 1416 to_addr_conv(sh, percpu, 0)); 1417 tx = async_gen_syndrome(blocks, 0, count+2, STRIPE_SIZE, &submit); 1418 } else { 1419 /* Compute any data- or p-drive using XOR */ 1420 count = 0; 1421 for (i = disks; i-- ; ) { 1422 if (i == target || i == qd_idx) 1423 continue; 1424 blocks[count++] = sh->dev[i].page; 1425 } 1426 1427 init_async_submit(&submit, ASYNC_TX_FENCE|ASYNC_TX_XOR_ZERO_DST, 1428 NULL, ops_complete_compute, sh, 1429 to_addr_conv(sh, percpu, 0)); 1430 tx = async_xor(dest, blocks, 0, count, STRIPE_SIZE, &submit); 1431 } 1432 1433 return tx; 1434 } 1435 1436 static struct dma_async_tx_descriptor * 1437 ops_run_compute6_2(struct stripe_head *sh, struct raid5_percpu *percpu) 1438 { 1439 int i, count, disks = sh->disks; 1440 int syndrome_disks = sh->ddf_layout ? disks : disks-2; 1441 int d0_idx = raid6_d0(sh); 1442 int faila = -1, failb = -1; 1443 int target = sh->ops.target; 1444 int target2 = sh->ops.target2; 1445 struct r5dev *tgt = &sh->dev[target]; 1446 struct r5dev *tgt2 = &sh->dev[target2]; 1447 struct dma_async_tx_descriptor *tx; 1448 struct page **blocks = to_addr_page(percpu, 0); 1449 struct async_submit_ctl submit; 1450 1451 BUG_ON(sh->batch_head); 1452 pr_debug("%s: stripe %llu block1: %d block2: %d\n", 1453 __func__, (unsigned long long)sh->sector, target, target2); 1454 BUG_ON(target < 0 || target2 < 0); 1455 BUG_ON(!test_bit(R5_Wantcompute, &tgt->flags)); 1456 BUG_ON(!test_bit(R5_Wantcompute, &tgt2->flags)); 1457 1458 /* we need to open-code set_syndrome_sources to handle the 1459 * slot number conversion for 'faila' and 'failb' 1460 */ 1461 for (i = 0; i < disks ; i++) 1462 blocks[i] = NULL; 1463 count = 0; 1464 i = d0_idx; 1465 do { 1466 int slot = raid6_idx_to_slot(i, sh, &count, syndrome_disks); 1467 1468 blocks[slot] = sh->dev[i].page; 1469 1470 if (i == target) 1471 faila = slot; 1472 if (i == target2) 1473 failb = slot; 1474 i = raid6_next_disk(i, disks); 1475 } while (i != d0_idx); 1476 1477 BUG_ON(faila == failb); 1478 if (failb < faila) 1479 swap(faila, failb); 1480 pr_debug("%s: stripe: %llu faila: %d failb: %d\n", 1481 __func__, (unsigned long long)sh->sector, faila, failb); 1482 1483 atomic_inc(&sh->count); 1484 1485 if (failb == syndrome_disks+1) { 1486 /* Q disk is one of the missing disks */ 1487 if (faila == syndrome_disks) { 1488 /* Missing P+Q, just recompute */ 1489 init_async_submit(&submit, ASYNC_TX_FENCE, NULL, 1490 ops_complete_compute, sh, 1491 to_addr_conv(sh, percpu, 0)); 1492 return async_gen_syndrome(blocks, 0, syndrome_disks+2, 1493 STRIPE_SIZE, &submit); 1494 } else { 1495 struct page *dest; 1496 int data_target; 1497 int qd_idx = sh->qd_idx; 1498 1499 /* Missing D+Q: recompute D from P, then recompute Q */ 1500 if (target == qd_idx) 1501 data_target = target2; 1502 else 1503 data_target = target; 1504 1505 count = 0; 1506 for (i = disks; i-- ; ) { 1507 if (i == data_target || i == qd_idx) 1508 continue; 1509 blocks[count++] = sh->dev[i].page; 1510 } 1511 dest = sh->dev[data_target].page; 1512 init_async_submit(&submit, 1513 ASYNC_TX_FENCE|ASYNC_TX_XOR_ZERO_DST, 1514 NULL, NULL, NULL, 1515 to_addr_conv(sh, percpu, 0)); 1516 tx = async_xor(dest, blocks, 0, count, STRIPE_SIZE, 1517 &submit); 1518 1519 count = set_syndrome_sources(blocks, sh, SYNDROME_SRC_ALL); 1520 init_async_submit(&submit, ASYNC_TX_FENCE, tx, 1521 ops_complete_compute, sh, 1522 to_addr_conv(sh, percpu, 0)); 1523 return async_gen_syndrome(blocks, 0, count+2, 1524 STRIPE_SIZE, &submit); 1525 } 1526 } else { 1527 init_async_submit(&submit, ASYNC_TX_FENCE, NULL, 1528 ops_complete_compute, sh, 1529 to_addr_conv(sh, percpu, 0)); 1530 if (failb == syndrome_disks) { 1531 /* We're missing D+P. */ 1532 return async_raid6_datap_recov(syndrome_disks+2, 1533 STRIPE_SIZE, faila, 1534 blocks, &submit); 1535 } else { 1536 /* We're missing D+D. */ 1537 return async_raid6_2data_recov(syndrome_disks+2, 1538 STRIPE_SIZE, faila, failb, 1539 blocks, &submit); 1540 } 1541 } 1542 } 1543 1544 static void ops_complete_prexor(void *stripe_head_ref) 1545 { 1546 struct stripe_head *sh = stripe_head_ref; 1547 1548 pr_debug("%s: stripe %llu\n", __func__, 1549 (unsigned long long)sh->sector); 1550 } 1551 1552 static struct dma_async_tx_descriptor * 1553 ops_run_prexor5(struct stripe_head *sh, struct raid5_percpu *percpu, 1554 struct dma_async_tx_descriptor *tx) 1555 { 1556 int disks = sh->disks; 1557 struct page **xor_srcs = to_addr_page(percpu, 0); 1558 int count = 0, pd_idx = sh->pd_idx, i; 1559 struct async_submit_ctl submit; 1560 1561 /* existing parity data subtracted */ 1562 struct page *xor_dest = xor_srcs[count++] = sh->dev[pd_idx].page; 1563 1564 BUG_ON(sh->batch_head); 1565 pr_debug("%s: stripe %llu\n", __func__, 1566 (unsigned long long)sh->sector); 1567 1568 for (i = disks; i--; ) { 1569 struct r5dev *dev = &sh->dev[i]; 1570 /* Only process blocks that are known to be uptodate */ 1571 if (test_bit(R5_Wantdrain, &dev->flags)) 1572 xor_srcs[count++] = dev->page; 1573 } 1574 1575 init_async_submit(&submit, ASYNC_TX_FENCE|ASYNC_TX_XOR_DROP_DST, tx, 1576 ops_complete_prexor, sh, to_addr_conv(sh, percpu, 0)); 1577 tx = async_xor(xor_dest, xor_srcs, 0, count, STRIPE_SIZE, &submit); 1578 1579 return tx; 1580 } 1581 1582 static struct dma_async_tx_descriptor * 1583 ops_run_prexor6(struct stripe_head *sh, struct raid5_percpu *percpu, 1584 struct dma_async_tx_descriptor *tx) 1585 { 1586 struct page **blocks = to_addr_page(percpu, 0); 1587 int count; 1588 struct async_submit_ctl submit; 1589 1590 pr_debug("%s: stripe %llu\n", __func__, 1591 (unsigned long long)sh->sector); 1592 1593 count = set_syndrome_sources(blocks, sh, SYNDROME_SRC_WANT_DRAIN); 1594 1595 init_async_submit(&submit, ASYNC_TX_FENCE|ASYNC_TX_PQ_XOR_DST, tx, 1596 ops_complete_prexor, sh, to_addr_conv(sh, percpu, 0)); 1597 tx = async_gen_syndrome(blocks, 0, count+2, STRIPE_SIZE, &submit); 1598 1599 return tx; 1600 } 1601 1602 static struct dma_async_tx_descriptor * 1603 ops_run_biodrain(struct stripe_head *sh, struct dma_async_tx_descriptor *tx) 1604 { 1605 int disks = sh->disks; 1606 int i; 1607 struct stripe_head *head_sh = sh; 1608 1609 pr_debug("%s: stripe %llu\n", __func__, 1610 (unsigned long long)sh->sector); 1611 1612 for (i = disks; i--; ) { 1613 struct r5dev *dev; 1614 struct bio *chosen; 1615 1616 sh = head_sh; 1617 if (test_and_clear_bit(R5_Wantdrain, &head_sh->dev[i].flags)) { 1618 struct bio *wbi; 1619 1620 again: 1621 dev = &sh->dev[i]; 1622 spin_lock_irq(&sh->stripe_lock); 1623 chosen = dev->towrite; 1624 dev->towrite = NULL; 1625 sh->overwrite_disks = 0; 1626 BUG_ON(dev->written); 1627 wbi = dev->written = chosen; 1628 spin_unlock_irq(&sh->stripe_lock); 1629 WARN_ON(dev->page != dev->orig_page); 1630 1631 while (wbi && wbi->bi_iter.bi_sector < 1632 dev->sector + STRIPE_SECTORS) { 1633 if (wbi->bi_rw & REQ_FUA) 1634 set_bit(R5_WantFUA, &dev->flags); 1635 if (wbi->bi_rw & REQ_SYNC) 1636 set_bit(R5_SyncIO, &dev->flags); 1637 if (wbi->bi_rw & REQ_DISCARD) 1638 set_bit(R5_Discard, &dev->flags); 1639 else { 1640 tx = async_copy_data(1, wbi, &dev->page, 1641 dev->sector, tx, sh); 1642 if (dev->page != dev->orig_page) { 1643 set_bit(R5_SkipCopy, &dev->flags); 1644 clear_bit(R5_UPTODATE, &dev->flags); 1645 clear_bit(R5_OVERWRITE, &dev->flags); 1646 } 1647 } 1648 wbi = r5_next_bio(wbi, dev->sector); 1649 } 1650 1651 if (head_sh->batch_head) { 1652 sh = list_first_entry(&sh->batch_list, 1653 struct stripe_head, 1654 batch_list); 1655 if (sh == head_sh) 1656 continue; 1657 goto again; 1658 } 1659 } 1660 } 1661 1662 return tx; 1663 } 1664 1665 static void ops_complete_reconstruct(void *stripe_head_ref) 1666 { 1667 struct stripe_head *sh = stripe_head_ref; 1668 int disks = sh->disks; 1669 int pd_idx = sh->pd_idx; 1670 int qd_idx = sh->qd_idx; 1671 int i; 1672 bool fua = false, sync = false, discard = false; 1673 1674 pr_debug("%s: stripe %llu\n", __func__, 1675 (unsigned long long)sh->sector); 1676 1677 for (i = disks; i--; ) { 1678 fua |= test_bit(R5_WantFUA, &sh->dev[i].flags); 1679 sync |= test_bit(R5_SyncIO, &sh->dev[i].flags); 1680 discard |= test_bit(R5_Discard, &sh->dev[i].flags); 1681 } 1682 1683 for (i = disks; i--; ) { 1684 struct r5dev *dev = &sh->dev[i]; 1685 1686 if (dev->written || i == pd_idx || i == qd_idx) { 1687 if (!discard && !test_bit(R5_SkipCopy, &dev->flags)) 1688 set_bit(R5_UPTODATE, &dev->flags); 1689 if (fua) 1690 set_bit(R5_WantFUA, &dev->flags); 1691 if (sync) 1692 set_bit(R5_SyncIO, &dev->flags); 1693 } 1694 } 1695 1696 if (sh->reconstruct_state == reconstruct_state_drain_run) 1697 sh->reconstruct_state = reconstruct_state_drain_result; 1698 else if (sh->reconstruct_state == reconstruct_state_prexor_drain_run) 1699 sh->reconstruct_state = reconstruct_state_prexor_drain_result; 1700 else { 1701 BUG_ON(sh->reconstruct_state != reconstruct_state_run); 1702 sh->reconstruct_state = reconstruct_state_result; 1703 } 1704 1705 set_bit(STRIPE_HANDLE, &sh->state); 1706 release_stripe(sh); 1707 } 1708 1709 static void 1710 ops_run_reconstruct5(struct stripe_head *sh, struct raid5_percpu *percpu, 1711 struct dma_async_tx_descriptor *tx) 1712 { 1713 int disks = sh->disks; 1714 struct page **xor_srcs; 1715 struct async_submit_ctl submit; 1716 int count, pd_idx = sh->pd_idx, i; 1717 struct page *xor_dest; 1718 int prexor = 0; 1719 unsigned long flags; 1720 int j = 0; 1721 struct stripe_head *head_sh = sh; 1722 int last_stripe; 1723 1724 pr_debug("%s: stripe %llu\n", __func__, 1725 (unsigned long long)sh->sector); 1726 1727 for (i = 0; i < sh->disks; i++) { 1728 if (pd_idx == i) 1729 continue; 1730 if (!test_bit(R5_Discard, &sh->dev[i].flags)) 1731 break; 1732 } 1733 if (i >= sh->disks) { 1734 atomic_inc(&sh->count); 1735 set_bit(R5_Discard, &sh->dev[pd_idx].flags); 1736 ops_complete_reconstruct(sh); 1737 return; 1738 } 1739 again: 1740 count = 0; 1741 xor_srcs = to_addr_page(percpu, j); 1742 /* check if prexor is active which means only process blocks 1743 * that are part of a read-modify-write (written) 1744 */ 1745 if (head_sh->reconstruct_state == reconstruct_state_prexor_drain_run) { 1746 prexor = 1; 1747 xor_dest = xor_srcs[count++] = sh->dev[pd_idx].page; 1748 for (i = disks; i--; ) { 1749 struct r5dev *dev = &sh->dev[i]; 1750 if (head_sh->dev[i].written) 1751 xor_srcs[count++] = dev->page; 1752 } 1753 } else { 1754 xor_dest = sh->dev[pd_idx].page; 1755 for (i = disks; i--; ) { 1756 struct r5dev *dev = &sh->dev[i]; 1757 if (i != pd_idx) 1758 xor_srcs[count++] = dev->page; 1759 } 1760 } 1761 1762 /* 1/ if we prexor'd then the dest is reused as a source 1763 * 2/ if we did not prexor then we are redoing the parity 1764 * set ASYNC_TX_XOR_DROP_DST and ASYNC_TX_XOR_ZERO_DST 1765 * for the synchronous xor case 1766 */ 1767 last_stripe = !head_sh->batch_head || 1768 list_first_entry(&sh->batch_list, 1769 struct stripe_head, batch_list) == head_sh; 1770 if (last_stripe) { 1771 flags = ASYNC_TX_ACK | 1772 (prexor ? ASYNC_TX_XOR_DROP_DST : ASYNC_TX_XOR_ZERO_DST); 1773 1774 atomic_inc(&head_sh->count); 1775 init_async_submit(&submit, flags, tx, ops_complete_reconstruct, head_sh, 1776 to_addr_conv(sh, percpu, j)); 1777 } else { 1778 flags = prexor ? ASYNC_TX_XOR_DROP_DST : ASYNC_TX_XOR_ZERO_DST; 1779 init_async_submit(&submit, flags, tx, NULL, NULL, 1780 to_addr_conv(sh, percpu, j)); 1781 } 1782 1783 if (unlikely(count == 1)) 1784 tx = async_memcpy(xor_dest, xor_srcs[0], 0, 0, STRIPE_SIZE, &submit); 1785 else 1786 tx = async_xor(xor_dest, xor_srcs, 0, count, STRIPE_SIZE, &submit); 1787 if (!last_stripe) { 1788 j++; 1789 sh = list_first_entry(&sh->batch_list, struct stripe_head, 1790 batch_list); 1791 goto again; 1792 } 1793 } 1794 1795 static void 1796 ops_run_reconstruct6(struct stripe_head *sh, struct raid5_percpu *percpu, 1797 struct dma_async_tx_descriptor *tx) 1798 { 1799 struct async_submit_ctl submit; 1800 struct page **blocks; 1801 int count, i, j = 0; 1802 struct stripe_head *head_sh = sh; 1803 int last_stripe; 1804 int synflags; 1805 unsigned long txflags; 1806 1807 pr_debug("%s: stripe %llu\n", __func__, (unsigned long long)sh->sector); 1808 1809 for (i = 0; i < sh->disks; i++) { 1810 if (sh->pd_idx == i || sh->qd_idx == i) 1811 continue; 1812 if (!test_bit(R5_Discard, &sh->dev[i].flags)) 1813 break; 1814 } 1815 if (i >= sh->disks) { 1816 atomic_inc(&sh->count); 1817 set_bit(R5_Discard, &sh->dev[sh->pd_idx].flags); 1818 set_bit(R5_Discard, &sh->dev[sh->qd_idx].flags); 1819 ops_complete_reconstruct(sh); 1820 return; 1821 } 1822 1823 again: 1824 blocks = to_addr_page(percpu, j); 1825 1826 if (sh->reconstruct_state == reconstruct_state_prexor_drain_run) { 1827 synflags = SYNDROME_SRC_WRITTEN; 1828 txflags = ASYNC_TX_ACK | ASYNC_TX_PQ_XOR_DST; 1829 } else { 1830 synflags = SYNDROME_SRC_ALL; 1831 txflags = ASYNC_TX_ACK; 1832 } 1833 1834 count = set_syndrome_sources(blocks, sh, synflags); 1835 last_stripe = !head_sh->batch_head || 1836 list_first_entry(&sh->batch_list, 1837 struct stripe_head, batch_list) == head_sh; 1838 1839 if (last_stripe) { 1840 atomic_inc(&head_sh->count); 1841 init_async_submit(&submit, txflags, tx, ops_complete_reconstruct, 1842 head_sh, to_addr_conv(sh, percpu, j)); 1843 } else 1844 init_async_submit(&submit, 0, tx, NULL, NULL, 1845 to_addr_conv(sh, percpu, j)); 1846 tx = async_gen_syndrome(blocks, 0, count+2, STRIPE_SIZE, &submit); 1847 if (!last_stripe) { 1848 j++; 1849 sh = list_first_entry(&sh->batch_list, struct stripe_head, 1850 batch_list); 1851 goto again; 1852 } 1853 } 1854 1855 static void ops_complete_check(void *stripe_head_ref) 1856 { 1857 struct stripe_head *sh = stripe_head_ref; 1858 1859 pr_debug("%s: stripe %llu\n", __func__, 1860 (unsigned long long)sh->sector); 1861 1862 sh->check_state = check_state_check_result; 1863 set_bit(STRIPE_HANDLE, &sh->state); 1864 release_stripe(sh); 1865 } 1866 1867 static void ops_run_check_p(struct stripe_head *sh, struct raid5_percpu *percpu) 1868 { 1869 int disks = sh->disks; 1870 int pd_idx = sh->pd_idx; 1871 int qd_idx = sh->qd_idx; 1872 struct page *xor_dest; 1873 struct page **xor_srcs = to_addr_page(percpu, 0); 1874 struct dma_async_tx_descriptor *tx; 1875 struct async_submit_ctl submit; 1876 int count; 1877 int i; 1878 1879 pr_debug("%s: stripe %llu\n", __func__, 1880 (unsigned long long)sh->sector); 1881 1882 BUG_ON(sh->batch_head); 1883 count = 0; 1884 xor_dest = sh->dev[pd_idx].page; 1885 xor_srcs[count++] = xor_dest; 1886 for (i = disks; i--; ) { 1887 if (i == pd_idx || i == qd_idx) 1888 continue; 1889 xor_srcs[count++] = sh->dev[i].page; 1890 } 1891 1892 init_async_submit(&submit, 0, NULL, NULL, NULL, 1893 to_addr_conv(sh, percpu, 0)); 1894 tx = async_xor_val(xor_dest, xor_srcs, 0, count, STRIPE_SIZE, 1895 &sh->ops.zero_sum_result, &submit); 1896 1897 atomic_inc(&sh->count); 1898 init_async_submit(&submit, ASYNC_TX_ACK, tx, ops_complete_check, sh, NULL); 1899 tx = async_trigger_callback(&submit); 1900 } 1901 1902 static void ops_run_check_pq(struct stripe_head *sh, struct raid5_percpu *percpu, int checkp) 1903 { 1904 struct page **srcs = to_addr_page(percpu, 0); 1905 struct async_submit_ctl submit; 1906 int count; 1907 1908 pr_debug("%s: stripe %llu checkp: %d\n", __func__, 1909 (unsigned long long)sh->sector, checkp); 1910 1911 BUG_ON(sh->batch_head); 1912 count = set_syndrome_sources(srcs, sh, SYNDROME_SRC_ALL); 1913 if (!checkp) 1914 srcs[count] = NULL; 1915 1916 atomic_inc(&sh->count); 1917 init_async_submit(&submit, ASYNC_TX_ACK, NULL, ops_complete_check, 1918 sh, to_addr_conv(sh, percpu, 0)); 1919 async_syndrome_val(srcs, 0, count+2, STRIPE_SIZE, 1920 &sh->ops.zero_sum_result, percpu->spare_page, &submit); 1921 } 1922 1923 static void raid_run_ops(struct stripe_head *sh, unsigned long ops_request) 1924 { 1925 int overlap_clear = 0, i, disks = sh->disks; 1926 struct dma_async_tx_descriptor *tx = NULL; 1927 struct r5conf *conf = sh->raid_conf; 1928 int level = conf->level; 1929 struct raid5_percpu *percpu; 1930 unsigned long cpu; 1931 1932 cpu = get_cpu(); 1933 percpu = per_cpu_ptr(conf->percpu, cpu); 1934 if (test_bit(STRIPE_OP_BIOFILL, &ops_request)) { 1935 ops_run_biofill(sh); 1936 overlap_clear++; 1937 } 1938 1939 if (test_bit(STRIPE_OP_COMPUTE_BLK, &ops_request)) { 1940 if (level < 6) 1941 tx = ops_run_compute5(sh, percpu); 1942 else { 1943 if (sh->ops.target2 < 0 || sh->ops.target < 0) 1944 tx = ops_run_compute6_1(sh, percpu); 1945 else 1946 tx = ops_run_compute6_2(sh, percpu); 1947 } 1948 /* terminate the chain if reconstruct is not set to be run */ 1949 if (tx && !test_bit(STRIPE_OP_RECONSTRUCT, &ops_request)) 1950 async_tx_ack(tx); 1951 } 1952 1953 if (test_bit(STRIPE_OP_PREXOR, &ops_request)) { 1954 if (level < 6) 1955 tx = ops_run_prexor5(sh, percpu, tx); 1956 else 1957 tx = ops_run_prexor6(sh, percpu, tx); 1958 } 1959 1960 if (test_bit(STRIPE_OP_BIODRAIN, &ops_request)) { 1961 tx = ops_run_biodrain(sh, tx); 1962 overlap_clear++; 1963 } 1964 1965 if (test_bit(STRIPE_OP_RECONSTRUCT, &ops_request)) { 1966 if (level < 6) 1967 ops_run_reconstruct5(sh, percpu, tx); 1968 else 1969 ops_run_reconstruct6(sh, percpu, tx); 1970 } 1971 1972 if (test_bit(STRIPE_OP_CHECK, &ops_request)) { 1973 if (sh->check_state == check_state_run) 1974 ops_run_check_p(sh, percpu); 1975 else if (sh->check_state == check_state_run_q) 1976 ops_run_check_pq(sh, percpu, 0); 1977 else if (sh->check_state == check_state_run_pq) 1978 ops_run_check_pq(sh, percpu, 1); 1979 else 1980 BUG(); 1981 } 1982 1983 if (overlap_clear && !sh->batch_head) 1984 for (i = disks; i--; ) { 1985 struct r5dev *dev = &sh->dev[i]; 1986 if (test_and_clear_bit(R5_Overlap, &dev->flags)) 1987 wake_up(&sh->raid_conf->wait_for_overlap); 1988 } 1989 put_cpu(); 1990 } 1991 1992 static struct stripe_head *alloc_stripe(struct kmem_cache *sc, gfp_t gfp) 1993 { 1994 struct stripe_head *sh; 1995 1996 sh = kmem_cache_zalloc(sc, gfp); 1997 if (sh) { 1998 spin_lock_init(&sh->stripe_lock); 1999 spin_lock_init(&sh->batch_lock); 2000 INIT_LIST_HEAD(&sh->batch_list); 2001 INIT_LIST_HEAD(&sh->lru); 2002 atomic_set(&sh->count, 1); 2003 } 2004 return sh; 2005 } 2006 static int grow_one_stripe(struct r5conf *conf, gfp_t gfp) 2007 { 2008 struct stripe_head *sh; 2009 2010 sh = alloc_stripe(conf->slab_cache, gfp); 2011 if (!sh) 2012 return 0; 2013 2014 sh->raid_conf = conf; 2015 2016 if (grow_buffers(sh, gfp)) { 2017 shrink_buffers(sh); 2018 kmem_cache_free(conf->slab_cache, sh); 2019 return 0; 2020 } 2021 sh->hash_lock_index = 2022 conf->max_nr_stripes % NR_STRIPE_HASH_LOCKS; 2023 /* we just created an active stripe so... */ 2024 atomic_inc(&conf->active_stripes); 2025 2026 release_stripe(sh); 2027 conf->max_nr_stripes++; 2028 return 1; 2029 } 2030 2031 static int grow_stripes(struct r5conf *conf, int num) 2032 { 2033 struct kmem_cache *sc; 2034 int devs = max(conf->raid_disks, conf->previous_raid_disks); 2035 2036 if (conf->mddev->gendisk) 2037 sprintf(conf->cache_name[0], 2038 "raid%d-%s", conf->level, mdname(conf->mddev)); 2039 else 2040 sprintf(conf->cache_name[0], 2041 "raid%d-%p", conf->level, conf->mddev); 2042 sprintf(conf->cache_name[1], "%s-alt", conf->cache_name[0]); 2043 2044 conf->active_name = 0; 2045 sc = kmem_cache_create(conf->cache_name[conf->active_name], 2046 sizeof(struct stripe_head)+(devs-1)*sizeof(struct r5dev), 2047 0, 0, NULL); 2048 if (!sc) 2049 return 1; 2050 conf->slab_cache = sc; 2051 conf->pool_size = devs; 2052 while (num--) 2053 if (!grow_one_stripe(conf, GFP_KERNEL)) 2054 return 1; 2055 2056 return 0; 2057 } 2058 2059 /** 2060 * scribble_len - return the required size of the scribble region 2061 * @num - total number of disks in the array 2062 * 2063 * The size must be enough to contain: 2064 * 1/ a struct page pointer for each device in the array +2 2065 * 2/ room to convert each entry in (1) to its corresponding dma 2066 * (dma_map_page()) or page (page_address()) address. 2067 * 2068 * Note: the +2 is for the destination buffers of the ddf/raid6 case where we 2069 * calculate over all devices (not just the data blocks), using zeros in place 2070 * of the P and Q blocks. 2071 */ 2072 static struct flex_array *scribble_alloc(int num, int cnt, gfp_t flags) 2073 { 2074 struct flex_array *ret; 2075 size_t len; 2076 2077 len = sizeof(struct page *) * (num+2) + sizeof(addr_conv_t) * (num+2); 2078 ret = flex_array_alloc(len, cnt, flags); 2079 if (!ret) 2080 return NULL; 2081 /* always prealloc all elements, so no locking is required */ 2082 if (flex_array_prealloc(ret, 0, cnt, flags)) { 2083 flex_array_free(ret); 2084 return NULL; 2085 } 2086 return ret; 2087 } 2088 2089 static int resize_chunks(struct r5conf *conf, int new_disks, int new_sectors) 2090 { 2091 unsigned long cpu; 2092 int err = 0; 2093 2094 mddev_suspend(conf->mddev); 2095 get_online_cpus(); 2096 for_each_present_cpu(cpu) { 2097 struct raid5_percpu *percpu; 2098 struct flex_array *scribble; 2099 2100 percpu = per_cpu_ptr(conf->percpu, cpu); 2101 scribble = scribble_alloc(new_disks, 2102 new_sectors / STRIPE_SECTORS, 2103 GFP_NOIO); 2104 2105 if (scribble) { 2106 flex_array_free(percpu->scribble); 2107 percpu->scribble = scribble; 2108 } else { 2109 err = -ENOMEM; 2110 break; 2111 } 2112 } 2113 put_online_cpus(); 2114 mddev_resume(conf->mddev); 2115 return err; 2116 } 2117 2118 static int resize_stripes(struct r5conf *conf, int newsize) 2119 { 2120 /* Make all the stripes able to hold 'newsize' devices. 2121 * New slots in each stripe get 'page' set to a new page. 2122 * 2123 * This happens in stages: 2124 * 1/ create a new kmem_cache and allocate the required number of 2125 * stripe_heads. 2126 * 2/ gather all the old stripe_heads and transfer the pages across 2127 * to the new stripe_heads. This will have the side effect of 2128 * freezing the array as once all stripe_heads have been collected, 2129 * no IO will be possible. Old stripe heads are freed once their 2130 * pages have been transferred over, and the old kmem_cache is 2131 * freed when all stripes are done. 2132 * 3/ reallocate conf->disks to be suitable bigger. If this fails, 2133 * we simple return a failre status - no need to clean anything up. 2134 * 4/ allocate new pages for the new slots in the new stripe_heads. 2135 * If this fails, we don't bother trying the shrink the 2136 * stripe_heads down again, we just leave them as they are. 2137 * As each stripe_head is processed the new one is released into 2138 * active service. 2139 * 2140 * Once step2 is started, we cannot afford to wait for a write, 2141 * so we use GFP_NOIO allocations. 2142 */ 2143 struct stripe_head *osh, *nsh; 2144 LIST_HEAD(newstripes); 2145 struct disk_info *ndisks; 2146 int err; 2147 struct kmem_cache *sc; 2148 int i; 2149 int hash, cnt; 2150 2151 if (newsize <= conf->pool_size) 2152 return 0; /* never bother to shrink */ 2153 2154 err = md_allow_write(conf->mddev); 2155 if (err) 2156 return err; 2157 2158 /* Step 1 */ 2159 sc = kmem_cache_create(conf->cache_name[1-conf->active_name], 2160 sizeof(struct stripe_head)+(newsize-1)*sizeof(struct r5dev), 2161 0, 0, NULL); 2162 if (!sc) 2163 return -ENOMEM; 2164 2165 /* Need to ensure auto-resizing doesn't interfere */ 2166 mutex_lock(&conf->cache_size_mutex); 2167 2168 for (i = conf->max_nr_stripes; i; i--) { 2169 nsh = alloc_stripe(sc, GFP_KERNEL); 2170 if (!nsh) 2171 break; 2172 2173 nsh->raid_conf = conf; 2174 list_add(&nsh->lru, &newstripes); 2175 } 2176 if (i) { 2177 /* didn't get enough, give up */ 2178 while (!list_empty(&newstripes)) { 2179 nsh = list_entry(newstripes.next, struct stripe_head, lru); 2180 list_del(&nsh->lru); 2181 kmem_cache_free(sc, nsh); 2182 } 2183 kmem_cache_destroy(sc); 2184 mutex_unlock(&conf->cache_size_mutex); 2185 return -ENOMEM; 2186 } 2187 /* Step 2 - Must use GFP_NOIO now. 2188 * OK, we have enough stripes, start collecting inactive 2189 * stripes and copying them over 2190 */ 2191 hash = 0; 2192 cnt = 0; 2193 list_for_each_entry(nsh, &newstripes, lru) { 2194 lock_device_hash_lock(conf, hash); 2195 wait_event_exclusive_cmd(conf->wait_for_stripe[hash], 2196 !list_empty(conf->inactive_list + hash), 2197 unlock_device_hash_lock(conf, hash), 2198 lock_device_hash_lock(conf, hash)); 2199 osh = get_free_stripe(conf, hash); 2200 unlock_device_hash_lock(conf, hash); 2201 2202 for(i=0; i<conf->pool_size; i++) { 2203 nsh->dev[i].page = osh->dev[i].page; 2204 nsh->dev[i].orig_page = osh->dev[i].page; 2205 } 2206 nsh->hash_lock_index = hash; 2207 kmem_cache_free(conf->slab_cache, osh); 2208 cnt++; 2209 if (cnt >= conf->max_nr_stripes / NR_STRIPE_HASH_LOCKS + 2210 !!((conf->max_nr_stripes % NR_STRIPE_HASH_LOCKS) > hash)) { 2211 hash++; 2212 cnt = 0; 2213 } 2214 } 2215 kmem_cache_destroy(conf->slab_cache); 2216 2217 /* Step 3. 2218 * At this point, we are holding all the stripes so the array 2219 * is completely stalled, so now is a good time to resize 2220 * conf->disks and the scribble region 2221 */ 2222 ndisks = kzalloc(newsize * sizeof(struct disk_info), GFP_NOIO); 2223 if (ndisks) { 2224 for (i=0; i<conf->raid_disks; i++) 2225 ndisks[i] = conf->disks[i]; 2226 kfree(conf->disks); 2227 conf->disks = ndisks; 2228 } else 2229 err = -ENOMEM; 2230 2231 mutex_unlock(&conf->cache_size_mutex); 2232 /* Step 4, return new stripes to service */ 2233 while(!list_empty(&newstripes)) { 2234 nsh = list_entry(newstripes.next, struct stripe_head, lru); 2235 list_del_init(&nsh->lru); 2236 2237 for (i=conf->raid_disks; i < newsize; i++) 2238 if (nsh->dev[i].page == NULL) { 2239 struct page *p = alloc_page(GFP_NOIO); 2240 nsh->dev[i].page = p; 2241 nsh->dev[i].orig_page = p; 2242 if (!p) 2243 err = -ENOMEM; 2244 } 2245 release_stripe(nsh); 2246 } 2247 /* critical section pass, GFP_NOIO no longer needed */ 2248 2249 conf->slab_cache = sc; 2250 conf->active_name = 1-conf->active_name; 2251 if (!err) 2252 conf->pool_size = newsize; 2253 return err; 2254 } 2255 2256 static int drop_one_stripe(struct r5conf *conf) 2257 { 2258 struct stripe_head *sh; 2259 int hash = (conf->max_nr_stripes - 1) & STRIPE_HASH_LOCKS_MASK; 2260 2261 spin_lock_irq(conf->hash_locks + hash); 2262 sh = get_free_stripe(conf, hash); 2263 spin_unlock_irq(conf->hash_locks + hash); 2264 if (!sh) 2265 return 0; 2266 BUG_ON(atomic_read(&sh->count)); 2267 shrink_buffers(sh); 2268 kmem_cache_free(conf->slab_cache, sh); 2269 atomic_dec(&conf->active_stripes); 2270 conf->max_nr_stripes--; 2271 return 1; 2272 } 2273 2274 static void shrink_stripes(struct r5conf *conf) 2275 { 2276 while (conf->max_nr_stripes && 2277 drop_one_stripe(conf)) 2278 ; 2279 2280 if (conf->slab_cache) 2281 kmem_cache_destroy(conf->slab_cache); 2282 conf->slab_cache = NULL; 2283 } 2284 2285 static void raid5_end_read_request(struct bio * bi, int error) 2286 { 2287 struct stripe_head *sh = bi->bi_private; 2288 struct r5conf *conf = sh->raid_conf; 2289 int disks = sh->disks, i; 2290 int uptodate = test_bit(BIO_UPTODATE, &bi->bi_flags); 2291 char b[BDEVNAME_SIZE]; 2292 struct md_rdev *rdev = NULL; 2293 sector_t s; 2294 2295 for (i=0 ; i<disks; i++) 2296 if (bi == &sh->dev[i].req) 2297 break; 2298 2299 pr_debug("end_read_request %llu/%d, count: %d, uptodate %d.\n", 2300 (unsigned long long)sh->sector, i, atomic_read(&sh->count), 2301 uptodate); 2302 if (i == disks) { 2303 BUG(); 2304 return; 2305 } 2306 if (test_bit(R5_ReadRepl, &sh->dev[i].flags)) 2307 /* If replacement finished while this request was outstanding, 2308 * 'replacement' might be NULL already. 2309 * In that case it moved down to 'rdev'. 2310 * rdev is not removed until all requests are finished. 2311 */ 2312 rdev = conf->disks[i].replacement; 2313 if (!rdev) 2314 rdev = conf->disks[i].rdev; 2315 2316 if (use_new_offset(conf, sh)) 2317 s = sh->sector + rdev->new_data_offset; 2318 else 2319 s = sh->sector + rdev->data_offset; 2320 if (uptodate) { 2321 set_bit(R5_UPTODATE, &sh->dev[i].flags); 2322 if (test_bit(R5_ReadError, &sh->dev[i].flags)) { 2323 /* Note that this cannot happen on a 2324 * replacement device. We just fail those on 2325 * any error 2326 */ 2327 printk_ratelimited( 2328 KERN_INFO 2329 "md/raid:%s: read error corrected" 2330 " (%lu sectors at %llu on %s)\n", 2331 mdname(conf->mddev), STRIPE_SECTORS, 2332 (unsigned long long)s, 2333 bdevname(rdev->bdev, b)); 2334 atomic_add(STRIPE_SECTORS, &rdev->corrected_errors); 2335 clear_bit(R5_ReadError, &sh->dev[i].flags); 2336 clear_bit(R5_ReWrite, &sh->dev[i].flags); 2337 } else if (test_bit(R5_ReadNoMerge, &sh->dev[i].flags)) 2338 clear_bit(R5_ReadNoMerge, &sh->dev[i].flags); 2339 2340 if (atomic_read(&rdev->read_errors)) 2341 atomic_set(&rdev->read_errors, 0); 2342 } else { 2343 const char *bdn = bdevname(rdev->bdev, b); 2344 int retry = 0; 2345 int set_bad = 0; 2346 2347 clear_bit(R5_UPTODATE, &sh->dev[i].flags); 2348 atomic_inc(&rdev->read_errors); 2349 if (test_bit(R5_ReadRepl, &sh->dev[i].flags)) 2350 printk_ratelimited( 2351 KERN_WARNING 2352 "md/raid:%s: read error on replacement device " 2353 "(sector %llu on %s).\n", 2354 mdname(conf->mddev), 2355 (unsigned long long)s, 2356 bdn); 2357 else if (conf->mddev->degraded >= conf->max_degraded) { 2358 set_bad = 1; 2359 printk_ratelimited( 2360 KERN_WARNING 2361 "md/raid:%s: read error not correctable " 2362 "(sector %llu on %s).\n", 2363 mdname(conf->mddev), 2364 (unsigned long long)s, 2365 bdn); 2366 } else if (test_bit(R5_ReWrite, &sh->dev[i].flags)) { 2367 /* Oh, no!!! */ 2368 set_bad = 1; 2369 printk_ratelimited( 2370 KERN_WARNING 2371 "md/raid:%s: read error NOT corrected!! " 2372 "(sector %llu on %s).\n", 2373 mdname(conf->mddev), 2374 (unsigned long long)s, 2375 bdn); 2376 } else if (atomic_read(&rdev->read_errors) 2377 > conf->max_nr_stripes) 2378 printk(KERN_WARNING 2379 "md/raid:%s: Too many read errors, failing device %s.\n", 2380 mdname(conf->mddev), bdn); 2381 else 2382 retry = 1; 2383 if (set_bad && test_bit(In_sync, &rdev->flags) 2384 && !test_bit(R5_ReadNoMerge, &sh->dev[i].flags)) 2385 retry = 1; 2386 if (retry) 2387 if (test_bit(R5_ReadNoMerge, &sh->dev[i].flags)) { 2388 set_bit(R5_ReadError, &sh->dev[i].flags); 2389 clear_bit(R5_ReadNoMerge, &sh->dev[i].flags); 2390 } else 2391 set_bit(R5_ReadNoMerge, &sh->dev[i].flags); 2392 else { 2393 clear_bit(R5_ReadError, &sh->dev[i].flags); 2394 clear_bit(R5_ReWrite, &sh->dev[i].flags); 2395 if (!(set_bad 2396 && test_bit(In_sync, &rdev->flags) 2397 && rdev_set_badblocks( 2398 rdev, sh->sector, STRIPE_SECTORS, 0))) 2399 md_error(conf->mddev, rdev); 2400 } 2401 } 2402 rdev_dec_pending(rdev, conf->mddev); 2403 clear_bit(R5_LOCKED, &sh->dev[i].flags); 2404 set_bit(STRIPE_HANDLE, &sh->state); 2405 release_stripe(sh); 2406 } 2407 2408 static void raid5_end_write_request(struct bio *bi, int error) 2409 { 2410 struct stripe_head *sh = bi->bi_private; 2411 struct r5conf *conf = sh->raid_conf; 2412 int disks = sh->disks, i; 2413 struct md_rdev *uninitialized_var(rdev); 2414 int uptodate = test_bit(BIO_UPTODATE, &bi->bi_flags); 2415 sector_t first_bad; 2416 int bad_sectors; 2417 int replacement = 0; 2418 2419 for (i = 0 ; i < disks; i++) { 2420 if (bi == &sh->dev[i].req) { 2421 rdev = conf->disks[i].rdev; 2422 break; 2423 } 2424 if (bi == &sh->dev[i].rreq) { 2425 rdev = conf->disks[i].replacement; 2426 if (rdev) 2427 replacement = 1; 2428 else 2429 /* rdev was removed and 'replacement' 2430 * replaced it. rdev is not removed 2431 * until all requests are finished. 2432 */ 2433 rdev = conf->disks[i].rdev; 2434 break; 2435 } 2436 } 2437 pr_debug("end_write_request %llu/%d, count %d, uptodate: %d.\n", 2438 (unsigned long long)sh->sector, i, atomic_read(&sh->count), 2439 uptodate); 2440 if (i == disks) { 2441 BUG(); 2442 return; 2443 } 2444 2445 if (replacement) { 2446 if (!uptodate) 2447 md_error(conf->mddev, rdev); 2448 else if (is_badblock(rdev, sh->sector, 2449 STRIPE_SECTORS, 2450 &first_bad, &bad_sectors)) 2451 set_bit(R5_MadeGoodRepl, &sh->dev[i].flags); 2452 } else { 2453 if (!uptodate) { 2454 set_bit(STRIPE_DEGRADED, &sh->state); 2455 set_bit(WriteErrorSeen, &rdev->flags); 2456 set_bit(R5_WriteError, &sh->dev[i].flags); 2457 if (!test_and_set_bit(WantReplacement, &rdev->flags)) 2458 set_bit(MD_RECOVERY_NEEDED, 2459 &rdev->mddev->recovery); 2460 } else if (is_badblock(rdev, sh->sector, 2461 STRIPE_SECTORS, 2462 &first_bad, &bad_sectors)) { 2463 set_bit(R5_MadeGood, &sh->dev[i].flags); 2464 if (test_bit(R5_ReadError, &sh->dev[i].flags)) 2465 /* That was a successful write so make 2466 * sure it looks like we already did 2467 * a re-write. 2468 */ 2469 set_bit(R5_ReWrite, &sh->dev[i].flags); 2470 } 2471 } 2472 rdev_dec_pending(rdev, conf->mddev); 2473 2474 if (sh->batch_head && !uptodate && !replacement) 2475 set_bit(STRIPE_BATCH_ERR, &sh->batch_head->state); 2476 2477 if (!test_and_clear_bit(R5_DOUBLE_LOCKED, &sh->dev[i].flags)) 2478 clear_bit(R5_LOCKED, &sh->dev[i].flags); 2479 set_bit(STRIPE_HANDLE, &sh->state); 2480 release_stripe(sh); 2481 2482 if (sh->batch_head && sh != sh->batch_head) 2483 release_stripe(sh->batch_head); 2484 } 2485 2486 static sector_t compute_blocknr(struct stripe_head *sh, int i, int previous); 2487 2488 static void raid5_build_block(struct stripe_head *sh, int i, int previous) 2489 { 2490 struct r5dev *dev = &sh->dev[i]; 2491 2492 bio_init(&dev->req); 2493 dev->req.bi_io_vec = &dev->vec; 2494 dev->req.bi_max_vecs = 1; 2495 dev->req.bi_private = sh; 2496 2497 bio_init(&dev->rreq); 2498 dev->rreq.bi_io_vec = &dev->rvec; 2499 dev->rreq.bi_max_vecs = 1; 2500 dev->rreq.bi_private = sh; 2501 2502 dev->flags = 0; 2503 dev->sector = compute_blocknr(sh, i, previous); 2504 } 2505 2506 static void error(struct mddev *mddev, struct md_rdev *rdev) 2507 { 2508 char b[BDEVNAME_SIZE]; 2509 struct r5conf *conf = mddev->private; 2510 unsigned long flags; 2511 pr_debug("raid456: error called\n"); 2512 2513 spin_lock_irqsave(&conf->device_lock, flags); 2514 clear_bit(In_sync, &rdev->flags); 2515 mddev->degraded = calc_degraded(conf); 2516 spin_unlock_irqrestore(&conf->device_lock, flags); 2517 set_bit(MD_RECOVERY_INTR, &mddev->recovery); 2518 2519 set_bit(Blocked, &rdev->flags); 2520 set_bit(Faulty, &rdev->flags); 2521 set_bit(MD_CHANGE_DEVS, &mddev->flags); 2522 printk(KERN_ALERT 2523 "md/raid:%s: Disk failure on %s, disabling device.\n" 2524 "md/raid:%s: Operation continuing on %d devices.\n", 2525 mdname(mddev), 2526 bdevname(rdev->bdev, b), 2527 mdname(mddev), 2528 conf->raid_disks - mddev->degraded); 2529 } 2530 2531 /* 2532 * Input: a 'big' sector number, 2533 * Output: index of the data and parity disk, and the sector # in them. 2534 */ 2535 static sector_t raid5_compute_sector(struct r5conf *conf, sector_t r_sector, 2536 int previous, int *dd_idx, 2537 struct stripe_head *sh) 2538 { 2539 sector_t stripe, stripe2; 2540 sector_t chunk_number; 2541 unsigned int chunk_offset; 2542 int pd_idx, qd_idx; 2543 int ddf_layout = 0; 2544 sector_t new_sector; 2545 int algorithm = previous ? conf->prev_algo 2546 : conf->algorithm; 2547 int sectors_per_chunk = previous ? conf->prev_chunk_sectors 2548 : conf->chunk_sectors; 2549 int raid_disks = previous ? conf->previous_raid_disks 2550 : conf->raid_disks; 2551 int data_disks = raid_disks - conf->max_degraded; 2552 2553 /* First compute the information on this sector */ 2554 2555 /* 2556 * Compute the chunk number and the sector offset inside the chunk 2557 */ 2558 chunk_offset = sector_div(r_sector, sectors_per_chunk); 2559 chunk_number = r_sector; 2560 2561 /* 2562 * Compute the stripe number 2563 */ 2564 stripe = chunk_number; 2565 *dd_idx = sector_div(stripe, data_disks); 2566 stripe2 = stripe; 2567 /* 2568 * Select the parity disk based on the user selected algorithm. 2569 */ 2570 pd_idx = qd_idx = -1; 2571 switch(conf->level) { 2572 case 4: 2573 pd_idx = data_disks; 2574 break; 2575 case 5: 2576 switch (algorithm) { 2577 case ALGORITHM_LEFT_ASYMMETRIC: 2578 pd_idx = data_disks - sector_div(stripe2, raid_disks); 2579 if (*dd_idx >= pd_idx) 2580 (*dd_idx)++; 2581 break; 2582 case ALGORITHM_RIGHT_ASYMMETRIC: 2583 pd_idx = sector_div(stripe2, raid_disks); 2584 if (*dd_idx >= pd_idx) 2585 (*dd_idx)++; 2586 break; 2587 case ALGORITHM_LEFT_SYMMETRIC: 2588 pd_idx = data_disks - sector_div(stripe2, raid_disks); 2589 *dd_idx = (pd_idx + 1 + *dd_idx) % raid_disks; 2590 break; 2591 case ALGORITHM_RIGHT_SYMMETRIC: 2592 pd_idx = sector_div(stripe2, raid_disks); 2593 *dd_idx = (pd_idx + 1 + *dd_idx) % raid_disks; 2594 break; 2595 case ALGORITHM_PARITY_0: 2596 pd_idx = 0; 2597 (*dd_idx)++; 2598 break; 2599 case ALGORITHM_PARITY_N: 2600 pd_idx = data_disks; 2601 break; 2602 default: 2603 BUG(); 2604 } 2605 break; 2606 case 6: 2607 2608 switch (algorithm) { 2609 case ALGORITHM_LEFT_ASYMMETRIC: 2610 pd_idx = raid_disks - 1 - sector_div(stripe2, raid_disks); 2611 qd_idx = pd_idx + 1; 2612 if (pd_idx == raid_disks-1) { 2613 (*dd_idx)++; /* Q D D D P */ 2614 qd_idx = 0; 2615 } else if (*dd_idx >= pd_idx) 2616 (*dd_idx) += 2; /* D D P Q D */ 2617 break; 2618 case ALGORITHM_RIGHT_ASYMMETRIC: 2619 pd_idx = sector_div(stripe2, raid_disks); 2620 qd_idx = pd_idx + 1; 2621 if (pd_idx == raid_disks-1) { 2622 (*dd_idx)++; /* Q D D D P */ 2623 qd_idx = 0; 2624 } else if (*dd_idx >= pd_idx) 2625 (*dd_idx) += 2; /* D D P Q D */ 2626 break; 2627 case ALGORITHM_LEFT_SYMMETRIC: 2628 pd_idx = raid_disks - 1 - sector_div(stripe2, raid_disks); 2629 qd_idx = (pd_idx + 1) % raid_disks; 2630 *dd_idx = (pd_idx + 2 + *dd_idx) % raid_disks; 2631 break; 2632 case ALGORITHM_RIGHT_SYMMETRIC: 2633 pd_idx = sector_div(stripe2, raid_disks); 2634 qd_idx = (pd_idx + 1) % raid_disks; 2635 *dd_idx = (pd_idx + 2 + *dd_idx) % raid_disks; 2636 break; 2637 2638 case ALGORITHM_PARITY_0: 2639 pd_idx = 0; 2640 qd_idx = 1; 2641 (*dd_idx) += 2; 2642 break; 2643 case ALGORITHM_PARITY_N: 2644 pd_idx = data_disks; 2645 qd_idx = data_disks + 1; 2646 break; 2647 2648 case ALGORITHM_ROTATING_ZERO_RESTART: 2649 /* Exactly the same as RIGHT_ASYMMETRIC, but or 2650 * of blocks for computing Q is different. 2651 */ 2652 pd_idx = sector_div(stripe2, raid_disks); 2653 qd_idx = pd_idx + 1; 2654 if (pd_idx == raid_disks-1) { 2655 (*dd_idx)++; /* Q D D D P */ 2656 qd_idx = 0; 2657 } else if (*dd_idx >= pd_idx) 2658 (*dd_idx) += 2; /* D D P Q D */ 2659 ddf_layout = 1; 2660 break; 2661 2662 case ALGORITHM_ROTATING_N_RESTART: 2663 /* Same a left_asymmetric, by first stripe is 2664 * D D D P Q rather than 2665 * Q D D D P 2666 */ 2667 stripe2 += 1; 2668 pd_idx = raid_disks - 1 - sector_div(stripe2, raid_disks); 2669 qd_idx = pd_idx + 1; 2670 if (pd_idx == raid_disks-1) { 2671 (*dd_idx)++; /* Q D D D P */ 2672 qd_idx = 0; 2673 } else if (*dd_idx >= pd_idx) 2674 (*dd_idx) += 2; /* D D P Q D */ 2675 ddf_layout = 1; 2676 break; 2677 2678 case ALGORITHM_ROTATING_N_CONTINUE: 2679 /* Same as left_symmetric but Q is before P */ 2680 pd_idx = raid_disks - 1 - sector_div(stripe2, raid_disks); 2681 qd_idx = (pd_idx + raid_disks - 1) % raid_disks; 2682 *dd_idx = (pd_idx + 1 + *dd_idx) % raid_disks; 2683 ddf_layout = 1; 2684 break; 2685 2686 case ALGORITHM_LEFT_ASYMMETRIC_6: 2687 /* RAID5 left_asymmetric, with Q on last device */ 2688 pd_idx = data_disks - sector_div(stripe2, raid_disks-1); 2689 if (*dd_idx >= pd_idx) 2690 (*dd_idx)++; 2691 qd_idx = raid_disks - 1; 2692 break; 2693 2694 case ALGORITHM_RIGHT_ASYMMETRIC_6: 2695 pd_idx = sector_div(stripe2, raid_disks-1); 2696 if (*dd_idx >= pd_idx) 2697 (*dd_idx)++; 2698 qd_idx = raid_disks - 1; 2699 break; 2700 2701 case ALGORITHM_LEFT_SYMMETRIC_6: 2702 pd_idx = data_disks - sector_div(stripe2, raid_disks-1); 2703 *dd_idx = (pd_idx + 1 + *dd_idx) % (raid_disks-1); 2704 qd_idx = raid_disks - 1; 2705 break; 2706 2707 case ALGORITHM_RIGHT_SYMMETRIC_6: 2708 pd_idx = sector_div(stripe2, raid_disks-1); 2709 *dd_idx = (pd_idx + 1 + *dd_idx) % (raid_disks-1); 2710 qd_idx = raid_disks - 1; 2711 break; 2712 2713 case ALGORITHM_PARITY_0_6: 2714 pd_idx = 0; 2715 (*dd_idx)++; 2716 qd_idx = raid_disks - 1; 2717 break; 2718 2719 default: 2720 BUG(); 2721 } 2722 break; 2723 } 2724 2725 if (sh) { 2726 sh->pd_idx = pd_idx; 2727 sh->qd_idx = qd_idx; 2728 sh->ddf_layout = ddf_layout; 2729 } 2730 /* 2731 * Finally, compute the new sector number 2732 */ 2733 new_sector = (sector_t)stripe * sectors_per_chunk + chunk_offset; 2734 return new_sector; 2735 } 2736 2737 static sector_t compute_blocknr(struct stripe_head *sh, int i, int previous) 2738 { 2739 struct r5conf *conf = sh->raid_conf; 2740 int raid_disks = sh->disks; 2741 int data_disks = raid_disks - conf->max_degraded; 2742 sector_t new_sector = sh->sector, check; 2743 int sectors_per_chunk = previous ? conf->prev_chunk_sectors 2744 : conf->chunk_sectors; 2745 int algorithm = previous ? conf->prev_algo 2746 : conf->algorithm; 2747 sector_t stripe; 2748 int chunk_offset; 2749 sector_t chunk_number; 2750 int dummy1, dd_idx = i; 2751 sector_t r_sector; 2752 struct stripe_head sh2; 2753 2754 chunk_offset = sector_div(new_sector, sectors_per_chunk); 2755 stripe = new_sector; 2756 2757 if (i == sh->pd_idx) 2758 return 0; 2759 switch(conf->level) { 2760 case 4: break; 2761 case 5: 2762 switch (algorithm) { 2763 case ALGORITHM_LEFT_ASYMMETRIC: 2764 case ALGORITHM_RIGHT_ASYMMETRIC: 2765 if (i > sh->pd_idx) 2766 i--; 2767 break; 2768 case ALGORITHM_LEFT_SYMMETRIC: 2769 case ALGORITHM_RIGHT_SYMMETRIC: 2770 if (i < sh->pd_idx) 2771 i += raid_disks; 2772 i -= (sh->pd_idx + 1); 2773 break; 2774 case ALGORITHM_PARITY_0: 2775 i -= 1; 2776 break; 2777 case ALGORITHM_PARITY_N: 2778 break; 2779 default: 2780 BUG(); 2781 } 2782 break; 2783 case 6: 2784 if (i == sh->qd_idx) 2785 return 0; /* It is the Q disk */ 2786 switch (algorithm) { 2787 case ALGORITHM_LEFT_ASYMMETRIC: 2788 case ALGORITHM_RIGHT_ASYMMETRIC: 2789 case ALGORITHM_ROTATING_ZERO_RESTART: 2790 case ALGORITHM_ROTATING_N_RESTART: 2791 if (sh->pd_idx == raid_disks-1) 2792 i--; /* Q D D D P */ 2793 else if (i > sh->pd_idx) 2794 i -= 2; /* D D P Q D */ 2795 break; 2796 case ALGORITHM_LEFT_SYMMETRIC: 2797 case ALGORITHM_RIGHT_SYMMETRIC: 2798 if (sh->pd_idx == raid_disks-1) 2799 i--; /* Q D D D P */ 2800 else { 2801 /* D D P Q D */ 2802 if (i < sh->pd_idx) 2803 i += raid_disks; 2804 i -= (sh->pd_idx + 2); 2805 } 2806 break; 2807 case ALGORITHM_PARITY_0: 2808 i -= 2; 2809 break; 2810 case ALGORITHM_PARITY_N: 2811 break; 2812 case ALGORITHM_ROTATING_N_CONTINUE: 2813 /* Like left_symmetric, but P is before Q */ 2814 if (sh->pd_idx == 0) 2815 i--; /* P D D D Q */ 2816 else { 2817 /* D D Q P D */ 2818 if (i < sh->pd_idx) 2819 i += raid_disks; 2820 i -= (sh->pd_idx + 1); 2821 } 2822 break; 2823 case ALGORITHM_LEFT_ASYMMETRIC_6: 2824 case ALGORITHM_RIGHT_ASYMMETRIC_6: 2825 if (i > sh->pd_idx) 2826 i--; 2827 break; 2828 case ALGORITHM_LEFT_SYMMETRIC_6: 2829 case ALGORITHM_RIGHT_SYMMETRIC_6: 2830 if (i < sh->pd_idx) 2831 i += data_disks + 1; 2832 i -= (sh->pd_idx + 1); 2833 break; 2834 case ALGORITHM_PARITY_0_6: 2835 i -= 1; 2836 break; 2837 default: 2838 BUG(); 2839 } 2840 break; 2841 } 2842 2843 chunk_number = stripe * data_disks + i; 2844 r_sector = chunk_number * sectors_per_chunk + chunk_offset; 2845 2846 check = raid5_compute_sector(conf, r_sector, 2847 previous, &dummy1, &sh2); 2848 if (check != sh->sector || dummy1 != dd_idx || sh2.pd_idx != sh->pd_idx 2849 || sh2.qd_idx != sh->qd_idx) { 2850 printk(KERN_ERR "md/raid:%s: compute_blocknr: map not correct\n", 2851 mdname(conf->mddev)); 2852 return 0; 2853 } 2854 return r_sector; 2855 } 2856 2857 static void 2858 schedule_reconstruction(struct stripe_head *sh, struct stripe_head_state *s, 2859 int rcw, int expand) 2860 { 2861 int i, pd_idx = sh->pd_idx, qd_idx = sh->qd_idx, disks = sh->disks; 2862 struct r5conf *conf = sh->raid_conf; 2863 int level = conf->level; 2864 2865 if (rcw) { 2866 2867 for (i = disks; i--; ) { 2868 struct r5dev *dev = &sh->dev[i]; 2869 2870 if (dev->towrite) { 2871 set_bit(R5_LOCKED, &dev->flags); 2872 set_bit(R5_Wantdrain, &dev->flags); 2873 if (!expand) 2874 clear_bit(R5_UPTODATE, &dev->flags); 2875 s->locked++; 2876 } 2877 } 2878 /* if we are not expanding this is a proper write request, and 2879 * there will be bios with new data to be drained into the 2880 * stripe cache 2881 */ 2882 if (!expand) { 2883 if (!s->locked) 2884 /* False alarm, nothing to do */ 2885 return; 2886 sh->reconstruct_state = reconstruct_state_drain_run; 2887 set_bit(STRIPE_OP_BIODRAIN, &s->ops_request); 2888 } else 2889 sh->reconstruct_state = reconstruct_state_run; 2890 2891 set_bit(STRIPE_OP_RECONSTRUCT, &s->ops_request); 2892 2893 if (s->locked + conf->max_degraded == disks) 2894 if (!test_and_set_bit(STRIPE_FULL_WRITE, &sh->state)) 2895 atomic_inc(&conf->pending_full_writes); 2896 } else { 2897 BUG_ON(!(test_bit(R5_UPTODATE, &sh->dev[pd_idx].flags) || 2898 test_bit(R5_Wantcompute, &sh->dev[pd_idx].flags))); 2899 BUG_ON(level == 6 && 2900 (!(test_bit(R5_UPTODATE, &sh->dev[qd_idx].flags) || 2901 test_bit(R5_Wantcompute, &sh->dev[qd_idx].flags)))); 2902 2903 for (i = disks; i--; ) { 2904 struct r5dev *dev = &sh->dev[i]; 2905 if (i == pd_idx || i == qd_idx) 2906 continue; 2907 2908 if (dev->towrite && 2909 (test_bit(R5_UPTODATE, &dev->flags) || 2910 test_bit(R5_Wantcompute, &dev->flags))) { 2911 set_bit(R5_Wantdrain, &dev->flags); 2912 set_bit(R5_LOCKED, &dev->flags); 2913 clear_bit(R5_UPTODATE, &dev->flags); 2914 s->locked++; 2915 } 2916 } 2917 if (!s->locked) 2918 /* False alarm - nothing to do */ 2919 return; 2920 sh->reconstruct_state = reconstruct_state_prexor_drain_run; 2921 set_bit(STRIPE_OP_PREXOR, &s->ops_request); 2922 set_bit(STRIPE_OP_BIODRAIN, &s->ops_request); 2923 set_bit(STRIPE_OP_RECONSTRUCT, &s->ops_request); 2924 } 2925 2926 /* keep the parity disk(s) locked while asynchronous operations 2927 * are in flight 2928 */ 2929 set_bit(R5_LOCKED, &sh->dev[pd_idx].flags); 2930 clear_bit(R5_UPTODATE, &sh->dev[pd_idx].flags); 2931 s->locked++; 2932 2933 if (level == 6) { 2934 int qd_idx = sh->qd_idx; 2935 struct r5dev *dev = &sh->dev[qd_idx]; 2936 2937 set_bit(R5_LOCKED, &dev->flags); 2938 clear_bit(R5_UPTODATE, &dev->flags); 2939 s->locked++; 2940 } 2941 2942 pr_debug("%s: stripe %llu locked: %d ops_request: %lx\n", 2943 __func__, (unsigned long long)sh->sector, 2944 s->locked, s->ops_request); 2945 } 2946 2947 /* 2948 * Each stripe/dev can have one or more bion attached. 2949 * toread/towrite point to the first in a chain. 2950 * The bi_next chain must be in order. 2951 */ 2952 static int add_stripe_bio(struct stripe_head *sh, struct bio *bi, int dd_idx, 2953 int forwrite, int previous) 2954 { 2955 struct bio **bip; 2956 struct r5conf *conf = sh->raid_conf; 2957 int firstwrite=0; 2958 2959 pr_debug("adding bi b#%llu to stripe s#%llu\n", 2960 (unsigned long long)bi->bi_iter.bi_sector, 2961 (unsigned long long)sh->sector); 2962 2963 /* 2964 * If several bio share a stripe. The bio bi_phys_segments acts as a 2965 * reference count to avoid race. The reference count should already be 2966 * increased before this function is called (for example, in 2967 * make_request()), so other bio sharing this stripe will not free the 2968 * stripe. If a stripe is owned by one stripe, the stripe lock will 2969 * protect it. 2970 */ 2971 spin_lock_irq(&sh->stripe_lock); 2972 /* Don't allow new IO added to stripes in batch list */ 2973 if (sh->batch_head) 2974 goto overlap; 2975 if (forwrite) { 2976 bip = &sh->dev[dd_idx].towrite; 2977 if (*bip == NULL) 2978 firstwrite = 1; 2979 } else 2980 bip = &sh->dev[dd_idx].toread; 2981 while (*bip && (*bip)->bi_iter.bi_sector < bi->bi_iter.bi_sector) { 2982 if (bio_end_sector(*bip) > bi->bi_iter.bi_sector) 2983 goto overlap; 2984 bip = & (*bip)->bi_next; 2985 } 2986 if (*bip && (*bip)->bi_iter.bi_sector < bio_end_sector(bi)) 2987 goto overlap; 2988 2989 if (!forwrite || previous) 2990 clear_bit(STRIPE_BATCH_READY, &sh->state); 2991 2992 BUG_ON(*bip && bi->bi_next && (*bip) != bi->bi_next); 2993 if (*bip) 2994 bi->bi_next = *bip; 2995 *bip = bi; 2996 raid5_inc_bi_active_stripes(bi); 2997 2998 if (forwrite) { 2999 /* check if page is covered */ 3000 sector_t sector = sh->dev[dd_idx].sector; 3001 for (bi=sh->dev[dd_idx].towrite; 3002 sector < sh->dev[dd_idx].sector + STRIPE_SECTORS && 3003 bi && bi->bi_iter.bi_sector <= sector; 3004 bi = r5_next_bio(bi, sh->dev[dd_idx].sector)) { 3005 if (bio_end_sector(bi) >= sector) 3006 sector = bio_end_sector(bi); 3007 } 3008 if (sector >= sh->dev[dd_idx].sector + STRIPE_SECTORS) 3009 if (!test_and_set_bit(R5_OVERWRITE, &sh->dev[dd_idx].flags)) 3010 sh->overwrite_disks++; 3011 } 3012 3013 pr_debug("added bi b#%llu to stripe s#%llu, disk %d.\n", 3014 (unsigned long long)(*bip)->bi_iter.bi_sector, 3015 (unsigned long long)sh->sector, dd_idx); 3016 3017 if (conf->mddev->bitmap && firstwrite) { 3018 /* Cannot hold spinlock over bitmap_startwrite, 3019 * but must ensure this isn't added to a batch until 3020 * we have added to the bitmap and set bm_seq. 3021 * So set STRIPE_BITMAP_PENDING to prevent 3022 * batching. 3023 * If multiple add_stripe_bio() calls race here they 3024 * much all set STRIPE_BITMAP_PENDING. So only the first one 3025 * to complete "bitmap_startwrite" gets to set 3026 * STRIPE_BIT_DELAY. This is important as once a stripe 3027 * is added to a batch, STRIPE_BIT_DELAY cannot be changed 3028 * any more. 3029 */ 3030 set_bit(STRIPE_BITMAP_PENDING, &sh->state); 3031 spin_unlock_irq(&sh->stripe_lock); 3032 bitmap_startwrite(conf->mddev->bitmap, sh->sector, 3033 STRIPE_SECTORS, 0); 3034 spin_lock_irq(&sh->stripe_lock); 3035 clear_bit(STRIPE_BITMAP_PENDING, &sh->state); 3036 if (!sh->batch_head) { 3037 sh->bm_seq = conf->seq_flush+1; 3038 set_bit(STRIPE_BIT_DELAY, &sh->state); 3039 } 3040 } 3041 spin_unlock_irq(&sh->stripe_lock); 3042 3043 if (stripe_can_batch(sh)) 3044 stripe_add_to_batch_list(conf, sh); 3045 return 1; 3046 3047 overlap: 3048 set_bit(R5_Overlap, &sh->dev[dd_idx].flags); 3049 spin_unlock_irq(&sh->stripe_lock); 3050 return 0; 3051 } 3052 3053 static void end_reshape(struct r5conf *conf); 3054 3055 static void stripe_set_idx(sector_t stripe, struct r5conf *conf, int previous, 3056 struct stripe_head *sh) 3057 { 3058 int sectors_per_chunk = 3059 previous ? conf->prev_chunk_sectors : conf->chunk_sectors; 3060 int dd_idx; 3061 int chunk_offset = sector_div(stripe, sectors_per_chunk); 3062 int disks = previous ? conf->previous_raid_disks : conf->raid_disks; 3063 3064 raid5_compute_sector(conf, 3065 stripe * (disks - conf->max_degraded) 3066 *sectors_per_chunk + chunk_offset, 3067 previous, 3068 &dd_idx, sh); 3069 } 3070 3071 static void 3072 handle_failed_stripe(struct r5conf *conf, struct stripe_head *sh, 3073 struct stripe_head_state *s, int disks, 3074 struct bio **return_bi) 3075 { 3076 int i; 3077 BUG_ON(sh->batch_head); 3078 for (i = disks; i--; ) { 3079 struct bio *bi; 3080 int bitmap_end = 0; 3081 3082 if (test_bit(R5_ReadError, &sh->dev[i].flags)) { 3083 struct md_rdev *rdev; 3084 rcu_read_lock(); 3085 rdev = rcu_dereference(conf->disks[i].rdev); 3086 if (rdev && test_bit(In_sync, &rdev->flags)) 3087 atomic_inc(&rdev->nr_pending); 3088 else 3089 rdev = NULL; 3090 rcu_read_unlock(); 3091 if (rdev) { 3092 if (!rdev_set_badblocks( 3093 rdev, 3094 sh->sector, 3095 STRIPE_SECTORS, 0)) 3096 md_error(conf->mddev, rdev); 3097 rdev_dec_pending(rdev, conf->mddev); 3098 } 3099 } 3100 spin_lock_irq(&sh->stripe_lock); 3101 /* fail all writes first */ 3102 bi = sh->dev[i].towrite; 3103 sh->dev[i].towrite = NULL; 3104 sh->overwrite_disks = 0; 3105 spin_unlock_irq(&sh->stripe_lock); 3106 if (bi) 3107 bitmap_end = 1; 3108 3109 if (test_and_clear_bit(R5_Overlap, &sh->dev[i].flags)) 3110 wake_up(&conf->wait_for_overlap); 3111 3112 while (bi && bi->bi_iter.bi_sector < 3113 sh->dev[i].sector + STRIPE_SECTORS) { 3114 struct bio *nextbi = r5_next_bio(bi, sh->dev[i].sector); 3115 clear_bit(BIO_UPTODATE, &bi->bi_flags); 3116 if (!raid5_dec_bi_active_stripes(bi)) { 3117 md_write_end(conf->mddev); 3118 bi->bi_next = *return_bi; 3119 *return_bi = bi; 3120 } 3121 bi = nextbi; 3122 } 3123 if (bitmap_end) 3124 bitmap_endwrite(conf->mddev->bitmap, sh->sector, 3125 STRIPE_SECTORS, 0, 0); 3126 bitmap_end = 0; 3127 /* and fail all 'written' */ 3128 bi = sh->dev[i].written; 3129 sh->dev[i].written = NULL; 3130 if (test_and_clear_bit(R5_SkipCopy, &sh->dev[i].flags)) { 3131 WARN_ON(test_bit(R5_UPTODATE, &sh->dev[i].flags)); 3132 sh->dev[i].page = sh->dev[i].orig_page; 3133 } 3134 3135 if (bi) bitmap_end = 1; 3136 while (bi && bi->bi_iter.bi_sector < 3137 sh->dev[i].sector + STRIPE_SECTORS) { 3138 struct bio *bi2 = r5_next_bio(bi, sh->dev[i].sector); 3139 clear_bit(BIO_UPTODATE, &bi->bi_flags); 3140 if (!raid5_dec_bi_active_stripes(bi)) { 3141 md_write_end(conf->mddev); 3142 bi->bi_next = *return_bi; 3143 *return_bi = bi; 3144 } 3145 bi = bi2; 3146 } 3147 3148 /* fail any reads if this device is non-operational and 3149 * the data has not reached the cache yet. 3150 */ 3151 if (!test_bit(R5_Wantfill, &sh->dev[i].flags) && 3152 (!test_bit(R5_Insync, &sh->dev[i].flags) || 3153 test_bit(R5_ReadError, &sh->dev[i].flags))) { 3154 spin_lock_irq(&sh->stripe_lock); 3155 bi = sh->dev[i].toread; 3156 sh->dev[i].toread = NULL; 3157 spin_unlock_irq(&sh->stripe_lock); 3158 if (test_and_clear_bit(R5_Overlap, &sh->dev[i].flags)) 3159 wake_up(&conf->wait_for_overlap); 3160 while (bi && bi->bi_iter.bi_sector < 3161 sh->dev[i].sector + STRIPE_SECTORS) { 3162 struct bio *nextbi = 3163 r5_next_bio(bi, sh->dev[i].sector); 3164 clear_bit(BIO_UPTODATE, &bi->bi_flags); 3165 if (!raid5_dec_bi_active_stripes(bi)) { 3166 bi->bi_next = *return_bi; 3167 *return_bi = bi; 3168 } 3169 bi = nextbi; 3170 } 3171 } 3172 if (bitmap_end) 3173 bitmap_endwrite(conf->mddev->bitmap, sh->sector, 3174 STRIPE_SECTORS, 0, 0); 3175 /* If we were in the middle of a write the parity block might 3176 * still be locked - so just clear all R5_LOCKED flags 3177 */ 3178 clear_bit(R5_LOCKED, &sh->dev[i].flags); 3179 } 3180 3181 if (test_and_clear_bit(STRIPE_FULL_WRITE, &sh->state)) 3182 if (atomic_dec_and_test(&conf->pending_full_writes)) 3183 md_wakeup_thread(conf->mddev->thread); 3184 } 3185 3186 static void 3187 handle_failed_sync(struct r5conf *conf, struct stripe_head *sh, 3188 struct stripe_head_state *s) 3189 { 3190 int abort = 0; 3191 int i; 3192 3193 BUG_ON(sh->batch_head); 3194 clear_bit(STRIPE_SYNCING, &sh->state); 3195 if (test_and_clear_bit(R5_Overlap, &sh->dev[sh->pd_idx].flags)) 3196 wake_up(&conf->wait_for_overlap); 3197 s->syncing = 0; 3198 s->replacing = 0; 3199 /* There is nothing more to do for sync/check/repair. 3200 * Don't even need to abort as that is handled elsewhere 3201 * if needed, and not always wanted e.g. if there is a known 3202 * bad block here. 3203 * For recover/replace we need to record a bad block on all 3204 * non-sync devices, or abort the recovery 3205 */ 3206 if (test_bit(MD_RECOVERY_RECOVER, &conf->mddev->recovery)) { 3207 /* During recovery devices cannot be removed, so 3208 * locking and refcounting of rdevs is not needed 3209 */ 3210 for (i = 0; i < conf->raid_disks; i++) { 3211 struct md_rdev *rdev = conf->disks[i].rdev; 3212 if (rdev 3213 && !test_bit(Faulty, &rdev->flags) 3214 && !test_bit(In_sync, &rdev->flags) 3215 && !rdev_set_badblocks(rdev, sh->sector, 3216 STRIPE_SECTORS, 0)) 3217 abort = 1; 3218 rdev = conf->disks[i].replacement; 3219 if (rdev 3220 && !test_bit(Faulty, &rdev->flags) 3221 && !test_bit(In_sync, &rdev->flags) 3222 && !rdev_set_badblocks(rdev, sh->sector, 3223 STRIPE_SECTORS, 0)) 3224 abort = 1; 3225 } 3226 if (abort) 3227 conf->recovery_disabled = 3228 conf->mddev->recovery_disabled; 3229 } 3230 md_done_sync(conf->mddev, STRIPE_SECTORS, !abort); 3231 } 3232 3233 static int want_replace(struct stripe_head *sh, int disk_idx) 3234 { 3235 struct md_rdev *rdev; 3236 int rv = 0; 3237 /* Doing recovery so rcu locking not required */ 3238 rdev = sh->raid_conf->disks[disk_idx].replacement; 3239 if (rdev 3240 && !test_bit(Faulty, &rdev->flags) 3241 && !test_bit(In_sync, &rdev->flags) 3242 && (rdev->recovery_offset <= sh->sector 3243 || rdev->mddev->recovery_cp <= sh->sector)) 3244 rv = 1; 3245 3246 return rv; 3247 } 3248 3249 /* fetch_block - checks the given member device to see if its data needs 3250 * to be read or computed to satisfy a request. 3251 * 3252 * Returns 1 when no more member devices need to be checked, otherwise returns 3253 * 0 to tell the loop in handle_stripe_fill to continue 3254 */ 3255 3256 static int need_this_block(struct stripe_head *sh, struct stripe_head_state *s, 3257 int disk_idx, int disks) 3258 { 3259 struct r5dev *dev = &sh->dev[disk_idx]; 3260 struct r5dev *fdev[2] = { &sh->dev[s->failed_num[0]], 3261 &sh->dev[s->failed_num[1]] }; 3262 int i; 3263 3264 3265 if (test_bit(R5_LOCKED, &dev->flags) || 3266 test_bit(R5_UPTODATE, &dev->flags)) 3267 /* No point reading this as we already have it or have 3268 * decided to get it. 3269 */ 3270 return 0; 3271 3272 if (dev->toread || 3273 (dev->towrite && !test_bit(R5_OVERWRITE, &dev->flags))) 3274 /* We need this block to directly satisfy a request */ 3275 return 1; 3276 3277 if (s->syncing || s->expanding || 3278 (s->replacing && want_replace(sh, disk_idx))) 3279 /* When syncing, or expanding we read everything. 3280 * When replacing, we need the replaced block. 3281 */ 3282 return 1; 3283 3284 if ((s->failed >= 1 && fdev[0]->toread) || 3285 (s->failed >= 2 && fdev[1]->toread)) 3286 /* If we want to read from a failed device, then 3287 * we need to actually read every other device. 3288 */ 3289 return 1; 3290 3291 /* Sometimes neither read-modify-write nor reconstruct-write 3292 * cycles can work. In those cases we read every block we 3293 * can. Then the parity-update is certain to have enough to 3294 * work with. 3295 * This can only be a problem when we need to write something, 3296 * and some device has failed. If either of those tests 3297 * fail we need look no further. 3298 */ 3299 if (!s->failed || !s->to_write) 3300 return 0; 3301 3302 if (test_bit(R5_Insync, &dev->flags) && 3303 !test_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) 3304 /* Pre-reads at not permitted until after short delay 3305 * to gather multiple requests. However if this 3306 * device is no Insync, the block could only be be computed 3307 * and there is no need to delay that. 3308 */ 3309 return 0; 3310 3311 for (i = 0; i < s->failed; i++) { 3312 if (fdev[i]->towrite && 3313 !test_bit(R5_UPTODATE, &fdev[i]->flags) && 3314 !test_bit(R5_OVERWRITE, &fdev[i]->flags)) 3315 /* If we have a partial write to a failed 3316 * device, then we will need to reconstruct 3317 * the content of that device, so all other 3318 * devices must be read. 3319 */ 3320 return 1; 3321 } 3322 3323 /* If we are forced to do a reconstruct-write, either because 3324 * the current RAID6 implementation only supports that, or 3325 * or because parity cannot be trusted and we are currently 3326 * recovering it, there is extra need to be careful. 3327 * If one of the devices that we would need to read, because 3328 * it is not being overwritten (and maybe not written at all) 3329 * is missing/faulty, then we need to read everything we can. 3330 */ 3331 if (sh->raid_conf->level != 6 && 3332 sh->sector < sh->raid_conf->mddev->recovery_cp) 3333 /* reconstruct-write isn't being forced */ 3334 return 0; 3335 for (i = 0; i < s->failed; i++) { 3336 if (s->failed_num[i] != sh->pd_idx && 3337 s->failed_num[i] != sh->qd_idx && 3338 !test_bit(R5_UPTODATE, &fdev[i]->flags) && 3339 !test_bit(R5_OVERWRITE, &fdev[i]->flags)) 3340 return 1; 3341 } 3342 3343 return 0; 3344 } 3345 3346 static int fetch_block(struct stripe_head *sh, struct stripe_head_state *s, 3347 int disk_idx, int disks) 3348 { 3349 struct r5dev *dev = &sh->dev[disk_idx]; 3350 3351 /* is the data in this block needed, and can we get it? */ 3352 if (need_this_block(sh, s, disk_idx, disks)) { 3353 /* we would like to get this block, possibly by computing it, 3354 * otherwise read it if the backing disk is insync 3355 */ 3356 BUG_ON(test_bit(R5_Wantcompute, &dev->flags)); 3357 BUG_ON(test_bit(R5_Wantread, &dev->flags)); 3358 BUG_ON(sh->batch_head); 3359 if ((s->uptodate == disks - 1) && 3360 (s->failed && (disk_idx == s->failed_num[0] || 3361 disk_idx == s->failed_num[1]))) { 3362 /* have disk failed, and we're requested to fetch it; 3363 * do compute it 3364 */ 3365 pr_debug("Computing stripe %llu block %d\n", 3366 (unsigned long long)sh->sector, disk_idx); 3367 set_bit(STRIPE_COMPUTE_RUN, &sh->state); 3368 set_bit(STRIPE_OP_COMPUTE_BLK, &s->ops_request); 3369 set_bit(R5_Wantcompute, &dev->flags); 3370 sh->ops.target = disk_idx; 3371 sh->ops.target2 = -1; /* no 2nd target */ 3372 s->req_compute = 1; 3373 /* Careful: from this point on 'uptodate' is in the eye 3374 * of raid_run_ops which services 'compute' operations 3375 * before writes. R5_Wantcompute flags a block that will 3376 * be R5_UPTODATE by the time it is needed for a 3377 * subsequent operation. 3378 */ 3379 s->uptodate++; 3380 return 1; 3381 } else if (s->uptodate == disks-2 && s->failed >= 2) { 3382 /* Computing 2-failure is *very* expensive; only 3383 * do it if failed >= 2 3384 */ 3385 int other; 3386 for (other = disks; other--; ) { 3387 if (other == disk_idx) 3388 continue; 3389 if (!test_bit(R5_UPTODATE, 3390 &sh->dev[other].flags)) 3391 break; 3392 } 3393 BUG_ON(other < 0); 3394 pr_debug("Computing stripe %llu blocks %d,%d\n", 3395 (unsigned long long)sh->sector, 3396 disk_idx, other); 3397 set_bit(STRIPE_COMPUTE_RUN, &sh->state); 3398 set_bit(STRIPE_OP_COMPUTE_BLK, &s->ops_request); 3399 set_bit(R5_Wantcompute, &sh->dev[disk_idx].flags); 3400 set_bit(R5_Wantcompute, &sh->dev[other].flags); 3401 sh->ops.target = disk_idx; 3402 sh->ops.target2 = other; 3403 s->uptodate += 2; 3404 s->req_compute = 1; 3405 return 1; 3406 } else if (test_bit(R5_Insync, &dev->flags)) { 3407 set_bit(R5_LOCKED, &dev->flags); 3408 set_bit(R5_Wantread, &dev->flags); 3409 s->locked++; 3410 pr_debug("Reading block %d (sync=%d)\n", 3411 disk_idx, s->syncing); 3412 } 3413 } 3414 3415 return 0; 3416 } 3417 3418 /** 3419 * handle_stripe_fill - read or compute data to satisfy pending requests. 3420 */ 3421 static void handle_stripe_fill(struct stripe_head *sh, 3422 struct stripe_head_state *s, 3423 int disks) 3424 { 3425 int i; 3426 3427 /* look for blocks to read/compute, skip this if a compute 3428 * is already in flight, or if the stripe contents are in the 3429 * midst of changing due to a write 3430 */ 3431 if (!test_bit(STRIPE_COMPUTE_RUN, &sh->state) && !sh->check_state && 3432 !sh->reconstruct_state) 3433 for (i = disks; i--; ) 3434 if (fetch_block(sh, s, i, disks)) 3435 break; 3436 set_bit(STRIPE_HANDLE, &sh->state); 3437 } 3438 3439 static void break_stripe_batch_list(struct stripe_head *head_sh, 3440 unsigned long handle_flags); 3441 /* handle_stripe_clean_event 3442 * any written block on an uptodate or failed drive can be returned. 3443 * Note that if we 'wrote' to a failed drive, it will be UPTODATE, but 3444 * never LOCKED, so we don't need to test 'failed' directly. 3445 */ 3446 static void handle_stripe_clean_event(struct r5conf *conf, 3447 struct stripe_head *sh, int disks, struct bio **return_bi) 3448 { 3449 int i; 3450 struct r5dev *dev; 3451 int discard_pending = 0; 3452 struct stripe_head *head_sh = sh; 3453 bool do_endio = false; 3454 3455 for (i = disks; i--; ) 3456 if (sh->dev[i].written) { 3457 dev = &sh->dev[i]; 3458 if (!test_bit(R5_LOCKED, &dev->flags) && 3459 (test_bit(R5_UPTODATE, &dev->flags) || 3460 test_bit(R5_Discard, &dev->flags) || 3461 test_bit(R5_SkipCopy, &dev->flags))) { 3462 /* We can return any write requests */ 3463 struct bio *wbi, *wbi2; 3464 pr_debug("Return write for disc %d\n", i); 3465 if (test_and_clear_bit(R5_Discard, &dev->flags)) 3466 clear_bit(R5_UPTODATE, &dev->flags); 3467 if (test_and_clear_bit(R5_SkipCopy, &dev->flags)) { 3468 WARN_ON(test_bit(R5_UPTODATE, &dev->flags)); 3469 } 3470 do_endio = true; 3471 3472 returnbi: 3473 dev->page = dev->orig_page; 3474 wbi = dev->written; 3475 dev->written = NULL; 3476 while (wbi && wbi->bi_iter.bi_sector < 3477 dev->sector + STRIPE_SECTORS) { 3478 wbi2 = r5_next_bio(wbi, dev->sector); 3479 if (!raid5_dec_bi_active_stripes(wbi)) { 3480 md_write_end(conf->mddev); 3481 wbi->bi_next = *return_bi; 3482 *return_bi = wbi; 3483 } 3484 wbi = wbi2; 3485 } 3486 bitmap_endwrite(conf->mddev->bitmap, sh->sector, 3487 STRIPE_SECTORS, 3488 !test_bit(STRIPE_DEGRADED, &sh->state), 3489 0); 3490 if (head_sh->batch_head) { 3491 sh = list_first_entry(&sh->batch_list, 3492 struct stripe_head, 3493 batch_list); 3494 if (sh != head_sh) { 3495 dev = &sh->dev[i]; 3496 goto returnbi; 3497 } 3498 } 3499 sh = head_sh; 3500 dev = &sh->dev[i]; 3501 } else if (test_bit(R5_Discard, &dev->flags)) 3502 discard_pending = 1; 3503 WARN_ON(test_bit(R5_SkipCopy, &dev->flags)); 3504 WARN_ON(dev->page != dev->orig_page); 3505 } 3506 if (!discard_pending && 3507 test_bit(R5_Discard, &sh->dev[sh->pd_idx].flags)) { 3508 clear_bit(R5_Discard, &sh->dev[sh->pd_idx].flags); 3509 clear_bit(R5_UPTODATE, &sh->dev[sh->pd_idx].flags); 3510 if (sh->qd_idx >= 0) { 3511 clear_bit(R5_Discard, &sh->dev[sh->qd_idx].flags); 3512 clear_bit(R5_UPTODATE, &sh->dev[sh->qd_idx].flags); 3513 } 3514 /* now that discard is done we can proceed with any sync */ 3515 clear_bit(STRIPE_DISCARD, &sh->state); 3516 /* 3517 * SCSI discard will change some bio fields and the stripe has 3518 * no updated data, so remove it from hash list and the stripe 3519 * will be reinitialized 3520 */ 3521 spin_lock_irq(&conf->device_lock); 3522 unhash: 3523 remove_hash(sh); 3524 if (head_sh->batch_head) { 3525 sh = list_first_entry(&sh->batch_list, 3526 struct stripe_head, batch_list); 3527 if (sh != head_sh) 3528 goto unhash; 3529 } 3530 spin_unlock_irq(&conf->device_lock); 3531 sh = head_sh; 3532 3533 if (test_bit(STRIPE_SYNC_REQUESTED, &sh->state)) 3534 set_bit(STRIPE_HANDLE, &sh->state); 3535 3536 } 3537 3538 if (test_and_clear_bit(STRIPE_FULL_WRITE, &sh->state)) 3539 if (atomic_dec_and_test(&conf->pending_full_writes)) 3540 md_wakeup_thread(conf->mddev->thread); 3541 3542 if (head_sh->batch_head && do_endio) 3543 break_stripe_batch_list(head_sh, STRIPE_EXPAND_SYNC_FLAGS); 3544 } 3545 3546 static void handle_stripe_dirtying(struct r5conf *conf, 3547 struct stripe_head *sh, 3548 struct stripe_head_state *s, 3549 int disks) 3550 { 3551 int rmw = 0, rcw = 0, i; 3552 sector_t recovery_cp = conf->mddev->recovery_cp; 3553 3554 /* Check whether resync is now happening or should start. 3555 * If yes, then the array is dirty (after unclean shutdown or 3556 * initial creation), so parity in some stripes might be inconsistent. 3557 * In this case, we need to always do reconstruct-write, to ensure 3558 * that in case of drive failure or read-error correction, we 3559 * generate correct data from the parity. 3560 */ 3561 if (conf->rmw_level == PARITY_DISABLE_RMW || 3562 (recovery_cp < MaxSector && sh->sector >= recovery_cp && 3563 s->failed == 0)) { 3564 /* Calculate the real rcw later - for now make it 3565 * look like rcw is cheaper 3566 */ 3567 rcw = 1; rmw = 2; 3568 pr_debug("force RCW rmw_level=%u, recovery_cp=%llu sh->sector=%llu\n", 3569 conf->rmw_level, (unsigned long long)recovery_cp, 3570 (unsigned long long)sh->sector); 3571 } else for (i = disks; i--; ) { 3572 /* would I have to read this buffer for read_modify_write */ 3573 struct r5dev *dev = &sh->dev[i]; 3574 if ((dev->towrite || i == sh->pd_idx || i == sh->qd_idx) && 3575 !test_bit(R5_LOCKED, &dev->flags) && 3576 !(test_bit(R5_UPTODATE, &dev->flags) || 3577 test_bit(R5_Wantcompute, &dev->flags))) { 3578 if (test_bit(R5_Insync, &dev->flags)) 3579 rmw++; 3580 else 3581 rmw += 2*disks; /* cannot read it */ 3582 } 3583 /* Would I have to read this buffer for reconstruct_write */ 3584 if (!test_bit(R5_OVERWRITE, &dev->flags) && 3585 i != sh->pd_idx && i != sh->qd_idx && 3586 !test_bit(R5_LOCKED, &dev->flags) && 3587 !(test_bit(R5_UPTODATE, &dev->flags) || 3588 test_bit(R5_Wantcompute, &dev->flags))) { 3589 if (test_bit(R5_Insync, &dev->flags)) 3590 rcw++; 3591 else 3592 rcw += 2*disks; 3593 } 3594 } 3595 pr_debug("for sector %llu, rmw=%d rcw=%d\n", 3596 (unsigned long long)sh->sector, rmw, rcw); 3597 set_bit(STRIPE_HANDLE, &sh->state); 3598 if ((rmw < rcw || (rmw == rcw && conf->rmw_level == PARITY_ENABLE_RMW)) && rmw > 0) { 3599 /* prefer read-modify-write, but need to get some data */ 3600 if (conf->mddev->queue) 3601 blk_add_trace_msg(conf->mddev->queue, 3602 "raid5 rmw %llu %d", 3603 (unsigned long long)sh->sector, rmw); 3604 for (i = disks; i--; ) { 3605 struct r5dev *dev = &sh->dev[i]; 3606 if ((dev->towrite || i == sh->pd_idx || i == sh->qd_idx) && 3607 !test_bit(R5_LOCKED, &dev->flags) && 3608 !(test_bit(R5_UPTODATE, &dev->flags) || 3609 test_bit(R5_Wantcompute, &dev->flags)) && 3610 test_bit(R5_Insync, &dev->flags)) { 3611 if (test_bit(STRIPE_PREREAD_ACTIVE, 3612 &sh->state)) { 3613 pr_debug("Read_old block %d for r-m-w\n", 3614 i); 3615 set_bit(R5_LOCKED, &dev->flags); 3616 set_bit(R5_Wantread, &dev->flags); 3617 s->locked++; 3618 } else { 3619 set_bit(STRIPE_DELAYED, &sh->state); 3620 set_bit(STRIPE_HANDLE, &sh->state); 3621 } 3622 } 3623 } 3624 } 3625 if ((rcw < rmw || (rcw == rmw && conf->rmw_level != PARITY_ENABLE_RMW)) && rcw > 0) { 3626 /* want reconstruct write, but need to get some data */ 3627 int qread =0; 3628 rcw = 0; 3629 for (i = disks; i--; ) { 3630 struct r5dev *dev = &sh->dev[i]; 3631 if (!test_bit(R5_OVERWRITE, &dev->flags) && 3632 i != sh->pd_idx && i != sh->qd_idx && 3633 !test_bit(R5_LOCKED, &dev->flags) && 3634 !(test_bit(R5_UPTODATE, &dev->flags) || 3635 test_bit(R5_Wantcompute, &dev->flags))) { 3636 rcw++; 3637 if (test_bit(R5_Insync, &dev->flags) && 3638 test_bit(STRIPE_PREREAD_ACTIVE, 3639 &sh->state)) { 3640 pr_debug("Read_old block " 3641 "%d for Reconstruct\n", i); 3642 set_bit(R5_LOCKED, &dev->flags); 3643 set_bit(R5_Wantread, &dev->flags); 3644 s->locked++; 3645 qread++; 3646 } else { 3647 set_bit(STRIPE_DELAYED, &sh->state); 3648 set_bit(STRIPE_HANDLE, &sh->state); 3649 } 3650 } 3651 } 3652 if (rcw && conf->mddev->queue) 3653 blk_add_trace_msg(conf->mddev->queue, "raid5 rcw %llu %d %d %d", 3654 (unsigned long long)sh->sector, 3655 rcw, qread, test_bit(STRIPE_DELAYED, &sh->state)); 3656 } 3657 3658 if (rcw > disks && rmw > disks && 3659 !test_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) 3660 set_bit(STRIPE_DELAYED, &sh->state); 3661 3662 /* now if nothing is locked, and if we have enough data, 3663 * we can start a write request 3664 */ 3665 /* since handle_stripe can be called at any time we need to handle the 3666 * case where a compute block operation has been submitted and then a 3667 * subsequent call wants to start a write request. raid_run_ops only 3668 * handles the case where compute block and reconstruct are requested 3669 * simultaneously. If this is not the case then new writes need to be 3670 * held off until the compute completes. 3671 */ 3672 if ((s->req_compute || !test_bit(STRIPE_COMPUTE_RUN, &sh->state)) && 3673 (s->locked == 0 && (rcw == 0 || rmw == 0) && 3674 !test_bit(STRIPE_BIT_DELAY, &sh->state))) 3675 schedule_reconstruction(sh, s, rcw == 0, 0); 3676 } 3677 3678 static void handle_parity_checks5(struct r5conf *conf, struct stripe_head *sh, 3679 struct stripe_head_state *s, int disks) 3680 { 3681 struct r5dev *dev = NULL; 3682 3683 BUG_ON(sh->batch_head); 3684 set_bit(STRIPE_HANDLE, &sh->state); 3685 3686 switch (sh->check_state) { 3687 case check_state_idle: 3688 /* start a new check operation if there are no failures */ 3689 if (s->failed == 0) { 3690 BUG_ON(s->uptodate != disks); 3691 sh->check_state = check_state_run; 3692 set_bit(STRIPE_OP_CHECK, &s->ops_request); 3693 clear_bit(R5_UPTODATE, &sh->dev[sh->pd_idx].flags); 3694 s->uptodate--; 3695 break; 3696 } 3697 dev = &sh->dev[s->failed_num[0]]; 3698 /* fall through */ 3699 case check_state_compute_result: 3700 sh->check_state = check_state_idle; 3701 if (!dev) 3702 dev = &sh->dev[sh->pd_idx]; 3703 3704 /* check that a write has not made the stripe insync */ 3705 if (test_bit(STRIPE_INSYNC, &sh->state)) 3706 break; 3707 3708 /* either failed parity check, or recovery is happening */ 3709 BUG_ON(!test_bit(R5_UPTODATE, &dev->flags)); 3710 BUG_ON(s->uptodate != disks); 3711 3712 set_bit(R5_LOCKED, &dev->flags); 3713 s->locked++; 3714 set_bit(R5_Wantwrite, &dev->flags); 3715 3716 clear_bit(STRIPE_DEGRADED, &sh->state); 3717 set_bit(STRIPE_INSYNC, &sh->state); 3718 break; 3719 case check_state_run: 3720 break; /* we will be called again upon completion */ 3721 case check_state_check_result: 3722 sh->check_state = check_state_idle; 3723 3724 /* if a failure occurred during the check operation, leave 3725 * STRIPE_INSYNC not set and let the stripe be handled again 3726 */ 3727 if (s->failed) 3728 break; 3729 3730 /* handle a successful check operation, if parity is correct 3731 * we are done. Otherwise update the mismatch count and repair 3732 * parity if !MD_RECOVERY_CHECK 3733 */ 3734 if ((sh->ops.zero_sum_result & SUM_CHECK_P_RESULT) == 0) 3735 /* parity is correct (on disc, 3736 * not in buffer any more) 3737 */ 3738 set_bit(STRIPE_INSYNC, &sh->state); 3739 else { 3740 atomic64_add(STRIPE_SECTORS, &conf->mddev->resync_mismatches); 3741 if (test_bit(MD_RECOVERY_CHECK, &conf->mddev->recovery)) 3742 /* don't try to repair!! */ 3743 set_bit(STRIPE_INSYNC, &sh->state); 3744 else { 3745 sh->check_state = check_state_compute_run; 3746 set_bit(STRIPE_COMPUTE_RUN, &sh->state); 3747 set_bit(STRIPE_OP_COMPUTE_BLK, &s->ops_request); 3748 set_bit(R5_Wantcompute, 3749 &sh->dev[sh->pd_idx].flags); 3750 sh->ops.target = sh->pd_idx; 3751 sh->ops.target2 = -1; 3752 s->uptodate++; 3753 } 3754 } 3755 break; 3756 case check_state_compute_run: 3757 break; 3758 default: 3759 printk(KERN_ERR "%s: unknown check_state: %d sector: %llu\n", 3760 __func__, sh->check_state, 3761 (unsigned long long) sh->sector); 3762 BUG(); 3763 } 3764 } 3765 3766 static void handle_parity_checks6(struct r5conf *conf, struct stripe_head *sh, 3767 struct stripe_head_state *s, 3768 int disks) 3769 { 3770 int pd_idx = sh->pd_idx; 3771 int qd_idx = sh->qd_idx; 3772 struct r5dev *dev; 3773 3774 BUG_ON(sh->batch_head); 3775 set_bit(STRIPE_HANDLE, &sh->state); 3776 3777 BUG_ON(s->failed > 2); 3778 3779 /* Want to check and possibly repair P and Q. 3780 * However there could be one 'failed' device, in which 3781 * case we can only check one of them, possibly using the 3782 * other to generate missing data 3783 */ 3784 3785 switch (sh->check_state) { 3786 case check_state_idle: 3787 /* start a new check operation if there are < 2 failures */ 3788 if (s->failed == s->q_failed) { 3789 /* The only possible failed device holds Q, so it 3790 * makes sense to check P (If anything else were failed, 3791 * we would have used P to recreate it). 3792 */ 3793 sh->check_state = check_state_run; 3794 } 3795 if (!s->q_failed && s->failed < 2) { 3796 /* Q is not failed, and we didn't use it to generate 3797 * anything, so it makes sense to check it 3798 */ 3799 if (sh->check_state == check_state_run) 3800 sh->check_state = check_state_run_pq; 3801 else 3802 sh->check_state = check_state_run_q; 3803 } 3804 3805 /* discard potentially stale zero_sum_result */ 3806 sh->ops.zero_sum_result = 0; 3807 3808 if (sh->check_state == check_state_run) { 3809 /* async_xor_zero_sum destroys the contents of P */ 3810 clear_bit(R5_UPTODATE, &sh->dev[pd_idx].flags); 3811 s->uptodate--; 3812 } 3813 if (sh->check_state >= check_state_run && 3814 sh->check_state <= check_state_run_pq) { 3815 /* async_syndrome_zero_sum preserves P and Q, so 3816 * no need to mark them !uptodate here 3817 */ 3818 set_bit(STRIPE_OP_CHECK, &s->ops_request); 3819 break; 3820 } 3821 3822 /* we have 2-disk failure */ 3823 BUG_ON(s->failed != 2); 3824 /* fall through */ 3825 case check_state_compute_result: 3826 sh->check_state = check_state_idle; 3827 3828 /* check that a write has not made the stripe insync */ 3829 if (test_bit(STRIPE_INSYNC, &sh->state)) 3830 break; 3831 3832 /* now write out any block on a failed drive, 3833 * or P or Q if they were recomputed 3834 */ 3835 BUG_ON(s->uptodate < disks - 1); /* We don't need Q to recover */ 3836 if (s->failed == 2) { 3837 dev = &sh->dev[s->failed_num[1]]; 3838 s->locked++; 3839 set_bit(R5_LOCKED, &dev->flags); 3840 set_bit(R5_Wantwrite, &dev->flags); 3841 } 3842 if (s->failed >= 1) { 3843 dev = &sh->dev[s->failed_num[0]]; 3844 s->locked++; 3845 set_bit(R5_LOCKED, &dev->flags); 3846 set_bit(R5_Wantwrite, &dev->flags); 3847 } 3848 if (sh->ops.zero_sum_result & SUM_CHECK_P_RESULT) { 3849 dev = &sh->dev[pd_idx]; 3850 s->locked++; 3851 set_bit(R5_LOCKED, &dev->flags); 3852 set_bit(R5_Wantwrite, &dev->flags); 3853 } 3854 if (sh->ops.zero_sum_result & SUM_CHECK_Q_RESULT) { 3855 dev = &sh->dev[qd_idx]; 3856 s->locked++; 3857 set_bit(R5_LOCKED, &dev->flags); 3858 set_bit(R5_Wantwrite, &dev->flags); 3859 } 3860 clear_bit(STRIPE_DEGRADED, &sh->state); 3861 3862 set_bit(STRIPE_INSYNC, &sh->state); 3863 break; 3864 case check_state_run: 3865 case check_state_run_q: 3866 case check_state_run_pq: 3867 break; /* we will be called again upon completion */ 3868 case check_state_check_result: 3869 sh->check_state = check_state_idle; 3870 3871 /* handle a successful check operation, if parity is correct 3872 * we are done. Otherwise update the mismatch count and repair 3873 * parity if !MD_RECOVERY_CHECK 3874 */ 3875 if (sh->ops.zero_sum_result == 0) { 3876 /* both parities are correct */ 3877 if (!s->failed) 3878 set_bit(STRIPE_INSYNC, &sh->state); 3879 else { 3880 /* in contrast to the raid5 case we can validate 3881 * parity, but still have a failure to write 3882 * back 3883 */ 3884 sh->check_state = check_state_compute_result; 3885 /* Returning at this point means that we may go 3886 * off and bring p and/or q uptodate again so 3887 * we make sure to check zero_sum_result again 3888 * to verify if p or q need writeback 3889 */ 3890 } 3891 } else { 3892 atomic64_add(STRIPE_SECTORS, &conf->mddev->resync_mismatches); 3893 if (test_bit(MD_RECOVERY_CHECK, &conf->mddev->recovery)) 3894 /* don't try to repair!! */ 3895 set_bit(STRIPE_INSYNC, &sh->state); 3896 else { 3897 int *target = &sh->ops.target; 3898 3899 sh->ops.target = -1; 3900 sh->ops.target2 = -1; 3901 sh->check_state = check_state_compute_run; 3902 set_bit(STRIPE_COMPUTE_RUN, &sh->state); 3903 set_bit(STRIPE_OP_COMPUTE_BLK, &s->ops_request); 3904 if (sh->ops.zero_sum_result & SUM_CHECK_P_RESULT) { 3905 set_bit(R5_Wantcompute, 3906 &sh->dev[pd_idx].flags); 3907 *target = pd_idx; 3908 target = &sh->ops.target2; 3909 s->uptodate++; 3910 } 3911 if (sh->ops.zero_sum_result & SUM_CHECK_Q_RESULT) { 3912 set_bit(R5_Wantcompute, 3913 &sh->dev[qd_idx].flags); 3914 *target = qd_idx; 3915 s->uptodate++; 3916 } 3917 } 3918 } 3919 break; 3920 case check_state_compute_run: 3921 break; 3922 default: 3923 printk(KERN_ERR "%s: unknown check_state: %d sector: %llu\n", 3924 __func__, sh->check_state, 3925 (unsigned long long) sh->sector); 3926 BUG(); 3927 } 3928 } 3929 3930 static void handle_stripe_expansion(struct r5conf *conf, struct stripe_head *sh) 3931 { 3932 int i; 3933 3934 /* We have read all the blocks in this stripe and now we need to 3935 * copy some of them into a target stripe for expand. 3936 */ 3937 struct dma_async_tx_descriptor *tx = NULL; 3938 BUG_ON(sh->batch_head); 3939 clear_bit(STRIPE_EXPAND_SOURCE, &sh->state); 3940 for (i = 0; i < sh->disks; i++) 3941 if (i != sh->pd_idx && i != sh->qd_idx) { 3942 int dd_idx, j; 3943 struct stripe_head *sh2; 3944 struct async_submit_ctl submit; 3945 3946 sector_t bn = compute_blocknr(sh, i, 1); 3947 sector_t s = raid5_compute_sector(conf, bn, 0, 3948 &dd_idx, NULL); 3949 sh2 = get_active_stripe(conf, s, 0, 1, 1); 3950 if (sh2 == NULL) 3951 /* so far only the early blocks of this stripe 3952 * have been requested. When later blocks 3953 * get requested, we will try again 3954 */ 3955 continue; 3956 if (!test_bit(STRIPE_EXPANDING, &sh2->state) || 3957 test_bit(R5_Expanded, &sh2->dev[dd_idx].flags)) { 3958 /* must have already done this block */ 3959 release_stripe(sh2); 3960 continue; 3961 } 3962 3963 /* place all the copies on one channel */ 3964 init_async_submit(&submit, 0, tx, NULL, NULL, NULL); 3965 tx = async_memcpy(sh2->dev[dd_idx].page, 3966 sh->dev[i].page, 0, 0, STRIPE_SIZE, 3967 &submit); 3968 3969 set_bit(R5_Expanded, &sh2->dev[dd_idx].flags); 3970 set_bit(R5_UPTODATE, &sh2->dev[dd_idx].flags); 3971 for (j = 0; j < conf->raid_disks; j++) 3972 if (j != sh2->pd_idx && 3973 j != sh2->qd_idx && 3974 !test_bit(R5_Expanded, &sh2->dev[j].flags)) 3975 break; 3976 if (j == conf->raid_disks) { 3977 set_bit(STRIPE_EXPAND_READY, &sh2->state); 3978 set_bit(STRIPE_HANDLE, &sh2->state); 3979 } 3980 release_stripe(sh2); 3981 3982 } 3983 /* done submitting copies, wait for them to complete */ 3984 async_tx_quiesce(&tx); 3985 } 3986 3987 /* 3988 * handle_stripe - do things to a stripe. 3989 * 3990 * We lock the stripe by setting STRIPE_ACTIVE and then examine the 3991 * state of various bits to see what needs to be done. 3992 * Possible results: 3993 * return some read requests which now have data 3994 * return some write requests which are safely on storage 3995 * schedule a read on some buffers 3996 * schedule a write of some buffers 3997 * return confirmation of parity correctness 3998 * 3999 */ 4000 4001 static void analyse_stripe(struct stripe_head *sh, struct stripe_head_state *s) 4002 { 4003 struct r5conf *conf = sh->raid_conf; 4004 int disks = sh->disks; 4005 struct r5dev *dev; 4006 int i; 4007 int do_recovery = 0; 4008 4009 memset(s, 0, sizeof(*s)); 4010 4011 s->expanding = test_bit(STRIPE_EXPAND_SOURCE, &sh->state) && !sh->batch_head; 4012 s->expanded = test_bit(STRIPE_EXPAND_READY, &sh->state) && !sh->batch_head; 4013 s->failed_num[0] = -1; 4014 s->failed_num[1] = -1; 4015 4016 /* Now to look around and see what can be done */ 4017 rcu_read_lock(); 4018 for (i=disks; i--; ) { 4019 struct md_rdev *rdev; 4020 sector_t first_bad; 4021 int bad_sectors; 4022 int is_bad = 0; 4023 4024 dev = &sh->dev[i]; 4025 4026 pr_debug("check %d: state 0x%lx read %p write %p written %p\n", 4027 i, dev->flags, 4028 dev->toread, dev->towrite, dev->written); 4029 /* maybe we can reply to a read 4030 * 4031 * new wantfill requests are only permitted while 4032 * ops_complete_biofill is guaranteed to be inactive 4033 */ 4034 if (test_bit(R5_UPTODATE, &dev->flags) && dev->toread && 4035 !test_bit(STRIPE_BIOFILL_RUN, &sh->state)) 4036 set_bit(R5_Wantfill, &dev->flags); 4037 4038 /* now count some things */ 4039 if (test_bit(R5_LOCKED, &dev->flags)) 4040 s->locked++; 4041 if (test_bit(R5_UPTODATE, &dev->flags)) 4042 s->uptodate++; 4043 if (test_bit(R5_Wantcompute, &dev->flags)) { 4044 s->compute++; 4045 BUG_ON(s->compute > 2); 4046 } 4047 4048 if (test_bit(R5_Wantfill, &dev->flags)) 4049 s->to_fill++; 4050 else if (dev->toread) 4051 s->to_read++; 4052 if (dev->towrite) { 4053 s->to_write++; 4054 if (!test_bit(R5_OVERWRITE, &dev->flags)) 4055 s->non_overwrite++; 4056 } 4057 if (dev->written) 4058 s->written++; 4059 /* Prefer to use the replacement for reads, but only 4060 * if it is recovered enough and has no bad blocks. 4061 */ 4062 rdev = rcu_dereference(conf->disks[i].replacement); 4063 if (rdev && !test_bit(Faulty, &rdev->flags) && 4064 rdev->recovery_offset >= sh->sector + STRIPE_SECTORS && 4065 !is_badblock(rdev, sh->sector, STRIPE_SECTORS, 4066 &first_bad, &bad_sectors)) 4067 set_bit(R5_ReadRepl, &dev->flags); 4068 else { 4069 if (rdev && !test_bit(Faulty, &rdev->flags)) 4070 set_bit(R5_NeedReplace, &dev->flags); 4071 else 4072 clear_bit(R5_NeedReplace, &dev->flags); 4073 rdev = rcu_dereference(conf->disks[i].rdev); 4074 clear_bit(R5_ReadRepl, &dev->flags); 4075 } 4076 if (rdev && test_bit(Faulty, &rdev->flags)) 4077 rdev = NULL; 4078 if (rdev) { 4079 is_bad = is_badblock(rdev, sh->sector, STRIPE_SECTORS, 4080 &first_bad, &bad_sectors); 4081 if (s->blocked_rdev == NULL 4082 && (test_bit(Blocked, &rdev->flags) 4083 || is_bad < 0)) { 4084 if (is_bad < 0) 4085 set_bit(BlockedBadBlocks, 4086 &rdev->flags); 4087 s->blocked_rdev = rdev; 4088 atomic_inc(&rdev->nr_pending); 4089 } 4090 } 4091 clear_bit(R5_Insync, &dev->flags); 4092 if (!rdev) 4093 /* Not in-sync */; 4094 else if (is_bad) { 4095 /* also not in-sync */ 4096 if (!test_bit(WriteErrorSeen, &rdev->flags) && 4097 test_bit(R5_UPTODATE, &dev->flags)) { 4098 /* treat as in-sync, but with a read error 4099 * which we can now try to correct 4100 */ 4101 set_bit(R5_Insync, &dev->flags); 4102 set_bit(R5_ReadError, &dev->flags); 4103 } 4104 } else if (test_bit(In_sync, &rdev->flags)) 4105 set_bit(R5_Insync, &dev->flags); 4106 else if (sh->sector + STRIPE_SECTORS <= rdev->recovery_offset) 4107 /* in sync if before recovery_offset */ 4108 set_bit(R5_Insync, &dev->flags); 4109 else if (test_bit(R5_UPTODATE, &dev->flags) && 4110 test_bit(R5_Expanded, &dev->flags)) 4111 /* If we've reshaped into here, we assume it is Insync. 4112 * We will shortly update recovery_offset to make 4113 * it official. 4114 */ 4115 set_bit(R5_Insync, &dev->flags); 4116 4117 if (test_bit(R5_WriteError, &dev->flags)) { 4118 /* This flag does not apply to '.replacement' 4119 * only to .rdev, so make sure to check that*/ 4120 struct md_rdev *rdev2 = rcu_dereference( 4121 conf->disks[i].rdev); 4122 if (rdev2 == rdev) 4123 clear_bit(R5_Insync, &dev->flags); 4124 if (rdev2 && !test_bit(Faulty, &rdev2->flags)) { 4125 s->handle_bad_blocks = 1; 4126 atomic_inc(&rdev2->nr_pending); 4127 } else 4128 clear_bit(R5_WriteError, &dev->flags); 4129 } 4130 if (test_bit(R5_MadeGood, &dev->flags)) { 4131 /* This flag does not apply to '.replacement' 4132 * only to .rdev, so make sure to check that*/ 4133 struct md_rdev *rdev2 = rcu_dereference( 4134 conf->disks[i].rdev); 4135 if (rdev2 && !test_bit(Faulty, &rdev2->flags)) { 4136 s->handle_bad_blocks = 1; 4137 atomic_inc(&rdev2->nr_pending); 4138 } else 4139 clear_bit(R5_MadeGood, &dev->flags); 4140 } 4141 if (test_bit(R5_MadeGoodRepl, &dev->flags)) { 4142 struct md_rdev *rdev2 = rcu_dereference( 4143 conf->disks[i].replacement); 4144 if (rdev2 && !test_bit(Faulty, &rdev2->flags)) { 4145 s->handle_bad_blocks = 1; 4146 atomic_inc(&rdev2->nr_pending); 4147 } else 4148 clear_bit(R5_MadeGoodRepl, &dev->flags); 4149 } 4150 if (!test_bit(R5_Insync, &dev->flags)) { 4151 /* The ReadError flag will just be confusing now */ 4152 clear_bit(R5_ReadError, &dev->flags); 4153 clear_bit(R5_ReWrite, &dev->flags); 4154 } 4155 if (test_bit(R5_ReadError, &dev->flags)) 4156 clear_bit(R5_Insync, &dev->flags); 4157 if (!test_bit(R5_Insync, &dev->flags)) { 4158 if (s->failed < 2) 4159 s->failed_num[s->failed] = i; 4160 s->failed++; 4161 if (rdev && !test_bit(Faulty, &rdev->flags)) 4162 do_recovery = 1; 4163 } 4164 } 4165 if (test_bit(STRIPE_SYNCING, &sh->state)) { 4166 /* If there is a failed device being replaced, 4167 * we must be recovering. 4168 * else if we are after recovery_cp, we must be syncing 4169 * else if MD_RECOVERY_REQUESTED is set, we also are syncing. 4170 * else we can only be replacing 4171 * sync and recovery both need to read all devices, and so 4172 * use the same flag. 4173 */ 4174 if (do_recovery || 4175 sh->sector >= conf->mddev->recovery_cp || 4176 test_bit(MD_RECOVERY_REQUESTED, &(conf->mddev->recovery))) 4177 s->syncing = 1; 4178 else 4179 s->replacing = 1; 4180 } 4181 rcu_read_unlock(); 4182 } 4183 4184 static int clear_batch_ready(struct stripe_head *sh) 4185 { 4186 /* Return '1' if this is a member of batch, or 4187 * '0' if it is a lone stripe or a head which can now be 4188 * handled. 4189 */ 4190 struct stripe_head *tmp; 4191 if (!test_and_clear_bit(STRIPE_BATCH_READY, &sh->state)) 4192 return (sh->batch_head && sh->batch_head != sh); 4193 spin_lock(&sh->stripe_lock); 4194 if (!sh->batch_head) { 4195 spin_unlock(&sh->stripe_lock); 4196 return 0; 4197 } 4198 4199 /* 4200 * this stripe could be added to a batch list before we check 4201 * BATCH_READY, skips it 4202 */ 4203 if (sh->batch_head != sh) { 4204 spin_unlock(&sh->stripe_lock); 4205 return 1; 4206 } 4207 spin_lock(&sh->batch_lock); 4208 list_for_each_entry(tmp, &sh->batch_list, batch_list) 4209 clear_bit(STRIPE_BATCH_READY, &tmp->state); 4210 spin_unlock(&sh->batch_lock); 4211 spin_unlock(&sh->stripe_lock); 4212 4213 /* 4214 * BATCH_READY is cleared, no new stripes can be added. 4215 * batch_list can be accessed without lock 4216 */ 4217 return 0; 4218 } 4219 4220 static void break_stripe_batch_list(struct stripe_head *head_sh, 4221 unsigned long handle_flags) 4222 { 4223 struct stripe_head *sh, *next; 4224 int i; 4225 int do_wakeup = 0; 4226 4227 list_for_each_entry_safe(sh, next, &head_sh->batch_list, batch_list) { 4228 4229 list_del_init(&sh->batch_list); 4230 4231 WARN_ON_ONCE(sh->state & ((1 << STRIPE_ACTIVE) | 4232 (1 << STRIPE_SYNCING) | 4233 (1 << STRIPE_REPLACED) | 4234 (1 << STRIPE_PREREAD_ACTIVE) | 4235 (1 << STRIPE_DELAYED) | 4236 (1 << STRIPE_BIT_DELAY) | 4237 (1 << STRIPE_FULL_WRITE) | 4238 (1 << STRIPE_BIOFILL_RUN) | 4239 (1 << STRIPE_COMPUTE_RUN) | 4240 (1 << STRIPE_OPS_REQ_PENDING) | 4241 (1 << STRIPE_DISCARD) | 4242 (1 << STRIPE_BATCH_READY) | 4243 (1 << STRIPE_BATCH_ERR) | 4244 (1 << STRIPE_BITMAP_PENDING))); 4245 WARN_ON_ONCE(head_sh->state & ((1 << STRIPE_DISCARD) | 4246 (1 << STRIPE_REPLACED))); 4247 4248 set_mask_bits(&sh->state, ~(STRIPE_EXPAND_SYNC_FLAGS | 4249 (1 << STRIPE_DEGRADED)), 4250 head_sh->state & (1 << STRIPE_INSYNC)); 4251 4252 sh->check_state = head_sh->check_state; 4253 sh->reconstruct_state = head_sh->reconstruct_state; 4254 for (i = 0; i < sh->disks; i++) { 4255 if (test_and_clear_bit(R5_Overlap, &sh->dev[i].flags)) 4256 do_wakeup = 1; 4257 sh->dev[i].flags = head_sh->dev[i].flags & 4258 (~((1 << R5_WriteError) | (1 << R5_Overlap))); 4259 } 4260 spin_lock_irq(&sh->stripe_lock); 4261 sh->batch_head = NULL; 4262 spin_unlock_irq(&sh->stripe_lock); 4263 if (handle_flags == 0 || 4264 sh->state & handle_flags) 4265 set_bit(STRIPE_HANDLE, &sh->state); 4266 release_stripe(sh); 4267 } 4268 spin_lock_irq(&head_sh->stripe_lock); 4269 head_sh->batch_head = NULL; 4270 spin_unlock_irq(&head_sh->stripe_lock); 4271 for (i = 0; i < head_sh->disks; i++) 4272 if (test_and_clear_bit(R5_Overlap, &head_sh->dev[i].flags)) 4273 do_wakeup = 1; 4274 if (head_sh->state & handle_flags) 4275 set_bit(STRIPE_HANDLE, &head_sh->state); 4276 4277 if (do_wakeup) 4278 wake_up(&head_sh->raid_conf->wait_for_overlap); 4279 } 4280 4281 static void handle_stripe(struct stripe_head *sh) 4282 { 4283 struct stripe_head_state s; 4284 struct r5conf *conf = sh->raid_conf; 4285 int i; 4286 int prexor; 4287 int disks = sh->disks; 4288 struct r5dev *pdev, *qdev; 4289 4290 clear_bit(STRIPE_HANDLE, &sh->state); 4291 if (test_and_set_bit_lock(STRIPE_ACTIVE, &sh->state)) { 4292 /* already being handled, ensure it gets handled 4293 * again when current action finishes */ 4294 set_bit(STRIPE_HANDLE, &sh->state); 4295 return; 4296 } 4297 4298 if (clear_batch_ready(sh) ) { 4299 clear_bit_unlock(STRIPE_ACTIVE, &sh->state); 4300 return; 4301 } 4302 4303 if (test_and_clear_bit(STRIPE_BATCH_ERR, &sh->state)) 4304 break_stripe_batch_list(sh, 0); 4305 4306 if (test_bit(STRIPE_SYNC_REQUESTED, &sh->state) && !sh->batch_head) { 4307 spin_lock(&sh->stripe_lock); 4308 /* Cannot process 'sync' concurrently with 'discard' */ 4309 if (!test_bit(STRIPE_DISCARD, &sh->state) && 4310 test_and_clear_bit(STRIPE_SYNC_REQUESTED, &sh->state)) { 4311 set_bit(STRIPE_SYNCING, &sh->state); 4312 clear_bit(STRIPE_INSYNC, &sh->state); 4313 clear_bit(STRIPE_REPLACED, &sh->state); 4314 } 4315 spin_unlock(&sh->stripe_lock); 4316 } 4317 clear_bit(STRIPE_DELAYED, &sh->state); 4318 4319 pr_debug("handling stripe %llu, state=%#lx cnt=%d, " 4320 "pd_idx=%d, qd_idx=%d\n, check:%d, reconstruct:%d\n", 4321 (unsigned long long)sh->sector, sh->state, 4322 atomic_read(&sh->count), sh->pd_idx, sh->qd_idx, 4323 sh->check_state, sh->reconstruct_state); 4324 4325 analyse_stripe(sh, &s); 4326 4327 if (s.handle_bad_blocks) { 4328 set_bit(STRIPE_HANDLE, &sh->state); 4329 goto finish; 4330 } 4331 4332 if (unlikely(s.blocked_rdev)) { 4333 if (s.syncing || s.expanding || s.expanded || 4334 s.replacing || s.to_write || s.written) { 4335 set_bit(STRIPE_HANDLE, &sh->state); 4336 goto finish; 4337 } 4338 /* There is nothing for the blocked_rdev to block */ 4339 rdev_dec_pending(s.blocked_rdev, conf->mddev); 4340 s.blocked_rdev = NULL; 4341 } 4342 4343 if (s.to_fill && !test_bit(STRIPE_BIOFILL_RUN, &sh->state)) { 4344 set_bit(STRIPE_OP_BIOFILL, &s.ops_request); 4345 set_bit(STRIPE_BIOFILL_RUN, &sh->state); 4346 } 4347 4348 pr_debug("locked=%d uptodate=%d to_read=%d" 4349 " to_write=%d failed=%d failed_num=%d,%d\n", 4350 s.locked, s.uptodate, s.to_read, s.to_write, s.failed, 4351 s.failed_num[0], s.failed_num[1]); 4352 /* check if the array has lost more than max_degraded devices and, 4353 * if so, some requests might need to be failed. 4354 */ 4355 if (s.failed > conf->max_degraded) { 4356 sh->check_state = 0; 4357 sh->reconstruct_state = 0; 4358 break_stripe_batch_list(sh, 0); 4359 if (s.to_read+s.to_write+s.written) 4360 handle_failed_stripe(conf, sh, &s, disks, &s.return_bi); 4361 if (s.syncing + s.replacing) 4362 handle_failed_sync(conf, sh, &s); 4363 } 4364 4365 /* Now we check to see if any write operations have recently 4366 * completed 4367 */ 4368 prexor = 0; 4369 if (sh->reconstruct_state == reconstruct_state_prexor_drain_result) 4370 prexor = 1; 4371 if (sh->reconstruct_state == reconstruct_state_drain_result || 4372 sh->reconstruct_state == reconstruct_state_prexor_drain_result) { 4373 sh->reconstruct_state = reconstruct_state_idle; 4374 4375 /* All the 'written' buffers and the parity block are ready to 4376 * be written back to disk 4377 */ 4378 BUG_ON(!test_bit(R5_UPTODATE, &sh->dev[sh->pd_idx].flags) && 4379 !test_bit(R5_Discard, &sh->dev[sh->pd_idx].flags)); 4380 BUG_ON(sh->qd_idx >= 0 && 4381 !test_bit(R5_UPTODATE, &sh->dev[sh->qd_idx].flags) && 4382 !test_bit(R5_Discard, &sh->dev[sh->qd_idx].flags)); 4383 for (i = disks; i--; ) { 4384 struct r5dev *dev = &sh->dev[i]; 4385 if (test_bit(R5_LOCKED, &dev->flags) && 4386 (i == sh->pd_idx || i == sh->qd_idx || 4387 dev->written)) { 4388 pr_debug("Writing block %d\n", i); 4389 set_bit(R5_Wantwrite, &dev->flags); 4390 if (prexor) 4391 continue; 4392 if (s.failed > 1) 4393 continue; 4394 if (!test_bit(R5_Insync, &dev->flags) || 4395 ((i == sh->pd_idx || i == sh->qd_idx) && 4396 s.failed == 0)) 4397 set_bit(STRIPE_INSYNC, &sh->state); 4398 } 4399 } 4400 if (test_and_clear_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) 4401 s.dec_preread_active = 1; 4402 } 4403 4404 /* 4405 * might be able to return some write requests if the parity blocks 4406 * are safe, or on a failed drive 4407 */ 4408 pdev = &sh->dev[sh->pd_idx]; 4409 s.p_failed = (s.failed >= 1 && s.failed_num[0] == sh->pd_idx) 4410 || (s.failed >= 2 && s.failed_num[1] == sh->pd_idx); 4411 qdev = &sh->dev[sh->qd_idx]; 4412 s.q_failed = (s.failed >= 1 && s.failed_num[0] == sh->qd_idx) 4413 || (s.failed >= 2 && s.failed_num[1] == sh->qd_idx) 4414 || conf->level < 6; 4415 4416 if (s.written && 4417 (s.p_failed || ((test_bit(R5_Insync, &pdev->flags) 4418 && !test_bit(R5_LOCKED, &pdev->flags) 4419 && (test_bit(R5_UPTODATE, &pdev->flags) || 4420 test_bit(R5_Discard, &pdev->flags))))) && 4421 (s.q_failed || ((test_bit(R5_Insync, &qdev->flags) 4422 && !test_bit(R5_LOCKED, &qdev->flags) 4423 && (test_bit(R5_UPTODATE, &qdev->flags) || 4424 test_bit(R5_Discard, &qdev->flags)))))) 4425 handle_stripe_clean_event(conf, sh, disks, &s.return_bi); 4426 4427 /* Now we might consider reading some blocks, either to check/generate 4428 * parity, or to satisfy requests 4429 * or to load a block that is being partially written. 4430 */ 4431 if (s.to_read || s.non_overwrite 4432 || (conf->level == 6 && s.to_write && s.failed) 4433 || (s.syncing && (s.uptodate + s.compute < disks)) 4434 || s.replacing 4435 || s.expanding) 4436 handle_stripe_fill(sh, &s, disks); 4437 4438 /* Now to consider new write requests and what else, if anything 4439 * should be read. We do not handle new writes when: 4440 * 1/ A 'write' operation (copy+xor) is already in flight. 4441 * 2/ A 'check' operation is in flight, as it may clobber the parity 4442 * block. 4443 */ 4444 if (s.to_write && !sh->reconstruct_state && !sh->check_state) 4445 handle_stripe_dirtying(conf, sh, &s, disks); 4446 4447 /* maybe we need to check and possibly fix the parity for this stripe 4448 * Any reads will already have been scheduled, so we just see if enough 4449 * data is available. The parity check is held off while parity 4450 * dependent operations are in flight. 4451 */ 4452 if (sh->check_state || 4453 (s.syncing && s.locked == 0 && 4454 !test_bit(STRIPE_COMPUTE_RUN, &sh->state) && 4455 !test_bit(STRIPE_INSYNC, &sh->state))) { 4456 if (conf->level == 6) 4457 handle_parity_checks6(conf, sh, &s, disks); 4458 else 4459 handle_parity_checks5(conf, sh, &s, disks); 4460 } 4461 4462 if ((s.replacing || s.syncing) && s.locked == 0 4463 && !test_bit(STRIPE_COMPUTE_RUN, &sh->state) 4464 && !test_bit(STRIPE_REPLACED, &sh->state)) { 4465 /* Write out to replacement devices where possible */ 4466 for (i = 0; i < conf->raid_disks; i++) 4467 if (test_bit(R5_NeedReplace, &sh->dev[i].flags)) { 4468 WARN_ON(!test_bit(R5_UPTODATE, &sh->dev[i].flags)); 4469 set_bit(R5_WantReplace, &sh->dev[i].flags); 4470 set_bit(R5_LOCKED, &sh->dev[i].flags); 4471 s.locked++; 4472 } 4473 if (s.replacing) 4474 set_bit(STRIPE_INSYNC, &sh->state); 4475 set_bit(STRIPE_REPLACED, &sh->state); 4476 } 4477 if ((s.syncing || s.replacing) && s.locked == 0 && 4478 !test_bit(STRIPE_COMPUTE_RUN, &sh->state) && 4479 test_bit(STRIPE_INSYNC, &sh->state)) { 4480 md_done_sync(conf->mddev, STRIPE_SECTORS, 1); 4481 clear_bit(STRIPE_SYNCING, &sh->state); 4482 if (test_and_clear_bit(R5_Overlap, &sh->dev[sh->pd_idx].flags)) 4483 wake_up(&conf->wait_for_overlap); 4484 } 4485 4486 /* If the failed drives are just a ReadError, then we might need 4487 * to progress the repair/check process 4488 */ 4489 if (s.failed <= conf->max_degraded && !conf->mddev->ro) 4490 for (i = 0; i < s.failed; i++) { 4491 struct r5dev *dev = &sh->dev[s.failed_num[i]]; 4492 if (test_bit(R5_ReadError, &dev->flags) 4493 && !test_bit(R5_LOCKED, &dev->flags) 4494 && test_bit(R5_UPTODATE, &dev->flags) 4495 ) { 4496 if (!test_bit(R5_ReWrite, &dev->flags)) { 4497 set_bit(R5_Wantwrite, &dev->flags); 4498 set_bit(R5_ReWrite, &dev->flags); 4499 set_bit(R5_LOCKED, &dev->flags); 4500 s.locked++; 4501 } else { 4502 /* let's read it back */ 4503 set_bit(R5_Wantread, &dev->flags); 4504 set_bit(R5_LOCKED, &dev->flags); 4505 s.locked++; 4506 } 4507 } 4508 } 4509 4510 /* Finish reconstruct operations initiated by the expansion process */ 4511 if (sh->reconstruct_state == reconstruct_state_result) { 4512 struct stripe_head *sh_src 4513 = get_active_stripe(conf, sh->sector, 1, 1, 1); 4514 if (sh_src && test_bit(STRIPE_EXPAND_SOURCE, &sh_src->state)) { 4515 /* sh cannot be written until sh_src has been read. 4516 * so arrange for sh to be delayed a little 4517 */ 4518 set_bit(STRIPE_DELAYED, &sh->state); 4519 set_bit(STRIPE_HANDLE, &sh->state); 4520 if (!test_and_set_bit(STRIPE_PREREAD_ACTIVE, 4521 &sh_src->state)) 4522 atomic_inc(&conf->preread_active_stripes); 4523 release_stripe(sh_src); 4524 goto finish; 4525 } 4526 if (sh_src) 4527 release_stripe(sh_src); 4528 4529 sh->reconstruct_state = reconstruct_state_idle; 4530 clear_bit(STRIPE_EXPANDING, &sh->state); 4531 for (i = conf->raid_disks; i--; ) { 4532 set_bit(R5_Wantwrite, &sh->dev[i].flags); 4533 set_bit(R5_LOCKED, &sh->dev[i].flags); 4534 s.locked++; 4535 } 4536 } 4537 4538 if (s.expanded && test_bit(STRIPE_EXPANDING, &sh->state) && 4539 !sh->reconstruct_state) { 4540 /* Need to write out all blocks after computing parity */ 4541 sh->disks = conf->raid_disks; 4542 stripe_set_idx(sh->sector, conf, 0, sh); 4543 schedule_reconstruction(sh, &s, 1, 1); 4544 } else if (s.expanded && !sh->reconstruct_state && s.locked == 0) { 4545 clear_bit(STRIPE_EXPAND_READY, &sh->state); 4546 atomic_dec(&conf->reshape_stripes); 4547 wake_up(&conf->wait_for_overlap); 4548 md_done_sync(conf->mddev, STRIPE_SECTORS, 1); 4549 } 4550 4551 if (s.expanding && s.locked == 0 && 4552 !test_bit(STRIPE_COMPUTE_RUN, &sh->state)) 4553 handle_stripe_expansion(conf, sh); 4554 4555 finish: 4556 /* wait for this device to become unblocked */ 4557 if (unlikely(s.blocked_rdev)) { 4558 if (conf->mddev->external) 4559 md_wait_for_blocked_rdev(s.blocked_rdev, 4560 conf->mddev); 4561 else 4562 /* Internal metadata will immediately 4563 * be written by raid5d, so we don't 4564 * need to wait here. 4565 */ 4566 rdev_dec_pending(s.blocked_rdev, 4567 conf->mddev); 4568 } 4569 4570 if (s.handle_bad_blocks) 4571 for (i = disks; i--; ) { 4572 struct md_rdev *rdev; 4573 struct r5dev *dev = &sh->dev[i]; 4574 if (test_and_clear_bit(R5_WriteError, &dev->flags)) { 4575 /* We own a safe reference to the rdev */ 4576 rdev = conf->disks[i].rdev; 4577 if (!rdev_set_badblocks(rdev, sh->sector, 4578 STRIPE_SECTORS, 0)) 4579 md_error(conf->mddev, rdev); 4580 rdev_dec_pending(rdev, conf->mddev); 4581 } 4582 if (test_and_clear_bit(R5_MadeGood, &dev->flags)) { 4583 rdev = conf->disks[i].rdev; 4584 rdev_clear_badblocks(rdev, sh->sector, 4585 STRIPE_SECTORS, 0); 4586 rdev_dec_pending(rdev, conf->mddev); 4587 } 4588 if (test_and_clear_bit(R5_MadeGoodRepl, &dev->flags)) { 4589 rdev = conf->disks[i].replacement; 4590 if (!rdev) 4591 /* rdev have been moved down */ 4592 rdev = conf->disks[i].rdev; 4593 rdev_clear_badblocks(rdev, sh->sector, 4594 STRIPE_SECTORS, 0); 4595 rdev_dec_pending(rdev, conf->mddev); 4596 } 4597 } 4598 4599 if (s.ops_request) 4600 raid_run_ops(sh, s.ops_request); 4601 4602 ops_run_io(sh, &s); 4603 4604 if (s.dec_preread_active) { 4605 /* We delay this until after ops_run_io so that if make_request 4606 * is waiting on a flush, it won't continue until the writes 4607 * have actually been submitted. 4608 */ 4609 atomic_dec(&conf->preread_active_stripes); 4610 if (atomic_read(&conf->preread_active_stripes) < 4611 IO_THRESHOLD) 4612 md_wakeup_thread(conf->mddev->thread); 4613 } 4614 4615 return_io(s.return_bi); 4616 4617 clear_bit_unlock(STRIPE_ACTIVE, &sh->state); 4618 } 4619 4620 static void raid5_activate_delayed(struct r5conf *conf) 4621 { 4622 if (atomic_read(&conf->preread_active_stripes) < IO_THRESHOLD) { 4623 while (!list_empty(&conf->delayed_list)) { 4624 struct list_head *l = conf->delayed_list.next; 4625 struct stripe_head *sh; 4626 sh = list_entry(l, struct stripe_head, lru); 4627 list_del_init(l); 4628 clear_bit(STRIPE_DELAYED, &sh->state); 4629 if (!test_and_set_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) 4630 atomic_inc(&conf->preread_active_stripes); 4631 list_add_tail(&sh->lru, &conf->hold_list); 4632 raid5_wakeup_stripe_thread(sh); 4633 } 4634 } 4635 } 4636 4637 static void activate_bit_delay(struct r5conf *conf, 4638 struct list_head *temp_inactive_list) 4639 { 4640 /* device_lock is held */ 4641 struct list_head head; 4642 list_add(&head, &conf->bitmap_list); 4643 list_del_init(&conf->bitmap_list); 4644 while (!list_empty(&head)) { 4645 struct stripe_head *sh = list_entry(head.next, struct stripe_head, lru); 4646 int hash; 4647 list_del_init(&sh->lru); 4648 atomic_inc(&sh->count); 4649 hash = sh->hash_lock_index; 4650 __release_stripe(conf, sh, &temp_inactive_list[hash]); 4651 } 4652 } 4653 4654 static int raid5_congested(struct mddev *mddev, int bits) 4655 { 4656 struct r5conf *conf = mddev->private; 4657 4658 /* No difference between reads and writes. Just check 4659 * how busy the stripe_cache is 4660 */ 4661 4662 if (test_bit(R5_INACTIVE_BLOCKED, &conf->cache_state)) 4663 return 1; 4664 if (conf->quiesce) 4665 return 1; 4666 if (atomic_read(&conf->empty_inactive_list_nr)) 4667 return 1; 4668 4669 return 0; 4670 } 4671 4672 /* We want read requests to align with chunks where possible, 4673 * but write requests don't need to. 4674 */ 4675 static int raid5_mergeable_bvec(struct mddev *mddev, 4676 struct bvec_merge_data *bvm, 4677 struct bio_vec *biovec) 4678 { 4679 sector_t sector = bvm->bi_sector + get_start_sect(bvm->bi_bdev); 4680 int max; 4681 unsigned int chunk_sectors = mddev->chunk_sectors; 4682 unsigned int bio_sectors = bvm->bi_size >> 9; 4683 4684 /* 4685 * always allow writes to be mergeable, read as well if array 4686 * is degraded as we'll go through stripe cache anyway. 4687 */ 4688 if ((bvm->bi_rw & 1) == WRITE || mddev->degraded) 4689 return biovec->bv_len; 4690 4691 if (mddev->new_chunk_sectors < mddev->chunk_sectors) 4692 chunk_sectors = mddev->new_chunk_sectors; 4693 max = (chunk_sectors - ((sector & (chunk_sectors - 1)) + bio_sectors)) << 9; 4694 if (max < 0) max = 0; 4695 if (max <= biovec->bv_len && bio_sectors == 0) 4696 return biovec->bv_len; 4697 else 4698 return max; 4699 } 4700 4701 static int in_chunk_boundary(struct mddev *mddev, struct bio *bio) 4702 { 4703 sector_t sector = bio->bi_iter.bi_sector + get_start_sect(bio->bi_bdev); 4704 unsigned int chunk_sectors = mddev->chunk_sectors; 4705 unsigned int bio_sectors = bio_sectors(bio); 4706 4707 if (mddev->new_chunk_sectors < mddev->chunk_sectors) 4708 chunk_sectors = mddev->new_chunk_sectors; 4709 return chunk_sectors >= 4710 ((sector & (chunk_sectors - 1)) + bio_sectors); 4711 } 4712 4713 /* 4714 * add bio to the retry LIFO ( in O(1) ... we are in interrupt ) 4715 * later sampled by raid5d. 4716 */ 4717 static void add_bio_to_retry(struct bio *bi,struct r5conf *conf) 4718 { 4719 unsigned long flags; 4720 4721 spin_lock_irqsave(&conf->device_lock, flags); 4722 4723 bi->bi_next = conf->retry_read_aligned_list; 4724 conf->retry_read_aligned_list = bi; 4725 4726 spin_unlock_irqrestore(&conf->device_lock, flags); 4727 md_wakeup_thread(conf->mddev->thread); 4728 } 4729 4730 static struct bio *remove_bio_from_retry(struct r5conf *conf) 4731 { 4732 struct bio *bi; 4733 4734 bi = conf->retry_read_aligned; 4735 if (bi) { 4736 conf->retry_read_aligned = NULL; 4737 return bi; 4738 } 4739 bi = conf->retry_read_aligned_list; 4740 if(bi) { 4741 conf->retry_read_aligned_list = bi->bi_next; 4742 bi->bi_next = NULL; 4743 /* 4744 * this sets the active strip count to 1 and the processed 4745 * strip count to zero (upper 8 bits) 4746 */ 4747 raid5_set_bi_stripes(bi, 1); /* biased count of active stripes */ 4748 } 4749 4750 return bi; 4751 } 4752 4753 /* 4754 * The "raid5_align_endio" should check if the read succeeded and if it 4755 * did, call bio_endio on the original bio (having bio_put the new bio 4756 * first). 4757 * If the read failed.. 4758 */ 4759 static void raid5_align_endio(struct bio *bi, int error) 4760 { 4761 struct bio* raid_bi = bi->bi_private; 4762 struct mddev *mddev; 4763 struct r5conf *conf; 4764 int uptodate = test_bit(BIO_UPTODATE, &bi->bi_flags); 4765 struct md_rdev *rdev; 4766 4767 bio_put(bi); 4768 4769 rdev = (void*)raid_bi->bi_next; 4770 raid_bi->bi_next = NULL; 4771 mddev = rdev->mddev; 4772 conf = mddev->private; 4773 4774 rdev_dec_pending(rdev, conf->mddev); 4775 4776 if (!error && uptodate) { 4777 trace_block_bio_complete(bdev_get_queue(raid_bi->bi_bdev), 4778 raid_bi, 0); 4779 bio_endio(raid_bi, 0); 4780 if (atomic_dec_and_test(&conf->active_aligned_reads)) 4781 wake_up(&conf->wait_for_quiescent); 4782 return; 4783 } 4784 4785 pr_debug("raid5_align_endio : io error...handing IO for a retry\n"); 4786 4787 add_bio_to_retry(raid_bi, conf); 4788 } 4789 4790 static int bio_fits_rdev(struct bio *bi) 4791 { 4792 struct request_queue *q = bdev_get_queue(bi->bi_bdev); 4793 4794 if (bio_sectors(bi) > queue_max_sectors(q)) 4795 return 0; 4796 blk_recount_segments(q, bi); 4797 if (bi->bi_phys_segments > queue_max_segments(q)) 4798 return 0; 4799 4800 if (q->merge_bvec_fn) 4801 /* it's too hard to apply the merge_bvec_fn at this stage, 4802 * just just give up 4803 */ 4804 return 0; 4805 4806 return 1; 4807 } 4808 4809 static int chunk_aligned_read(struct mddev *mddev, struct bio * raid_bio) 4810 { 4811 struct r5conf *conf = mddev->private; 4812 int dd_idx; 4813 struct bio* align_bi; 4814 struct md_rdev *rdev; 4815 sector_t end_sector; 4816 4817 if (!in_chunk_boundary(mddev, raid_bio)) { 4818 pr_debug("chunk_aligned_read : non aligned\n"); 4819 return 0; 4820 } 4821 /* 4822 * use bio_clone_mddev to make a copy of the bio 4823 */ 4824 align_bi = bio_clone_mddev(raid_bio, GFP_NOIO, mddev); 4825 if (!align_bi) 4826 return 0; 4827 /* 4828 * set bi_end_io to a new function, and set bi_private to the 4829 * original bio. 4830 */ 4831 align_bi->bi_end_io = raid5_align_endio; 4832 align_bi->bi_private = raid_bio; 4833 /* 4834 * compute position 4835 */ 4836 align_bi->bi_iter.bi_sector = 4837 raid5_compute_sector(conf, raid_bio->bi_iter.bi_sector, 4838 0, &dd_idx, NULL); 4839 4840 end_sector = bio_end_sector(align_bi); 4841 rcu_read_lock(); 4842 rdev = rcu_dereference(conf->disks[dd_idx].replacement); 4843 if (!rdev || test_bit(Faulty, &rdev->flags) || 4844 rdev->recovery_offset < end_sector) { 4845 rdev = rcu_dereference(conf->disks[dd_idx].rdev); 4846 if (rdev && 4847 (test_bit(Faulty, &rdev->flags) || 4848 !(test_bit(In_sync, &rdev->flags) || 4849 rdev->recovery_offset >= end_sector))) 4850 rdev = NULL; 4851 } 4852 if (rdev) { 4853 sector_t first_bad; 4854 int bad_sectors; 4855 4856 atomic_inc(&rdev->nr_pending); 4857 rcu_read_unlock(); 4858 raid_bio->bi_next = (void*)rdev; 4859 align_bi->bi_bdev = rdev->bdev; 4860 __clear_bit(BIO_SEG_VALID, &align_bi->bi_flags); 4861 4862 if (!bio_fits_rdev(align_bi) || 4863 is_badblock(rdev, align_bi->bi_iter.bi_sector, 4864 bio_sectors(align_bi), 4865 &first_bad, &bad_sectors)) { 4866 /* too big in some way, or has a known bad block */ 4867 bio_put(align_bi); 4868 rdev_dec_pending(rdev, mddev); 4869 return 0; 4870 } 4871 4872 /* No reshape active, so we can trust rdev->data_offset */ 4873 align_bi->bi_iter.bi_sector += rdev->data_offset; 4874 4875 spin_lock_irq(&conf->device_lock); 4876 wait_event_lock_irq(conf->wait_for_quiescent, 4877 conf->quiesce == 0, 4878 conf->device_lock); 4879 atomic_inc(&conf->active_aligned_reads); 4880 spin_unlock_irq(&conf->device_lock); 4881 4882 if (mddev->gendisk) 4883 trace_block_bio_remap(bdev_get_queue(align_bi->bi_bdev), 4884 align_bi, disk_devt(mddev->gendisk), 4885 raid_bio->bi_iter.bi_sector); 4886 generic_make_request(align_bi); 4887 return 1; 4888 } else { 4889 rcu_read_unlock(); 4890 bio_put(align_bi); 4891 return 0; 4892 } 4893 } 4894 4895 /* __get_priority_stripe - get the next stripe to process 4896 * 4897 * Full stripe writes are allowed to pass preread active stripes up until 4898 * the bypass_threshold is exceeded. In general the bypass_count 4899 * increments when the handle_list is handled before the hold_list; however, it 4900 * will not be incremented when STRIPE_IO_STARTED is sampled set signifying a 4901 * stripe with in flight i/o. The bypass_count will be reset when the 4902 * head of the hold_list has changed, i.e. the head was promoted to the 4903 * handle_list. 4904 */ 4905 static struct stripe_head *__get_priority_stripe(struct r5conf *conf, int group) 4906 { 4907 struct stripe_head *sh = NULL, *tmp; 4908 struct list_head *handle_list = NULL; 4909 struct r5worker_group *wg = NULL; 4910 4911 if (conf->worker_cnt_per_group == 0) { 4912 handle_list = &conf->handle_list; 4913 } else if (group != ANY_GROUP) { 4914 handle_list = &conf->worker_groups[group].handle_list; 4915 wg = &conf->worker_groups[group]; 4916 } else { 4917 int i; 4918 for (i = 0; i < conf->group_cnt; i++) { 4919 handle_list = &conf->worker_groups[i].handle_list; 4920 wg = &conf->worker_groups[i]; 4921 if (!list_empty(handle_list)) 4922 break; 4923 } 4924 } 4925 4926 pr_debug("%s: handle: %s hold: %s full_writes: %d bypass_count: %d\n", 4927 __func__, 4928 list_empty(handle_list) ? "empty" : "busy", 4929 list_empty(&conf->hold_list) ? "empty" : "busy", 4930 atomic_read(&conf->pending_full_writes), conf->bypass_count); 4931 4932 if (!list_empty(handle_list)) { 4933 sh = list_entry(handle_list->next, typeof(*sh), lru); 4934 4935 if (list_empty(&conf->hold_list)) 4936 conf->bypass_count = 0; 4937 else if (!test_bit(STRIPE_IO_STARTED, &sh->state)) { 4938 if (conf->hold_list.next == conf->last_hold) 4939 conf->bypass_count++; 4940 else { 4941 conf->last_hold = conf->hold_list.next; 4942 conf->bypass_count -= conf->bypass_threshold; 4943 if (conf->bypass_count < 0) 4944 conf->bypass_count = 0; 4945 } 4946 } 4947 } else if (!list_empty(&conf->hold_list) && 4948 ((conf->bypass_threshold && 4949 conf->bypass_count > conf->bypass_threshold) || 4950 atomic_read(&conf->pending_full_writes) == 0)) { 4951 4952 list_for_each_entry(tmp, &conf->hold_list, lru) { 4953 if (conf->worker_cnt_per_group == 0 || 4954 group == ANY_GROUP || 4955 !cpu_online(tmp->cpu) || 4956 cpu_to_group(tmp->cpu) == group) { 4957 sh = tmp; 4958 break; 4959 } 4960 } 4961 4962 if (sh) { 4963 conf->bypass_count -= conf->bypass_threshold; 4964 if (conf->bypass_count < 0) 4965 conf->bypass_count = 0; 4966 } 4967 wg = NULL; 4968 } 4969 4970 if (!sh) 4971 return NULL; 4972 4973 if (wg) { 4974 wg->stripes_cnt--; 4975 sh->group = NULL; 4976 } 4977 list_del_init(&sh->lru); 4978 BUG_ON(atomic_inc_return(&sh->count) != 1); 4979 return sh; 4980 } 4981 4982 struct raid5_plug_cb { 4983 struct blk_plug_cb cb; 4984 struct list_head list; 4985 struct list_head temp_inactive_list[NR_STRIPE_HASH_LOCKS]; 4986 }; 4987 4988 static void raid5_unplug(struct blk_plug_cb *blk_cb, bool from_schedule) 4989 { 4990 struct raid5_plug_cb *cb = container_of( 4991 blk_cb, struct raid5_plug_cb, cb); 4992 struct stripe_head *sh; 4993 struct mddev *mddev = cb->cb.data; 4994 struct r5conf *conf = mddev->private; 4995 int cnt = 0; 4996 int hash; 4997 4998 if (cb->list.next && !list_empty(&cb->list)) { 4999 spin_lock_irq(&conf->device_lock); 5000 while (!list_empty(&cb->list)) { 5001 sh = list_first_entry(&cb->list, struct stripe_head, lru); 5002 list_del_init(&sh->lru); 5003 /* 5004 * avoid race release_stripe_plug() sees 5005 * STRIPE_ON_UNPLUG_LIST clear but the stripe 5006 * is still in our list 5007 */ 5008 smp_mb__before_atomic(); 5009 clear_bit(STRIPE_ON_UNPLUG_LIST, &sh->state); 5010 /* 5011 * STRIPE_ON_RELEASE_LIST could be set here. In that 5012 * case, the count is always > 1 here 5013 */ 5014 hash = sh->hash_lock_index; 5015 __release_stripe(conf, sh, &cb->temp_inactive_list[hash]); 5016 cnt++; 5017 } 5018 spin_unlock_irq(&conf->device_lock); 5019 } 5020 release_inactive_stripe_list(conf, cb->temp_inactive_list, 5021 NR_STRIPE_HASH_LOCKS); 5022 if (mddev->queue) 5023 trace_block_unplug(mddev->queue, cnt, !from_schedule); 5024 kfree(cb); 5025 } 5026 5027 static void release_stripe_plug(struct mddev *mddev, 5028 struct stripe_head *sh) 5029 { 5030 struct blk_plug_cb *blk_cb = blk_check_plugged( 5031 raid5_unplug, mddev, 5032 sizeof(struct raid5_plug_cb)); 5033 struct raid5_plug_cb *cb; 5034 5035 if (!blk_cb) { 5036 release_stripe(sh); 5037 return; 5038 } 5039 5040 cb = container_of(blk_cb, struct raid5_plug_cb, cb); 5041 5042 if (cb->list.next == NULL) { 5043 int i; 5044 INIT_LIST_HEAD(&cb->list); 5045 for (i = 0; i < NR_STRIPE_HASH_LOCKS; i++) 5046 INIT_LIST_HEAD(cb->temp_inactive_list + i); 5047 } 5048 5049 if (!test_and_set_bit(STRIPE_ON_UNPLUG_LIST, &sh->state)) 5050 list_add_tail(&sh->lru, &cb->list); 5051 else 5052 release_stripe(sh); 5053 } 5054 5055 static void make_discard_request(struct mddev *mddev, struct bio *bi) 5056 { 5057 struct r5conf *conf = mddev->private; 5058 sector_t logical_sector, last_sector; 5059 struct stripe_head *sh; 5060 int remaining; 5061 int stripe_sectors; 5062 5063 if (mddev->reshape_position != MaxSector) 5064 /* Skip discard while reshape is happening */ 5065 return; 5066 5067 logical_sector = bi->bi_iter.bi_sector & ~((sector_t)STRIPE_SECTORS-1); 5068 last_sector = bi->bi_iter.bi_sector + (bi->bi_iter.bi_size>>9); 5069 5070 bi->bi_next = NULL; 5071 bi->bi_phys_segments = 1; /* over-loaded to count active stripes */ 5072 5073 stripe_sectors = conf->chunk_sectors * 5074 (conf->raid_disks - conf->max_degraded); 5075 logical_sector = DIV_ROUND_UP_SECTOR_T(logical_sector, 5076 stripe_sectors); 5077 sector_div(last_sector, stripe_sectors); 5078 5079 logical_sector *= conf->chunk_sectors; 5080 last_sector *= conf->chunk_sectors; 5081 5082 for (; logical_sector < last_sector; 5083 logical_sector += STRIPE_SECTORS) { 5084 DEFINE_WAIT(w); 5085 int d; 5086 again: 5087 sh = get_active_stripe(conf, logical_sector, 0, 0, 0); 5088 prepare_to_wait(&conf->wait_for_overlap, &w, 5089 TASK_UNINTERRUPTIBLE); 5090 set_bit(R5_Overlap, &sh->dev[sh->pd_idx].flags); 5091 if (test_bit(STRIPE_SYNCING, &sh->state)) { 5092 release_stripe(sh); 5093 schedule(); 5094 goto again; 5095 } 5096 clear_bit(R5_Overlap, &sh->dev[sh->pd_idx].flags); 5097 spin_lock_irq(&sh->stripe_lock); 5098 for (d = 0; d < conf->raid_disks; d++) { 5099 if (d == sh->pd_idx || d == sh->qd_idx) 5100 continue; 5101 if (sh->dev[d].towrite || sh->dev[d].toread) { 5102 set_bit(R5_Overlap, &sh->dev[d].flags); 5103 spin_unlock_irq(&sh->stripe_lock); 5104 release_stripe(sh); 5105 schedule(); 5106 goto again; 5107 } 5108 } 5109 set_bit(STRIPE_DISCARD, &sh->state); 5110 finish_wait(&conf->wait_for_overlap, &w); 5111 sh->overwrite_disks = 0; 5112 for (d = 0; d < conf->raid_disks; d++) { 5113 if (d == sh->pd_idx || d == sh->qd_idx) 5114 continue; 5115 sh->dev[d].towrite = bi; 5116 set_bit(R5_OVERWRITE, &sh->dev[d].flags); 5117 raid5_inc_bi_active_stripes(bi); 5118 sh->overwrite_disks++; 5119 } 5120 spin_unlock_irq(&sh->stripe_lock); 5121 if (conf->mddev->bitmap) { 5122 for (d = 0; 5123 d < conf->raid_disks - conf->max_degraded; 5124 d++) 5125 bitmap_startwrite(mddev->bitmap, 5126 sh->sector, 5127 STRIPE_SECTORS, 5128 0); 5129 sh->bm_seq = conf->seq_flush + 1; 5130 set_bit(STRIPE_BIT_DELAY, &sh->state); 5131 } 5132 5133 set_bit(STRIPE_HANDLE, &sh->state); 5134 clear_bit(STRIPE_DELAYED, &sh->state); 5135 if (!test_and_set_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) 5136 atomic_inc(&conf->preread_active_stripes); 5137 release_stripe_plug(mddev, sh); 5138 } 5139 5140 remaining = raid5_dec_bi_active_stripes(bi); 5141 if (remaining == 0) { 5142 md_write_end(mddev); 5143 bio_endio(bi, 0); 5144 } 5145 } 5146 5147 static void make_request(struct mddev *mddev, struct bio * bi) 5148 { 5149 struct r5conf *conf = mddev->private; 5150 int dd_idx; 5151 sector_t new_sector; 5152 sector_t logical_sector, last_sector; 5153 struct stripe_head *sh; 5154 const int rw = bio_data_dir(bi); 5155 int remaining; 5156 DEFINE_WAIT(w); 5157 bool do_prepare; 5158 5159 if (unlikely(bi->bi_rw & REQ_FLUSH)) { 5160 md_flush_request(mddev, bi); 5161 return; 5162 } 5163 5164 md_write_start(mddev, bi); 5165 5166 /* 5167 * If array is degraded, better not do chunk aligned read because 5168 * later we might have to read it again in order to reconstruct 5169 * data on failed drives. 5170 */ 5171 if (rw == READ && mddev->degraded == 0 && 5172 mddev->reshape_position == MaxSector && 5173 chunk_aligned_read(mddev,bi)) 5174 return; 5175 5176 if (unlikely(bi->bi_rw & REQ_DISCARD)) { 5177 make_discard_request(mddev, bi); 5178 return; 5179 } 5180 5181 logical_sector = bi->bi_iter.bi_sector & ~((sector_t)STRIPE_SECTORS-1); 5182 last_sector = bio_end_sector(bi); 5183 bi->bi_next = NULL; 5184 bi->bi_phys_segments = 1; /* over-loaded to count active stripes */ 5185 5186 prepare_to_wait(&conf->wait_for_overlap, &w, TASK_UNINTERRUPTIBLE); 5187 for (;logical_sector < last_sector; logical_sector += STRIPE_SECTORS) { 5188 int previous; 5189 int seq; 5190 5191 do_prepare = false; 5192 retry: 5193 seq = read_seqcount_begin(&conf->gen_lock); 5194 previous = 0; 5195 if (do_prepare) 5196 prepare_to_wait(&conf->wait_for_overlap, &w, 5197 TASK_UNINTERRUPTIBLE); 5198 if (unlikely(conf->reshape_progress != MaxSector)) { 5199 /* spinlock is needed as reshape_progress may be 5200 * 64bit on a 32bit platform, and so it might be 5201 * possible to see a half-updated value 5202 * Of course reshape_progress could change after 5203 * the lock is dropped, so once we get a reference 5204 * to the stripe that we think it is, we will have 5205 * to check again. 5206 */ 5207 spin_lock_irq(&conf->device_lock); 5208 if (mddev->reshape_backwards 5209 ? logical_sector < conf->reshape_progress 5210 : logical_sector >= conf->reshape_progress) { 5211 previous = 1; 5212 } else { 5213 if (mddev->reshape_backwards 5214 ? logical_sector < conf->reshape_safe 5215 : logical_sector >= conf->reshape_safe) { 5216 spin_unlock_irq(&conf->device_lock); 5217 schedule(); 5218 do_prepare = true; 5219 goto retry; 5220 } 5221 } 5222 spin_unlock_irq(&conf->device_lock); 5223 } 5224 5225 new_sector = raid5_compute_sector(conf, logical_sector, 5226 previous, 5227 &dd_idx, NULL); 5228 pr_debug("raid456: make_request, sector %llu logical %llu\n", 5229 (unsigned long long)new_sector, 5230 (unsigned long long)logical_sector); 5231 5232 sh = get_active_stripe(conf, new_sector, previous, 5233 (bi->bi_rw&RWA_MASK), 0); 5234 if (sh) { 5235 if (unlikely(previous)) { 5236 /* expansion might have moved on while waiting for a 5237 * stripe, so we must do the range check again. 5238 * Expansion could still move past after this 5239 * test, but as we are holding a reference to 5240 * 'sh', we know that if that happens, 5241 * STRIPE_EXPANDING will get set and the expansion 5242 * won't proceed until we finish with the stripe. 5243 */ 5244 int must_retry = 0; 5245 spin_lock_irq(&conf->device_lock); 5246 if (mddev->reshape_backwards 5247 ? logical_sector >= conf->reshape_progress 5248 : logical_sector < conf->reshape_progress) 5249 /* mismatch, need to try again */ 5250 must_retry = 1; 5251 spin_unlock_irq(&conf->device_lock); 5252 if (must_retry) { 5253 release_stripe(sh); 5254 schedule(); 5255 do_prepare = true; 5256 goto retry; 5257 } 5258 } 5259 if (read_seqcount_retry(&conf->gen_lock, seq)) { 5260 /* Might have got the wrong stripe_head 5261 * by accident 5262 */ 5263 release_stripe(sh); 5264 goto retry; 5265 } 5266 5267 if (rw == WRITE && 5268 logical_sector >= mddev->suspend_lo && 5269 logical_sector < mddev->suspend_hi) { 5270 release_stripe(sh); 5271 /* As the suspend_* range is controlled by 5272 * userspace, we want an interruptible 5273 * wait. 5274 */ 5275 flush_signals(current); 5276 prepare_to_wait(&conf->wait_for_overlap, 5277 &w, TASK_INTERRUPTIBLE); 5278 if (logical_sector >= mddev->suspend_lo && 5279 logical_sector < mddev->suspend_hi) { 5280 schedule(); 5281 do_prepare = true; 5282 } 5283 goto retry; 5284 } 5285 5286 if (test_bit(STRIPE_EXPANDING, &sh->state) || 5287 !add_stripe_bio(sh, bi, dd_idx, rw, previous)) { 5288 /* Stripe is busy expanding or 5289 * add failed due to overlap. Flush everything 5290 * and wait a while 5291 */ 5292 md_wakeup_thread(mddev->thread); 5293 release_stripe(sh); 5294 schedule(); 5295 do_prepare = true; 5296 goto retry; 5297 } 5298 set_bit(STRIPE_HANDLE, &sh->state); 5299 clear_bit(STRIPE_DELAYED, &sh->state); 5300 if ((!sh->batch_head || sh == sh->batch_head) && 5301 (bi->bi_rw & REQ_SYNC) && 5302 !test_and_set_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) 5303 atomic_inc(&conf->preread_active_stripes); 5304 release_stripe_plug(mddev, sh); 5305 } else { 5306 /* cannot get stripe for read-ahead, just give-up */ 5307 clear_bit(BIO_UPTODATE, &bi->bi_flags); 5308 break; 5309 } 5310 } 5311 finish_wait(&conf->wait_for_overlap, &w); 5312 5313 remaining = raid5_dec_bi_active_stripes(bi); 5314 if (remaining == 0) { 5315 5316 if ( rw == WRITE ) 5317 md_write_end(mddev); 5318 5319 trace_block_bio_complete(bdev_get_queue(bi->bi_bdev), 5320 bi, 0); 5321 bio_endio(bi, 0); 5322 } 5323 } 5324 5325 static sector_t raid5_size(struct mddev *mddev, sector_t sectors, int raid_disks); 5326 5327 static sector_t reshape_request(struct mddev *mddev, sector_t sector_nr, int *skipped) 5328 { 5329 /* reshaping is quite different to recovery/resync so it is 5330 * handled quite separately ... here. 5331 * 5332 * On each call to sync_request, we gather one chunk worth of 5333 * destination stripes and flag them as expanding. 5334 * Then we find all the source stripes and request reads. 5335 * As the reads complete, handle_stripe will copy the data 5336 * into the destination stripe and release that stripe. 5337 */ 5338 struct r5conf *conf = mddev->private; 5339 struct stripe_head *sh; 5340 sector_t first_sector, last_sector; 5341 int raid_disks = conf->previous_raid_disks; 5342 int data_disks = raid_disks - conf->max_degraded; 5343 int new_data_disks = conf->raid_disks - conf->max_degraded; 5344 int i; 5345 int dd_idx; 5346 sector_t writepos, readpos, safepos; 5347 sector_t stripe_addr; 5348 int reshape_sectors; 5349 struct list_head stripes; 5350 5351 if (sector_nr == 0) { 5352 /* If restarting in the middle, skip the initial sectors */ 5353 if (mddev->reshape_backwards && 5354 conf->reshape_progress < raid5_size(mddev, 0, 0)) { 5355 sector_nr = raid5_size(mddev, 0, 0) 5356 - conf->reshape_progress; 5357 } else if (!mddev->reshape_backwards && 5358 conf->reshape_progress > 0) 5359 sector_nr = conf->reshape_progress; 5360 sector_div(sector_nr, new_data_disks); 5361 if (sector_nr) { 5362 mddev->curr_resync_completed = sector_nr; 5363 sysfs_notify(&mddev->kobj, NULL, "sync_completed"); 5364 *skipped = 1; 5365 return sector_nr; 5366 } 5367 } 5368 5369 /* We need to process a full chunk at a time. 5370 * If old and new chunk sizes differ, we need to process the 5371 * largest of these 5372 */ 5373 if (mddev->new_chunk_sectors > mddev->chunk_sectors) 5374 reshape_sectors = mddev->new_chunk_sectors; 5375 else 5376 reshape_sectors = mddev->chunk_sectors; 5377 5378 /* We update the metadata at least every 10 seconds, or when 5379 * the data about to be copied would over-write the source of 5380 * the data at the front of the range. i.e. one new_stripe 5381 * along from reshape_progress new_maps to after where 5382 * reshape_safe old_maps to 5383 */ 5384 writepos = conf->reshape_progress; 5385 sector_div(writepos, new_data_disks); 5386 readpos = conf->reshape_progress; 5387 sector_div(readpos, data_disks); 5388 safepos = conf->reshape_safe; 5389 sector_div(safepos, data_disks); 5390 if (mddev->reshape_backwards) { 5391 writepos -= min_t(sector_t, reshape_sectors, writepos); 5392 readpos += reshape_sectors; 5393 safepos += reshape_sectors; 5394 } else { 5395 writepos += reshape_sectors; 5396 readpos -= min_t(sector_t, reshape_sectors, readpos); 5397 safepos -= min_t(sector_t, reshape_sectors, safepos); 5398 } 5399 5400 /* Having calculated the 'writepos' possibly use it 5401 * to set 'stripe_addr' which is where we will write to. 5402 */ 5403 if (mddev->reshape_backwards) { 5404 BUG_ON(conf->reshape_progress == 0); 5405 stripe_addr = writepos; 5406 BUG_ON((mddev->dev_sectors & 5407 ~((sector_t)reshape_sectors - 1)) 5408 - reshape_sectors - stripe_addr 5409 != sector_nr); 5410 } else { 5411 BUG_ON(writepos != sector_nr + reshape_sectors); 5412 stripe_addr = sector_nr; 5413 } 5414 5415 /* 'writepos' is the most advanced device address we might write. 5416 * 'readpos' is the least advanced device address we might read. 5417 * 'safepos' is the least address recorded in the metadata as having 5418 * been reshaped. 5419 * If there is a min_offset_diff, these are adjusted either by 5420 * increasing the safepos/readpos if diff is negative, or 5421 * increasing writepos if diff is positive. 5422 * If 'readpos' is then behind 'writepos', there is no way that we can 5423 * ensure safety in the face of a crash - that must be done by userspace 5424 * making a backup of the data. So in that case there is no particular 5425 * rush to update metadata. 5426 * Otherwise if 'safepos' is behind 'writepos', then we really need to 5427 * update the metadata to advance 'safepos' to match 'readpos' so that 5428 * we can be safe in the event of a crash. 5429 * So we insist on updating metadata if safepos is behind writepos and 5430 * readpos is beyond writepos. 5431 * In any case, update the metadata every 10 seconds. 5432 * Maybe that number should be configurable, but I'm not sure it is 5433 * worth it.... maybe it could be a multiple of safemode_delay??? 5434 */ 5435 if (conf->min_offset_diff < 0) { 5436 safepos += -conf->min_offset_diff; 5437 readpos += -conf->min_offset_diff; 5438 } else 5439 writepos += conf->min_offset_diff; 5440 5441 if ((mddev->reshape_backwards 5442 ? (safepos > writepos && readpos < writepos) 5443 : (safepos < writepos && readpos > writepos)) || 5444 time_after(jiffies, conf->reshape_checkpoint + 10*HZ)) { 5445 /* Cannot proceed until we've updated the superblock... */ 5446 wait_event(conf->wait_for_overlap, 5447 atomic_read(&conf->reshape_stripes)==0 5448 || test_bit(MD_RECOVERY_INTR, &mddev->recovery)); 5449 if (atomic_read(&conf->reshape_stripes) != 0) 5450 return 0; 5451 mddev->reshape_position = conf->reshape_progress; 5452 mddev->curr_resync_completed = sector_nr; 5453 conf->reshape_checkpoint = jiffies; 5454 set_bit(MD_CHANGE_DEVS, &mddev->flags); 5455 md_wakeup_thread(mddev->thread); 5456 wait_event(mddev->sb_wait, mddev->flags == 0 || 5457 test_bit(MD_RECOVERY_INTR, &mddev->recovery)); 5458 if (test_bit(MD_RECOVERY_INTR, &mddev->recovery)) 5459 return 0; 5460 spin_lock_irq(&conf->device_lock); 5461 conf->reshape_safe = mddev->reshape_position; 5462 spin_unlock_irq(&conf->device_lock); 5463 wake_up(&conf->wait_for_overlap); 5464 sysfs_notify(&mddev->kobj, NULL, "sync_completed"); 5465 } 5466 5467 INIT_LIST_HEAD(&stripes); 5468 for (i = 0; i < reshape_sectors; i += STRIPE_SECTORS) { 5469 int j; 5470 int skipped_disk = 0; 5471 sh = get_active_stripe(conf, stripe_addr+i, 0, 0, 1); 5472 set_bit(STRIPE_EXPANDING, &sh->state); 5473 atomic_inc(&conf->reshape_stripes); 5474 /* If any of this stripe is beyond the end of the old 5475 * array, then we need to zero those blocks 5476 */ 5477 for (j=sh->disks; j--;) { 5478 sector_t s; 5479 if (j == sh->pd_idx) 5480 continue; 5481 if (conf->level == 6 && 5482 j == sh->qd_idx) 5483 continue; 5484 s = compute_blocknr(sh, j, 0); 5485 if (s < raid5_size(mddev, 0, 0)) { 5486 skipped_disk = 1; 5487 continue; 5488 } 5489 memset(page_address(sh->dev[j].page), 0, STRIPE_SIZE); 5490 set_bit(R5_Expanded, &sh->dev[j].flags); 5491 set_bit(R5_UPTODATE, &sh->dev[j].flags); 5492 } 5493 if (!skipped_disk) { 5494 set_bit(STRIPE_EXPAND_READY, &sh->state); 5495 set_bit(STRIPE_HANDLE, &sh->state); 5496 } 5497 list_add(&sh->lru, &stripes); 5498 } 5499 spin_lock_irq(&conf->device_lock); 5500 if (mddev->reshape_backwards) 5501 conf->reshape_progress -= reshape_sectors * new_data_disks; 5502 else 5503 conf->reshape_progress += reshape_sectors * new_data_disks; 5504 spin_unlock_irq(&conf->device_lock); 5505 /* Ok, those stripe are ready. We can start scheduling 5506 * reads on the source stripes. 5507 * The source stripes are determined by mapping the first and last 5508 * block on the destination stripes. 5509 */ 5510 first_sector = 5511 raid5_compute_sector(conf, stripe_addr*(new_data_disks), 5512 1, &dd_idx, NULL); 5513 last_sector = 5514 raid5_compute_sector(conf, ((stripe_addr+reshape_sectors) 5515 * new_data_disks - 1), 5516 1, &dd_idx, NULL); 5517 if (last_sector >= mddev->dev_sectors) 5518 last_sector = mddev->dev_sectors - 1; 5519 while (first_sector <= last_sector) { 5520 sh = get_active_stripe(conf, first_sector, 1, 0, 1); 5521 set_bit(STRIPE_EXPAND_SOURCE, &sh->state); 5522 set_bit(STRIPE_HANDLE, &sh->state); 5523 release_stripe(sh); 5524 first_sector += STRIPE_SECTORS; 5525 } 5526 /* Now that the sources are clearly marked, we can release 5527 * the destination stripes 5528 */ 5529 while (!list_empty(&stripes)) { 5530 sh = list_entry(stripes.next, struct stripe_head, lru); 5531 list_del_init(&sh->lru); 5532 release_stripe(sh); 5533 } 5534 /* If this takes us to the resync_max point where we have to pause, 5535 * then we need to write out the superblock. 5536 */ 5537 sector_nr += reshape_sectors; 5538 if ((sector_nr - mddev->curr_resync_completed) * 2 5539 >= mddev->resync_max - mddev->curr_resync_completed) { 5540 /* Cannot proceed until we've updated the superblock... */ 5541 wait_event(conf->wait_for_overlap, 5542 atomic_read(&conf->reshape_stripes) == 0 5543 || test_bit(MD_RECOVERY_INTR, &mddev->recovery)); 5544 if (atomic_read(&conf->reshape_stripes) != 0) 5545 goto ret; 5546 mddev->reshape_position = conf->reshape_progress; 5547 mddev->curr_resync_completed = sector_nr; 5548 conf->reshape_checkpoint = jiffies; 5549 set_bit(MD_CHANGE_DEVS, &mddev->flags); 5550 md_wakeup_thread(mddev->thread); 5551 wait_event(mddev->sb_wait, 5552 !test_bit(MD_CHANGE_DEVS, &mddev->flags) 5553 || test_bit(MD_RECOVERY_INTR, &mddev->recovery)); 5554 if (test_bit(MD_RECOVERY_INTR, &mddev->recovery)) 5555 goto ret; 5556 spin_lock_irq(&conf->device_lock); 5557 conf->reshape_safe = mddev->reshape_position; 5558 spin_unlock_irq(&conf->device_lock); 5559 wake_up(&conf->wait_for_overlap); 5560 sysfs_notify(&mddev->kobj, NULL, "sync_completed"); 5561 } 5562 ret: 5563 return reshape_sectors; 5564 } 5565 5566 static inline sector_t sync_request(struct mddev *mddev, sector_t sector_nr, int *skipped) 5567 { 5568 struct r5conf *conf = mddev->private; 5569 struct stripe_head *sh; 5570 sector_t max_sector = mddev->dev_sectors; 5571 sector_t sync_blocks; 5572 int still_degraded = 0; 5573 int i; 5574 5575 if (sector_nr >= max_sector) { 5576 /* just being told to finish up .. nothing much to do */ 5577 5578 if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)) { 5579 end_reshape(conf); 5580 return 0; 5581 } 5582 5583 if (mddev->curr_resync < max_sector) /* aborted */ 5584 bitmap_end_sync(mddev->bitmap, mddev->curr_resync, 5585 &sync_blocks, 1); 5586 else /* completed sync */ 5587 conf->fullsync = 0; 5588 bitmap_close_sync(mddev->bitmap); 5589 5590 return 0; 5591 } 5592 5593 /* Allow raid5_quiesce to complete */ 5594 wait_event(conf->wait_for_overlap, conf->quiesce != 2); 5595 5596 if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)) 5597 return reshape_request(mddev, sector_nr, skipped); 5598 5599 /* No need to check resync_max as we never do more than one 5600 * stripe, and as resync_max will always be on a chunk boundary, 5601 * if the check in md_do_sync didn't fire, there is no chance 5602 * of overstepping resync_max here 5603 */ 5604 5605 /* if there is too many failed drives and we are trying 5606 * to resync, then assert that we are finished, because there is 5607 * nothing we can do. 5608 */ 5609 if (mddev->degraded >= conf->max_degraded && 5610 test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) { 5611 sector_t rv = mddev->dev_sectors - sector_nr; 5612 *skipped = 1; 5613 return rv; 5614 } 5615 if (!test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery) && 5616 !conf->fullsync && 5617 !bitmap_start_sync(mddev->bitmap, sector_nr, &sync_blocks, 1) && 5618 sync_blocks >= STRIPE_SECTORS) { 5619 /* we can skip this block, and probably more */ 5620 sync_blocks /= STRIPE_SECTORS; 5621 *skipped = 1; 5622 return sync_blocks * STRIPE_SECTORS; /* keep things rounded to whole stripes */ 5623 } 5624 5625 bitmap_cond_end_sync(mddev->bitmap, sector_nr); 5626 5627 sh = get_active_stripe(conf, sector_nr, 0, 1, 0); 5628 if (sh == NULL) { 5629 sh = get_active_stripe(conf, sector_nr, 0, 0, 0); 5630 /* make sure we don't swamp the stripe cache if someone else 5631 * is trying to get access 5632 */ 5633 schedule_timeout_uninterruptible(1); 5634 } 5635 /* Need to check if array will still be degraded after recovery/resync 5636 * Note in case of > 1 drive failures it's possible we're rebuilding 5637 * one drive while leaving another faulty drive in array. 5638 */ 5639 rcu_read_lock(); 5640 for (i = 0; i < conf->raid_disks; i++) { 5641 struct md_rdev *rdev = ACCESS_ONCE(conf->disks[i].rdev); 5642 5643 if (rdev == NULL || test_bit(Faulty, &rdev->flags)) 5644 still_degraded = 1; 5645 } 5646 rcu_read_unlock(); 5647 5648 bitmap_start_sync(mddev->bitmap, sector_nr, &sync_blocks, still_degraded); 5649 5650 set_bit(STRIPE_SYNC_REQUESTED, &sh->state); 5651 set_bit(STRIPE_HANDLE, &sh->state); 5652 5653 release_stripe(sh); 5654 5655 return STRIPE_SECTORS; 5656 } 5657 5658 static int retry_aligned_read(struct r5conf *conf, struct bio *raid_bio) 5659 { 5660 /* We may not be able to submit a whole bio at once as there 5661 * may not be enough stripe_heads available. 5662 * We cannot pre-allocate enough stripe_heads as we may need 5663 * more than exist in the cache (if we allow ever large chunks). 5664 * So we do one stripe head at a time and record in 5665 * ->bi_hw_segments how many have been done. 5666 * 5667 * We *know* that this entire raid_bio is in one chunk, so 5668 * it will be only one 'dd_idx' and only need one call to raid5_compute_sector. 5669 */ 5670 struct stripe_head *sh; 5671 int dd_idx; 5672 sector_t sector, logical_sector, last_sector; 5673 int scnt = 0; 5674 int remaining; 5675 int handled = 0; 5676 5677 logical_sector = raid_bio->bi_iter.bi_sector & 5678 ~((sector_t)STRIPE_SECTORS-1); 5679 sector = raid5_compute_sector(conf, logical_sector, 5680 0, &dd_idx, NULL); 5681 last_sector = bio_end_sector(raid_bio); 5682 5683 for (; logical_sector < last_sector; 5684 logical_sector += STRIPE_SECTORS, 5685 sector += STRIPE_SECTORS, 5686 scnt++) { 5687 5688 if (scnt < raid5_bi_processed_stripes(raid_bio)) 5689 /* already done this stripe */ 5690 continue; 5691 5692 sh = get_active_stripe(conf, sector, 0, 1, 1); 5693 5694 if (!sh) { 5695 /* failed to get a stripe - must wait */ 5696 raid5_set_bi_processed_stripes(raid_bio, scnt); 5697 conf->retry_read_aligned = raid_bio; 5698 return handled; 5699 } 5700 5701 if (!add_stripe_bio(sh, raid_bio, dd_idx, 0, 0)) { 5702 release_stripe(sh); 5703 raid5_set_bi_processed_stripes(raid_bio, scnt); 5704 conf->retry_read_aligned = raid_bio; 5705 return handled; 5706 } 5707 5708 set_bit(R5_ReadNoMerge, &sh->dev[dd_idx].flags); 5709 handle_stripe(sh); 5710 release_stripe(sh); 5711 handled++; 5712 } 5713 remaining = raid5_dec_bi_active_stripes(raid_bio); 5714 if (remaining == 0) { 5715 trace_block_bio_complete(bdev_get_queue(raid_bio->bi_bdev), 5716 raid_bio, 0); 5717 bio_endio(raid_bio, 0); 5718 } 5719 if (atomic_dec_and_test(&conf->active_aligned_reads)) 5720 wake_up(&conf->wait_for_quiescent); 5721 return handled; 5722 } 5723 5724 static int handle_active_stripes(struct r5conf *conf, int group, 5725 struct r5worker *worker, 5726 struct list_head *temp_inactive_list) 5727 { 5728 struct stripe_head *batch[MAX_STRIPE_BATCH], *sh; 5729 int i, batch_size = 0, hash; 5730 bool release_inactive = false; 5731 5732 while (batch_size < MAX_STRIPE_BATCH && 5733 (sh = __get_priority_stripe(conf, group)) != NULL) 5734 batch[batch_size++] = sh; 5735 5736 if (batch_size == 0) { 5737 for (i = 0; i < NR_STRIPE_HASH_LOCKS; i++) 5738 if (!list_empty(temp_inactive_list + i)) 5739 break; 5740 if (i == NR_STRIPE_HASH_LOCKS) 5741 return batch_size; 5742 release_inactive = true; 5743 } 5744 spin_unlock_irq(&conf->device_lock); 5745 5746 release_inactive_stripe_list(conf, temp_inactive_list, 5747 NR_STRIPE_HASH_LOCKS); 5748 5749 if (release_inactive) { 5750 spin_lock_irq(&conf->device_lock); 5751 return 0; 5752 } 5753 5754 for (i = 0; i < batch_size; i++) 5755 handle_stripe(batch[i]); 5756 5757 cond_resched(); 5758 5759 spin_lock_irq(&conf->device_lock); 5760 for (i = 0; i < batch_size; i++) { 5761 hash = batch[i]->hash_lock_index; 5762 __release_stripe(conf, batch[i], &temp_inactive_list[hash]); 5763 } 5764 return batch_size; 5765 } 5766 5767 static void raid5_do_work(struct work_struct *work) 5768 { 5769 struct r5worker *worker = container_of(work, struct r5worker, work); 5770 struct r5worker_group *group = worker->group; 5771 struct r5conf *conf = group->conf; 5772 int group_id = group - conf->worker_groups; 5773 int handled; 5774 struct blk_plug plug; 5775 5776 pr_debug("+++ raid5worker active\n"); 5777 5778 blk_start_plug(&plug); 5779 handled = 0; 5780 spin_lock_irq(&conf->device_lock); 5781 while (1) { 5782 int batch_size, released; 5783 5784 released = release_stripe_list(conf, worker->temp_inactive_list); 5785 5786 batch_size = handle_active_stripes(conf, group_id, worker, 5787 worker->temp_inactive_list); 5788 worker->working = false; 5789 if (!batch_size && !released) 5790 break; 5791 handled += batch_size; 5792 } 5793 pr_debug("%d stripes handled\n", handled); 5794 5795 spin_unlock_irq(&conf->device_lock); 5796 blk_finish_plug(&plug); 5797 5798 pr_debug("--- raid5worker inactive\n"); 5799 } 5800 5801 /* 5802 * This is our raid5 kernel thread. 5803 * 5804 * We scan the hash table for stripes which can be handled now. 5805 * During the scan, completed stripes are saved for us by the interrupt 5806 * handler, so that they will not have to wait for our next wakeup. 5807 */ 5808 static void raid5d(struct md_thread *thread) 5809 { 5810 struct mddev *mddev = thread->mddev; 5811 struct r5conf *conf = mddev->private; 5812 int handled; 5813 struct blk_plug plug; 5814 5815 pr_debug("+++ raid5d active\n"); 5816 5817 md_check_recovery(mddev); 5818 5819 blk_start_plug(&plug); 5820 handled = 0; 5821 spin_lock_irq(&conf->device_lock); 5822 while (1) { 5823 struct bio *bio; 5824 int batch_size, released; 5825 5826 released = release_stripe_list(conf, conf->temp_inactive_list); 5827 if (released) 5828 clear_bit(R5_DID_ALLOC, &conf->cache_state); 5829 5830 if ( 5831 !list_empty(&conf->bitmap_list)) { 5832 /* Now is a good time to flush some bitmap updates */ 5833 conf->seq_flush++; 5834 spin_unlock_irq(&conf->device_lock); 5835 bitmap_unplug(mddev->bitmap); 5836 spin_lock_irq(&conf->device_lock); 5837 conf->seq_write = conf->seq_flush; 5838 activate_bit_delay(conf, conf->temp_inactive_list); 5839 } 5840 raid5_activate_delayed(conf); 5841 5842 while ((bio = remove_bio_from_retry(conf))) { 5843 int ok; 5844 spin_unlock_irq(&conf->device_lock); 5845 ok = retry_aligned_read(conf, bio); 5846 spin_lock_irq(&conf->device_lock); 5847 if (!ok) 5848 break; 5849 handled++; 5850 } 5851 5852 batch_size = handle_active_stripes(conf, ANY_GROUP, NULL, 5853 conf->temp_inactive_list); 5854 if (!batch_size && !released) 5855 break; 5856 handled += batch_size; 5857 5858 if (mddev->flags & ~(1<<MD_CHANGE_PENDING)) { 5859 spin_unlock_irq(&conf->device_lock); 5860 md_check_recovery(mddev); 5861 spin_lock_irq(&conf->device_lock); 5862 } 5863 } 5864 pr_debug("%d stripes handled\n", handled); 5865 5866 spin_unlock_irq(&conf->device_lock); 5867 if (test_and_clear_bit(R5_ALLOC_MORE, &conf->cache_state) && 5868 mutex_trylock(&conf->cache_size_mutex)) { 5869 grow_one_stripe(conf, __GFP_NOWARN); 5870 /* Set flag even if allocation failed. This helps 5871 * slow down allocation requests when mem is short 5872 */ 5873 set_bit(R5_DID_ALLOC, &conf->cache_state); 5874 mutex_unlock(&conf->cache_size_mutex); 5875 } 5876 5877 async_tx_issue_pending_all(); 5878 blk_finish_plug(&plug); 5879 5880 pr_debug("--- raid5d inactive\n"); 5881 } 5882 5883 static ssize_t 5884 raid5_show_stripe_cache_size(struct mddev *mddev, char *page) 5885 { 5886 struct r5conf *conf; 5887 int ret = 0; 5888 spin_lock(&mddev->lock); 5889 conf = mddev->private; 5890 if (conf) 5891 ret = sprintf(page, "%d\n", conf->min_nr_stripes); 5892 spin_unlock(&mddev->lock); 5893 return ret; 5894 } 5895 5896 int 5897 raid5_set_cache_size(struct mddev *mddev, int size) 5898 { 5899 struct r5conf *conf = mddev->private; 5900 int err; 5901 5902 if (size <= 16 || size > 32768) 5903 return -EINVAL; 5904 5905 conf->min_nr_stripes = size; 5906 mutex_lock(&conf->cache_size_mutex); 5907 while (size < conf->max_nr_stripes && 5908 drop_one_stripe(conf)) 5909 ; 5910 mutex_unlock(&conf->cache_size_mutex); 5911 5912 5913 err = md_allow_write(mddev); 5914 if (err) 5915 return err; 5916 5917 mutex_lock(&conf->cache_size_mutex); 5918 while (size > conf->max_nr_stripes) 5919 if (!grow_one_stripe(conf, GFP_KERNEL)) 5920 break; 5921 mutex_unlock(&conf->cache_size_mutex); 5922 5923 return 0; 5924 } 5925 EXPORT_SYMBOL(raid5_set_cache_size); 5926 5927 static ssize_t 5928 raid5_store_stripe_cache_size(struct mddev *mddev, const char *page, size_t len) 5929 { 5930 struct r5conf *conf; 5931 unsigned long new; 5932 int err; 5933 5934 if (len >= PAGE_SIZE) 5935 return -EINVAL; 5936 if (kstrtoul(page, 10, &new)) 5937 return -EINVAL; 5938 err = mddev_lock(mddev); 5939 if (err) 5940 return err; 5941 conf = mddev->private; 5942 if (!conf) 5943 err = -ENODEV; 5944 else 5945 err = raid5_set_cache_size(mddev, new); 5946 mddev_unlock(mddev); 5947 5948 return err ?: len; 5949 } 5950 5951 static struct md_sysfs_entry 5952 raid5_stripecache_size = __ATTR(stripe_cache_size, S_IRUGO | S_IWUSR, 5953 raid5_show_stripe_cache_size, 5954 raid5_store_stripe_cache_size); 5955 5956 static ssize_t 5957 raid5_show_rmw_level(struct mddev *mddev, char *page) 5958 { 5959 struct r5conf *conf = mddev->private; 5960 if (conf) 5961 return sprintf(page, "%d\n", conf->rmw_level); 5962 else 5963 return 0; 5964 } 5965 5966 static ssize_t 5967 raid5_store_rmw_level(struct mddev *mddev, const char *page, size_t len) 5968 { 5969 struct r5conf *conf = mddev->private; 5970 unsigned long new; 5971 5972 if (!conf) 5973 return -ENODEV; 5974 5975 if (len >= PAGE_SIZE) 5976 return -EINVAL; 5977 5978 if (kstrtoul(page, 10, &new)) 5979 return -EINVAL; 5980 5981 if (new != PARITY_DISABLE_RMW && !raid6_call.xor_syndrome) 5982 return -EINVAL; 5983 5984 if (new != PARITY_DISABLE_RMW && 5985 new != PARITY_ENABLE_RMW && 5986 new != PARITY_PREFER_RMW) 5987 return -EINVAL; 5988 5989 conf->rmw_level = new; 5990 return len; 5991 } 5992 5993 static struct md_sysfs_entry 5994 raid5_rmw_level = __ATTR(rmw_level, S_IRUGO | S_IWUSR, 5995 raid5_show_rmw_level, 5996 raid5_store_rmw_level); 5997 5998 5999 static ssize_t 6000 raid5_show_preread_threshold(struct mddev *mddev, char *page) 6001 { 6002 struct r5conf *conf; 6003 int ret = 0; 6004 spin_lock(&mddev->lock); 6005 conf = mddev->private; 6006 if (conf) 6007 ret = sprintf(page, "%d\n", conf->bypass_threshold); 6008 spin_unlock(&mddev->lock); 6009 return ret; 6010 } 6011 6012 static ssize_t 6013 raid5_store_preread_threshold(struct mddev *mddev, const char *page, size_t len) 6014 { 6015 struct r5conf *conf; 6016 unsigned long new; 6017 int err; 6018 6019 if (len >= PAGE_SIZE) 6020 return -EINVAL; 6021 if (kstrtoul(page, 10, &new)) 6022 return -EINVAL; 6023 6024 err = mddev_lock(mddev); 6025 if (err) 6026 return err; 6027 conf = mddev->private; 6028 if (!conf) 6029 err = -ENODEV; 6030 else if (new > conf->min_nr_stripes) 6031 err = -EINVAL; 6032 else 6033 conf->bypass_threshold = new; 6034 mddev_unlock(mddev); 6035 return err ?: len; 6036 } 6037 6038 static struct md_sysfs_entry 6039 raid5_preread_bypass_threshold = __ATTR(preread_bypass_threshold, 6040 S_IRUGO | S_IWUSR, 6041 raid5_show_preread_threshold, 6042 raid5_store_preread_threshold); 6043 6044 static ssize_t 6045 raid5_show_skip_copy(struct mddev *mddev, char *page) 6046 { 6047 struct r5conf *conf; 6048 int ret = 0; 6049 spin_lock(&mddev->lock); 6050 conf = mddev->private; 6051 if (conf) 6052 ret = sprintf(page, "%d\n", conf->skip_copy); 6053 spin_unlock(&mddev->lock); 6054 return ret; 6055 } 6056 6057 static ssize_t 6058 raid5_store_skip_copy(struct mddev *mddev, const char *page, size_t len) 6059 { 6060 struct r5conf *conf; 6061 unsigned long new; 6062 int err; 6063 6064 if (len >= PAGE_SIZE) 6065 return -EINVAL; 6066 if (kstrtoul(page, 10, &new)) 6067 return -EINVAL; 6068 new = !!new; 6069 6070 err = mddev_lock(mddev); 6071 if (err) 6072 return err; 6073 conf = mddev->private; 6074 if (!conf) 6075 err = -ENODEV; 6076 else if (new != conf->skip_copy) { 6077 mddev_suspend(mddev); 6078 conf->skip_copy = new; 6079 if (new) 6080 mddev->queue->backing_dev_info.capabilities |= 6081 BDI_CAP_STABLE_WRITES; 6082 else 6083 mddev->queue->backing_dev_info.capabilities &= 6084 ~BDI_CAP_STABLE_WRITES; 6085 mddev_resume(mddev); 6086 } 6087 mddev_unlock(mddev); 6088 return err ?: len; 6089 } 6090 6091 static struct md_sysfs_entry 6092 raid5_skip_copy = __ATTR(skip_copy, S_IRUGO | S_IWUSR, 6093 raid5_show_skip_copy, 6094 raid5_store_skip_copy); 6095 6096 static ssize_t 6097 stripe_cache_active_show(struct mddev *mddev, char *page) 6098 { 6099 struct r5conf *conf = mddev->private; 6100 if (conf) 6101 return sprintf(page, "%d\n", atomic_read(&conf->active_stripes)); 6102 else 6103 return 0; 6104 } 6105 6106 static struct md_sysfs_entry 6107 raid5_stripecache_active = __ATTR_RO(stripe_cache_active); 6108 6109 static ssize_t 6110 raid5_show_group_thread_cnt(struct mddev *mddev, char *page) 6111 { 6112 struct r5conf *conf; 6113 int ret = 0; 6114 spin_lock(&mddev->lock); 6115 conf = mddev->private; 6116 if (conf) 6117 ret = sprintf(page, "%d\n", conf->worker_cnt_per_group); 6118 spin_unlock(&mddev->lock); 6119 return ret; 6120 } 6121 6122 static int alloc_thread_groups(struct r5conf *conf, int cnt, 6123 int *group_cnt, 6124 int *worker_cnt_per_group, 6125 struct r5worker_group **worker_groups); 6126 static ssize_t 6127 raid5_store_group_thread_cnt(struct mddev *mddev, const char *page, size_t len) 6128 { 6129 struct r5conf *conf; 6130 unsigned long new; 6131 int err; 6132 struct r5worker_group *new_groups, *old_groups; 6133 int group_cnt, worker_cnt_per_group; 6134 6135 if (len >= PAGE_SIZE) 6136 return -EINVAL; 6137 if (kstrtoul(page, 10, &new)) 6138 return -EINVAL; 6139 6140 err = mddev_lock(mddev); 6141 if (err) 6142 return err; 6143 conf = mddev->private; 6144 if (!conf) 6145 err = -ENODEV; 6146 else if (new != conf->worker_cnt_per_group) { 6147 mddev_suspend(mddev); 6148 6149 old_groups = conf->worker_groups; 6150 if (old_groups) 6151 flush_workqueue(raid5_wq); 6152 6153 err = alloc_thread_groups(conf, new, 6154 &group_cnt, &worker_cnt_per_group, 6155 &new_groups); 6156 if (!err) { 6157 spin_lock_irq(&conf->device_lock); 6158 conf->group_cnt = group_cnt; 6159 conf->worker_cnt_per_group = worker_cnt_per_group; 6160 conf->worker_groups = new_groups; 6161 spin_unlock_irq(&conf->device_lock); 6162 6163 if (old_groups) 6164 kfree(old_groups[0].workers); 6165 kfree(old_groups); 6166 } 6167 mddev_resume(mddev); 6168 } 6169 mddev_unlock(mddev); 6170 6171 return err ?: len; 6172 } 6173 6174 static struct md_sysfs_entry 6175 raid5_group_thread_cnt = __ATTR(group_thread_cnt, S_IRUGO | S_IWUSR, 6176 raid5_show_group_thread_cnt, 6177 raid5_store_group_thread_cnt); 6178 6179 static struct attribute *raid5_attrs[] = { 6180 &raid5_stripecache_size.attr, 6181 &raid5_stripecache_active.attr, 6182 &raid5_preread_bypass_threshold.attr, 6183 &raid5_group_thread_cnt.attr, 6184 &raid5_skip_copy.attr, 6185 &raid5_rmw_level.attr, 6186 NULL, 6187 }; 6188 static struct attribute_group raid5_attrs_group = { 6189 .name = NULL, 6190 .attrs = raid5_attrs, 6191 }; 6192 6193 static int alloc_thread_groups(struct r5conf *conf, int cnt, 6194 int *group_cnt, 6195 int *worker_cnt_per_group, 6196 struct r5worker_group **worker_groups) 6197 { 6198 int i, j, k; 6199 ssize_t size; 6200 struct r5worker *workers; 6201 6202 *worker_cnt_per_group = cnt; 6203 if (cnt == 0) { 6204 *group_cnt = 0; 6205 *worker_groups = NULL; 6206 return 0; 6207 } 6208 *group_cnt = num_possible_nodes(); 6209 size = sizeof(struct r5worker) * cnt; 6210 workers = kzalloc(size * *group_cnt, GFP_NOIO); 6211 *worker_groups = kzalloc(sizeof(struct r5worker_group) * 6212 *group_cnt, GFP_NOIO); 6213 if (!*worker_groups || !workers) { 6214 kfree(workers); 6215 kfree(*worker_groups); 6216 return -ENOMEM; 6217 } 6218 6219 for (i = 0; i < *group_cnt; i++) { 6220 struct r5worker_group *group; 6221 6222 group = &(*worker_groups)[i]; 6223 INIT_LIST_HEAD(&group->handle_list); 6224 group->conf = conf; 6225 group->workers = workers + i * cnt; 6226 6227 for (j = 0; j < cnt; j++) { 6228 struct r5worker *worker = group->workers + j; 6229 worker->group = group; 6230 INIT_WORK(&worker->work, raid5_do_work); 6231 6232 for (k = 0; k < NR_STRIPE_HASH_LOCKS; k++) 6233 INIT_LIST_HEAD(worker->temp_inactive_list + k); 6234 } 6235 } 6236 6237 return 0; 6238 } 6239 6240 static void free_thread_groups(struct r5conf *conf) 6241 { 6242 if (conf->worker_groups) 6243 kfree(conf->worker_groups[0].workers); 6244 kfree(conf->worker_groups); 6245 conf->worker_groups = NULL; 6246 } 6247 6248 static sector_t 6249 raid5_size(struct mddev *mddev, sector_t sectors, int raid_disks) 6250 { 6251 struct r5conf *conf = mddev->private; 6252 6253 if (!sectors) 6254 sectors = mddev->dev_sectors; 6255 if (!raid_disks) 6256 /* size is defined by the smallest of previous and new size */ 6257 raid_disks = min(conf->raid_disks, conf->previous_raid_disks); 6258 6259 sectors &= ~((sector_t)mddev->chunk_sectors - 1); 6260 sectors &= ~((sector_t)mddev->new_chunk_sectors - 1); 6261 return sectors * (raid_disks - conf->max_degraded); 6262 } 6263 6264 static void free_scratch_buffer(struct r5conf *conf, struct raid5_percpu *percpu) 6265 { 6266 safe_put_page(percpu->spare_page); 6267 if (percpu->scribble) 6268 flex_array_free(percpu->scribble); 6269 percpu->spare_page = NULL; 6270 percpu->scribble = NULL; 6271 } 6272 6273 static int alloc_scratch_buffer(struct r5conf *conf, struct raid5_percpu *percpu) 6274 { 6275 if (conf->level == 6 && !percpu->spare_page) 6276 percpu->spare_page = alloc_page(GFP_KERNEL); 6277 if (!percpu->scribble) 6278 percpu->scribble = scribble_alloc(max(conf->raid_disks, 6279 conf->previous_raid_disks), 6280 max(conf->chunk_sectors, 6281 conf->prev_chunk_sectors) 6282 / STRIPE_SECTORS, 6283 GFP_KERNEL); 6284 6285 if (!percpu->scribble || (conf->level == 6 && !percpu->spare_page)) { 6286 free_scratch_buffer(conf, percpu); 6287 return -ENOMEM; 6288 } 6289 6290 return 0; 6291 } 6292 6293 static void raid5_free_percpu(struct r5conf *conf) 6294 { 6295 unsigned long cpu; 6296 6297 if (!conf->percpu) 6298 return; 6299 6300 #ifdef CONFIG_HOTPLUG_CPU 6301 unregister_cpu_notifier(&conf->cpu_notify); 6302 #endif 6303 6304 get_online_cpus(); 6305 for_each_possible_cpu(cpu) 6306 free_scratch_buffer(conf, per_cpu_ptr(conf->percpu, cpu)); 6307 put_online_cpus(); 6308 6309 free_percpu(conf->percpu); 6310 } 6311 6312 static void free_conf(struct r5conf *conf) 6313 { 6314 if (conf->shrinker.seeks) 6315 unregister_shrinker(&conf->shrinker); 6316 free_thread_groups(conf); 6317 shrink_stripes(conf); 6318 raid5_free_percpu(conf); 6319 kfree(conf->disks); 6320 kfree(conf->stripe_hashtbl); 6321 kfree(conf); 6322 } 6323 6324 #ifdef CONFIG_HOTPLUG_CPU 6325 static int raid456_cpu_notify(struct notifier_block *nfb, unsigned long action, 6326 void *hcpu) 6327 { 6328 struct r5conf *conf = container_of(nfb, struct r5conf, cpu_notify); 6329 long cpu = (long)hcpu; 6330 struct raid5_percpu *percpu = per_cpu_ptr(conf->percpu, cpu); 6331 6332 switch (action) { 6333 case CPU_UP_PREPARE: 6334 case CPU_UP_PREPARE_FROZEN: 6335 if (alloc_scratch_buffer(conf, percpu)) { 6336 pr_err("%s: failed memory allocation for cpu%ld\n", 6337 __func__, cpu); 6338 return notifier_from_errno(-ENOMEM); 6339 } 6340 break; 6341 case CPU_DEAD: 6342 case CPU_DEAD_FROZEN: 6343 free_scratch_buffer(conf, per_cpu_ptr(conf->percpu, cpu)); 6344 break; 6345 default: 6346 break; 6347 } 6348 return NOTIFY_OK; 6349 } 6350 #endif 6351 6352 static int raid5_alloc_percpu(struct r5conf *conf) 6353 { 6354 unsigned long cpu; 6355 int err = 0; 6356 6357 conf->percpu = alloc_percpu(struct raid5_percpu); 6358 if (!conf->percpu) 6359 return -ENOMEM; 6360 6361 #ifdef CONFIG_HOTPLUG_CPU 6362 conf->cpu_notify.notifier_call = raid456_cpu_notify; 6363 conf->cpu_notify.priority = 0; 6364 err = register_cpu_notifier(&conf->cpu_notify); 6365 if (err) 6366 return err; 6367 #endif 6368 6369 get_online_cpus(); 6370 for_each_present_cpu(cpu) { 6371 err = alloc_scratch_buffer(conf, per_cpu_ptr(conf->percpu, cpu)); 6372 if (err) { 6373 pr_err("%s: failed memory allocation for cpu%ld\n", 6374 __func__, cpu); 6375 break; 6376 } 6377 } 6378 put_online_cpus(); 6379 6380 return err; 6381 } 6382 6383 static unsigned long raid5_cache_scan(struct shrinker *shrink, 6384 struct shrink_control *sc) 6385 { 6386 struct r5conf *conf = container_of(shrink, struct r5conf, shrinker); 6387 unsigned long ret = SHRINK_STOP; 6388 6389 if (mutex_trylock(&conf->cache_size_mutex)) { 6390 ret= 0; 6391 while (ret < sc->nr_to_scan && 6392 conf->max_nr_stripes > conf->min_nr_stripes) { 6393 if (drop_one_stripe(conf) == 0) { 6394 ret = SHRINK_STOP; 6395 break; 6396 } 6397 ret++; 6398 } 6399 mutex_unlock(&conf->cache_size_mutex); 6400 } 6401 return ret; 6402 } 6403 6404 static unsigned long raid5_cache_count(struct shrinker *shrink, 6405 struct shrink_control *sc) 6406 { 6407 struct r5conf *conf = container_of(shrink, struct r5conf, shrinker); 6408 6409 if (conf->max_nr_stripes < conf->min_nr_stripes) 6410 /* unlikely, but not impossible */ 6411 return 0; 6412 return conf->max_nr_stripes - conf->min_nr_stripes; 6413 } 6414 6415 static struct r5conf *setup_conf(struct mddev *mddev) 6416 { 6417 struct r5conf *conf; 6418 int raid_disk, memory, max_disks; 6419 struct md_rdev *rdev; 6420 struct disk_info *disk; 6421 char pers_name[6]; 6422 int i; 6423 int group_cnt, worker_cnt_per_group; 6424 struct r5worker_group *new_group; 6425 6426 if (mddev->new_level != 5 6427 && mddev->new_level != 4 6428 && mddev->new_level != 6) { 6429 printk(KERN_ERR "md/raid:%s: raid level not set to 4/5/6 (%d)\n", 6430 mdname(mddev), mddev->new_level); 6431 return ERR_PTR(-EIO); 6432 } 6433 if ((mddev->new_level == 5 6434 && !algorithm_valid_raid5(mddev->new_layout)) || 6435 (mddev->new_level == 6 6436 && !algorithm_valid_raid6(mddev->new_layout))) { 6437 printk(KERN_ERR "md/raid:%s: layout %d not supported\n", 6438 mdname(mddev), mddev->new_layout); 6439 return ERR_PTR(-EIO); 6440 } 6441 if (mddev->new_level == 6 && mddev->raid_disks < 4) { 6442 printk(KERN_ERR "md/raid:%s: not enough configured devices (%d, minimum 4)\n", 6443 mdname(mddev), mddev->raid_disks); 6444 return ERR_PTR(-EINVAL); 6445 } 6446 6447 if (!mddev->new_chunk_sectors || 6448 (mddev->new_chunk_sectors << 9) % PAGE_SIZE || 6449 !is_power_of_2(mddev->new_chunk_sectors)) { 6450 printk(KERN_ERR "md/raid:%s: invalid chunk size %d\n", 6451 mdname(mddev), mddev->new_chunk_sectors << 9); 6452 return ERR_PTR(-EINVAL); 6453 } 6454 6455 conf = kzalloc(sizeof(struct r5conf), GFP_KERNEL); 6456 if (conf == NULL) 6457 goto abort; 6458 /* Don't enable multi-threading by default*/ 6459 if (!alloc_thread_groups(conf, 0, &group_cnt, &worker_cnt_per_group, 6460 &new_group)) { 6461 conf->group_cnt = group_cnt; 6462 conf->worker_cnt_per_group = worker_cnt_per_group; 6463 conf->worker_groups = new_group; 6464 } else 6465 goto abort; 6466 spin_lock_init(&conf->device_lock); 6467 seqcount_init(&conf->gen_lock); 6468 mutex_init(&conf->cache_size_mutex); 6469 init_waitqueue_head(&conf->wait_for_quiescent); 6470 for (i = 0; i < NR_STRIPE_HASH_LOCKS; i++) { 6471 init_waitqueue_head(&conf->wait_for_stripe[i]); 6472 } 6473 init_waitqueue_head(&conf->wait_for_overlap); 6474 INIT_LIST_HEAD(&conf->handle_list); 6475 INIT_LIST_HEAD(&conf->hold_list); 6476 INIT_LIST_HEAD(&conf->delayed_list); 6477 INIT_LIST_HEAD(&conf->bitmap_list); 6478 init_llist_head(&conf->released_stripes); 6479 atomic_set(&conf->active_stripes, 0); 6480 atomic_set(&conf->preread_active_stripes, 0); 6481 atomic_set(&conf->active_aligned_reads, 0); 6482 conf->bypass_threshold = BYPASS_THRESHOLD; 6483 conf->recovery_disabled = mddev->recovery_disabled - 1; 6484 6485 conf->raid_disks = mddev->raid_disks; 6486 if (mddev->reshape_position == MaxSector) 6487 conf->previous_raid_disks = mddev->raid_disks; 6488 else 6489 conf->previous_raid_disks = mddev->raid_disks - mddev->delta_disks; 6490 max_disks = max(conf->raid_disks, conf->previous_raid_disks); 6491 6492 conf->disks = kzalloc(max_disks * sizeof(struct disk_info), 6493 GFP_KERNEL); 6494 if (!conf->disks) 6495 goto abort; 6496 6497 conf->mddev = mddev; 6498 6499 if ((conf->stripe_hashtbl = kzalloc(PAGE_SIZE, GFP_KERNEL)) == NULL) 6500 goto abort; 6501 6502 /* We init hash_locks[0] separately to that it can be used 6503 * as the reference lock in the spin_lock_nest_lock() call 6504 * in lock_all_device_hash_locks_irq in order to convince 6505 * lockdep that we know what we are doing. 6506 */ 6507 spin_lock_init(conf->hash_locks); 6508 for (i = 1; i < NR_STRIPE_HASH_LOCKS; i++) 6509 spin_lock_init(conf->hash_locks + i); 6510 6511 for (i = 0; i < NR_STRIPE_HASH_LOCKS; i++) 6512 INIT_LIST_HEAD(conf->inactive_list + i); 6513 6514 for (i = 0; i < NR_STRIPE_HASH_LOCKS; i++) 6515 INIT_LIST_HEAD(conf->temp_inactive_list + i); 6516 6517 conf->level = mddev->new_level; 6518 conf->chunk_sectors = mddev->new_chunk_sectors; 6519 if (raid5_alloc_percpu(conf) != 0) 6520 goto abort; 6521 6522 pr_debug("raid456: run(%s) called.\n", mdname(mddev)); 6523 6524 rdev_for_each(rdev, mddev) { 6525 raid_disk = rdev->raid_disk; 6526 if (raid_disk >= max_disks 6527 || raid_disk < 0) 6528 continue; 6529 disk = conf->disks + raid_disk; 6530 6531 if (test_bit(Replacement, &rdev->flags)) { 6532 if (disk->replacement) 6533 goto abort; 6534 disk->replacement = rdev; 6535 } else { 6536 if (disk->rdev) 6537 goto abort; 6538 disk->rdev = rdev; 6539 } 6540 6541 if (test_bit(In_sync, &rdev->flags)) { 6542 char b[BDEVNAME_SIZE]; 6543 printk(KERN_INFO "md/raid:%s: device %s operational as raid" 6544 " disk %d\n", 6545 mdname(mddev), bdevname(rdev->bdev, b), raid_disk); 6546 } else if (rdev->saved_raid_disk != raid_disk) 6547 /* Cannot rely on bitmap to complete recovery */ 6548 conf->fullsync = 1; 6549 } 6550 6551 conf->level = mddev->new_level; 6552 if (conf->level == 6) { 6553 conf->max_degraded = 2; 6554 if (raid6_call.xor_syndrome) 6555 conf->rmw_level = PARITY_ENABLE_RMW; 6556 else 6557 conf->rmw_level = PARITY_DISABLE_RMW; 6558 } else { 6559 conf->max_degraded = 1; 6560 conf->rmw_level = PARITY_ENABLE_RMW; 6561 } 6562 conf->algorithm = mddev->new_layout; 6563 conf->reshape_progress = mddev->reshape_position; 6564 if (conf->reshape_progress != MaxSector) { 6565 conf->prev_chunk_sectors = mddev->chunk_sectors; 6566 conf->prev_algo = mddev->layout; 6567 } 6568 6569 conf->min_nr_stripes = NR_STRIPES; 6570 memory = conf->min_nr_stripes * (sizeof(struct stripe_head) + 6571 max_disks * ((sizeof(struct bio) + PAGE_SIZE))) / 1024; 6572 atomic_set(&conf->empty_inactive_list_nr, NR_STRIPE_HASH_LOCKS); 6573 if (grow_stripes(conf, conf->min_nr_stripes)) { 6574 printk(KERN_ERR 6575 "md/raid:%s: couldn't allocate %dkB for buffers\n", 6576 mdname(mddev), memory); 6577 goto abort; 6578 } else 6579 printk(KERN_INFO "md/raid:%s: allocated %dkB\n", 6580 mdname(mddev), memory); 6581 /* 6582 * Losing a stripe head costs more than the time to refill it, 6583 * it reduces the queue depth and so can hurt throughput. 6584 * So set it rather large, scaled by number of devices. 6585 */ 6586 conf->shrinker.seeks = DEFAULT_SEEKS * conf->raid_disks * 4; 6587 conf->shrinker.scan_objects = raid5_cache_scan; 6588 conf->shrinker.count_objects = raid5_cache_count; 6589 conf->shrinker.batch = 128; 6590 conf->shrinker.flags = 0; 6591 register_shrinker(&conf->shrinker); 6592 6593 sprintf(pers_name, "raid%d", mddev->new_level); 6594 conf->thread = md_register_thread(raid5d, mddev, pers_name); 6595 if (!conf->thread) { 6596 printk(KERN_ERR 6597 "md/raid:%s: couldn't allocate thread.\n", 6598 mdname(mddev)); 6599 goto abort; 6600 } 6601 6602 return conf; 6603 6604 abort: 6605 if (conf) { 6606 free_conf(conf); 6607 return ERR_PTR(-EIO); 6608 } else 6609 return ERR_PTR(-ENOMEM); 6610 } 6611 6612 static int only_parity(int raid_disk, int algo, int raid_disks, int max_degraded) 6613 { 6614 switch (algo) { 6615 case ALGORITHM_PARITY_0: 6616 if (raid_disk < max_degraded) 6617 return 1; 6618 break; 6619 case ALGORITHM_PARITY_N: 6620 if (raid_disk >= raid_disks - max_degraded) 6621 return 1; 6622 break; 6623 case ALGORITHM_PARITY_0_6: 6624 if (raid_disk == 0 || 6625 raid_disk == raid_disks - 1) 6626 return 1; 6627 break; 6628 case ALGORITHM_LEFT_ASYMMETRIC_6: 6629 case ALGORITHM_RIGHT_ASYMMETRIC_6: 6630 case ALGORITHM_LEFT_SYMMETRIC_6: 6631 case ALGORITHM_RIGHT_SYMMETRIC_6: 6632 if (raid_disk == raid_disks - 1) 6633 return 1; 6634 } 6635 return 0; 6636 } 6637 6638 static int run(struct mddev *mddev) 6639 { 6640 struct r5conf *conf; 6641 int working_disks = 0; 6642 int dirty_parity_disks = 0; 6643 struct md_rdev *rdev; 6644 sector_t reshape_offset = 0; 6645 int i; 6646 long long min_offset_diff = 0; 6647 int first = 1; 6648 6649 if (mddev->recovery_cp != MaxSector) 6650 printk(KERN_NOTICE "md/raid:%s: not clean" 6651 " -- starting background reconstruction\n", 6652 mdname(mddev)); 6653 6654 rdev_for_each(rdev, mddev) { 6655 long long diff; 6656 if (rdev->raid_disk < 0) 6657 continue; 6658 diff = (rdev->new_data_offset - rdev->data_offset); 6659 if (first) { 6660 min_offset_diff = diff; 6661 first = 0; 6662 } else if (mddev->reshape_backwards && 6663 diff < min_offset_diff) 6664 min_offset_diff = diff; 6665 else if (!mddev->reshape_backwards && 6666 diff > min_offset_diff) 6667 min_offset_diff = diff; 6668 } 6669 6670 if (mddev->reshape_position != MaxSector) { 6671 /* Check that we can continue the reshape. 6672 * Difficulties arise if the stripe we would write to 6673 * next is at or after the stripe we would read from next. 6674 * For a reshape that changes the number of devices, this 6675 * is only possible for a very short time, and mdadm makes 6676 * sure that time appears to have past before assembling 6677 * the array. So we fail if that time hasn't passed. 6678 * For a reshape that keeps the number of devices the same 6679 * mdadm must be monitoring the reshape can keeping the 6680 * critical areas read-only and backed up. It will start 6681 * the array in read-only mode, so we check for that. 6682 */ 6683 sector_t here_new, here_old; 6684 int old_disks; 6685 int max_degraded = (mddev->level == 6 ? 2 : 1); 6686 6687 if (mddev->new_level != mddev->level) { 6688 printk(KERN_ERR "md/raid:%s: unsupported reshape " 6689 "required - aborting.\n", 6690 mdname(mddev)); 6691 return -EINVAL; 6692 } 6693 old_disks = mddev->raid_disks - mddev->delta_disks; 6694 /* reshape_position must be on a new-stripe boundary, and one 6695 * further up in new geometry must map after here in old 6696 * geometry. 6697 */ 6698 here_new = mddev->reshape_position; 6699 if (sector_div(here_new, mddev->new_chunk_sectors * 6700 (mddev->raid_disks - max_degraded))) { 6701 printk(KERN_ERR "md/raid:%s: reshape_position not " 6702 "on a stripe boundary\n", mdname(mddev)); 6703 return -EINVAL; 6704 } 6705 reshape_offset = here_new * mddev->new_chunk_sectors; 6706 /* here_new is the stripe we will write to */ 6707 here_old = mddev->reshape_position; 6708 sector_div(here_old, mddev->chunk_sectors * 6709 (old_disks-max_degraded)); 6710 /* here_old is the first stripe that we might need to read 6711 * from */ 6712 if (mddev->delta_disks == 0) { 6713 if ((here_new * mddev->new_chunk_sectors != 6714 here_old * mddev->chunk_sectors)) { 6715 printk(KERN_ERR "md/raid:%s: reshape position is" 6716 " confused - aborting\n", mdname(mddev)); 6717 return -EINVAL; 6718 } 6719 /* We cannot be sure it is safe to start an in-place 6720 * reshape. It is only safe if user-space is monitoring 6721 * and taking constant backups. 6722 * mdadm always starts a situation like this in 6723 * readonly mode so it can take control before 6724 * allowing any writes. So just check for that. 6725 */ 6726 if (abs(min_offset_diff) >= mddev->chunk_sectors && 6727 abs(min_offset_diff) >= mddev->new_chunk_sectors) 6728 /* not really in-place - so OK */; 6729 else if (mddev->ro == 0) { 6730 printk(KERN_ERR "md/raid:%s: in-place reshape " 6731 "must be started in read-only mode " 6732 "- aborting\n", 6733 mdname(mddev)); 6734 return -EINVAL; 6735 } 6736 } else if (mddev->reshape_backwards 6737 ? (here_new * mddev->new_chunk_sectors + min_offset_diff <= 6738 here_old * mddev->chunk_sectors) 6739 : (here_new * mddev->new_chunk_sectors >= 6740 here_old * mddev->chunk_sectors + (-min_offset_diff))) { 6741 /* Reading from the same stripe as writing to - bad */ 6742 printk(KERN_ERR "md/raid:%s: reshape_position too early for " 6743 "auto-recovery - aborting.\n", 6744 mdname(mddev)); 6745 return -EINVAL; 6746 } 6747 printk(KERN_INFO "md/raid:%s: reshape will continue\n", 6748 mdname(mddev)); 6749 /* OK, we should be able to continue; */ 6750 } else { 6751 BUG_ON(mddev->level != mddev->new_level); 6752 BUG_ON(mddev->layout != mddev->new_layout); 6753 BUG_ON(mddev->chunk_sectors != mddev->new_chunk_sectors); 6754 BUG_ON(mddev->delta_disks != 0); 6755 } 6756 6757 if (mddev->private == NULL) 6758 conf = setup_conf(mddev); 6759 else 6760 conf = mddev->private; 6761 6762 if (IS_ERR(conf)) 6763 return PTR_ERR(conf); 6764 6765 conf->min_offset_diff = min_offset_diff; 6766 mddev->thread = conf->thread; 6767 conf->thread = NULL; 6768 mddev->private = conf; 6769 6770 for (i = 0; i < conf->raid_disks && conf->previous_raid_disks; 6771 i++) { 6772 rdev = conf->disks[i].rdev; 6773 if (!rdev && conf->disks[i].replacement) { 6774 /* The replacement is all we have yet */ 6775 rdev = conf->disks[i].replacement; 6776 conf->disks[i].replacement = NULL; 6777 clear_bit(Replacement, &rdev->flags); 6778 conf->disks[i].rdev = rdev; 6779 } 6780 if (!rdev) 6781 continue; 6782 if (conf->disks[i].replacement && 6783 conf->reshape_progress != MaxSector) { 6784 /* replacements and reshape simply do not mix. */ 6785 printk(KERN_ERR "md: cannot handle concurrent " 6786 "replacement and reshape.\n"); 6787 goto abort; 6788 } 6789 if (test_bit(In_sync, &rdev->flags)) { 6790 working_disks++; 6791 continue; 6792 } 6793 /* This disc is not fully in-sync. However if it 6794 * just stored parity (beyond the recovery_offset), 6795 * when we don't need to be concerned about the 6796 * array being dirty. 6797 * When reshape goes 'backwards', we never have 6798 * partially completed devices, so we only need 6799 * to worry about reshape going forwards. 6800 */ 6801 /* Hack because v0.91 doesn't store recovery_offset properly. */ 6802 if (mddev->major_version == 0 && 6803 mddev->minor_version > 90) 6804 rdev->recovery_offset = reshape_offset; 6805 6806 if (rdev->recovery_offset < reshape_offset) { 6807 /* We need to check old and new layout */ 6808 if (!only_parity(rdev->raid_disk, 6809 conf->algorithm, 6810 conf->raid_disks, 6811 conf->max_degraded)) 6812 continue; 6813 } 6814 if (!only_parity(rdev->raid_disk, 6815 conf->prev_algo, 6816 conf->previous_raid_disks, 6817 conf->max_degraded)) 6818 continue; 6819 dirty_parity_disks++; 6820 } 6821 6822 /* 6823 * 0 for a fully functional array, 1 or 2 for a degraded array. 6824 */ 6825 mddev->degraded = calc_degraded(conf); 6826 6827 if (has_failed(conf)) { 6828 printk(KERN_ERR "md/raid:%s: not enough operational devices" 6829 " (%d/%d failed)\n", 6830 mdname(mddev), mddev->degraded, conf->raid_disks); 6831 goto abort; 6832 } 6833 6834 /* device size must be a multiple of chunk size */ 6835 mddev->dev_sectors &= ~(mddev->chunk_sectors - 1); 6836 mddev->resync_max_sectors = mddev->dev_sectors; 6837 6838 if (mddev->degraded > dirty_parity_disks && 6839 mddev->recovery_cp != MaxSector) { 6840 if (mddev->ok_start_degraded) 6841 printk(KERN_WARNING 6842 "md/raid:%s: starting dirty degraded array" 6843 " - data corruption possible.\n", 6844 mdname(mddev)); 6845 else { 6846 printk(KERN_ERR 6847 "md/raid:%s: cannot start dirty degraded array.\n", 6848 mdname(mddev)); 6849 goto abort; 6850 } 6851 } 6852 6853 if (mddev->degraded == 0) 6854 printk(KERN_INFO "md/raid:%s: raid level %d active with %d out of %d" 6855 " devices, algorithm %d\n", mdname(mddev), conf->level, 6856 mddev->raid_disks-mddev->degraded, mddev->raid_disks, 6857 mddev->new_layout); 6858 else 6859 printk(KERN_ALERT "md/raid:%s: raid level %d active with %d" 6860 " out of %d devices, algorithm %d\n", 6861 mdname(mddev), conf->level, 6862 mddev->raid_disks - mddev->degraded, 6863 mddev->raid_disks, mddev->new_layout); 6864 6865 print_raid5_conf(conf); 6866 6867 if (conf->reshape_progress != MaxSector) { 6868 conf->reshape_safe = conf->reshape_progress; 6869 atomic_set(&conf->reshape_stripes, 0); 6870 clear_bit(MD_RECOVERY_SYNC, &mddev->recovery); 6871 clear_bit(MD_RECOVERY_CHECK, &mddev->recovery); 6872 set_bit(MD_RECOVERY_RESHAPE, &mddev->recovery); 6873 set_bit(MD_RECOVERY_RUNNING, &mddev->recovery); 6874 mddev->sync_thread = md_register_thread(md_do_sync, mddev, 6875 "reshape"); 6876 } 6877 6878 /* Ok, everything is just fine now */ 6879 if (mddev->to_remove == &raid5_attrs_group) 6880 mddev->to_remove = NULL; 6881 else if (mddev->kobj.sd && 6882 sysfs_create_group(&mddev->kobj, &raid5_attrs_group)) 6883 printk(KERN_WARNING 6884 "raid5: failed to create sysfs attributes for %s\n", 6885 mdname(mddev)); 6886 md_set_array_sectors(mddev, raid5_size(mddev, 0, 0)); 6887 6888 if (mddev->queue) { 6889 int chunk_size; 6890 bool discard_supported = true; 6891 /* read-ahead size must cover two whole stripes, which 6892 * is 2 * (datadisks) * chunksize where 'n' is the 6893 * number of raid devices 6894 */ 6895 int data_disks = conf->previous_raid_disks - conf->max_degraded; 6896 int stripe = data_disks * 6897 ((mddev->chunk_sectors << 9) / PAGE_SIZE); 6898 if (mddev->queue->backing_dev_info.ra_pages < 2 * stripe) 6899 mddev->queue->backing_dev_info.ra_pages = 2 * stripe; 6900 6901 chunk_size = mddev->chunk_sectors << 9; 6902 blk_queue_io_min(mddev->queue, chunk_size); 6903 blk_queue_io_opt(mddev->queue, chunk_size * 6904 (conf->raid_disks - conf->max_degraded)); 6905 mddev->queue->limits.raid_partial_stripes_expensive = 1; 6906 /* 6907 * We can only discard a whole stripe. It doesn't make sense to 6908 * discard data disk but write parity disk 6909 */ 6910 stripe = stripe * PAGE_SIZE; 6911 /* Round up to power of 2, as discard handling 6912 * currently assumes that */ 6913 while ((stripe-1) & stripe) 6914 stripe = (stripe | (stripe-1)) + 1; 6915 mddev->queue->limits.discard_alignment = stripe; 6916 mddev->queue->limits.discard_granularity = stripe; 6917 /* 6918 * unaligned part of discard request will be ignored, so can't 6919 * guarantee discard_zeroes_data 6920 */ 6921 mddev->queue->limits.discard_zeroes_data = 0; 6922 6923 blk_queue_max_write_same_sectors(mddev->queue, 0); 6924 6925 rdev_for_each(rdev, mddev) { 6926 disk_stack_limits(mddev->gendisk, rdev->bdev, 6927 rdev->data_offset << 9); 6928 disk_stack_limits(mddev->gendisk, rdev->bdev, 6929 rdev->new_data_offset << 9); 6930 /* 6931 * discard_zeroes_data is required, otherwise data 6932 * could be lost. Consider a scenario: discard a stripe 6933 * (the stripe could be inconsistent if 6934 * discard_zeroes_data is 0); write one disk of the 6935 * stripe (the stripe could be inconsistent again 6936 * depending on which disks are used to calculate 6937 * parity); the disk is broken; The stripe data of this 6938 * disk is lost. 6939 */ 6940 if (!blk_queue_discard(bdev_get_queue(rdev->bdev)) || 6941 !bdev_get_queue(rdev->bdev)-> 6942 limits.discard_zeroes_data) 6943 discard_supported = false; 6944 /* Unfortunately, discard_zeroes_data is not currently 6945 * a guarantee - just a hint. So we only allow DISCARD 6946 * if the sysadmin has confirmed that only safe devices 6947 * are in use by setting a module parameter. 6948 */ 6949 if (!devices_handle_discard_safely) { 6950 if (discard_supported) { 6951 pr_info("md/raid456: discard support disabled due to uncertainty.\n"); 6952 pr_info("Set raid456.devices_handle_discard_safely=Y to override.\n"); 6953 } 6954 discard_supported = false; 6955 } 6956 } 6957 6958 if (discard_supported && 6959 mddev->queue->limits.max_discard_sectors >= stripe && 6960 mddev->queue->limits.discard_granularity >= stripe) 6961 queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, 6962 mddev->queue); 6963 else 6964 queue_flag_clear_unlocked(QUEUE_FLAG_DISCARD, 6965 mddev->queue); 6966 } 6967 6968 return 0; 6969 abort: 6970 md_unregister_thread(&mddev->thread); 6971 print_raid5_conf(conf); 6972 free_conf(conf); 6973 mddev->private = NULL; 6974 printk(KERN_ALERT "md/raid:%s: failed to run raid set.\n", mdname(mddev)); 6975 return -EIO; 6976 } 6977 6978 static void raid5_free(struct mddev *mddev, void *priv) 6979 { 6980 struct r5conf *conf = priv; 6981 6982 free_conf(conf); 6983 mddev->to_remove = &raid5_attrs_group; 6984 } 6985 6986 static void status(struct seq_file *seq, struct mddev *mddev) 6987 { 6988 struct r5conf *conf = mddev->private; 6989 int i; 6990 6991 seq_printf(seq, " level %d, %dk chunk, algorithm %d", mddev->level, 6992 mddev->chunk_sectors / 2, mddev->layout); 6993 seq_printf (seq, " [%d/%d] [", conf->raid_disks, conf->raid_disks - mddev->degraded); 6994 for (i = 0; i < conf->raid_disks; i++) 6995 seq_printf (seq, "%s", 6996 conf->disks[i].rdev && 6997 test_bit(In_sync, &conf->disks[i].rdev->flags) ? "U" : "_"); 6998 seq_printf (seq, "]"); 6999 } 7000 7001 static void print_raid5_conf (struct r5conf *conf) 7002 { 7003 int i; 7004 struct disk_info *tmp; 7005 7006 printk(KERN_DEBUG "RAID conf printout:\n"); 7007 if (!conf) { 7008 printk("(conf==NULL)\n"); 7009 return; 7010 } 7011 printk(KERN_DEBUG " --- level:%d rd:%d wd:%d\n", conf->level, 7012 conf->raid_disks, 7013 conf->raid_disks - conf->mddev->degraded); 7014 7015 for (i = 0; i < conf->raid_disks; i++) { 7016 char b[BDEVNAME_SIZE]; 7017 tmp = conf->disks + i; 7018 if (tmp->rdev) 7019 printk(KERN_DEBUG " disk %d, o:%d, dev:%s\n", 7020 i, !test_bit(Faulty, &tmp->rdev->flags), 7021 bdevname(tmp->rdev->bdev, b)); 7022 } 7023 } 7024 7025 static int raid5_spare_active(struct mddev *mddev) 7026 { 7027 int i; 7028 struct r5conf *conf = mddev->private; 7029 struct disk_info *tmp; 7030 int count = 0; 7031 unsigned long flags; 7032 7033 for (i = 0; i < conf->raid_disks; i++) { 7034 tmp = conf->disks + i; 7035 if (tmp->replacement 7036 && tmp->replacement->recovery_offset == MaxSector 7037 && !test_bit(Faulty, &tmp->replacement->flags) 7038 && !test_and_set_bit(In_sync, &tmp->replacement->flags)) { 7039 /* Replacement has just become active. */ 7040 if (!tmp->rdev 7041 || !test_and_clear_bit(In_sync, &tmp->rdev->flags)) 7042 count++; 7043 if (tmp->rdev) { 7044 /* Replaced device not technically faulty, 7045 * but we need to be sure it gets removed 7046 * and never re-added. 7047 */ 7048 set_bit(Faulty, &tmp->rdev->flags); 7049 sysfs_notify_dirent_safe( 7050 tmp->rdev->sysfs_state); 7051 } 7052 sysfs_notify_dirent_safe(tmp->replacement->sysfs_state); 7053 } else if (tmp->rdev 7054 && tmp->rdev->recovery_offset == MaxSector 7055 && !test_bit(Faulty, &tmp->rdev->flags) 7056 && !test_and_set_bit(In_sync, &tmp->rdev->flags)) { 7057 count++; 7058 sysfs_notify_dirent_safe(tmp->rdev->sysfs_state); 7059 } 7060 } 7061 spin_lock_irqsave(&conf->device_lock, flags); 7062 mddev->degraded = calc_degraded(conf); 7063 spin_unlock_irqrestore(&conf->device_lock, flags); 7064 print_raid5_conf(conf); 7065 return count; 7066 } 7067 7068 static int raid5_remove_disk(struct mddev *mddev, struct md_rdev *rdev) 7069 { 7070 struct r5conf *conf = mddev->private; 7071 int err = 0; 7072 int number = rdev->raid_disk; 7073 struct md_rdev **rdevp; 7074 struct disk_info *p = conf->disks + number; 7075 7076 print_raid5_conf(conf); 7077 if (rdev == p->rdev) 7078 rdevp = &p->rdev; 7079 else if (rdev == p->replacement) 7080 rdevp = &p->replacement; 7081 else 7082 return 0; 7083 7084 if (number >= conf->raid_disks && 7085 conf->reshape_progress == MaxSector) 7086 clear_bit(In_sync, &rdev->flags); 7087 7088 if (test_bit(In_sync, &rdev->flags) || 7089 atomic_read(&rdev->nr_pending)) { 7090 err = -EBUSY; 7091 goto abort; 7092 } 7093 /* Only remove non-faulty devices if recovery 7094 * isn't possible. 7095 */ 7096 if (!test_bit(Faulty, &rdev->flags) && 7097 mddev->recovery_disabled != conf->recovery_disabled && 7098 !has_failed(conf) && 7099 (!p->replacement || p->replacement == rdev) && 7100 number < conf->raid_disks) { 7101 err = -EBUSY; 7102 goto abort; 7103 } 7104 *rdevp = NULL; 7105 synchronize_rcu(); 7106 if (atomic_read(&rdev->nr_pending)) { 7107 /* lost the race, try later */ 7108 err = -EBUSY; 7109 *rdevp = rdev; 7110 } else if (p->replacement) { 7111 /* We must have just cleared 'rdev' */ 7112 p->rdev = p->replacement; 7113 clear_bit(Replacement, &p->replacement->flags); 7114 smp_mb(); /* Make sure other CPUs may see both as identical 7115 * but will never see neither - if they are careful 7116 */ 7117 p->replacement = NULL; 7118 clear_bit(WantReplacement, &rdev->flags); 7119 } else 7120 /* We might have just removed the Replacement as faulty- 7121 * clear the bit just in case 7122 */ 7123 clear_bit(WantReplacement, &rdev->flags); 7124 abort: 7125 7126 print_raid5_conf(conf); 7127 return err; 7128 } 7129 7130 static int raid5_add_disk(struct mddev *mddev, struct md_rdev *rdev) 7131 { 7132 struct r5conf *conf = mddev->private; 7133 int err = -EEXIST; 7134 int disk; 7135 struct disk_info *p; 7136 int first = 0; 7137 int last = conf->raid_disks - 1; 7138 7139 if (mddev->recovery_disabled == conf->recovery_disabled) 7140 return -EBUSY; 7141 7142 if (rdev->saved_raid_disk < 0 && has_failed(conf)) 7143 /* no point adding a device */ 7144 return -EINVAL; 7145 7146 if (rdev->raid_disk >= 0) 7147 first = last = rdev->raid_disk; 7148 7149 /* 7150 * find the disk ... but prefer rdev->saved_raid_disk 7151 * if possible. 7152 */ 7153 if (rdev->saved_raid_disk >= 0 && 7154 rdev->saved_raid_disk >= first && 7155 conf->disks[rdev->saved_raid_disk].rdev == NULL) 7156 first = rdev->saved_raid_disk; 7157 7158 for (disk = first; disk <= last; disk++) { 7159 p = conf->disks + disk; 7160 if (p->rdev == NULL) { 7161 clear_bit(In_sync, &rdev->flags); 7162 rdev->raid_disk = disk; 7163 err = 0; 7164 if (rdev->saved_raid_disk != disk) 7165 conf->fullsync = 1; 7166 rcu_assign_pointer(p->rdev, rdev); 7167 goto out; 7168 } 7169 } 7170 for (disk = first; disk <= last; disk++) { 7171 p = conf->disks + disk; 7172 if (test_bit(WantReplacement, &p->rdev->flags) && 7173 p->replacement == NULL) { 7174 clear_bit(In_sync, &rdev->flags); 7175 set_bit(Replacement, &rdev->flags); 7176 rdev->raid_disk = disk; 7177 err = 0; 7178 conf->fullsync = 1; 7179 rcu_assign_pointer(p->replacement, rdev); 7180 break; 7181 } 7182 } 7183 out: 7184 print_raid5_conf(conf); 7185 return err; 7186 } 7187 7188 static int raid5_resize(struct mddev *mddev, sector_t sectors) 7189 { 7190 /* no resync is happening, and there is enough space 7191 * on all devices, so we can resize. 7192 * We need to make sure resync covers any new space. 7193 * If the array is shrinking we should possibly wait until 7194 * any io in the removed space completes, but it hardly seems 7195 * worth it. 7196 */ 7197 sector_t newsize; 7198 sectors &= ~((sector_t)mddev->chunk_sectors - 1); 7199 newsize = raid5_size(mddev, sectors, mddev->raid_disks); 7200 if (mddev->external_size && 7201 mddev->array_sectors > newsize) 7202 return -EINVAL; 7203 if (mddev->bitmap) { 7204 int ret = bitmap_resize(mddev->bitmap, sectors, 0, 0); 7205 if (ret) 7206 return ret; 7207 } 7208 md_set_array_sectors(mddev, newsize); 7209 set_capacity(mddev->gendisk, mddev->array_sectors); 7210 revalidate_disk(mddev->gendisk); 7211 if (sectors > mddev->dev_sectors && 7212 mddev->recovery_cp > mddev->dev_sectors) { 7213 mddev->recovery_cp = mddev->dev_sectors; 7214 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 7215 } 7216 mddev->dev_sectors = sectors; 7217 mddev->resync_max_sectors = sectors; 7218 return 0; 7219 } 7220 7221 static int check_stripe_cache(struct mddev *mddev) 7222 { 7223 /* Can only proceed if there are plenty of stripe_heads. 7224 * We need a minimum of one full stripe,, and for sensible progress 7225 * it is best to have about 4 times that. 7226 * If we require 4 times, then the default 256 4K stripe_heads will 7227 * allow for chunk sizes up to 256K, which is probably OK. 7228 * If the chunk size is greater, user-space should request more 7229 * stripe_heads first. 7230 */ 7231 struct r5conf *conf = mddev->private; 7232 if (((mddev->chunk_sectors << 9) / STRIPE_SIZE) * 4 7233 > conf->min_nr_stripes || 7234 ((mddev->new_chunk_sectors << 9) / STRIPE_SIZE) * 4 7235 > conf->min_nr_stripes) { 7236 printk(KERN_WARNING "md/raid:%s: reshape: not enough stripes. Needed %lu\n", 7237 mdname(mddev), 7238 ((max(mddev->chunk_sectors, mddev->new_chunk_sectors) << 9) 7239 / STRIPE_SIZE)*4); 7240 return 0; 7241 } 7242 return 1; 7243 } 7244 7245 static int check_reshape(struct mddev *mddev) 7246 { 7247 struct r5conf *conf = mddev->private; 7248 7249 if (mddev->delta_disks == 0 && 7250 mddev->new_layout == mddev->layout && 7251 mddev->new_chunk_sectors == mddev->chunk_sectors) 7252 return 0; /* nothing to do */ 7253 if (has_failed(conf)) 7254 return -EINVAL; 7255 if (mddev->delta_disks < 0 && mddev->reshape_position == MaxSector) { 7256 /* We might be able to shrink, but the devices must 7257 * be made bigger first. 7258 * For raid6, 4 is the minimum size. 7259 * Otherwise 2 is the minimum 7260 */ 7261 int min = 2; 7262 if (mddev->level == 6) 7263 min = 4; 7264 if (mddev->raid_disks + mddev->delta_disks < min) 7265 return -EINVAL; 7266 } 7267 7268 if (!check_stripe_cache(mddev)) 7269 return -ENOSPC; 7270 7271 if (mddev->new_chunk_sectors > mddev->chunk_sectors || 7272 mddev->delta_disks > 0) 7273 if (resize_chunks(conf, 7274 conf->previous_raid_disks 7275 + max(0, mddev->delta_disks), 7276 max(mddev->new_chunk_sectors, 7277 mddev->chunk_sectors) 7278 ) < 0) 7279 return -ENOMEM; 7280 return resize_stripes(conf, (conf->previous_raid_disks 7281 + mddev->delta_disks)); 7282 } 7283 7284 static int raid5_start_reshape(struct mddev *mddev) 7285 { 7286 struct r5conf *conf = mddev->private; 7287 struct md_rdev *rdev; 7288 int spares = 0; 7289 unsigned long flags; 7290 7291 if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) 7292 return -EBUSY; 7293 7294 if (!check_stripe_cache(mddev)) 7295 return -ENOSPC; 7296 7297 if (has_failed(conf)) 7298 return -EINVAL; 7299 7300 rdev_for_each(rdev, mddev) { 7301 if (!test_bit(In_sync, &rdev->flags) 7302 && !test_bit(Faulty, &rdev->flags)) 7303 spares++; 7304 } 7305 7306 if (spares - mddev->degraded < mddev->delta_disks - conf->max_degraded) 7307 /* Not enough devices even to make a degraded array 7308 * of that size 7309 */ 7310 return -EINVAL; 7311 7312 /* Refuse to reduce size of the array. Any reductions in 7313 * array size must be through explicit setting of array_size 7314 * attribute. 7315 */ 7316 if (raid5_size(mddev, 0, conf->raid_disks + mddev->delta_disks) 7317 < mddev->array_sectors) { 7318 printk(KERN_ERR "md/raid:%s: array size must be reduced " 7319 "before number of disks\n", mdname(mddev)); 7320 return -EINVAL; 7321 } 7322 7323 atomic_set(&conf->reshape_stripes, 0); 7324 spin_lock_irq(&conf->device_lock); 7325 write_seqcount_begin(&conf->gen_lock); 7326 conf->previous_raid_disks = conf->raid_disks; 7327 conf->raid_disks += mddev->delta_disks; 7328 conf->prev_chunk_sectors = conf->chunk_sectors; 7329 conf->chunk_sectors = mddev->new_chunk_sectors; 7330 conf->prev_algo = conf->algorithm; 7331 conf->algorithm = mddev->new_layout; 7332 conf->generation++; 7333 /* Code that selects data_offset needs to see the generation update 7334 * if reshape_progress has been set - so a memory barrier needed. 7335 */ 7336 smp_mb(); 7337 if (mddev->reshape_backwards) 7338 conf->reshape_progress = raid5_size(mddev, 0, 0); 7339 else 7340 conf->reshape_progress = 0; 7341 conf->reshape_safe = conf->reshape_progress; 7342 write_seqcount_end(&conf->gen_lock); 7343 spin_unlock_irq(&conf->device_lock); 7344 7345 /* Now make sure any requests that proceeded on the assumption 7346 * the reshape wasn't running - like Discard or Read - have 7347 * completed. 7348 */ 7349 mddev_suspend(mddev); 7350 mddev_resume(mddev); 7351 7352 /* Add some new drives, as many as will fit. 7353 * We know there are enough to make the newly sized array work. 7354 * Don't add devices if we are reducing the number of 7355 * devices in the array. This is because it is not possible 7356 * to correctly record the "partially reconstructed" state of 7357 * such devices during the reshape and confusion could result. 7358 */ 7359 if (mddev->delta_disks >= 0) { 7360 rdev_for_each(rdev, mddev) 7361 if (rdev->raid_disk < 0 && 7362 !test_bit(Faulty, &rdev->flags)) { 7363 if (raid5_add_disk(mddev, rdev) == 0) { 7364 if (rdev->raid_disk 7365 >= conf->previous_raid_disks) 7366 set_bit(In_sync, &rdev->flags); 7367 else 7368 rdev->recovery_offset = 0; 7369 7370 if (sysfs_link_rdev(mddev, rdev)) 7371 /* Failure here is OK */; 7372 } 7373 } else if (rdev->raid_disk >= conf->previous_raid_disks 7374 && !test_bit(Faulty, &rdev->flags)) { 7375 /* This is a spare that was manually added */ 7376 set_bit(In_sync, &rdev->flags); 7377 } 7378 7379 /* When a reshape changes the number of devices, 7380 * ->degraded is measured against the larger of the 7381 * pre and post number of devices. 7382 */ 7383 spin_lock_irqsave(&conf->device_lock, flags); 7384 mddev->degraded = calc_degraded(conf); 7385 spin_unlock_irqrestore(&conf->device_lock, flags); 7386 } 7387 mddev->raid_disks = conf->raid_disks; 7388 mddev->reshape_position = conf->reshape_progress; 7389 set_bit(MD_CHANGE_DEVS, &mddev->flags); 7390 7391 clear_bit(MD_RECOVERY_SYNC, &mddev->recovery); 7392 clear_bit(MD_RECOVERY_CHECK, &mddev->recovery); 7393 clear_bit(MD_RECOVERY_DONE, &mddev->recovery); 7394 set_bit(MD_RECOVERY_RESHAPE, &mddev->recovery); 7395 set_bit(MD_RECOVERY_RUNNING, &mddev->recovery); 7396 mddev->sync_thread = md_register_thread(md_do_sync, mddev, 7397 "reshape"); 7398 if (!mddev->sync_thread) { 7399 mddev->recovery = 0; 7400 spin_lock_irq(&conf->device_lock); 7401 write_seqcount_begin(&conf->gen_lock); 7402 mddev->raid_disks = conf->raid_disks = conf->previous_raid_disks; 7403 mddev->new_chunk_sectors = 7404 conf->chunk_sectors = conf->prev_chunk_sectors; 7405 mddev->new_layout = conf->algorithm = conf->prev_algo; 7406 rdev_for_each(rdev, mddev) 7407 rdev->new_data_offset = rdev->data_offset; 7408 smp_wmb(); 7409 conf->generation --; 7410 conf->reshape_progress = MaxSector; 7411 mddev->reshape_position = MaxSector; 7412 write_seqcount_end(&conf->gen_lock); 7413 spin_unlock_irq(&conf->device_lock); 7414 return -EAGAIN; 7415 } 7416 conf->reshape_checkpoint = jiffies; 7417 md_wakeup_thread(mddev->sync_thread); 7418 md_new_event(mddev); 7419 return 0; 7420 } 7421 7422 /* This is called from the reshape thread and should make any 7423 * changes needed in 'conf' 7424 */ 7425 static void end_reshape(struct r5conf *conf) 7426 { 7427 7428 if (!test_bit(MD_RECOVERY_INTR, &conf->mddev->recovery)) { 7429 struct md_rdev *rdev; 7430 7431 spin_lock_irq(&conf->device_lock); 7432 conf->previous_raid_disks = conf->raid_disks; 7433 rdev_for_each(rdev, conf->mddev) 7434 rdev->data_offset = rdev->new_data_offset; 7435 smp_wmb(); 7436 conf->reshape_progress = MaxSector; 7437 spin_unlock_irq(&conf->device_lock); 7438 wake_up(&conf->wait_for_overlap); 7439 7440 /* read-ahead size must cover two whole stripes, which is 7441 * 2 * (datadisks) * chunksize where 'n' is the number of raid devices 7442 */ 7443 if (conf->mddev->queue) { 7444 int data_disks = conf->raid_disks - conf->max_degraded; 7445 int stripe = data_disks * ((conf->chunk_sectors << 9) 7446 / PAGE_SIZE); 7447 if (conf->mddev->queue->backing_dev_info.ra_pages < 2 * stripe) 7448 conf->mddev->queue->backing_dev_info.ra_pages = 2 * stripe; 7449 } 7450 } 7451 } 7452 7453 /* This is called from the raid5d thread with mddev_lock held. 7454 * It makes config changes to the device. 7455 */ 7456 static void raid5_finish_reshape(struct mddev *mddev) 7457 { 7458 struct r5conf *conf = mddev->private; 7459 7460 if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery)) { 7461 7462 if (mddev->delta_disks > 0) { 7463 md_set_array_sectors(mddev, raid5_size(mddev, 0, 0)); 7464 set_capacity(mddev->gendisk, mddev->array_sectors); 7465 revalidate_disk(mddev->gendisk); 7466 } else { 7467 int d; 7468 spin_lock_irq(&conf->device_lock); 7469 mddev->degraded = calc_degraded(conf); 7470 spin_unlock_irq(&conf->device_lock); 7471 for (d = conf->raid_disks ; 7472 d < conf->raid_disks - mddev->delta_disks; 7473 d++) { 7474 struct md_rdev *rdev = conf->disks[d].rdev; 7475 if (rdev) 7476 clear_bit(In_sync, &rdev->flags); 7477 rdev = conf->disks[d].replacement; 7478 if (rdev) 7479 clear_bit(In_sync, &rdev->flags); 7480 } 7481 } 7482 mddev->layout = conf->algorithm; 7483 mddev->chunk_sectors = conf->chunk_sectors; 7484 mddev->reshape_position = MaxSector; 7485 mddev->delta_disks = 0; 7486 mddev->reshape_backwards = 0; 7487 } 7488 } 7489 7490 static void raid5_quiesce(struct mddev *mddev, int state) 7491 { 7492 struct r5conf *conf = mddev->private; 7493 7494 switch(state) { 7495 case 2: /* resume for a suspend */ 7496 wake_up(&conf->wait_for_overlap); 7497 break; 7498 7499 case 1: /* stop all writes */ 7500 lock_all_device_hash_locks_irq(conf); 7501 /* '2' tells resync/reshape to pause so that all 7502 * active stripes can drain 7503 */ 7504 conf->quiesce = 2; 7505 wait_event_cmd(conf->wait_for_quiescent, 7506 atomic_read(&conf->active_stripes) == 0 && 7507 atomic_read(&conf->active_aligned_reads) == 0, 7508 unlock_all_device_hash_locks_irq(conf), 7509 lock_all_device_hash_locks_irq(conf)); 7510 conf->quiesce = 1; 7511 unlock_all_device_hash_locks_irq(conf); 7512 /* allow reshape to continue */ 7513 wake_up(&conf->wait_for_overlap); 7514 break; 7515 7516 case 0: /* re-enable writes */ 7517 lock_all_device_hash_locks_irq(conf); 7518 conf->quiesce = 0; 7519 wake_up(&conf->wait_for_quiescent); 7520 wake_up(&conf->wait_for_overlap); 7521 unlock_all_device_hash_locks_irq(conf); 7522 break; 7523 } 7524 } 7525 7526 static void *raid45_takeover_raid0(struct mddev *mddev, int level) 7527 { 7528 struct r0conf *raid0_conf = mddev->private; 7529 sector_t sectors; 7530 7531 /* for raid0 takeover only one zone is supported */ 7532 if (raid0_conf->nr_strip_zones > 1) { 7533 printk(KERN_ERR "md/raid:%s: cannot takeover raid0 with more than one zone.\n", 7534 mdname(mddev)); 7535 return ERR_PTR(-EINVAL); 7536 } 7537 7538 sectors = raid0_conf->strip_zone[0].zone_end; 7539 sector_div(sectors, raid0_conf->strip_zone[0].nb_dev); 7540 mddev->dev_sectors = sectors; 7541 mddev->new_level = level; 7542 mddev->new_layout = ALGORITHM_PARITY_N; 7543 mddev->new_chunk_sectors = mddev->chunk_sectors; 7544 mddev->raid_disks += 1; 7545 mddev->delta_disks = 1; 7546 /* make sure it will be not marked as dirty */ 7547 mddev->recovery_cp = MaxSector; 7548 7549 return setup_conf(mddev); 7550 } 7551 7552 static void *raid5_takeover_raid1(struct mddev *mddev) 7553 { 7554 int chunksect; 7555 7556 if (mddev->raid_disks != 2 || 7557 mddev->degraded > 1) 7558 return ERR_PTR(-EINVAL); 7559 7560 /* Should check if there are write-behind devices? */ 7561 7562 chunksect = 64*2; /* 64K by default */ 7563 7564 /* The array must be an exact multiple of chunksize */ 7565 while (chunksect && (mddev->array_sectors & (chunksect-1))) 7566 chunksect >>= 1; 7567 7568 if ((chunksect<<9) < STRIPE_SIZE) 7569 /* array size does not allow a suitable chunk size */ 7570 return ERR_PTR(-EINVAL); 7571 7572 mddev->new_level = 5; 7573 mddev->new_layout = ALGORITHM_LEFT_SYMMETRIC; 7574 mddev->new_chunk_sectors = chunksect; 7575 7576 return setup_conf(mddev); 7577 } 7578 7579 static void *raid5_takeover_raid6(struct mddev *mddev) 7580 { 7581 int new_layout; 7582 7583 switch (mddev->layout) { 7584 case ALGORITHM_LEFT_ASYMMETRIC_6: 7585 new_layout = ALGORITHM_LEFT_ASYMMETRIC; 7586 break; 7587 case ALGORITHM_RIGHT_ASYMMETRIC_6: 7588 new_layout = ALGORITHM_RIGHT_ASYMMETRIC; 7589 break; 7590 case ALGORITHM_LEFT_SYMMETRIC_6: 7591 new_layout = ALGORITHM_LEFT_SYMMETRIC; 7592 break; 7593 case ALGORITHM_RIGHT_SYMMETRIC_6: 7594 new_layout = ALGORITHM_RIGHT_SYMMETRIC; 7595 break; 7596 case ALGORITHM_PARITY_0_6: 7597 new_layout = ALGORITHM_PARITY_0; 7598 break; 7599 case ALGORITHM_PARITY_N: 7600 new_layout = ALGORITHM_PARITY_N; 7601 break; 7602 default: 7603 return ERR_PTR(-EINVAL); 7604 } 7605 mddev->new_level = 5; 7606 mddev->new_layout = new_layout; 7607 mddev->delta_disks = -1; 7608 mddev->raid_disks -= 1; 7609 return setup_conf(mddev); 7610 } 7611 7612 static int raid5_check_reshape(struct mddev *mddev) 7613 { 7614 /* For a 2-drive array, the layout and chunk size can be changed 7615 * immediately as not restriping is needed. 7616 * For larger arrays we record the new value - after validation 7617 * to be used by a reshape pass. 7618 */ 7619 struct r5conf *conf = mddev->private; 7620 int new_chunk = mddev->new_chunk_sectors; 7621 7622 if (mddev->new_layout >= 0 && !algorithm_valid_raid5(mddev->new_layout)) 7623 return -EINVAL; 7624 if (new_chunk > 0) { 7625 if (!is_power_of_2(new_chunk)) 7626 return -EINVAL; 7627 if (new_chunk < (PAGE_SIZE>>9)) 7628 return -EINVAL; 7629 if (mddev->array_sectors & (new_chunk-1)) 7630 /* not factor of array size */ 7631 return -EINVAL; 7632 } 7633 7634 /* They look valid */ 7635 7636 if (mddev->raid_disks == 2) { 7637 /* can make the change immediately */ 7638 if (mddev->new_layout >= 0) { 7639 conf->algorithm = mddev->new_layout; 7640 mddev->layout = mddev->new_layout; 7641 } 7642 if (new_chunk > 0) { 7643 conf->chunk_sectors = new_chunk ; 7644 mddev->chunk_sectors = new_chunk; 7645 } 7646 set_bit(MD_CHANGE_DEVS, &mddev->flags); 7647 md_wakeup_thread(mddev->thread); 7648 } 7649 return check_reshape(mddev); 7650 } 7651 7652 static int raid6_check_reshape(struct mddev *mddev) 7653 { 7654 int new_chunk = mddev->new_chunk_sectors; 7655 7656 if (mddev->new_layout >= 0 && !algorithm_valid_raid6(mddev->new_layout)) 7657 return -EINVAL; 7658 if (new_chunk > 0) { 7659 if (!is_power_of_2(new_chunk)) 7660 return -EINVAL; 7661 if (new_chunk < (PAGE_SIZE >> 9)) 7662 return -EINVAL; 7663 if (mddev->array_sectors & (new_chunk-1)) 7664 /* not factor of array size */ 7665 return -EINVAL; 7666 } 7667 7668 /* They look valid */ 7669 return check_reshape(mddev); 7670 } 7671 7672 static void *raid5_takeover(struct mddev *mddev) 7673 { 7674 /* raid5 can take over: 7675 * raid0 - if there is only one strip zone - make it a raid4 layout 7676 * raid1 - if there are two drives. We need to know the chunk size 7677 * raid4 - trivial - just use a raid4 layout. 7678 * raid6 - Providing it is a *_6 layout 7679 */ 7680 if (mddev->level == 0) 7681 return raid45_takeover_raid0(mddev, 5); 7682 if (mddev->level == 1) 7683 return raid5_takeover_raid1(mddev); 7684 if (mddev->level == 4) { 7685 mddev->new_layout = ALGORITHM_PARITY_N; 7686 mddev->new_level = 5; 7687 return setup_conf(mddev); 7688 } 7689 if (mddev->level == 6) 7690 return raid5_takeover_raid6(mddev); 7691 7692 return ERR_PTR(-EINVAL); 7693 } 7694 7695 static void *raid4_takeover(struct mddev *mddev) 7696 { 7697 /* raid4 can take over: 7698 * raid0 - if there is only one strip zone 7699 * raid5 - if layout is right 7700 */ 7701 if (mddev->level == 0) 7702 return raid45_takeover_raid0(mddev, 4); 7703 if (mddev->level == 5 && 7704 mddev->layout == ALGORITHM_PARITY_N) { 7705 mddev->new_layout = 0; 7706 mddev->new_level = 4; 7707 return setup_conf(mddev); 7708 } 7709 return ERR_PTR(-EINVAL); 7710 } 7711 7712 static struct md_personality raid5_personality; 7713 7714 static void *raid6_takeover(struct mddev *mddev) 7715 { 7716 /* Currently can only take over a raid5. We map the 7717 * personality to an equivalent raid6 personality 7718 * with the Q block at the end. 7719 */ 7720 int new_layout; 7721 7722 if (mddev->pers != &raid5_personality) 7723 return ERR_PTR(-EINVAL); 7724 if (mddev->degraded > 1) 7725 return ERR_PTR(-EINVAL); 7726 if (mddev->raid_disks > 253) 7727 return ERR_PTR(-EINVAL); 7728 if (mddev->raid_disks < 3) 7729 return ERR_PTR(-EINVAL); 7730 7731 switch (mddev->layout) { 7732 case ALGORITHM_LEFT_ASYMMETRIC: 7733 new_layout = ALGORITHM_LEFT_ASYMMETRIC_6; 7734 break; 7735 case ALGORITHM_RIGHT_ASYMMETRIC: 7736 new_layout = ALGORITHM_RIGHT_ASYMMETRIC_6; 7737 break; 7738 case ALGORITHM_LEFT_SYMMETRIC: 7739 new_layout = ALGORITHM_LEFT_SYMMETRIC_6; 7740 break; 7741 case ALGORITHM_RIGHT_SYMMETRIC: 7742 new_layout = ALGORITHM_RIGHT_SYMMETRIC_6; 7743 break; 7744 case ALGORITHM_PARITY_0: 7745 new_layout = ALGORITHM_PARITY_0_6; 7746 break; 7747 case ALGORITHM_PARITY_N: 7748 new_layout = ALGORITHM_PARITY_N; 7749 break; 7750 default: 7751 return ERR_PTR(-EINVAL); 7752 } 7753 mddev->new_level = 6; 7754 mddev->new_layout = new_layout; 7755 mddev->delta_disks = 1; 7756 mddev->raid_disks += 1; 7757 return setup_conf(mddev); 7758 } 7759 7760 static struct md_personality raid6_personality = 7761 { 7762 .name = "raid6", 7763 .level = 6, 7764 .owner = THIS_MODULE, 7765 .make_request = make_request, 7766 .run = run, 7767 .free = raid5_free, 7768 .status = status, 7769 .error_handler = error, 7770 .hot_add_disk = raid5_add_disk, 7771 .hot_remove_disk= raid5_remove_disk, 7772 .spare_active = raid5_spare_active, 7773 .sync_request = sync_request, 7774 .resize = raid5_resize, 7775 .size = raid5_size, 7776 .check_reshape = raid6_check_reshape, 7777 .start_reshape = raid5_start_reshape, 7778 .finish_reshape = raid5_finish_reshape, 7779 .quiesce = raid5_quiesce, 7780 .takeover = raid6_takeover, 7781 .congested = raid5_congested, 7782 .mergeable_bvec = raid5_mergeable_bvec, 7783 }; 7784 static struct md_personality raid5_personality = 7785 { 7786 .name = "raid5", 7787 .level = 5, 7788 .owner = THIS_MODULE, 7789 .make_request = make_request, 7790 .run = run, 7791 .free = raid5_free, 7792 .status = status, 7793 .error_handler = error, 7794 .hot_add_disk = raid5_add_disk, 7795 .hot_remove_disk= raid5_remove_disk, 7796 .spare_active = raid5_spare_active, 7797 .sync_request = sync_request, 7798 .resize = raid5_resize, 7799 .size = raid5_size, 7800 .check_reshape = raid5_check_reshape, 7801 .start_reshape = raid5_start_reshape, 7802 .finish_reshape = raid5_finish_reshape, 7803 .quiesce = raid5_quiesce, 7804 .takeover = raid5_takeover, 7805 .congested = raid5_congested, 7806 .mergeable_bvec = raid5_mergeable_bvec, 7807 }; 7808 7809 static struct md_personality raid4_personality = 7810 { 7811 .name = "raid4", 7812 .level = 4, 7813 .owner = THIS_MODULE, 7814 .make_request = make_request, 7815 .run = run, 7816 .free = raid5_free, 7817 .status = status, 7818 .error_handler = error, 7819 .hot_add_disk = raid5_add_disk, 7820 .hot_remove_disk= raid5_remove_disk, 7821 .spare_active = raid5_spare_active, 7822 .sync_request = sync_request, 7823 .resize = raid5_resize, 7824 .size = raid5_size, 7825 .check_reshape = raid5_check_reshape, 7826 .start_reshape = raid5_start_reshape, 7827 .finish_reshape = raid5_finish_reshape, 7828 .quiesce = raid5_quiesce, 7829 .takeover = raid4_takeover, 7830 .congested = raid5_congested, 7831 .mergeable_bvec = raid5_mergeable_bvec, 7832 }; 7833 7834 static int __init raid5_init(void) 7835 { 7836 raid5_wq = alloc_workqueue("raid5wq", 7837 WQ_UNBOUND|WQ_MEM_RECLAIM|WQ_CPU_INTENSIVE|WQ_SYSFS, 0); 7838 if (!raid5_wq) 7839 return -ENOMEM; 7840 register_md_personality(&raid6_personality); 7841 register_md_personality(&raid5_personality); 7842 register_md_personality(&raid4_personality); 7843 return 0; 7844 } 7845 7846 static void raid5_exit(void) 7847 { 7848 unregister_md_personality(&raid6_personality); 7849 unregister_md_personality(&raid5_personality); 7850 unregister_md_personality(&raid4_personality); 7851 destroy_workqueue(raid5_wq); 7852 } 7853 7854 module_init(raid5_init); 7855 module_exit(raid5_exit); 7856 MODULE_LICENSE("GPL"); 7857 MODULE_DESCRIPTION("RAID4/5/6 (striping with parity) personality for MD"); 7858 MODULE_ALIAS("md-personality-4"); /* RAID5 */ 7859 MODULE_ALIAS("md-raid5"); 7860 MODULE_ALIAS("md-raid4"); 7861 MODULE_ALIAS("md-level-5"); 7862 MODULE_ALIAS("md-level-4"); 7863 MODULE_ALIAS("md-personality-8"); /* RAID6 */ 7864 MODULE_ALIAS("md-raid6"); 7865 MODULE_ALIAS("md-level-6"); 7866 7867 /* This used to be two separate modules, they were: */ 7868 MODULE_ALIAS("raid5"); 7869 MODULE_ALIAS("raid6"); 7870