1 /* 2 * raid5.c : Multiple Devices driver for Linux 3 * Copyright (C) 1996, 1997 Ingo Molnar, Miguel de Icaza, Gadi Oxman 4 * Copyright (C) 1999, 2000 Ingo Molnar 5 * Copyright (C) 2002, 2003 H. Peter Anvin 6 * 7 * RAID-4/5/6 management functions. 8 * Thanks to Penguin Computing for making the RAID-6 development possible 9 * by donating a test server! 10 * 11 * This program is free software; you can redistribute it and/or modify 12 * it under the terms of the GNU General Public License as published by 13 * the Free Software Foundation; either version 2, or (at your option) 14 * any later version. 15 * 16 * You should have received a copy of the GNU General Public License 17 * (for example /usr/src/linux/COPYING); if not, write to the Free 18 * Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. 19 */ 20 21 /* 22 * BITMAP UNPLUGGING: 23 * 24 * The sequencing for updating the bitmap reliably is a little 25 * subtle (and I got it wrong the first time) so it deserves some 26 * explanation. 27 * 28 * We group bitmap updates into batches. Each batch has a number. 29 * We may write out several batches at once, but that isn't very important. 30 * conf->seq_write is the number of the last batch successfully written. 31 * conf->seq_flush is the number of the last batch that was closed to 32 * new additions. 33 * When we discover that we will need to write to any block in a stripe 34 * (in add_stripe_bio) we update the in-memory bitmap and record in sh->bm_seq 35 * the number of the batch it will be in. This is seq_flush+1. 36 * When we are ready to do a write, if that batch hasn't been written yet, 37 * we plug the array and queue the stripe for later. 38 * When an unplug happens, we increment bm_flush, thus closing the current 39 * batch. 40 * When we notice that bm_flush > bm_write, we write out all pending updates 41 * to the bitmap, and advance bm_write to where bm_flush was. 42 * This may occasionally write a bit out twice, but is sure never to 43 * miss any bits. 44 */ 45 46 #include <linux/blkdev.h> 47 #include <linux/kthread.h> 48 #include <linux/raid/pq.h> 49 #include <linux/async_tx.h> 50 #include <linux/module.h> 51 #include <linux/async.h> 52 #include <linux/seq_file.h> 53 #include <linux/cpu.h> 54 #include <linux/slab.h> 55 #include <linux/ratelimit.h> 56 #include <linux/nodemask.h> 57 #include <linux/flex_array.h> 58 #include <linux/sched/signal.h> 59 60 #include <trace/events/block.h> 61 #include <linux/list_sort.h> 62 63 #include "md.h" 64 #include "raid5.h" 65 #include "raid0.h" 66 #include "bitmap.h" 67 #include "raid5-log.h" 68 69 #define UNSUPPORTED_MDDEV_FLAGS (1L << MD_FAILFAST_SUPPORTED) 70 71 #define cpu_to_group(cpu) cpu_to_node(cpu) 72 #define ANY_GROUP NUMA_NO_NODE 73 74 static bool devices_handle_discard_safely = false; 75 module_param(devices_handle_discard_safely, bool, 0644); 76 MODULE_PARM_DESC(devices_handle_discard_safely, 77 "Set to Y if all devices in each array reliably return zeroes on reads from discarded regions"); 78 static struct workqueue_struct *raid5_wq; 79 80 static inline struct hlist_head *stripe_hash(struct r5conf *conf, sector_t sect) 81 { 82 int hash = (sect >> STRIPE_SHIFT) & HASH_MASK; 83 return &conf->stripe_hashtbl[hash]; 84 } 85 86 static inline int stripe_hash_locks_hash(sector_t sect) 87 { 88 return (sect >> STRIPE_SHIFT) & STRIPE_HASH_LOCKS_MASK; 89 } 90 91 static inline void lock_device_hash_lock(struct r5conf *conf, int hash) 92 { 93 spin_lock_irq(conf->hash_locks + hash); 94 spin_lock(&conf->device_lock); 95 } 96 97 static inline void unlock_device_hash_lock(struct r5conf *conf, int hash) 98 { 99 spin_unlock(&conf->device_lock); 100 spin_unlock_irq(conf->hash_locks + hash); 101 } 102 103 static inline void lock_all_device_hash_locks_irq(struct r5conf *conf) 104 { 105 int i; 106 local_irq_disable(); 107 spin_lock(conf->hash_locks); 108 for (i = 1; i < NR_STRIPE_HASH_LOCKS; i++) 109 spin_lock_nest_lock(conf->hash_locks + i, conf->hash_locks); 110 spin_lock(&conf->device_lock); 111 } 112 113 static inline void unlock_all_device_hash_locks_irq(struct r5conf *conf) 114 { 115 int i; 116 spin_unlock(&conf->device_lock); 117 for (i = NR_STRIPE_HASH_LOCKS; i; i--) 118 spin_unlock(conf->hash_locks + i - 1); 119 local_irq_enable(); 120 } 121 122 /* Find first data disk in a raid6 stripe */ 123 static inline int raid6_d0(struct stripe_head *sh) 124 { 125 if (sh->ddf_layout) 126 /* ddf always start from first device */ 127 return 0; 128 /* md starts just after Q block */ 129 if (sh->qd_idx == sh->disks - 1) 130 return 0; 131 else 132 return sh->qd_idx + 1; 133 } 134 static inline int raid6_next_disk(int disk, int raid_disks) 135 { 136 disk++; 137 return (disk < raid_disks) ? disk : 0; 138 } 139 140 /* When walking through the disks in a raid5, starting at raid6_d0, 141 * We need to map each disk to a 'slot', where the data disks are slot 142 * 0 .. raid_disks-3, the parity disk is raid_disks-2 and the Q disk 143 * is raid_disks-1. This help does that mapping. 144 */ 145 static int raid6_idx_to_slot(int idx, struct stripe_head *sh, 146 int *count, int syndrome_disks) 147 { 148 int slot = *count; 149 150 if (sh->ddf_layout) 151 (*count)++; 152 if (idx == sh->pd_idx) 153 return syndrome_disks; 154 if (idx == sh->qd_idx) 155 return syndrome_disks + 1; 156 if (!sh->ddf_layout) 157 (*count)++; 158 return slot; 159 } 160 161 static void return_io(struct bio_list *return_bi) 162 { 163 struct bio *bi; 164 while ((bi = bio_list_pop(return_bi)) != NULL) { 165 bi->bi_iter.bi_size = 0; 166 trace_block_bio_complete(bdev_get_queue(bi->bi_bdev), 167 bi, 0); 168 bio_endio(bi); 169 } 170 } 171 172 static void print_raid5_conf (struct r5conf *conf); 173 174 static int stripe_operations_active(struct stripe_head *sh) 175 { 176 return sh->check_state || sh->reconstruct_state || 177 test_bit(STRIPE_BIOFILL_RUN, &sh->state) || 178 test_bit(STRIPE_COMPUTE_RUN, &sh->state); 179 } 180 181 static bool stripe_is_lowprio(struct stripe_head *sh) 182 { 183 return (test_bit(STRIPE_R5C_FULL_STRIPE, &sh->state) || 184 test_bit(STRIPE_R5C_PARTIAL_STRIPE, &sh->state)) && 185 !test_bit(STRIPE_R5C_CACHING, &sh->state); 186 } 187 188 static void raid5_wakeup_stripe_thread(struct stripe_head *sh) 189 { 190 struct r5conf *conf = sh->raid_conf; 191 struct r5worker_group *group; 192 int thread_cnt; 193 int i, cpu = sh->cpu; 194 195 if (!cpu_online(cpu)) { 196 cpu = cpumask_any(cpu_online_mask); 197 sh->cpu = cpu; 198 } 199 200 if (list_empty(&sh->lru)) { 201 struct r5worker_group *group; 202 group = conf->worker_groups + cpu_to_group(cpu); 203 if (stripe_is_lowprio(sh)) 204 list_add_tail(&sh->lru, &group->loprio_list); 205 else 206 list_add_tail(&sh->lru, &group->handle_list); 207 group->stripes_cnt++; 208 sh->group = group; 209 } 210 211 if (conf->worker_cnt_per_group == 0) { 212 md_wakeup_thread(conf->mddev->thread); 213 return; 214 } 215 216 group = conf->worker_groups + cpu_to_group(sh->cpu); 217 218 group->workers[0].working = true; 219 /* at least one worker should run to avoid race */ 220 queue_work_on(sh->cpu, raid5_wq, &group->workers[0].work); 221 222 thread_cnt = group->stripes_cnt / MAX_STRIPE_BATCH - 1; 223 /* wakeup more workers */ 224 for (i = 1; i < conf->worker_cnt_per_group && thread_cnt > 0; i++) { 225 if (group->workers[i].working == false) { 226 group->workers[i].working = true; 227 queue_work_on(sh->cpu, raid5_wq, 228 &group->workers[i].work); 229 thread_cnt--; 230 } 231 } 232 } 233 234 static void do_release_stripe(struct r5conf *conf, struct stripe_head *sh, 235 struct list_head *temp_inactive_list) 236 { 237 int i; 238 int injournal = 0; /* number of date pages with R5_InJournal */ 239 240 BUG_ON(!list_empty(&sh->lru)); 241 BUG_ON(atomic_read(&conf->active_stripes)==0); 242 243 if (r5c_is_writeback(conf->log)) 244 for (i = sh->disks; i--; ) 245 if (test_bit(R5_InJournal, &sh->dev[i].flags)) 246 injournal++; 247 /* 248 * When quiesce in r5c write back, set STRIPE_HANDLE for stripes with 249 * data in journal, so they are not released to cached lists 250 */ 251 if (conf->quiesce && r5c_is_writeback(conf->log) && 252 !test_bit(STRIPE_HANDLE, &sh->state) && injournal != 0) { 253 if (test_bit(STRIPE_R5C_CACHING, &sh->state)) 254 r5c_make_stripe_write_out(sh); 255 set_bit(STRIPE_HANDLE, &sh->state); 256 } 257 258 if (test_bit(STRIPE_HANDLE, &sh->state)) { 259 if (test_bit(STRIPE_DELAYED, &sh->state) && 260 !test_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) 261 list_add_tail(&sh->lru, &conf->delayed_list); 262 else if (test_bit(STRIPE_BIT_DELAY, &sh->state) && 263 sh->bm_seq - conf->seq_write > 0) 264 list_add_tail(&sh->lru, &conf->bitmap_list); 265 else { 266 clear_bit(STRIPE_DELAYED, &sh->state); 267 clear_bit(STRIPE_BIT_DELAY, &sh->state); 268 if (conf->worker_cnt_per_group == 0) { 269 if (stripe_is_lowprio(sh)) 270 list_add_tail(&sh->lru, 271 &conf->loprio_list); 272 else 273 list_add_tail(&sh->lru, 274 &conf->handle_list); 275 } else { 276 raid5_wakeup_stripe_thread(sh); 277 return; 278 } 279 } 280 md_wakeup_thread(conf->mddev->thread); 281 } else { 282 BUG_ON(stripe_operations_active(sh)); 283 if (test_and_clear_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) 284 if (atomic_dec_return(&conf->preread_active_stripes) 285 < IO_THRESHOLD) 286 md_wakeup_thread(conf->mddev->thread); 287 atomic_dec(&conf->active_stripes); 288 if (!test_bit(STRIPE_EXPANDING, &sh->state)) { 289 if (!r5c_is_writeback(conf->log)) 290 list_add_tail(&sh->lru, temp_inactive_list); 291 else { 292 WARN_ON(test_bit(R5_InJournal, &sh->dev[sh->pd_idx].flags)); 293 if (injournal == 0) 294 list_add_tail(&sh->lru, temp_inactive_list); 295 else if (injournal == conf->raid_disks - conf->max_degraded) { 296 /* full stripe */ 297 if (!test_and_set_bit(STRIPE_R5C_FULL_STRIPE, &sh->state)) 298 atomic_inc(&conf->r5c_cached_full_stripes); 299 if (test_and_clear_bit(STRIPE_R5C_PARTIAL_STRIPE, &sh->state)) 300 atomic_dec(&conf->r5c_cached_partial_stripes); 301 list_add_tail(&sh->lru, &conf->r5c_full_stripe_list); 302 r5c_check_cached_full_stripe(conf); 303 } else 304 /* 305 * STRIPE_R5C_PARTIAL_STRIPE is set in 306 * r5c_try_caching_write(). No need to 307 * set it again. 308 */ 309 list_add_tail(&sh->lru, &conf->r5c_partial_stripe_list); 310 } 311 } 312 } 313 } 314 315 static void __release_stripe(struct r5conf *conf, struct stripe_head *sh, 316 struct list_head *temp_inactive_list) 317 { 318 if (atomic_dec_and_test(&sh->count)) 319 do_release_stripe(conf, sh, temp_inactive_list); 320 } 321 322 /* 323 * @hash could be NR_STRIPE_HASH_LOCKS, then we have a list of inactive_list 324 * 325 * Be careful: Only one task can add/delete stripes from temp_inactive_list at 326 * given time. Adding stripes only takes device lock, while deleting stripes 327 * only takes hash lock. 328 */ 329 static void release_inactive_stripe_list(struct r5conf *conf, 330 struct list_head *temp_inactive_list, 331 int hash) 332 { 333 int size; 334 bool do_wakeup = false; 335 unsigned long flags; 336 337 if (hash == NR_STRIPE_HASH_LOCKS) { 338 size = NR_STRIPE_HASH_LOCKS; 339 hash = NR_STRIPE_HASH_LOCKS - 1; 340 } else 341 size = 1; 342 while (size) { 343 struct list_head *list = &temp_inactive_list[size - 1]; 344 345 /* 346 * We don't hold any lock here yet, raid5_get_active_stripe() might 347 * remove stripes from the list 348 */ 349 if (!list_empty_careful(list)) { 350 spin_lock_irqsave(conf->hash_locks + hash, flags); 351 if (list_empty(conf->inactive_list + hash) && 352 !list_empty(list)) 353 atomic_dec(&conf->empty_inactive_list_nr); 354 list_splice_tail_init(list, conf->inactive_list + hash); 355 do_wakeup = true; 356 spin_unlock_irqrestore(conf->hash_locks + hash, flags); 357 } 358 size--; 359 hash--; 360 } 361 362 if (do_wakeup) { 363 wake_up(&conf->wait_for_stripe); 364 if (atomic_read(&conf->active_stripes) == 0) 365 wake_up(&conf->wait_for_quiescent); 366 if (conf->retry_read_aligned) 367 md_wakeup_thread(conf->mddev->thread); 368 } 369 } 370 371 /* should hold conf->device_lock already */ 372 static int release_stripe_list(struct r5conf *conf, 373 struct list_head *temp_inactive_list) 374 { 375 struct stripe_head *sh, *t; 376 int count = 0; 377 struct llist_node *head; 378 379 head = llist_del_all(&conf->released_stripes); 380 head = llist_reverse_order(head); 381 llist_for_each_entry_safe(sh, t, head, release_list) { 382 int hash; 383 384 /* sh could be readded after STRIPE_ON_RELEASE_LIST is cleard */ 385 smp_mb(); 386 clear_bit(STRIPE_ON_RELEASE_LIST, &sh->state); 387 /* 388 * Don't worry the bit is set here, because if the bit is set 389 * again, the count is always > 1. This is true for 390 * STRIPE_ON_UNPLUG_LIST bit too. 391 */ 392 hash = sh->hash_lock_index; 393 __release_stripe(conf, sh, &temp_inactive_list[hash]); 394 count++; 395 } 396 397 return count; 398 } 399 400 void raid5_release_stripe(struct stripe_head *sh) 401 { 402 struct r5conf *conf = sh->raid_conf; 403 unsigned long flags; 404 struct list_head list; 405 int hash; 406 bool wakeup; 407 408 /* Avoid release_list until the last reference. 409 */ 410 if (atomic_add_unless(&sh->count, -1, 1)) 411 return; 412 413 if (unlikely(!conf->mddev->thread) || 414 test_and_set_bit(STRIPE_ON_RELEASE_LIST, &sh->state)) 415 goto slow_path; 416 wakeup = llist_add(&sh->release_list, &conf->released_stripes); 417 if (wakeup) 418 md_wakeup_thread(conf->mddev->thread); 419 return; 420 slow_path: 421 local_irq_save(flags); 422 /* we are ok here if STRIPE_ON_RELEASE_LIST is set or not */ 423 if (atomic_dec_and_lock(&sh->count, &conf->device_lock)) { 424 INIT_LIST_HEAD(&list); 425 hash = sh->hash_lock_index; 426 do_release_stripe(conf, sh, &list); 427 spin_unlock(&conf->device_lock); 428 release_inactive_stripe_list(conf, &list, hash); 429 } 430 local_irq_restore(flags); 431 } 432 433 static inline void remove_hash(struct stripe_head *sh) 434 { 435 pr_debug("remove_hash(), stripe %llu\n", 436 (unsigned long long)sh->sector); 437 438 hlist_del_init(&sh->hash); 439 } 440 441 static inline void insert_hash(struct r5conf *conf, struct stripe_head *sh) 442 { 443 struct hlist_head *hp = stripe_hash(conf, sh->sector); 444 445 pr_debug("insert_hash(), stripe %llu\n", 446 (unsigned long long)sh->sector); 447 448 hlist_add_head(&sh->hash, hp); 449 } 450 451 /* find an idle stripe, make sure it is unhashed, and return it. */ 452 static struct stripe_head *get_free_stripe(struct r5conf *conf, int hash) 453 { 454 struct stripe_head *sh = NULL; 455 struct list_head *first; 456 457 if (list_empty(conf->inactive_list + hash)) 458 goto out; 459 first = (conf->inactive_list + hash)->next; 460 sh = list_entry(first, struct stripe_head, lru); 461 list_del_init(first); 462 remove_hash(sh); 463 atomic_inc(&conf->active_stripes); 464 BUG_ON(hash != sh->hash_lock_index); 465 if (list_empty(conf->inactive_list + hash)) 466 atomic_inc(&conf->empty_inactive_list_nr); 467 out: 468 return sh; 469 } 470 471 static void shrink_buffers(struct stripe_head *sh) 472 { 473 struct page *p; 474 int i; 475 int num = sh->raid_conf->pool_size; 476 477 for (i = 0; i < num ; i++) { 478 WARN_ON(sh->dev[i].page != sh->dev[i].orig_page); 479 p = sh->dev[i].page; 480 if (!p) 481 continue; 482 sh->dev[i].page = NULL; 483 put_page(p); 484 } 485 486 if (sh->ppl_page) { 487 put_page(sh->ppl_page); 488 sh->ppl_page = NULL; 489 } 490 } 491 492 static int grow_buffers(struct stripe_head *sh, gfp_t gfp) 493 { 494 int i; 495 int num = sh->raid_conf->pool_size; 496 497 for (i = 0; i < num; i++) { 498 struct page *page; 499 500 if (!(page = alloc_page(gfp))) { 501 return 1; 502 } 503 sh->dev[i].page = page; 504 sh->dev[i].orig_page = page; 505 } 506 507 if (raid5_has_ppl(sh->raid_conf)) { 508 sh->ppl_page = alloc_page(gfp); 509 if (!sh->ppl_page) 510 return 1; 511 } 512 513 return 0; 514 } 515 516 static void raid5_build_block(struct stripe_head *sh, int i, int previous); 517 static void stripe_set_idx(sector_t stripe, struct r5conf *conf, int previous, 518 struct stripe_head *sh); 519 520 static void init_stripe(struct stripe_head *sh, sector_t sector, int previous) 521 { 522 struct r5conf *conf = sh->raid_conf; 523 int i, seq; 524 525 BUG_ON(atomic_read(&sh->count) != 0); 526 BUG_ON(test_bit(STRIPE_HANDLE, &sh->state)); 527 BUG_ON(stripe_operations_active(sh)); 528 BUG_ON(sh->batch_head); 529 530 pr_debug("init_stripe called, stripe %llu\n", 531 (unsigned long long)sector); 532 retry: 533 seq = read_seqcount_begin(&conf->gen_lock); 534 sh->generation = conf->generation - previous; 535 sh->disks = previous ? conf->previous_raid_disks : conf->raid_disks; 536 sh->sector = sector; 537 stripe_set_idx(sector, conf, previous, sh); 538 sh->state = 0; 539 540 for (i = sh->disks; i--; ) { 541 struct r5dev *dev = &sh->dev[i]; 542 543 if (dev->toread || dev->read || dev->towrite || dev->written || 544 test_bit(R5_LOCKED, &dev->flags)) { 545 pr_err("sector=%llx i=%d %p %p %p %p %d\n", 546 (unsigned long long)sh->sector, i, dev->toread, 547 dev->read, dev->towrite, dev->written, 548 test_bit(R5_LOCKED, &dev->flags)); 549 WARN_ON(1); 550 } 551 dev->flags = 0; 552 raid5_build_block(sh, i, previous); 553 } 554 if (read_seqcount_retry(&conf->gen_lock, seq)) 555 goto retry; 556 sh->overwrite_disks = 0; 557 insert_hash(conf, sh); 558 sh->cpu = smp_processor_id(); 559 set_bit(STRIPE_BATCH_READY, &sh->state); 560 } 561 562 static struct stripe_head *__find_stripe(struct r5conf *conf, sector_t sector, 563 short generation) 564 { 565 struct stripe_head *sh; 566 567 pr_debug("__find_stripe, sector %llu\n", (unsigned long long)sector); 568 hlist_for_each_entry(sh, stripe_hash(conf, sector), hash) 569 if (sh->sector == sector && sh->generation == generation) 570 return sh; 571 pr_debug("__stripe %llu not in cache\n", (unsigned long long)sector); 572 return NULL; 573 } 574 575 /* 576 * Need to check if array has failed when deciding whether to: 577 * - start an array 578 * - remove non-faulty devices 579 * - add a spare 580 * - allow a reshape 581 * This determination is simple when no reshape is happening. 582 * However if there is a reshape, we need to carefully check 583 * both the before and after sections. 584 * This is because some failed devices may only affect one 585 * of the two sections, and some non-in_sync devices may 586 * be insync in the section most affected by failed devices. 587 */ 588 int raid5_calc_degraded(struct r5conf *conf) 589 { 590 int degraded, degraded2; 591 int i; 592 593 rcu_read_lock(); 594 degraded = 0; 595 for (i = 0; i < conf->previous_raid_disks; i++) { 596 struct md_rdev *rdev = rcu_dereference(conf->disks[i].rdev); 597 if (rdev && test_bit(Faulty, &rdev->flags)) 598 rdev = rcu_dereference(conf->disks[i].replacement); 599 if (!rdev || test_bit(Faulty, &rdev->flags)) 600 degraded++; 601 else if (test_bit(In_sync, &rdev->flags)) 602 ; 603 else 604 /* not in-sync or faulty. 605 * If the reshape increases the number of devices, 606 * this is being recovered by the reshape, so 607 * this 'previous' section is not in_sync. 608 * If the number of devices is being reduced however, 609 * the device can only be part of the array if 610 * we are reverting a reshape, so this section will 611 * be in-sync. 612 */ 613 if (conf->raid_disks >= conf->previous_raid_disks) 614 degraded++; 615 } 616 rcu_read_unlock(); 617 if (conf->raid_disks == conf->previous_raid_disks) 618 return degraded; 619 rcu_read_lock(); 620 degraded2 = 0; 621 for (i = 0; i < conf->raid_disks; i++) { 622 struct md_rdev *rdev = rcu_dereference(conf->disks[i].rdev); 623 if (rdev && test_bit(Faulty, &rdev->flags)) 624 rdev = rcu_dereference(conf->disks[i].replacement); 625 if (!rdev || test_bit(Faulty, &rdev->flags)) 626 degraded2++; 627 else if (test_bit(In_sync, &rdev->flags)) 628 ; 629 else 630 /* not in-sync or faulty. 631 * If reshape increases the number of devices, this 632 * section has already been recovered, else it 633 * almost certainly hasn't. 634 */ 635 if (conf->raid_disks <= conf->previous_raid_disks) 636 degraded2++; 637 } 638 rcu_read_unlock(); 639 if (degraded2 > degraded) 640 return degraded2; 641 return degraded; 642 } 643 644 static int has_failed(struct r5conf *conf) 645 { 646 int degraded; 647 648 if (conf->mddev->reshape_position == MaxSector) 649 return conf->mddev->degraded > conf->max_degraded; 650 651 degraded = raid5_calc_degraded(conf); 652 if (degraded > conf->max_degraded) 653 return 1; 654 return 0; 655 } 656 657 struct stripe_head * 658 raid5_get_active_stripe(struct r5conf *conf, sector_t sector, 659 int previous, int noblock, int noquiesce) 660 { 661 struct stripe_head *sh; 662 int hash = stripe_hash_locks_hash(sector); 663 int inc_empty_inactive_list_flag; 664 665 pr_debug("get_stripe, sector %llu\n", (unsigned long long)sector); 666 667 spin_lock_irq(conf->hash_locks + hash); 668 669 do { 670 wait_event_lock_irq(conf->wait_for_quiescent, 671 conf->quiesce == 0 || noquiesce, 672 *(conf->hash_locks + hash)); 673 sh = __find_stripe(conf, sector, conf->generation - previous); 674 if (!sh) { 675 if (!test_bit(R5_INACTIVE_BLOCKED, &conf->cache_state)) { 676 sh = get_free_stripe(conf, hash); 677 if (!sh && !test_bit(R5_DID_ALLOC, 678 &conf->cache_state)) 679 set_bit(R5_ALLOC_MORE, 680 &conf->cache_state); 681 } 682 if (noblock && sh == NULL) 683 break; 684 685 r5c_check_stripe_cache_usage(conf); 686 if (!sh) { 687 set_bit(R5_INACTIVE_BLOCKED, 688 &conf->cache_state); 689 r5l_wake_reclaim(conf->log, 0); 690 wait_event_lock_irq( 691 conf->wait_for_stripe, 692 !list_empty(conf->inactive_list + hash) && 693 (atomic_read(&conf->active_stripes) 694 < (conf->max_nr_stripes * 3 / 4) 695 || !test_bit(R5_INACTIVE_BLOCKED, 696 &conf->cache_state)), 697 *(conf->hash_locks + hash)); 698 clear_bit(R5_INACTIVE_BLOCKED, 699 &conf->cache_state); 700 } else { 701 init_stripe(sh, sector, previous); 702 atomic_inc(&sh->count); 703 } 704 } else if (!atomic_inc_not_zero(&sh->count)) { 705 spin_lock(&conf->device_lock); 706 if (!atomic_read(&sh->count)) { 707 if (!test_bit(STRIPE_HANDLE, &sh->state)) 708 atomic_inc(&conf->active_stripes); 709 BUG_ON(list_empty(&sh->lru) && 710 !test_bit(STRIPE_EXPANDING, &sh->state)); 711 inc_empty_inactive_list_flag = 0; 712 if (!list_empty(conf->inactive_list + hash)) 713 inc_empty_inactive_list_flag = 1; 714 list_del_init(&sh->lru); 715 if (list_empty(conf->inactive_list + hash) && inc_empty_inactive_list_flag) 716 atomic_inc(&conf->empty_inactive_list_nr); 717 if (sh->group) { 718 sh->group->stripes_cnt--; 719 sh->group = NULL; 720 } 721 } 722 atomic_inc(&sh->count); 723 spin_unlock(&conf->device_lock); 724 } 725 } while (sh == NULL); 726 727 spin_unlock_irq(conf->hash_locks + hash); 728 return sh; 729 } 730 731 static bool is_full_stripe_write(struct stripe_head *sh) 732 { 733 BUG_ON(sh->overwrite_disks > (sh->disks - sh->raid_conf->max_degraded)); 734 return sh->overwrite_disks == (sh->disks - sh->raid_conf->max_degraded); 735 } 736 737 static void lock_two_stripes(struct stripe_head *sh1, struct stripe_head *sh2) 738 { 739 local_irq_disable(); 740 if (sh1 > sh2) { 741 spin_lock(&sh2->stripe_lock); 742 spin_lock_nested(&sh1->stripe_lock, 1); 743 } else { 744 spin_lock(&sh1->stripe_lock); 745 spin_lock_nested(&sh2->stripe_lock, 1); 746 } 747 } 748 749 static void unlock_two_stripes(struct stripe_head *sh1, struct stripe_head *sh2) 750 { 751 spin_unlock(&sh1->stripe_lock); 752 spin_unlock(&sh2->stripe_lock); 753 local_irq_enable(); 754 } 755 756 /* Only freshly new full stripe normal write stripe can be added to a batch list */ 757 static bool stripe_can_batch(struct stripe_head *sh) 758 { 759 struct r5conf *conf = sh->raid_conf; 760 761 if (conf->log || raid5_has_ppl(conf)) 762 return false; 763 return test_bit(STRIPE_BATCH_READY, &sh->state) && 764 !test_bit(STRIPE_BITMAP_PENDING, &sh->state) && 765 is_full_stripe_write(sh); 766 } 767 768 /* we only do back search */ 769 static void stripe_add_to_batch_list(struct r5conf *conf, struct stripe_head *sh) 770 { 771 struct stripe_head *head; 772 sector_t head_sector, tmp_sec; 773 int hash; 774 int dd_idx; 775 int inc_empty_inactive_list_flag; 776 777 /* Don't cross chunks, so stripe pd_idx/qd_idx is the same */ 778 tmp_sec = sh->sector; 779 if (!sector_div(tmp_sec, conf->chunk_sectors)) 780 return; 781 head_sector = sh->sector - STRIPE_SECTORS; 782 783 hash = stripe_hash_locks_hash(head_sector); 784 spin_lock_irq(conf->hash_locks + hash); 785 head = __find_stripe(conf, head_sector, conf->generation); 786 if (head && !atomic_inc_not_zero(&head->count)) { 787 spin_lock(&conf->device_lock); 788 if (!atomic_read(&head->count)) { 789 if (!test_bit(STRIPE_HANDLE, &head->state)) 790 atomic_inc(&conf->active_stripes); 791 BUG_ON(list_empty(&head->lru) && 792 !test_bit(STRIPE_EXPANDING, &head->state)); 793 inc_empty_inactive_list_flag = 0; 794 if (!list_empty(conf->inactive_list + hash)) 795 inc_empty_inactive_list_flag = 1; 796 list_del_init(&head->lru); 797 if (list_empty(conf->inactive_list + hash) && inc_empty_inactive_list_flag) 798 atomic_inc(&conf->empty_inactive_list_nr); 799 if (head->group) { 800 head->group->stripes_cnt--; 801 head->group = NULL; 802 } 803 } 804 atomic_inc(&head->count); 805 spin_unlock(&conf->device_lock); 806 } 807 spin_unlock_irq(conf->hash_locks + hash); 808 809 if (!head) 810 return; 811 if (!stripe_can_batch(head)) 812 goto out; 813 814 lock_two_stripes(head, sh); 815 /* clear_batch_ready clear the flag */ 816 if (!stripe_can_batch(head) || !stripe_can_batch(sh)) 817 goto unlock_out; 818 819 if (sh->batch_head) 820 goto unlock_out; 821 822 dd_idx = 0; 823 while (dd_idx == sh->pd_idx || dd_idx == sh->qd_idx) 824 dd_idx++; 825 if (head->dev[dd_idx].towrite->bi_opf != sh->dev[dd_idx].towrite->bi_opf || 826 bio_op(head->dev[dd_idx].towrite) != bio_op(sh->dev[dd_idx].towrite)) 827 goto unlock_out; 828 829 if (head->batch_head) { 830 spin_lock(&head->batch_head->batch_lock); 831 /* This batch list is already running */ 832 if (!stripe_can_batch(head)) { 833 spin_unlock(&head->batch_head->batch_lock); 834 goto unlock_out; 835 } 836 837 /* 838 * at this point, head's BATCH_READY could be cleared, but we 839 * can still add the stripe to batch list 840 */ 841 list_add(&sh->batch_list, &head->batch_list); 842 spin_unlock(&head->batch_head->batch_lock); 843 844 sh->batch_head = head->batch_head; 845 } else { 846 head->batch_head = head; 847 sh->batch_head = head->batch_head; 848 spin_lock(&head->batch_lock); 849 list_add_tail(&sh->batch_list, &head->batch_list); 850 spin_unlock(&head->batch_lock); 851 } 852 853 if (test_and_clear_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) 854 if (atomic_dec_return(&conf->preread_active_stripes) 855 < IO_THRESHOLD) 856 md_wakeup_thread(conf->mddev->thread); 857 858 if (test_and_clear_bit(STRIPE_BIT_DELAY, &sh->state)) { 859 int seq = sh->bm_seq; 860 if (test_bit(STRIPE_BIT_DELAY, &sh->batch_head->state) && 861 sh->batch_head->bm_seq > seq) 862 seq = sh->batch_head->bm_seq; 863 set_bit(STRIPE_BIT_DELAY, &sh->batch_head->state); 864 sh->batch_head->bm_seq = seq; 865 } 866 867 atomic_inc(&sh->count); 868 unlock_out: 869 unlock_two_stripes(head, sh); 870 out: 871 raid5_release_stripe(head); 872 } 873 874 /* Determine if 'data_offset' or 'new_data_offset' should be used 875 * in this stripe_head. 876 */ 877 static int use_new_offset(struct r5conf *conf, struct stripe_head *sh) 878 { 879 sector_t progress = conf->reshape_progress; 880 /* Need a memory barrier to make sure we see the value 881 * of conf->generation, or ->data_offset that was set before 882 * reshape_progress was updated. 883 */ 884 smp_rmb(); 885 if (progress == MaxSector) 886 return 0; 887 if (sh->generation == conf->generation - 1) 888 return 0; 889 /* We are in a reshape, and this is a new-generation stripe, 890 * so use new_data_offset. 891 */ 892 return 1; 893 } 894 895 static void dispatch_bio_list(struct bio_list *tmp) 896 { 897 struct bio *bio; 898 899 while ((bio = bio_list_pop(tmp))) 900 generic_make_request(bio); 901 } 902 903 static int cmp_stripe(void *priv, struct list_head *a, struct list_head *b) 904 { 905 const struct r5pending_data *da = list_entry(a, 906 struct r5pending_data, sibling); 907 const struct r5pending_data *db = list_entry(b, 908 struct r5pending_data, sibling); 909 if (da->sector > db->sector) 910 return 1; 911 if (da->sector < db->sector) 912 return -1; 913 return 0; 914 } 915 916 static void dispatch_defer_bios(struct r5conf *conf, int target, 917 struct bio_list *list) 918 { 919 struct r5pending_data *data; 920 struct list_head *first, *next = NULL; 921 int cnt = 0; 922 923 if (conf->pending_data_cnt == 0) 924 return; 925 926 list_sort(NULL, &conf->pending_list, cmp_stripe); 927 928 first = conf->pending_list.next; 929 930 /* temporarily move the head */ 931 if (conf->next_pending_data) 932 list_move_tail(&conf->pending_list, 933 &conf->next_pending_data->sibling); 934 935 while (!list_empty(&conf->pending_list)) { 936 data = list_first_entry(&conf->pending_list, 937 struct r5pending_data, sibling); 938 if (&data->sibling == first) 939 first = data->sibling.next; 940 next = data->sibling.next; 941 942 bio_list_merge(list, &data->bios); 943 list_move(&data->sibling, &conf->free_list); 944 cnt++; 945 if (cnt >= target) 946 break; 947 } 948 conf->pending_data_cnt -= cnt; 949 BUG_ON(conf->pending_data_cnt < 0 || cnt < target); 950 951 if (next != &conf->pending_list) 952 conf->next_pending_data = list_entry(next, 953 struct r5pending_data, sibling); 954 else 955 conf->next_pending_data = NULL; 956 /* list isn't empty */ 957 if (first != &conf->pending_list) 958 list_move_tail(&conf->pending_list, first); 959 } 960 961 static void flush_deferred_bios(struct r5conf *conf) 962 { 963 struct bio_list tmp = BIO_EMPTY_LIST; 964 965 if (conf->pending_data_cnt == 0) 966 return; 967 968 spin_lock(&conf->pending_bios_lock); 969 dispatch_defer_bios(conf, conf->pending_data_cnt, &tmp); 970 BUG_ON(conf->pending_data_cnt != 0); 971 spin_unlock(&conf->pending_bios_lock); 972 973 dispatch_bio_list(&tmp); 974 } 975 976 static void defer_issue_bios(struct r5conf *conf, sector_t sector, 977 struct bio_list *bios) 978 { 979 struct bio_list tmp = BIO_EMPTY_LIST; 980 struct r5pending_data *ent; 981 982 spin_lock(&conf->pending_bios_lock); 983 ent = list_first_entry(&conf->free_list, struct r5pending_data, 984 sibling); 985 list_move_tail(&ent->sibling, &conf->pending_list); 986 ent->sector = sector; 987 bio_list_init(&ent->bios); 988 bio_list_merge(&ent->bios, bios); 989 conf->pending_data_cnt++; 990 if (conf->pending_data_cnt >= PENDING_IO_MAX) 991 dispatch_defer_bios(conf, PENDING_IO_ONE_FLUSH, &tmp); 992 993 spin_unlock(&conf->pending_bios_lock); 994 995 dispatch_bio_list(&tmp); 996 } 997 998 static void 999 raid5_end_read_request(struct bio *bi); 1000 static void 1001 raid5_end_write_request(struct bio *bi); 1002 1003 static void ops_run_io(struct stripe_head *sh, struct stripe_head_state *s) 1004 { 1005 struct r5conf *conf = sh->raid_conf; 1006 int i, disks = sh->disks; 1007 struct stripe_head *head_sh = sh; 1008 struct bio_list pending_bios = BIO_EMPTY_LIST; 1009 bool should_defer; 1010 1011 might_sleep(); 1012 1013 if (log_stripe(sh, s) == 0) 1014 return; 1015 1016 should_defer = conf->batch_bio_dispatch && conf->group_cnt; 1017 1018 for (i = disks; i--; ) { 1019 int op, op_flags = 0; 1020 int replace_only = 0; 1021 struct bio *bi, *rbi; 1022 struct md_rdev *rdev, *rrdev = NULL; 1023 1024 sh = head_sh; 1025 if (test_and_clear_bit(R5_Wantwrite, &sh->dev[i].flags)) { 1026 op = REQ_OP_WRITE; 1027 if (test_and_clear_bit(R5_WantFUA, &sh->dev[i].flags)) 1028 op_flags = REQ_FUA; 1029 if (test_bit(R5_Discard, &sh->dev[i].flags)) 1030 op = REQ_OP_DISCARD; 1031 } else if (test_and_clear_bit(R5_Wantread, &sh->dev[i].flags)) 1032 op = REQ_OP_READ; 1033 else if (test_and_clear_bit(R5_WantReplace, 1034 &sh->dev[i].flags)) { 1035 op = REQ_OP_WRITE; 1036 replace_only = 1; 1037 } else 1038 continue; 1039 if (test_and_clear_bit(R5_SyncIO, &sh->dev[i].flags)) 1040 op_flags |= REQ_SYNC; 1041 1042 again: 1043 bi = &sh->dev[i].req; 1044 rbi = &sh->dev[i].rreq; /* For writing to replacement */ 1045 1046 rcu_read_lock(); 1047 rrdev = rcu_dereference(conf->disks[i].replacement); 1048 smp_mb(); /* Ensure that if rrdev is NULL, rdev won't be */ 1049 rdev = rcu_dereference(conf->disks[i].rdev); 1050 if (!rdev) { 1051 rdev = rrdev; 1052 rrdev = NULL; 1053 } 1054 if (op_is_write(op)) { 1055 if (replace_only) 1056 rdev = NULL; 1057 if (rdev == rrdev) 1058 /* We raced and saw duplicates */ 1059 rrdev = NULL; 1060 } else { 1061 if (test_bit(R5_ReadRepl, &head_sh->dev[i].flags) && rrdev) 1062 rdev = rrdev; 1063 rrdev = NULL; 1064 } 1065 1066 if (rdev && test_bit(Faulty, &rdev->flags)) 1067 rdev = NULL; 1068 if (rdev) 1069 atomic_inc(&rdev->nr_pending); 1070 if (rrdev && test_bit(Faulty, &rrdev->flags)) 1071 rrdev = NULL; 1072 if (rrdev) 1073 atomic_inc(&rrdev->nr_pending); 1074 rcu_read_unlock(); 1075 1076 /* We have already checked bad blocks for reads. Now 1077 * need to check for writes. We never accept write errors 1078 * on the replacement, so we don't to check rrdev. 1079 */ 1080 while (op_is_write(op) && rdev && 1081 test_bit(WriteErrorSeen, &rdev->flags)) { 1082 sector_t first_bad; 1083 int bad_sectors; 1084 int bad = is_badblock(rdev, sh->sector, STRIPE_SECTORS, 1085 &first_bad, &bad_sectors); 1086 if (!bad) 1087 break; 1088 1089 if (bad < 0) { 1090 set_bit(BlockedBadBlocks, &rdev->flags); 1091 if (!conf->mddev->external && 1092 conf->mddev->sb_flags) { 1093 /* It is very unlikely, but we might 1094 * still need to write out the 1095 * bad block log - better give it 1096 * a chance*/ 1097 md_check_recovery(conf->mddev); 1098 } 1099 /* 1100 * Because md_wait_for_blocked_rdev 1101 * will dec nr_pending, we must 1102 * increment it first. 1103 */ 1104 atomic_inc(&rdev->nr_pending); 1105 md_wait_for_blocked_rdev(rdev, conf->mddev); 1106 } else { 1107 /* Acknowledged bad block - skip the write */ 1108 rdev_dec_pending(rdev, conf->mddev); 1109 rdev = NULL; 1110 } 1111 } 1112 1113 if (rdev) { 1114 if (s->syncing || s->expanding || s->expanded 1115 || s->replacing) 1116 md_sync_acct(rdev->bdev, STRIPE_SECTORS); 1117 1118 set_bit(STRIPE_IO_STARTED, &sh->state); 1119 1120 bi->bi_bdev = rdev->bdev; 1121 bio_set_op_attrs(bi, op, op_flags); 1122 bi->bi_end_io = op_is_write(op) 1123 ? raid5_end_write_request 1124 : raid5_end_read_request; 1125 bi->bi_private = sh; 1126 1127 pr_debug("%s: for %llu schedule op %d on disc %d\n", 1128 __func__, (unsigned long long)sh->sector, 1129 bi->bi_opf, i); 1130 atomic_inc(&sh->count); 1131 if (sh != head_sh) 1132 atomic_inc(&head_sh->count); 1133 if (use_new_offset(conf, sh)) 1134 bi->bi_iter.bi_sector = (sh->sector 1135 + rdev->new_data_offset); 1136 else 1137 bi->bi_iter.bi_sector = (sh->sector 1138 + rdev->data_offset); 1139 if (test_bit(R5_ReadNoMerge, &head_sh->dev[i].flags)) 1140 bi->bi_opf |= REQ_NOMERGE; 1141 1142 if (test_bit(R5_SkipCopy, &sh->dev[i].flags)) 1143 WARN_ON(test_bit(R5_UPTODATE, &sh->dev[i].flags)); 1144 1145 if (!op_is_write(op) && 1146 test_bit(R5_InJournal, &sh->dev[i].flags)) 1147 /* 1148 * issuing read for a page in journal, this 1149 * must be preparing for prexor in rmw; read 1150 * the data into orig_page 1151 */ 1152 sh->dev[i].vec.bv_page = sh->dev[i].orig_page; 1153 else 1154 sh->dev[i].vec.bv_page = sh->dev[i].page; 1155 bi->bi_vcnt = 1; 1156 bi->bi_io_vec[0].bv_len = STRIPE_SIZE; 1157 bi->bi_io_vec[0].bv_offset = 0; 1158 bi->bi_iter.bi_size = STRIPE_SIZE; 1159 /* 1160 * If this is discard request, set bi_vcnt 0. We don't 1161 * want to confuse SCSI because SCSI will replace payload 1162 */ 1163 if (op == REQ_OP_DISCARD) 1164 bi->bi_vcnt = 0; 1165 if (rrdev) 1166 set_bit(R5_DOUBLE_LOCKED, &sh->dev[i].flags); 1167 1168 if (conf->mddev->gendisk) 1169 trace_block_bio_remap(bdev_get_queue(bi->bi_bdev), 1170 bi, disk_devt(conf->mddev->gendisk), 1171 sh->dev[i].sector); 1172 if (should_defer && op_is_write(op)) 1173 bio_list_add(&pending_bios, bi); 1174 else 1175 generic_make_request(bi); 1176 } 1177 if (rrdev) { 1178 if (s->syncing || s->expanding || s->expanded 1179 || s->replacing) 1180 md_sync_acct(rrdev->bdev, STRIPE_SECTORS); 1181 1182 set_bit(STRIPE_IO_STARTED, &sh->state); 1183 1184 rbi->bi_bdev = rrdev->bdev; 1185 bio_set_op_attrs(rbi, op, op_flags); 1186 BUG_ON(!op_is_write(op)); 1187 rbi->bi_end_io = raid5_end_write_request; 1188 rbi->bi_private = sh; 1189 1190 pr_debug("%s: for %llu schedule op %d on " 1191 "replacement disc %d\n", 1192 __func__, (unsigned long long)sh->sector, 1193 rbi->bi_opf, i); 1194 atomic_inc(&sh->count); 1195 if (sh != head_sh) 1196 atomic_inc(&head_sh->count); 1197 if (use_new_offset(conf, sh)) 1198 rbi->bi_iter.bi_sector = (sh->sector 1199 + rrdev->new_data_offset); 1200 else 1201 rbi->bi_iter.bi_sector = (sh->sector 1202 + rrdev->data_offset); 1203 if (test_bit(R5_SkipCopy, &sh->dev[i].flags)) 1204 WARN_ON(test_bit(R5_UPTODATE, &sh->dev[i].flags)); 1205 sh->dev[i].rvec.bv_page = sh->dev[i].page; 1206 rbi->bi_vcnt = 1; 1207 rbi->bi_io_vec[0].bv_len = STRIPE_SIZE; 1208 rbi->bi_io_vec[0].bv_offset = 0; 1209 rbi->bi_iter.bi_size = STRIPE_SIZE; 1210 /* 1211 * If this is discard request, set bi_vcnt 0. We don't 1212 * want to confuse SCSI because SCSI will replace payload 1213 */ 1214 if (op == REQ_OP_DISCARD) 1215 rbi->bi_vcnt = 0; 1216 if (conf->mddev->gendisk) 1217 trace_block_bio_remap(bdev_get_queue(rbi->bi_bdev), 1218 rbi, disk_devt(conf->mddev->gendisk), 1219 sh->dev[i].sector); 1220 if (should_defer && op_is_write(op)) 1221 bio_list_add(&pending_bios, rbi); 1222 else 1223 generic_make_request(rbi); 1224 } 1225 if (!rdev && !rrdev) { 1226 if (op_is_write(op)) 1227 set_bit(STRIPE_DEGRADED, &sh->state); 1228 pr_debug("skip op %d on disc %d for sector %llu\n", 1229 bi->bi_opf, i, (unsigned long long)sh->sector); 1230 clear_bit(R5_LOCKED, &sh->dev[i].flags); 1231 set_bit(STRIPE_HANDLE, &sh->state); 1232 } 1233 1234 if (!head_sh->batch_head) 1235 continue; 1236 sh = list_first_entry(&sh->batch_list, struct stripe_head, 1237 batch_list); 1238 if (sh != head_sh) 1239 goto again; 1240 } 1241 1242 if (should_defer && !bio_list_empty(&pending_bios)) 1243 defer_issue_bios(conf, head_sh->sector, &pending_bios); 1244 } 1245 1246 static struct dma_async_tx_descriptor * 1247 async_copy_data(int frombio, struct bio *bio, struct page **page, 1248 sector_t sector, struct dma_async_tx_descriptor *tx, 1249 struct stripe_head *sh, int no_skipcopy) 1250 { 1251 struct bio_vec bvl; 1252 struct bvec_iter iter; 1253 struct page *bio_page; 1254 int page_offset; 1255 struct async_submit_ctl submit; 1256 enum async_tx_flags flags = 0; 1257 1258 if (bio->bi_iter.bi_sector >= sector) 1259 page_offset = (signed)(bio->bi_iter.bi_sector - sector) * 512; 1260 else 1261 page_offset = (signed)(sector - bio->bi_iter.bi_sector) * -512; 1262 1263 if (frombio) 1264 flags |= ASYNC_TX_FENCE; 1265 init_async_submit(&submit, flags, tx, NULL, NULL, NULL); 1266 1267 bio_for_each_segment(bvl, bio, iter) { 1268 int len = bvl.bv_len; 1269 int clen; 1270 int b_offset = 0; 1271 1272 if (page_offset < 0) { 1273 b_offset = -page_offset; 1274 page_offset += b_offset; 1275 len -= b_offset; 1276 } 1277 1278 if (len > 0 && page_offset + len > STRIPE_SIZE) 1279 clen = STRIPE_SIZE - page_offset; 1280 else 1281 clen = len; 1282 1283 if (clen > 0) { 1284 b_offset += bvl.bv_offset; 1285 bio_page = bvl.bv_page; 1286 if (frombio) { 1287 if (sh->raid_conf->skip_copy && 1288 b_offset == 0 && page_offset == 0 && 1289 clen == STRIPE_SIZE && 1290 !no_skipcopy) 1291 *page = bio_page; 1292 else 1293 tx = async_memcpy(*page, bio_page, page_offset, 1294 b_offset, clen, &submit); 1295 } else 1296 tx = async_memcpy(bio_page, *page, b_offset, 1297 page_offset, clen, &submit); 1298 } 1299 /* chain the operations */ 1300 submit.depend_tx = tx; 1301 1302 if (clen < len) /* hit end of page */ 1303 break; 1304 page_offset += len; 1305 } 1306 1307 return tx; 1308 } 1309 1310 static void ops_complete_biofill(void *stripe_head_ref) 1311 { 1312 struct stripe_head *sh = stripe_head_ref; 1313 struct bio_list return_bi = BIO_EMPTY_LIST; 1314 int i; 1315 1316 pr_debug("%s: stripe %llu\n", __func__, 1317 (unsigned long long)sh->sector); 1318 1319 /* clear completed biofills */ 1320 for (i = sh->disks; i--; ) { 1321 struct r5dev *dev = &sh->dev[i]; 1322 1323 /* acknowledge completion of a biofill operation */ 1324 /* and check if we need to reply to a read request, 1325 * new R5_Wantfill requests are held off until 1326 * !STRIPE_BIOFILL_RUN 1327 */ 1328 if (test_and_clear_bit(R5_Wantfill, &dev->flags)) { 1329 struct bio *rbi, *rbi2; 1330 1331 BUG_ON(!dev->read); 1332 rbi = dev->read; 1333 dev->read = NULL; 1334 while (rbi && rbi->bi_iter.bi_sector < 1335 dev->sector + STRIPE_SECTORS) { 1336 rbi2 = r5_next_bio(rbi, dev->sector); 1337 if (!raid5_dec_bi_active_stripes(rbi)) 1338 bio_list_add(&return_bi, rbi); 1339 rbi = rbi2; 1340 } 1341 } 1342 } 1343 clear_bit(STRIPE_BIOFILL_RUN, &sh->state); 1344 1345 return_io(&return_bi); 1346 1347 set_bit(STRIPE_HANDLE, &sh->state); 1348 raid5_release_stripe(sh); 1349 } 1350 1351 static void ops_run_biofill(struct stripe_head *sh) 1352 { 1353 struct dma_async_tx_descriptor *tx = NULL; 1354 struct async_submit_ctl submit; 1355 int i; 1356 1357 BUG_ON(sh->batch_head); 1358 pr_debug("%s: stripe %llu\n", __func__, 1359 (unsigned long long)sh->sector); 1360 1361 for (i = sh->disks; i--; ) { 1362 struct r5dev *dev = &sh->dev[i]; 1363 if (test_bit(R5_Wantfill, &dev->flags)) { 1364 struct bio *rbi; 1365 spin_lock_irq(&sh->stripe_lock); 1366 dev->read = rbi = dev->toread; 1367 dev->toread = NULL; 1368 spin_unlock_irq(&sh->stripe_lock); 1369 while (rbi && rbi->bi_iter.bi_sector < 1370 dev->sector + STRIPE_SECTORS) { 1371 tx = async_copy_data(0, rbi, &dev->page, 1372 dev->sector, tx, sh, 0); 1373 rbi = r5_next_bio(rbi, dev->sector); 1374 } 1375 } 1376 } 1377 1378 atomic_inc(&sh->count); 1379 init_async_submit(&submit, ASYNC_TX_ACK, tx, ops_complete_biofill, sh, NULL); 1380 async_trigger_callback(&submit); 1381 } 1382 1383 static void mark_target_uptodate(struct stripe_head *sh, int target) 1384 { 1385 struct r5dev *tgt; 1386 1387 if (target < 0) 1388 return; 1389 1390 tgt = &sh->dev[target]; 1391 set_bit(R5_UPTODATE, &tgt->flags); 1392 BUG_ON(!test_bit(R5_Wantcompute, &tgt->flags)); 1393 clear_bit(R5_Wantcompute, &tgt->flags); 1394 } 1395 1396 static void ops_complete_compute(void *stripe_head_ref) 1397 { 1398 struct stripe_head *sh = stripe_head_ref; 1399 1400 pr_debug("%s: stripe %llu\n", __func__, 1401 (unsigned long long)sh->sector); 1402 1403 /* mark the computed target(s) as uptodate */ 1404 mark_target_uptodate(sh, sh->ops.target); 1405 mark_target_uptodate(sh, sh->ops.target2); 1406 1407 clear_bit(STRIPE_COMPUTE_RUN, &sh->state); 1408 if (sh->check_state == check_state_compute_run) 1409 sh->check_state = check_state_compute_result; 1410 set_bit(STRIPE_HANDLE, &sh->state); 1411 raid5_release_stripe(sh); 1412 } 1413 1414 /* return a pointer to the address conversion region of the scribble buffer */ 1415 static addr_conv_t *to_addr_conv(struct stripe_head *sh, 1416 struct raid5_percpu *percpu, int i) 1417 { 1418 void *addr; 1419 1420 addr = flex_array_get(percpu->scribble, i); 1421 return addr + sizeof(struct page *) * (sh->disks + 2); 1422 } 1423 1424 /* return a pointer to the address conversion region of the scribble buffer */ 1425 static struct page **to_addr_page(struct raid5_percpu *percpu, int i) 1426 { 1427 void *addr; 1428 1429 addr = flex_array_get(percpu->scribble, i); 1430 return addr; 1431 } 1432 1433 static struct dma_async_tx_descriptor * 1434 ops_run_compute5(struct stripe_head *sh, struct raid5_percpu *percpu) 1435 { 1436 int disks = sh->disks; 1437 struct page **xor_srcs = to_addr_page(percpu, 0); 1438 int target = sh->ops.target; 1439 struct r5dev *tgt = &sh->dev[target]; 1440 struct page *xor_dest = tgt->page; 1441 int count = 0; 1442 struct dma_async_tx_descriptor *tx; 1443 struct async_submit_ctl submit; 1444 int i; 1445 1446 BUG_ON(sh->batch_head); 1447 1448 pr_debug("%s: stripe %llu block: %d\n", 1449 __func__, (unsigned long long)sh->sector, target); 1450 BUG_ON(!test_bit(R5_Wantcompute, &tgt->flags)); 1451 1452 for (i = disks; i--; ) 1453 if (i != target) 1454 xor_srcs[count++] = sh->dev[i].page; 1455 1456 atomic_inc(&sh->count); 1457 1458 init_async_submit(&submit, ASYNC_TX_FENCE|ASYNC_TX_XOR_ZERO_DST, NULL, 1459 ops_complete_compute, sh, to_addr_conv(sh, percpu, 0)); 1460 if (unlikely(count == 1)) 1461 tx = async_memcpy(xor_dest, xor_srcs[0], 0, 0, STRIPE_SIZE, &submit); 1462 else 1463 tx = async_xor(xor_dest, xor_srcs, 0, count, STRIPE_SIZE, &submit); 1464 1465 return tx; 1466 } 1467 1468 /* set_syndrome_sources - populate source buffers for gen_syndrome 1469 * @srcs - (struct page *) array of size sh->disks 1470 * @sh - stripe_head to parse 1471 * 1472 * Populates srcs in proper layout order for the stripe and returns the 1473 * 'count' of sources to be used in a call to async_gen_syndrome. The P 1474 * destination buffer is recorded in srcs[count] and the Q destination 1475 * is recorded in srcs[count+1]]. 1476 */ 1477 static int set_syndrome_sources(struct page **srcs, 1478 struct stripe_head *sh, 1479 int srctype) 1480 { 1481 int disks = sh->disks; 1482 int syndrome_disks = sh->ddf_layout ? disks : (disks - 2); 1483 int d0_idx = raid6_d0(sh); 1484 int count; 1485 int i; 1486 1487 for (i = 0; i < disks; i++) 1488 srcs[i] = NULL; 1489 1490 count = 0; 1491 i = d0_idx; 1492 do { 1493 int slot = raid6_idx_to_slot(i, sh, &count, syndrome_disks); 1494 struct r5dev *dev = &sh->dev[i]; 1495 1496 if (i == sh->qd_idx || i == sh->pd_idx || 1497 (srctype == SYNDROME_SRC_ALL) || 1498 (srctype == SYNDROME_SRC_WANT_DRAIN && 1499 (test_bit(R5_Wantdrain, &dev->flags) || 1500 test_bit(R5_InJournal, &dev->flags))) || 1501 (srctype == SYNDROME_SRC_WRITTEN && 1502 (dev->written || 1503 test_bit(R5_InJournal, &dev->flags)))) { 1504 if (test_bit(R5_InJournal, &dev->flags)) 1505 srcs[slot] = sh->dev[i].orig_page; 1506 else 1507 srcs[slot] = sh->dev[i].page; 1508 } 1509 i = raid6_next_disk(i, disks); 1510 } while (i != d0_idx); 1511 1512 return syndrome_disks; 1513 } 1514 1515 static struct dma_async_tx_descriptor * 1516 ops_run_compute6_1(struct stripe_head *sh, struct raid5_percpu *percpu) 1517 { 1518 int disks = sh->disks; 1519 struct page **blocks = to_addr_page(percpu, 0); 1520 int target; 1521 int qd_idx = sh->qd_idx; 1522 struct dma_async_tx_descriptor *tx; 1523 struct async_submit_ctl submit; 1524 struct r5dev *tgt; 1525 struct page *dest; 1526 int i; 1527 int count; 1528 1529 BUG_ON(sh->batch_head); 1530 if (sh->ops.target < 0) 1531 target = sh->ops.target2; 1532 else if (sh->ops.target2 < 0) 1533 target = sh->ops.target; 1534 else 1535 /* we should only have one valid target */ 1536 BUG(); 1537 BUG_ON(target < 0); 1538 pr_debug("%s: stripe %llu block: %d\n", 1539 __func__, (unsigned long long)sh->sector, target); 1540 1541 tgt = &sh->dev[target]; 1542 BUG_ON(!test_bit(R5_Wantcompute, &tgt->flags)); 1543 dest = tgt->page; 1544 1545 atomic_inc(&sh->count); 1546 1547 if (target == qd_idx) { 1548 count = set_syndrome_sources(blocks, sh, SYNDROME_SRC_ALL); 1549 blocks[count] = NULL; /* regenerating p is not necessary */ 1550 BUG_ON(blocks[count+1] != dest); /* q should already be set */ 1551 init_async_submit(&submit, ASYNC_TX_FENCE, NULL, 1552 ops_complete_compute, sh, 1553 to_addr_conv(sh, percpu, 0)); 1554 tx = async_gen_syndrome(blocks, 0, count+2, STRIPE_SIZE, &submit); 1555 } else { 1556 /* Compute any data- or p-drive using XOR */ 1557 count = 0; 1558 for (i = disks; i-- ; ) { 1559 if (i == target || i == qd_idx) 1560 continue; 1561 blocks[count++] = sh->dev[i].page; 1562 } 1563 1564 init_async_submit(&submit, ASYNC_TX_FENCE|ASYNC_TX_XOR_ZERO_DST, 1565 NULL, ops_complete_compute, sh, 1566 to_addr_conv(sh, percpu, 0)); 1567 tx = async_xor(dest, blocks, 0, count, STRIPE_SIZE, &submit); 1568 } 1569 1570 return tx; 1571 } 1572 1573 static struct dma_async_tx_descriptor * 1574 ops_run_compute6_2(struct stripe_head *sh, struct raid5_percpu *percpu) 1575 { 1576 int i, count, disks = sh->disks; 1577 int syndrome_disks = sh->ddf_layout ? disks : disks-2; 1578 int d0_idx = raid6_d0(sh); 1579 int faila = -1, failb = -1; 1580 int target = sh->ops.target; 1581 int target2 = sh->ops.target2; 1582 struct r5dev *tgt = &sh->dev[target]; 1583 struct r5dev *tgt2 = &sh->dev[target2]; 1584 struct dma_async_tx_descriptor *tx; 1585 struct page **blocks = to_addr_page(percpu, 0); 1586 struct async_submit_ctl submit; 1587 1588 BUG_ON(sh->batch_head); 1589 pr_debug("%s: stripe %llu block1: %d block2: %d\n", 1590 __func__, (unsigned long long)sh->sector, target, target2); 1591 BUG_ON(target < 0 || target2 < 0); 1592 BUG_ON(!test_bit(R5_Wantcompute, &tgt->flags)); 1593 BUG_ON(!test_bit(R5_Wantcompute, &tgt2->flags)); 1594 1595 /* we need to open-code set_syndrome_sources to handle the 1596 * slot number conversion for 'faila' and 'failb' 1597 */ 1598 for (i = 0; i < disks ; i++) 1599 blocks[i] = NULL; 1600 count = 0; 1601 i = d0_idx; 1602 do { 1603 int slot = raid6_idx_to_slot(i, sh, &count, syndrome_disks); 1604 1605 blocks[slot] = sh->dev[i].page; 1606 1607 if (i == target) 1608 faila = slot; 1609 if (i == target2) 1610 failb = slot; 1611 i = raid6_next_disk(i, disks); 1612 } while (i != d0_idx); 1613 1614 BUG_ON(faila == failb); 1615 if (failb < faila) 1616 swap(faila, failb); 1617 pr_debug("%s: stripe: %llu faila: %d failb: %d\n", 1618 __func__, (unsigned long long)sh->sector, faila, failb); 1619 1620 atomic_inc(&sh->count); 1621 1622 if (failb == syndrome_disks+1) { 1623 /* Q disk is one of the missing disks */ 1624 if (faila == syndrome_disks) { 1625 /* Missing P+Q, just recompute */ 1626 init_async_submit(&submit, ASYNC_TX_FENCE, NULL, 1627 ops_complete_compute, sh, 1628 to_addr_conv(sh, percpu, 0)); 1629 return async_gen_syndrome(blocks, 0, syndrome_disks+2, 1630 STRIPE_SIZE, &submit); 1631 } else { 1632 struct page *dest; 1633 int data_target; 1634 int qd_idx = sh->qd_idx; 1635 1636 /* Missing D+Q: recompute D from P, then recompute Q */ 1637 if (target == qd_idx) 1638 data_target = target2; 1639 else 1640 data_target = target; 1641 1642 count = 0; 1643 for (i = disks; i-- ; ) { 1644 if (i == data_target || i == qd_idx) 1645 continue; 1646 blocks[count++] = sh->dev[i].page; 1647 } 1648 dest = sh->dev[data_target].page; 1649 init_async_submit(&submit, 1650 ASYNC_TX_FENCE|ASYNC_TX_XOR_ZERO_DST, 1651 NULL, NULL, NULL, 1652 to_addr_conv(sh, percpu, 0)); 1653 tx = async_xor(dest, blocks, 0, count, STRIPE_SIZE, 1654 &submit); 1655 1656 count = set_syndrome_sources(blocks, sh, SYNDROME_SRC_ALL); 1657 init_async_submit(&submit, ASYNC_TX_FENCE, tx, 1658 ops_complete_compute, sh, 1659 to_addr_conv(sh, percpu, 0)); 1660 return async_gen_syndrome(blocks, 0, count+2, 1661 STRIPE_SIZE, &submit); 1662 } 1663 } else { 1664 init_async_submit(&submit, ASYNC_TX_FENCE, NULL, 1665 ops_complete_compute, sh, 1666 to_addr_conv(sh, percpu, 0)); 1667 if (failb == syndrome_disks) { 1668 /* We're missing D+P. */ 1669 return async_raid6_datap_recov(syndrome_disks+2, 1670 STRIPE_SIZE, faila, 1671 blocks, &submit); 1672 } else { 1673 /* We're missing D+D. */ 1674 return async_raid6_2data_recov(syndrome_disks+2, 1675 STRIPE_SIZE, faila, failb, 1676 blocks, &submit); 1677 } 1678 } 1679 } 1680 1681 static void ops_complete_prexor(void *stripe_head_ref) 1682 { 1683 struct stripe_head *sh = stripe_head_ref; 1684 1685 pr_debug("%s: stripe %llu\n", __func__, 1686 (unsigned long long)sh->sector); 1687 1688 if (r5c_is_writeback(sh->raid_conf->log)) 1689 /* 1690 * raid5-cache write back uses orig_page during prexor. 1691 * After prexor, it is time to free orig_page 1692 */ 1693 r5c_release_extra_page(sh); 1694 } 1695 1696 static struct dma_async_tx_descriptor * 1697 ops_run_prexor5(struct stripe_head *sh, struct raid5_percpu *percpu, 1698 struct dma_async_tx_descriptor *tx) 1699 { 1700 int disks = sh->disks; 1701 struct page **xor_srcs = to_addr_page(percpu, 0); 1702 int count = 0, pd_idx = sh->pd_idx, i; 1703 struct async_submit_ctl submit; 1704 1705 /* existing parity data subtracted */ 1706 struct page *xor_dest = xor_srcs[count++] = sh->dev[pd_idx].page; 1707 1708 BUG_ON(sh->batch_head); 1709 pr_debug("%s: stripe %llu\n", __func__, 1710 (unsigned long long)sh->sector); 1711 1712 for (i = disks; i--; ) { 1713 struct r5dev *dev = &sh->dev[i]; 1714 /* Only process blocks that are known to be uptodate */ 1715 if (test_bit(R5_InJournal, &dev->flags)) 1716 xor_srcs[count++] = dev->orig_page; 1717 else if (test_bit(R5_Wantdrain, &dev->flags)) 1718 xor_srcs[count++] = dev->page; 1719 } 1720 1721 init_async_submit(&submit, ASYNC_TX_FENCE|ASYNC_TX_XOR_DROP_DST, tx, 1722 ops_complete_prexor, sh, to_addr_conv(sh, percpu, 0)); 1723 tx = async_xor(xor_dest, xor_srcs, 0, count, STRIPE_SIZE, &submit); 1724 1725 return tx; 1726 } 1727 1728 static struct dma_async_tx_descriptor * 1729 ops_run_prexor6(struct stripe_head *sh, struct raid5_percpu *percpu, 1730 struct dma_async_tx_descriptor *tx) 1731 { 1732 struct page **blocks = to_addr_page(percpu, 0); 1733 int count; 1734 struct async_submit_ctl submit; 1735 1736 pr_debug("%s: stripe %llu\n", __func__, 1737 (unsigned long long)sh->sector); 1738 1739 count = set_syndrome_sources(blocks, sh, SYNDROME_SRC_WANT_DRAIN); 1740 1741 init_async_submit(&submit, ASYNC_TX_FENCE|ASYNC_TX_PQ_XOR_DST, tx, 1742 ops_complete_prexor, sh, to_addr_conv(sh, percpu, 0)); 1743 tx = async_gen_syndrome(blocks, 0, count+2, STRIPE_SIZE, &submit); 1744 1745 return tx; 1746 } 1747 1748 static struct dma_async_tx_descriptor * 1749 ops_run_biodrain(struct stripe_head *sh, struct dma_async_tx_descriptor *tx) 1750 { 1751 struct r5conf *conf = sh->raid_conf; 1752 int disks = sh->disks; 1753 int i; 1754 struct stripe_head *head_sh = sh; 1755 1756 pr_debug("%s: stripe %llu\n", __func__, 1757 (unsigned long long)sh->sector); 1758 1759 for (i = disks; i--; ) { 1760 struct r5dev *dev; 1761 struct bio *chosen; 1762 1763 sh = head_sh; 1764 if (test_and_clear_bit(R5_Wantdrain, &head_sh->dev[i].flags)) { 1765 struct bio *wbi; 1766 1767 again: 1768 dev = &sh->dev[i]; 1769 /* 1770 * clear R5_InJournal, so when rewriting a page in 1771 * journal, it is not skipped by r5l_log_stripe() 1772 */ 1773 clear_bit(R5_InJournal, &dev->flags); 1774 spin_lock_irq(&sh->stripe_lock); 1775 chosen = dev->towrite; 1776 dev->towrite = NULL; 1777 sh->overwrite_disks = 0; 1778 BUG_ON(dev->written); 1779 wbi = dev->written = chosen; 1780 spin_unlock_irq(&sh->stripe_lock); 1781 WARN_ON(dev->page != dev->orig_page); 1782 1783 while (wbi && wbi->bi_iter.bi_sector < 1784 dev->sector + STRIPE_SECTORS) { 1785 if (wbi->bi_opf & REQ_FUA) 1786 set_bit(R5_WantFUA, &dev->flags); 1787 if (wbi->bi_opf & REQ_SYNC) 1788 set_bit(R5_SyncIO, &dev->flags); 1789 if (bio_op(wbi) == REQ_OP_DISCARD) 1790 set_bit(R5_Discard, &dev->flags); 1791 else { 1792 tx = async_copy_data(1, wbi, &dev->page, 1793 dev->sector, tx, sh, 1794 r5c_is_writeback(conf->log)); 1795 if (dev->page != dev->orig_page && 1796 !r5c_is_writeback(conf->log)) { 1797 set_bit(R5_SkipCopy, &dev->flags); 1798 clear_bit(R5_UPTODATE, &dev->flags); 1799 clear_bit(R5_OVERWRITE, &dev->flags); 1800 } 1801 } 1802 wbi = r5_next_bio(wbi, dev->sector); 1803 } 1804 1805 if (head_sh->batch_head) { 1806 sh = list_first_entry(&sh->batch_list, 1807 struct stripe_head, 1808 batch_list); 1809 if (sh == head_sh) 1810 continue; 1811 goto again; 1812 } 1813 } 1814 } 1815 1816 return tx; 1817 } 1818 1819 static void ops_complete_reconstruct(void *stripe_head_ref) 1820 { 1821 struct stripe_head *sh = stripe_head_ref; 1822 int disks = sh->disks; 1823 int pd_idx = sh->pd_idx; 1824 int qd_idx = sh->qd_idx; 1825 int i; 1826 bool fua = false, sync = false, discard = false; 1827 1828 pr_debug("%s: stripe %llu\n", __func__, 1829 (unsigned long long)sh->sector); 1830 1831 for (i = disks; i--; ) { 1832 fua |= test_bit(R5_WantFUA, &sh->dev[i].flags); 1833 sync |= test_bit(R5_SyncIO, &sh->dev[i].flags); 1834 discard |= test_bit(R5_Discard, &sh->dev[i].flags); 1835 } 1836 1837 for (i = disks; i--; ) { 1838 struct r5dev *dev = &sh->dev[i]; 1839 1840 if (dev->written || i == pd_idx || i == qd_idx) { 1841 if (!discard && !test_bit(R5_SkipCopy, &dev->flags)) 1842 set_bit(R5_UPTODATE, &dev->flags); 1843 if (fua) 1844 set_bit(R5_WantFUA, &dev->flags); 1845 if (sync) 1846 set_bit(R5_SyncIO, &dev->flags); 1847 } 1848 } 1849 1850 if (sh->reconstruct_state == reconstruct_state_drain_run) 1851 sh->reconstruct_state = reconstruct_state_drain_result; 1852 else if (sh->reconstruct_state == reconstruct_state_prexor_drain_run) 1853 sh->reconstruct_state = reconstruct_state_prexor_drain_result; 1854 else { 1855 BUG_ON(sh->reconstruct_state != reconstruct_state_run); 1856 sh->reconstruct_state = reconstruct_state_result; 1857 } 1858 1859 set_bit(STRIPE_HANDLE, &sh->state); 1860 raid5_release_stripe(sh); 1861 } 1862 1863 static void 1864 ops_run_reconstruct5(struct stripe_head *sh, struct raid5_percpu *percpu, 1865 struct dma_async_tx_descriptor *tx) 1866 { 1867 int disks = sh->disks; 1868 struct page **xor_srcs; 1869 struct async_submit_ctl submit; 1870 int count, pd_idx = sh->pd_idx, i; 1871 struct page *xor_dest; 1872 int prexor = 0; 1873 unsigned long flags; 1874 int j = 0; 1875 struct stripe_head *head_sh = sh; 1876 int last_stripe; 1877 1878 pr_debug("%s: stripe %llu\n", __func__, 1879 (unsigned long long)sh->sector); 1880 1881 for (i = 0; i < sh->disks; i++) { 1882 if (pd_idx == i) 1883 continue; 1884 if (!test_bit(R5_Discard, &sh->dev[i].flags)) 1885 break; 1886 } 1887 if (i >= sh->disks) { 1888 atomic_inc(&sh->count); 1889 set_bit(R5_Discard, &sh->dev[pd_idx].flags); 1890 ops_complete_reconstruct(sh); 1891 return; 1892 } 1893 again: 1894 count = 0; 1895 xor_srcs = to_addr_page(percpu, j); 1896 /* check if prexor is active which means only process blocks 1897 * that are part of a read-modify-write (written) 1898 */ 1899 if (head_sh->reconstruct_state == reconstruct_state_prexor_drain_run) { 1900 prexor = 1; 1901 xor_dest = xor_srcs[count++] = sh->dev[pd_idx].page; 1902 for (i = disks; i--; ) { 1903 struct r5dev *dev = &sh->dev[i]; 1904 if (head_sh->dev[i].written || 1905 test_bit(R5_InJournal, &head_sh->dev[i].flags)) 1906 xor_srcs[count++] = dev->page; 1907 } 1908 } else { 1909 xor_dest = sh->dev[pd_idx].page; 1910 for (i = disks; i--; ) { 1911 struct r5dev *dev = &sh->dev[i]; 1912 if (i != pd_idx) 1913 xor_srcs[count++] = dev->page; 1914 } 1915 } 1916 1917 /* 1/ if we prexor'd then the dest is reused as a source 1918 * 2/ if we did not prexor then we are redoing the parity 1919 * set ASYNC_TX_XOR_DROP_DST and ASYNC_TX_XOR_ZERO_DST 1920 * for the synchronous xor case 1921 */ 1922 last_stripe = !head_sh->batch_head || 1923 list_first_entry(&sh->batch_list, 1924 struct stripe_head, batch_list) == head_sh; 1925 if (last_stripe) { 1926 flags = ASYNC_TX_ACK | 1927 (prexor ? ASYNC_TX_XOR_DROP_DST : ASYNC_TX_XOR_ZERO_DST); 1928 1929 atomic_inc(&head_sh->count); 1930 init_async_submit(&submit, flags, tx, ops_complete_reconstruct, head_sh, 1931 to_addr_conv(sh, percpu, j)); 1932 } else { 1933 flags = prexor ? ASYNC_TX_XOR_DROP_DST : ASYNC_TX_XOR_ZERO_DST; 1934 init_async_submit(&submit, flags, tx, NULL, NULL, 1935 to_addr_conv(sh, percpu, j)); 1936 } 1937 1938 if (unlikely(count == 1)) 1939 tx = async_memcpy(xor_dest, xor_srcs[0], 0, 0, STRIPE_SIZE, &submit); 1940 else 1941 tx = async_xor(xor_dest, xor_srcs, 0, count, STRIPE_SIZE, &submit); 1942 if (!last_stripe) { 1943 j++; 1944 sh = list_first_entry(&sh->batch_list, struct stripe_head, 1945 batch_list); 1946 goto again; 1947 } 1948 } 1949 1950 static void 1951 ops_run_reconstruct6(struct stripe_head *sh, struct raid5_percpu *percpu, 1952 struct dma_async_tx_descriptor *tx) 1953 { 1954 struct async_submit_ctl submit; 1955 struct page **blocks; 1956 int count, i, j = 0; 1957 struct stripe_head *head_sh = sh; 1958 int last_stripe; 1959 int synflags; 1960 unsigned long txflags; 1961 1962 pr_debug("%s: stripe %llu\n", __func__, (unsigned long long)sh->sector); 1963 1964 for (i = 0; i < sh->disks; i++) { 1965 if (sh->pd_idx == i || sh->qd_idx == i) 1966 continue; 1967 if (!test_bit(R5_Discard, &sh->dev[i].flags)) 1968 break; 1969 } 1970 if (i >= sh->disks) { 1971 atomic_inc(&sh->count); 1972 set_bit(R5_Discard, &sh->dev[sh->pd_idx].flags); 1973 set_bit(R5_Discard, &sh->dev[sh->qd_idx].flags); 1974 ops_complete_reconstruct(sh); 1975 return; 1976 } 1977 1978 again: 1979 blocks = to_addr_page(percpu, j); 1980 1981 if (sh->reconstruct_state == reconstruct_state_prexor_drain_run) { 1982 synflags = SYNDROME_SRC_WRITTEN; 1983 txflags = ASYNC_TX_ACK | ASYNC_TX_PQ_XOR_DST; 1984 } else { 1985 synflags = SYNDROME_SRC_ALL; 1986 txflags = ASYNC_TX_ACK; 1987 } 1988 1989 count = set_syndrome_sources(blocks, sh, synflags); 1990 last_stripe = !head_sh->batch_head || 1991 list_first_entry(&sh->batch_list, 1992 struct stripe_head, batch_list) == head_sh; 1993 1994 if (last_stripe) { 1995 atomic_inc(&head_sh->count); 1996 init_async_submit(&submit, txflags, tx, ops_complete_reconstruct, 1997 head_sh, to_addr_conv(sh, percpu, j)); 1998 } else 1999 init_async_submit(&submit, 0, tx, NULL, NULL, 2000 to_addr_conv(sh, percpu, j)); 2001 tx = async_gen_syndrome(blocks, 0, count+2, STRIPE_SIZE, &submit); 2002 if (!last_stripe) { 2003 j++; 2004 sh = list_first_entry(&sh->batch_list, struct stripe_head, 2005 batch_list); 2006 goto again; 2007 } 2008 } 2009 2010 static void ops_complete_check(void *stripe_head_ref) 2011 { 2012 struct stripe_head *sh = stripe_head_ref; 2013 2014 pr_debug("%s: stripe %llu\n", __func__, 2015 (unsigned long long)sh->sector); 2016 2017 sh->check_state = check_state_check_result; 2018 set_bit(STRIPE_HANDLE, &sh->state); 2019 raid5_release_stripe(sh); 2020 } 2021 2022 static void ops_run_check_p(struct stripe_head *sh, struct raid5_percpu *percpu) 2023 { 2024 int disks = sh->disks; 2025 int pd_idx = sh->pd_idx; 2026 int qd_idx = sh->qd_idx; 2027 struct page *xor_dest; 2028 struct page **xor_srcs = to_addr_page(percpu, 0); 2029 struct dma_async_tx_descriptor *tx; 2030 struct async_submit_ctl submit; 2031 int count; 2032 int i; 2033 2034 pr_debug("%s: stripe %llu\n", __func__, 2035 (unsigned long long)sh->sector); 2036 2037 BUG_ON(sh->batch_head); 2038 count = 0; 2039 xor_dest = sh->dev[pd_idx].page; 2040 xor_srcs[count++] = xor_dest; 2041 for (i = disks; i--; ) { 2042 if (i == pd_idx || i == qd_idx) 2043 continue; 2044 xor_srcs[count++] = sh->dev[i].page; 2045 } 2046 2047 init_async_submit(&submit, 0, NULL, NULL, NULL, 2048 to_addr_conv(sh, percpu, 0)); 2049 tx = async_xor_val(xor_dest, xor_srcs, 0, count, STRIPE_SIZE, 2050 &sh->ops.zero_sum_result, &submit); 2051 2052 atomic_inc(&sh->count); 2053 init_async_submit(&submit, ASYNC_TX_ACK, tx, ops_complete_check, sh, NULL); 2054 tx = async_trigger_callback(&submit); 2055 } 2056 2057 static void ops_run_check_pq(struct stripe_head *sh, struct raid5_percpu *percpu, int checkp) 2058 { 2059 struct page **srcs = to_addr_page(percpu, 0); 2060 struct async_submit_ctl submit; 2061 int count; 2062 2063 pr_debug("%s: stripe %llu checkp: %d\n", __func__, 2064 (unsigned long long)sh->sector, checkp); 2065 2066 BUG_ON(sh->batch_head); 2067 count = set_syndrome_sources(srcs, sh, SYNDROME_SRC_ALL); 2068 if (!checkp) 2069 srcs[count] = NULL; 2070 2071 atomic_inc(&sh->count); 2072 init_async_submit(&submit, ASYNC_TX_ACK, NULL, ops_complete_check, 2073 sh, to_addr_conv(sh, percpu, 0)); 2074 async_syndrome_val(srcs, 0, count+2, STRIPE_SIZE, 2075 &sh->ops.zero_sum_result, percpu->spare_page, &submit); 2076 } 2077 2078 static void raid_run_ops(struct stripe_head *sh, unsigned long ops_request) 2079 { 2080 int overlap_clear = 0, i, disks = sh->disks; 2081 struct dma_async_tx_descriptor *tx = NULL; 2082 struct r5conf *conf = sh->raid_conf; 2083 int level = conf->level; 2084 struct raid5_percpu *percpu; 2085 unsigned long cpu; 2086 2087 cpu = get_cpu(); 2088 percpu = per_cpu_ptr(conf->percpu, cpu); 2089 if (test_bit(STRIPE_OP_BIOFILL, &ops_request)) { 2090 ops_run_biofill(sh); 2091 overlap_clear++; 2092 } 2093 2094 if (test_bit(STRIPE_OP_COMPUTE_BLK, &ops_request)) { 2095 if (level < 6) 2096 tx = ops_run_compute5(sh, percpu); 2097 else { 2098 if (sh->ops.target2 < 0 || sh->ops.target < 0) 2099 tx = ops_run_compute6_1(sh, percpu); 2100 else 2101 tx = ops_run_compute6_2(sh, percpu); 2102 } 2103 /* terminate the chain if reconstruct is not set to be run */ 2104 if (tx && !test_bit(STRIPE_OP_RECONSTRUCT, &ops_request)) 2105 async_tx_ack(tx); 2106 } 2107 2108 if (test_bit(STRIPE_OP_PARTIAL_PARITY, &ops_request)) 2109 tx = ops_run_partial_parity(sh, percpu, tx); 2110 2111 if (test_bit(STRIPE_OP_PREXOR, &ops_request)) { 2112 if (level < 6) 2113 tx = ops_run_prexor5(sh, percpu, tx); 2114 else 2115 tx = ops_run_prexor6(sh, percpu, tx); 2116 } 2117 2118 if (test_bit(STRIPE_OP_BIODRAIN, &ops_request)) { 2119 tx = ops_run_biodrain(sh, tx); 2120 overlap_clear++; 2121 } 2122 2123 if (test_bit(STRIPE_OP_RECONSTRUCT, &ops_request)) { 2124 if (level < 6) 2125 ops_run_reconstruct5(sh, percpu, tx); 2126 else 2127 ops_run_reconstruct6(sh, percpu, tx); 2128 } 2129 2130 if (test_bit(STRIPE_OP_CHECK, &ops_request)) { 2131 if (sh->check_state == check_state_run) 2132 ops_run_check_p(sh, percpu); 2133 else if (sh->check_state == check_state_run_q) 2134 ops_run_check_pq(sh, percpu, 0); 2135 else if (sh->check_state == check_state_run_pq) 2136 ops_run_check_pq(sh, percpu, 1); 2137 else 2138 BUG(); 2139 } 2140 2141 if (overlap_clear && !sh->batch_head) 2142 for (i = disks; i--; ) { 2143 struct r5dev *dev = &sh->dev[i]; 2144 if (test_and_clear_bit(R5_Overlap, &dev->flags)) 2145 wake_up(&sh->raid_conf->wait_for_overlap); 2146 } 2147 put_cpu(); 2148 } 2149 2150 static struct stripe_head *alloc_stripe(struct kmem_cache *sc, gfp_t gfp, 2151 int disks) 2152 { 2153 struct stripe_head *sh; 2154 int i; 2155 2156 sh = kmem_cache_zalloc(sc, gfp); 2157 if (sh) { 2158 spin_lock_init(&sh->stripe_lock); 2159 spin_lock_init(&sh->batch_lock); 2160 INIT_LIST_HEAD(&sh->batch_list); 2161 INIT_LIST_HEAD(&sh->lru); 2162 INIT_LIST_HEAD(&sh->r5c); 2163 INIT_LIST_HEAD(&sh->log_list); 2164 atomic_set(&sh->count, 1); 2165 sh->log_start = MaxSector; 2166 for (i = 0; i < disks; i++) { 2167 struct r5dev *dev = &sh->dev[i]; 2168 2169 bio_init(&dev->req, &dev->vec, 1); 2170 bio_init(&dev->rreq, &dev->rvec, 1); 2171 } 2172 } 2173 return sh; 2174 } 2175 static int grow_one_stripe(struct r5conf *conf, gfp_t gfp) 2176 { 2177 struct stripe_head *sh; 2178 2179 sh = alloc_stripe(conf->slab_cache, gfp, conf->pool_size); 2180 if (!sh) 2181 return 0; 2182 2183 sh->raid_conf = conf; 2184 2185 if (grow_buffers(sh, gfp)) { 2186 shrink_buffers(sh); 2187 kmem_cache_free(conf->slab_cache, sh); 2188 return 0; 2189 } 2190 sh->hash_lock_index = 2191 conf->max_nr_stripes % NR_STRIPE_HASH_LOCKS; 2192 /* we just created an active stripe so... */ 2193 atomic_inc(&conf->active_stripes); 2194 2195 raid5_release_stripe(sh); 2196 conf->max_nr_stripes++; 2197 return 1; 2198 } 2199 2200 static int grow_stripes(struct r5conf *conf, int num) 2201 { 2202 struct kmem_cache *sc; 2203 int devs = max(conf->raid_disks, conf->previous_raid_disks); 2204 2205 if (conf->mddev->gendisk) 2206 sprintf(conf->cache_name[0], 2207 "raid%d-%s", conf->level, mdname(conf->mddev)); 2208 else 2209 sprintf(conf->cache_name[0], 2210 "raid%d-%p", conf->level, conf->mddev); 2211 sprintf(conf->cache_name[1], "%s-alt", conf->cache_name[0]); 2212 2213 conf->active_name = 0; 2214 sc = kmem_cache_create(conf->cache_name[conf->active_name], 2215 sizeof(struct stripe_head)+(devs-1)*sizeof(struct r5dev), 2216 0, 0, NULL); 2217 if (!sc) 2218 return 1; 2219 conf->slab_cache = sc; 2220 conf->pool_size = devs; 2221 while (num--) 2222 if (!grow_one_stripe(conf, GFP_KERNEL)) 2223 return 1; 2224 2225 return 0; 2226 } 2227 2228 /** 2229 * scribble_len - return the required size of the scribble region 2230 * @num - total number of disks in the array 2231 * 2232 * The size must be enough to contain: 2233 * 1/ a struct page pointer for each device in the array +2 2234 * 2/ room to convert each entry in (1) to its corresponding dma 2235 * (dma_map_page()) or page (page_address()) address. 2236 * 2237 * Note: the +2 is for the destination buffers of the ddf/raid6 case where we 2238 * calculate over all devices (not just the data blocks), using zeros in place 2239 * of the P and Q blocks. 2240 */ 2241 static struct flex_array *scribble_alloc(int num, int cnt, gfp_t flags) 2242 { 2243 struct flex_array *ret; 2244 size_t len; 2245 2246 len = sizeof(struct page *) * (num+2) + sizeof(addr_conv_t) * (num+2); 2247 ret = flex_array_alloc(len, cnt, flags); 2248 if (!ret) 2249 return NULL; 2250 /* always prealloc all elements, so no locking is required */ 2251 if (flex_array_prealloc(ret, 0, cnt, flags)) { 2252 flex_array_free(ret); 2253 return NULL; 2254 } 2255 return ret; 2256 } 2257 2258 static int resize_chunks(struct r5conf *conf, int new_disks, int new_sectors) 2259 { 2260 unsigned long cpu; 2261 int err = 0; 2262 2263 /* 2264 * Never shrink. And mddev_suspend() could deadlock if this is called 2265 * from raid5d. In that case, scribble_disks and scribble_sectors 2266 * should equal to new_disks and new_sectors 2267 */ 2268 if (conf->scribble_disks >= new_disks && 2269 conf->scribble_sectors >= new_sectors) 2270 return 0; 2271 mddev_suspend(conf->mddev); 2272 get_online_cpus(); 2273 for_each_present_cpu(cpu) { 2274 struct raid5_percpu *percpu; 2275 struct flex_array *scribble; 2276 2277 percpu = per_cpu_ptr(conf->percpu, cpu); 2278 scribble = scribble_alloc(new_disks, 2279 new_sectors / STRIPE_SECTORS, 2280 GFP_NOIO); 2281 2282 if (scribble) { 2283 flex_array_free(percpu->scribble); 2284 percpu->scribble = scribble; 2285 } else { 2286 err = -ENOMEM; 2287 break; 2288 } 2289 } 2290 put_online_cpus(); 2291 mddev_resume(conf->mddev); 2292 if (!err) { 2293 conf->scribble_disks = new_disks; 2294 conf->scribble_sectors = new_sectors; 2295 } 2296 return err; 2297 } 2298 2299 static int resize_stripes(struct r5conf *conf, int newsize) 2300 { 2301 /* Make all the stripes able to hold 'newsize' devices. 2302 * New slots in each stripe get 'page' set to a new page. 2303 * 2304 * This happens in stages: 2305 * 1/ create a new kmem_cache and allocate the required number of 2306 * stripe_heads. 2307 * 2/ gather all the old stripe_heads and transfer the pages across 2308 * to the new stripe_heads. This will have the side effect of 2309 * freezing the array as once all stripe_heads have been collected, 2310 * no IO will be possible. Old stripe heads are freed once their 2311 * pages have been transferred over, and the old kmem_cache is 2312 * freed when all stripes are done. 2313 * 3/ reallocate conf->disks to be suitable bigger. If this fails, 2314 * we simple return a failre status - no need to clean anything up. 2315 * 4/ allocate new pages for the new slots in the new stripe_heads. 2316 * If this fails, we don't bother trying the shrink the 2317 * stripe_heads down again, we just leave them as they are. 2318 * As each stripe_head is processed the new one is released into 2319 * active service. 2320 * 2321 * Once step2 is started, we cannot afford to wait for a write, 2322 * so we use GFP_NOIO allocations. 2323 */ 2324 struct stripe_head *osh, *nsh; 2325 LIST_HEAD(newstripes); 2326 struct disk_info *ndisks; 2327 int err; 2328 struct kmem_cache *sc; 2329 int i; 2330 int hash, cnt; 2331 2332 if (newsize <= conf->pool_size) 2333 return 0; /* never bother to shrink */ 2334 2335 err = md_allow_write(conf->mddev); 2336 if (err) 2337 return err; 2338 2339 /* Step 1 */ 2340 sc = kmem_cache_create(conf->cache_name[1-conf->active_name], 2341 sizeof(struct stripe_head)+(newsize-1)*sizeof(struct r5dev), 2342 0, 0, NULL); 2343 if (!sc) 2344 return -ENOMEM; 2345 2346 /* Need to ensure auto-resizing doesn't interfere */ 2347 mutex_lock(&conf->cache_size_mutex); 2348 2349 for (i = conf->max_nr_stripes; i; i--) { 2350 nsh = alloc_stripe(sc, GFP_KERNEL, newsize); 2351 if (!nsh) 2352 break; 2353 2354 nsh->raid_conf = conf; 2355 list_add(&nsh->lru, &newstripes); 2356 } 2357 if (i) { 2358 /* didn't get enough, give up */ 2359 while (!list_empty(&newstripes)) { 2360 nsh = list_entry(newstripes.next, struct stripe_head, lru); 2361 list_del(&nsh->lru); 2362 kmem_cache_free(sc, nsh); 2363 } 2364 kmem_cache_destroy(sc); 2365 mutex_unlock(&conf->cache_size_mutex); 2366 return -ENOMEM; 2367 } 2368 /* Step 2 - Must use GFP_NOIO now. 2369 * OK, we have enough stripes, start collecting inactive 2370 * stripes and copying them over 2371 */ 2372 hash = 0; 2373 cnt = 0; 2374 list_for_each_entry(nsh, &newstripes, lru) { 2375 lock_device_hash_lock(conf, hash); 2376 wait_event_cmd(conf->wait_for_stripe, 2377 !list_empty(conf->inactive_list + hash), 2378 unlock_device_hash_lock(conf, hash), 2379 lock_device_hash_lock(conf, hash)); 2380 osh = get_free_stripe(conf, hash); 2381 unlock_device_hash_lock(conf, hash); 2382 2383 for(i=0; i<conf->pool_size; i++) { 2384 nsh->dev[i].page = osh->dev[i].page; 2385 nsh->dev[i].orig_page = osh->dev[i].page; 2386 } 2387 nsh->hash_lock_index = hash; 2388 kmem_cache_free(conf->slab_cache, osh); 2389 cnt++; 2390 if (cnt >= conf->max_nr_stripes / NR_STRIPE_HASH_LOCKS + 2391 !!((conf->max_nr_stripes % NR_STRIPE_HASH_LOCKS) > hash)) { 2392 hash++; 2393 cnt = 0; 2394 } 2395 } 2396 kmem_cache_destroy(conf->slab_cache); 2397 2398 /* Step 3. 2399 * At this point, we are holding all the stripes so the array 2400 * is completely stalled, so now is a good time to resize 2401 * conf->disks and the scribble region 2402 */ 2403 ndisks = kzalloc(newsize * sizeof(struct disk_info), GFP_NOIO); 2404 if (ndisks) { 2405 for (i = 0; i < conf->pool_size; i++) 2406 ndisks[i] = conf->disks[i]; 2407 2408 for (i = conf->pool_size; i < newsize; i++) { 2409 ndisks[i].extra_page = alloc_page(GFP_NOIO); 2410 if (!ndisks[i].extra_page) 2411 err = -ENOMEM; 2412 } 2413 2414 if (err) { 2415 for (i = conf->pool_size; i < newsize; i++) 2416 if (ndisks[i].extra_page) 2417 put_page(ndisks[i].extra_page); 2418 kfree(ndisks); 2419 } else { 2420 kfree(conf->disks); 2421 conf->disks = ndisks; 2422 } 2423 } else 2424 err = -ENOMEM; 2425 2426 mutex_unlock(&conf->cache_size_mutex); 2427 /* Step 4, return new stripes to service */ 2428 while(!list_empty(&newstripes)) { 2429 nsh = list_entry(newstripes.next, struct stripe_head, lru); 2430 list_del_init(&nsh->lru); 2431 2432 for (i=conf->raid_disks; i < newsize; i++) 2433 if (nsh->dev[i].page == NULL) { 2434 struct page *p = alloc_page(GFP_NOIO); 2435 nsh->dev[i].page = p; 2436 nsh->dev[i].orig_page = p; 2437 if (!p) 2438 err = -ENOMEM; 2439 } 2440 raid5_release_stripe(nsh); 2441 } 2442 /* critical section pass, GFP_NOIO no longer needed */ 2443 2444 conf->slab_cache = sc; 2445 conf->active_name = 1-conf->active_name; 2446 if (!err) 2447 conf->pool_size = newsize; 2448 return err; 2449 } 2450 2451 static int drop_one_stripe(struct r5conf *conf) 2452 { 2453 struct stripe_head *sh; 2454 int hash = (conf->max_nr_stripes - 1) & STRIPE_HASH_LOCKS_MASK; 2455 2456 spin_lock_irq(conf->hash_locks + hash); 2457 sh = get_free_stripe(conf, hash); 2458 spin_unlock_irq(conf->hash_locks + hash); 2459 if (!sh) 2460 return 0; 2461 BUG_ON(atomic_read(&sh->count)); 2462 shrink_buffers(sh); 2463 kmem_cache_free(conf->slab_cache, sh); 2464 atomic_dec(&conf->active_stripes); 2465 conf->max_nr_stripes--; 2466 return 1; 2467 } 2468 2469 static void shrink_stripes(struct r5conf *conf) 2470 { 2471 while (conf->max_nr_stripes && 2472 drop_one_stripe(conf)) 2473 ; 2474 2475 kmem_cache_destroy(conf->slab_cache); 2476 conf->slab_cache = NULL; 2477 } 2478 2479 static void raid5_end_read_request(struct bio * bi) 2480 { 2481 struct stripe_head *sh = bi->bi_private; 2482 struct r5conf *conf = sh->raid_conf; 2483 int disks = sh->disks, i; 2484 char b[BDEVNAME_SIZE]; 2485 struct md_rdev *rdev = NULL; 2486 sector_t s; 2487 2488 for (i=0 ; i<disks; i++) 2489 if (bi == &sh->dev[i].req) 2490 break; 2491 2492 pr_debug("end_read_request %llu/%d, count: %d, error %d.\n", 2493 (unsigned long long)sh->sector, i, atomic_read(&sh->count), 2494 bi->bi_error); 2495 if (i == disks) { 2496 bio_reset(bi); 2497 BUG(); 2498 return; 2499 } 2500 if (test_bit(R5_ReadRepl, &sh->dev[i].flags)) 2501 /* If replacement finished while this request was outstanding, 2502 * 'replacement' might be NULL already. 2503 * In that case it moved down to 'rdev'. 2504 * rdev is not removed until all requests are finished. 2505 */ 2506 rdev = conf->disks[i].replacement; 2507 if (!rdev) 2508 rdev = conf->disks[i].rdev; 2509 2510 if (use_new_offset(conf, sh)) 2511 s = sh->sector + rdev->new_data_offset; 2512 else 2513 s = sh->sector + rdev->data_offset; 2514 if (!bi->bi_error) { 2515 set_bit(R5_UPTODATE, &sh->dev[i].flags); 2516 if (test_bit(R5_ReadError, &sh->dev[i].flags)) { 2517 /* Note that this cannot happen on a 2518 * replacement device. We just fail those on 2519 * any error 2520 */ 2521 pr_info_ratelimited( 2522 "md/raid:%s: read error corrected (%lu sectors at %llu on %s)\n", 2523 mdname(conf->mddev), STRIPE_SECTORS, 2524 (unsigned long long)s, 2525 bdevname(rdev->bdev, b)); 2526 atomic_add(STRIPE_SECTORS, &rdev->corrected_errors); 2527 clear_bit(R5_ReadError, &sh->dev[i].flags); 2528 clear_bit(R5_ReWrite, &sh->dev[i].flags); 2529 } else if (test_bit(R5_ReadNoMerge, &sh->dev[i].flags)) 2530 clear_bit(R5_ReadNoMerge, &sh->dev[i].flags); 2531 2532 if (test_bit(R5_InJournal, &sh->dev[i].flags)) 2533 /* 2534 * end read for a page in journal, this 2535 * must be preparing for prexor in rmw 2536 */ 2537 set_bit(R5_OrigPageUPTDODATE, &sh->dev[i].flags); 2538 2539 if (atomic_read(&rdev->read_errors)) 2540 atomic_set(&rdev->read_errors, 0); 2541 } else { 2542 const char *bdn = bdevname(rdev->bdev, b); 2543 int retry = 0; 2544 int set_bad = 0; 2545 2546 clear_bit(R5_UPTODATE, &sh->dev[i].flags); 2547 atomic_inc(&rdev->read_errors); 2548 if (test_bit(R5_ReadRepl, &sh->dev[i].flags)) 2549 pr_warn_ratelimited( 2550 "md/raid:%s: read error on replacement device (sector %llu on %s).\n", 2551 mdname(conf->mddev), 2552 (unsigned long long)s, 2553 bdn); 2554 else if (conf->mddev->degraded >= conf->max_degraded) { 2555 set_bad = 1; 2556 pr_warn_ratelimited( 2557 "md/raid:%s: read error not correctable (sector %llu on %s).\n", 2558 mdname(conf->mddev), 2559 (unsigned long long)s, 2560 bdn); 2561 } else if (test_bit(R5_ReWrite, &sh->dev[i].flags)) { 2562 /* Oh, no!!! */ 2563 set_bad = 1; 2564 pr_warn_ratelimited( 2565 "md/raid:%s: read error NOT corrected!! (sector %llu on %s).\n", 2566 mdname(conf->mddev), 2567 (unsigned long long)s, 2568 bdn); 2569 } else if (atomic_read(&rdev->read_errors) 2570 > conf->max_nr_stripes) 2571 pr_warn("md/raid:%s: Too many read errors, failing device %s.\n", 2572 mdname(conf->mddev), bdn); 2573 else 2574 retry = 1; 2575 if (set_bad && test_bit(In_sync, &rdev->flags) 2576 && !test_bit(R5_ReadNoMerge, &sh->dev[i].flags)) 2577 retry = 1; 2578 if (retry) 2579 if (test_bit(R5_ReadNoMerge, &sh->dev[i].flags)) { 2580 set_bit(R5_ReadError, &sh->dev[i].flags); 2581 clear_bit(R5_ReadNoMerge, &sh->dev[i].flags); 2582 } else 2583 set_bit(R5_ReadNoMerge, &sh->dev[i].flags); 2584 else { 2585 clear_bit(R5_ReadError, &sh->dev[i].flags); 2586 clear_bit(R5_ReWrite, &sh->dev[i].flags); 2587 if (!(set_bad 2588 && test_bit(In_sync, &rdev->flags) 2589 && rdev_set_badblocks( 2590 rdev, sh->sector, STRIPE_SECTORS, 0))) 2591 md_error(conf->mddev, rdev); 2592 } 2593 } 2594 rdev_dec_pending(rdev, conf->mddev); 2595 bio_reset(bi); 2596 clear_bit(R5_LOCKED, &sh->dev[i].flags); 2597 set_bit(STRIPE_HANDLE, &sh->state); 2598 raid5_release_stripe(sh); 2599 } 2600 2601 static void raid5_end_write_request(struct bio *bi) 2602 { 2603 struct stripe_head *sh = bi->bi_private; 2604 struct r5conf *conf = sh->raid_conf; 2605 int disks = sh->disks, i; 2606 struct md_rdev *uninitialized_var(rdev); 2607 sector_t first_bad; 2608 int bad_sectors; 2609 int replacement = 0; 2610 2611 for (i = 0 ; i < disks; i++) { 2612 if (bi == &sh->dev[i].req) { 2613 rdev = conf->disks[i].rdev; 2614 break; 2615 } 2616 if (bi == &sh->dev[i].rreq) { 2617 rdev = conf->disks[i].replacement; 2618 if (rdev) 2619 replacement = 1; 2620 else 2621 /* rdev was removed and 'replacement' 2622 * replaced it. rdev is not removed 2623 * until all requests are finished. 2624 */ 2625 rdev = conf->disks[i].rdev; 2626 break; 2627 } 2628 } 2629 pr_debug("end_write_request %llu/%d, count %d, error: %d.\n", 2630 (unsigned long long)sh->sector, i, atomic_read(&sh->count), 2631 bi->bi_error); 2632 if (i == disks) { 2633 bio_reset(bi); 2634 BUG(); 2635 return; 2636 } 2637 2638 if (replacement) { 2639 if (bi->bi_error) 2640 md_error(conf->mddev, rdev); 2641 else if (is_badblock(rdev, sh->sector, 2642 STRIPE_SECTORS, 2643 &first_bad, &bad_sectors)) 2644 set_bit(R5_MadeGoodRepl, &sh->dev[i].flags); 2645 } else { 2646 if (bi->bi_error) { 2647 set_bit(STRIPE_DEGRADED, &sh->state); 2648 set_bit(WriteErrorSeen, &rdev->flags); 2649 set_bit(R5_WriteError, &sh->dev[i].flags); 2650 if (!test_and_set_bit(WantReplacement, &rdev->flags)) 2651 set_bit(MD_RECOVERY_NEEDED, 2652 &rdev->mddev->recovery); 2653 } else if (is_badblock(rdev, sh->sector, 2654 STRIPE_SECTORS, 2655 &first_bad, &bad_sectors)) { 2656 set_bit(R5_MadeGood, &sh->dev[i].flags); 2657 if (test_bit(R5_ReadError, &sh->dev[i].flags)) 2658 /* That was a successful write so make 2659 * sure it looks like we already did 2660 * a re-write. 2661 */ 2662 set_bit(R5_ReWrite, &sh->dev[i].flags); 2663 } 2664 } 2665 rdev_dec_pending(rdev, conf->mddev); 2666 2667 if (sh->batch_head && bi->bi_error && !replacement) 2668 set_bit(STRIPE_BATCH_ERR, &sh->batch_head->state); 2669 2670 bio_reset(bi); 2671 if (!test_and_clear_bit(R5_DOUBLE_LOCKED, &sh->dev[i].flags)) 2672 clear_bit(R5_LOCKED, &sh->dev[i].flags); 2673 set_bit(STRIPE_HANDLE, &sh->state); 2674 raid5_release_stripe(sh); 2675 2676 if (sh->batch_head && sh != sh->batch_head) 2677 raid5_release_stripe(sh->batch_head); 2678 } 2679 2680 static void raid5_build_block(struct stripe_head *sh, int i, int previous) 2681 { 2682 struct r5dev *dev = &sh->dev[i]; 2683 2684 dev->flags = 0; 2685 dev->sector = raid5_compute_blocknr(sh, i, previous); 2686 } 2687 2688 static void raid5_error(struct mddev *mddev, struct md_rdev *rdev) 2689 { 2690 char b[BDEVNAME_SIZE]; 2691 struct r5conf *conf = mddev->private; 2692 unsigned long flags; 2693 pr_debug("raid456: error called\n"); 2694 2695 spin_lock_irqsave(&conf->device_lock, flags); 2696 clear_bit(In_sync, &rdev->flags); 2697 mddev->degraded = raid5_calc_degraded(conf); 2698 spin_unlock_irqrestore(&conf->device_lock, flags); 2699 set_bit(MD_RECOVERY_INTR, &mddev->recovery); 2700 2701 set_bit(Blocked, &rdev->flags); 2702 set_bit(Faulty, &rdev->flags); 2703 set_mask_bits(&mddev->sb_flags, 0, 2704 BIT(MD_SB_CHANGE_DEVS) | BIT(MD_SB_CHANGE_PENDING)); 2705 pr_crit("md/raid:%s: Disk failure on %s, disabling device.\n" 2706 "md/raid:%s: Operation continuing on %d devices.\n", 2707 mdname(mddev), 2708 bdevname(rdev->bdev, b), 2709 mdname(mddev), 2710 conf->raid_disks - mddev->degraded); 2711 r5c_update_on_rdev_error(mddev); 2712 } 2713 2714 /* 2715 * Input: a 'big' sector number, 2716 * Output: index of the data and parity disk, and the sector # in them. 2717 */ 2718 sector_t raid5_compute_sector(struct r5conf *conf, sector_t r_sector, 2719 int previous, int *dd_idx, 2720 struct stripe_head *sh) 2721 { 2722 sector_t stripe, stripe2; 2723 sector_t chunk_number; 2724 unsigned int chunk_offset; 2725 int pd_idx, qd_idx; 2726 int ddf_layout = 0; 2727 sector_t new_sector; 2728 int algorithm = previous ? conf->prev_algo 2729 : conf->algorithm; 2730 int sectors_per_chunk = previous ? conf->prev_chunk_sectors 2731 : conf->chunk_sectors; 2732 int raid_disks = previous ? conf->previous_raid_disks 2733 : conf->raid_disks; 2734 int data_disks = raid_disks - conf->max_degraded; 2735 2736 /* First compute the information on this sector */ 2737 2738 /* 2739 * Compute the chunk number and the sector offset inside the chunk 2740 */ 2741 chunk_offset = sector_div(r_sector, sectors_per_chunk); 2742 chunk_number = r_sector; 2743 2744 /* 2745 * Compute the stripe number 2746 */ 2747 stripe = chunk_number; 2748 *dd_idx = sector_div(stripe, data_disks); 2749 stripe2 = stripe; 2750 /* 2751 * Select the parity disk based on the user selected algorithm. 2752 */ 2753 pd_idx = qd_idx = -1; 2754 switch(conf->level) { 2755 case 4: 2756 pd_idx = data_disks; 2757 break; 2758 case 5: 2759 switch (algorithm) { 2760 case ALGORITHM_LEFT_ASYMMETRIC: 2761 pd_idx = data_disks - sector_div(stripe2, raid_disks); 2762 if (*dd_idx >= pd_idx) 2763 (*dd_idx)++; 2764 break; 2765 case ALGORITHM_RIGHT_ASYMMETRIC: 2766 pd_idx = sector_div(stripe2, raid_disks); 2767 if (*dd_idx >= pd_idx) 2768 (*dd_idx)++; 2769 break; 2770 case ALGORITHM_LEFT_SYMMETRIC: 2771 pd_idx = data_disks - sector_div(stripe2, raid_disks); 2772 *dd_idx = (pd_idx + 1 + *dd_idx) % raid_disks; 2773 break; 2774 case ALGORITHM_RIGHT_SYMMETRIC: 2775 pd_idx = sector_div(stripe2, raid_disks); 2776 *dd_idx = (pd_idx + 1 + *dd_idx) % raid_disks; 2777 break; 2778 case ALGORITHM_PARITY_0: 2779 pd_idx = 0; 2780 (*dd_idx)++; 2781 break; 2782 case ALGORITHM_PARITY_N: 2783 pd_idx = data_disks; 2784 break; 2785 default: 2786 BUG(); 2787 } 2788 break; 2789 case 6: 2790 2791 switch (algorithm) { 2792 case ALGORITHM_LEFT_ASYMMETRIC: 2793 pd_idx = raid_disks - 1 - sector_div(stripe2, raid_disks); 2794 qd_idx = pd_idx + 1; 2795 if (pd_idx == raid_disks-1) { 2796 (*dd_idx)++; /* Q D D D P */ 2797 qd_idx = 0; 2798 } else if (*dd_idx >= pd_idx) 2799 (*dd_idx) += 2; /* D D P Q D */ 2800 break; 2801 case ALGORITHM_RIGHT_ASYMMETRIC: 2802 pd_idx = sector_div(stripe2, raid_disks); 2803 qd_idx = pd_idx + 1; 2804 if (pd_idx == raid_disks-1) { 2805 (*dd_idx)++; /* Q D D D P */ 2806 qd_idx = 0; 2807 } else if (*dd_idx >= pd_idx) 2808 (*dd_idx) += 2; /* D D P Q D */ 2809 break; 2810 case ALGORITHM_LEFT_SYMMETRIC: 2811 pd_idx = raid_disks - 1 - sector_div(stripe2, raid_disks); 2812 qd_idx = (pd_idx + 1) % raid_disks; 2813 *dd_idx = (pd_idx + 2 + *dd_idx) % raid_disks; 2814 break; 2815 case ALGORITHM_RIGHT_SYMMETRIC: 2816 pd_idx = sector_div(stripe2, raid_disks); 2817 qd_idx = (pd_idx + 1) % raid_disks; 2818 *dd_idx = (pd_idx + 2 + *dd_idx) % raid_disks; 2819 break; 2820 2821 case ALGORITHM_PARITY_0: 2822 pd_idx = 0; 2823 qd_idx = 1; 2824 (*dd_idx) += 2; 2825 break; 2826 case ALGORITHM_PARITY_N: 2827 pd_idx = data_disks; 2828 qd_idx = data_disks + 1; 2829 break; 2830 2831 case ALGORITHM_ROTATING_ZERO_RESTART: 2832 /* Exactly the same as RIGHT_ASYMMETRIC, but or 2833 * of blocks for computing Q is different. 2834 */ 2835 pd_idx = sector_div(stripe2, raid_disks); 2836 qd_idx = pd_idx + 1; 2837 if (pd_idx == raid_disks-1) { 2838 (*dd_idx)++; /* Q D D D P */ 2839 qd_idx = 0; 2840 } else if (*dd_idx >= pd_idx) 2841 (*dd_idx) += 2; /* D D P Q D */ 2842 ddf_layout = 1; 2843 break; 2844 2845 case ALGORITHM_ROTATING_N_RESTART: 2846 /* Same a left_asymmetric, by first stripe is 2847 * D D D P Q rather than 2848 * Q D D D P 2849 */ 2850 stripe2 += 1; 2851 pd_idx = raid_disks - 1 - sector_div(stripe2, raid_disks); 2852 qd_idx = pd_idx + 1; 2853 if (pd_idx == raid_disks-1) { 2854 (*dd_idx)++; /* Q D D D P */ 2855 qd_idx = 0; 2856 } else if (*dd_idx >= pd_idx) 2857 (*dd_idx) += 2; /* D D P Q D */ 2858 ddf_layout = 1; 2859 break; 2860 2861 case ALGORITHM_ROTATING_N_CONTINUE: 2862 /* Same as left_symmetric but Q is before P */ 2863 pd_idx = raid_disks - 1 - sector_div(stripe2, raid_disks); 2864 qd_idx = (pd_idx + raid_disks - 1) % raid_disks; 2865 *dd_idx = (pd_idx + 1 + *dd_idx) % raid_disks; 2866 ddf_layout = 1; 2867 break; 2868 2869 case ALGORITHM_LEFT_ASYMMETRIC_6: 2870 /* RAID5 left_asymmetric, with Q on last device */ 2871 pd_idx = data_disks - sector_div(stripe2, raid_disks-1); 2872 if (*dd_idx >= pd_idx) 2873 (*dd_idx)++; 2874 qd_idx = raid_disks - 1; 2875 break; 2876 2877 case ALGORITHM_RIGHT_ASYMMETRIC_6: 2878 pd_idx = sector_div(stripe2, raid_disks-1); 2879 if (*dd_idx >= pd_idx) 2880 (*dd_idx)++; 2881 qd_idx = raid_disks - 1; 2882 break; 2883 2884 case ALGORITHM_LEFT_SYMMETRIC_6: 2885 pd_idx = data_disks - sector_div(stripe2, raid_disks-1); 2886 *dd_idx = (pd_idx + 1 + *dd_idx) % (raid_disks-1); 2887 qd_idx = raid_disks - 1; 2888 break; 2889 2890 case ALGORITHM_RIGHT_SYMMETRIC_6: 2891 pd_idx = sector_div(stripe2, raid_disks-1); 2892 *dd_idx = (pd_idx + 1 + *dd_idx) % (raid_disks-1); 2893 qd_idx = raid_disks - 1; 2894 break; 2895 2896 case ALGORITHM_PARITY_0_6: 2897 pd_idx = 0; 2898 (*dd_idx)++; 2899 qd_idx = raid_disks - 1; 2900 break; 2901 2902 default: 2903 BUG(); 2904 } 2905 break; 2906 } 2907 2908 if (sh) { 2909 sh->pd_idx = pd_idx; 2910 sh->qd_idx = qd_idx; 2911 sh->ddf_layout = ddf_layout; 2912 } 2913 /* 2914 * Finally, compute the new sector number 2915 */ 2916 new_sector = (sector_t)stripe * sectors_per_chunk + chunk_offset; 2917 return new_sector; 2918 } 2919 2920 sector_t raid5_compute_blocknr(struct stripe_head *sh, int i, int previous) 2921 { 2922 struct r5conf *conf = sh->raid_conf; 2923 int raid_disks = sh->disks; 2924 int data_disks = raid_disks - conf->max_degraded; 2925 sector_t new_sector = sh->sector, check; 2926 int sectors_per_chunk = previous ? conf->prev_chunk_sectors 2927 : conf->chunk_sectors; 2928 int algorithm = previous ? conf->prev_algo 2929 : conf->algorithm; 2930 sector_t stripe; 2931 int chunk_offset; 2932 sector_t chunk_number; 2933 int dummy1, dd_idx = i; 2934 sector_t r_sector; 2935 struct stripe_head sh2; 2936 2937 chunk_offset = sector_div(new_sector, sectors_per_chunk); 2938 stripe = new_sector; 2939 2940 if (i == sh->pd_idx) 2941 return 0; 2942 switch(conf->level) { 2943 case 4: break; 2944 case 5: 2945 switch (algorithm) { 2946 case ALGORITHM_LEFT_ASYMMETRIC: 2947 case ALGORITHM_RIGHT_ASYMMETRIC: 2948 if (i > sh->pd_idx) 2949 i--; 2950 break; 2951 case ALGORITHM_LEFT_SYMMETRIC: 2952 case ALGORITHM_RIGHT_SYMMETRIC: 2953 if (i < sh->pd_idx) 2954 i += raid_disks; 2955 i -= (sh->pd_idx + 1); 2956 break; 2957 case ALGORITHM_PARITY_0: 2958 i -= 1; 2959 break; 2960 case ALGORITHM_PARITY_N: 2961 break; 2962 default: 2963 BUG(); 2964 } 2965 break; 2966 case 6: 2967 if (i == sh->qd_idx) 2968 return 0; /* It is the Q disk */ 2969 switch (algorithm) { 2970 case ALGORITHM_LEFT_ASYMMETRIC: 2971 case ALGORITHM_RIGHT_ASYMMETRIC: 2972 case ALGORITHM_ROTATING_ZERO_RESTART: 2973 case ALGORITHM_ROTATING_N_RESTART: 2974 if (sh->pd_idx == raid_disks-1) 2975 i--; /* Q D D D P */ 2976 else if (i > sh->pd_idx) 2977 i -= 2; /* D D P Q D */ 2978 break; 2979 case ALGORITHM_LEFT_SYMMETRIC: 2980 case ALGORITHM_RIGHT_SYMMETRIC: 2981 if (sh->pd_idx == raid_disks-1) 2982 i--; /* Q D D D P */ 2983 else { 2984 /* D D P Q D */ 2985 if (i < sh->pd_idx) 2986 i += raid_disks; 2987 i -= (sh->pd_idx + 2); 2988 } 2989 break; 2990 case ALGORITHM_PARITY_0: 2991 i -= 2; 2992 break; 2993 case ALGORITHM_PARITY_N: 2994 break; 2995 case ALGORITHM_ROTATING_N_CONTINUE: 2996 /* Like left_symmetric, but P is before Q */ 2997 if (sh->pd_idx == 0) 2998 i--; /* P D D D Q */ 2999 else { 3000 /* D D Q P D */ 3001 if (i < sh->pd_idx) 3002 i += raid_disks; 3003 i -= (sh->pd_idx + 1); 3004 } 3005 break; 3006 case ALGORITHM_LEFT_ASYMMETRIC_6: 3007 case ALGORITHM_RIGHT_ASYMMETRIC_6: 3008 if (i > sh->pd_idx) 3009 i--; 3010 break; 3011 case ALGORITHM_LEFT_SYMMETRIC_6: 3012 case ALGORITHM_RIGHT_SYMMETRIC_6: 3013 if (i < sh->pd_idx) 3014 i += data_disks + 1; 3015 i -= (sh->pd_idx + 1); 3016 break; 3017 case ALGORITHM_PARITY_0_6: 3018 i -= 1; 3019 break; 3020 default: 3021 BUG(); 3022 } 3023 break; 3024 } 3025 3026 chunk_number = stripe * data_disks + i; 3027 r_sector = chunk_number * sectors_per_chunk + chunk_offset; 3028 3029 check = raid5_compute_sector(conf, r_sector, 3030 previous, &dummy1, &sh2); 3031 if (check != sh->sector || dummy1 != dd_idx || sh2.pd_idx != sh->pd_idx 3032 || sh2.qd_idx != sh->qd_idx) { 3033 pr_warn("md/raid:%s: compute_blocknr: map not correct\n", 3034 mdname(conf->mddev)); 3035 return 0; 3036 } 3037 return r_sector; 3038 } 3039 3040 /* 3041 * There are cases where we want handle_stripe_dirtying() and 3042 * schedule_reconstruction() to delay towrite to some dev of a stripe. 3043 * 3044 * This function checks whether we want to delay the towrite. Specifically, 3045 * we delay the towrite when: 3046 * 3047 * 1. degraded stripe has a non-overwrite to the missing dev, AND this 3048 * stripe has data in journal (for other devices). 3049 * 3050 * In this case, when reading data for the non-overwrite dev, it is 3051 * necessary to handle complex rmw of write back cache (prexor with 3052 * orig_page, and xor with page). To keep read path simple, we would 3053 * like to flush data in journal to RAID disks first, so complex rmw 3054 * is handled in the write patch (handle_stripe_dirtying). 3055 * 3056 * 2. when journal space is critical (R5C_LOG_CRITICAL=1) 3057 * 3058 * It is important to be able to flush all stripes in raid5-cache. 3059 * Therefore, we need reserve some space on the journal device for 3060 * these flushes. If flush operation includes pending writes to the 3061 * stripe, we need to reserve (conf->raid_disk + 1) pages per stripe 3062 * for the flush out. If we exclude these pending writes from flush 3063 * operation, we only need (conf->max_degraded + 1) pages per stripe. 3064 * Therefore, excluding pending writes in these cases enables more 3065 * efficient use of the journal device. 3066 * 3067 * Note: To make sure the stripe makes progress, we only delay 3068 * towrite for stripes with data already in journal (injournal > 0). 3069 * When LOG_CRITICAL, stripes with injournal == 0 will be sent to 3070 * no_space_stripes list. 3071 * 3072 */ 3073 static inline bool delay_towrite(struct r5conf *conf, 3074 struct r5dev *dev, 3075 struct stripe_head_state *s) 3076 { 3077 /* case 1 above */ 3078 if (!test_bit(R5_OVERWRITE, &dev->flags) && 3079 !test_bit(R5_Insync, &dev->flags) && s->injournal) 3080 return true; 3081 /* case 2 above */ 3082 if (test_bit(R5C_LOG_CRITICAL, &conf->cache_state) && 3083 s->injournal > 0) 3084 return true; 3085 return false; 3086 } 3087 3088 static void 3089 schedule_reconstruction(struct stripe_head *sh, struct stripe_head_state *s, 3090 int rcw, int expand) 3091 { 3092 int i, pd_idx = sh->pd_idx, qd_idx = sh->qd_idx, disks = sh->disks; 3093 struct r5conf *conf = sh->raid_conf; 3094 int level = conf->level; 3095 3096 if (rcw) { 3097 /* 3098 * In some cases, handle_stripe_dirtying initially decided to 3099 * run rmw and allocates extra page for prexor. However, rcw is 3100 * cheaper later on. We need to free the extra page now, 3101 * because we won't be able to do that in ops_complete_prexor(). 3102 */ 3103 r5c_release_extra_page(sh); 3104 3105 for (i = disks; i--; ) { 3106 struct r5dev *dev = &sh->dev[i]; 3107 3108 if (dev->towrite && !delay_towrite(conf, dev, s)) { 3109 set_bit(R5_LOCKED, &dev->flags); 3110 set_bit(R5_Wantdrain, &dev->flags); 3111 if (!expand) 3112 clear_bit(R5_UPTODATE, &dev->flags); 3113 s->locked++; 3114 } else if (test_bit(R5_InJournal, &dev->flags)) { 3115 set_bit(R5_LOCKED, &dev->flags); 3116 s->locked++; 3117 } 3118 } 3119 /* if we are not expanding this is a proper write request, and 3120 * there will be bios with new data to be drained into the 3121 * stripe cache 3122 */ 3123 if (!expand) { 3124 if (!s->locked) 3125 /* False alarm, nothing to do */ 3126 return; 3127 sh->reconstruct_state = reconstruct_state_drain_run; 3128 set_bit(STRIPE_OP_BIODRAIN, &s->ops_request); 3129 } else 3130 sh->reconstruct_state = reconstruct_state_run; 3131 3132 set_bit(STRIPE_OP_RECONSTRUCT, &s->ops_request); 3133 3134 if (s->locked + conf->max_degraded == disks) 3135 if (!test_and_set_bit(STRIPE_FULL_WRITE, &sh->state)) 3136 atomic_inc(&conf->pending_full_writes); 3137 } else { 3138 BUG_ON(!(test_bit(R5_UPTODATE, &sh->dev[pd_idx].flags) || 3139 test_bit(R5_Wantcompute, &sh->dev[pd_idx].flags))); 3140 BUG_ON(level == 6 && 3141 (!(test_bit(R5_UPTODATE, &sh->dev[qd_idx].flags) || 3142 test_bit(R5_Wantcompute, &sh->dev[qd_idx].flags)))); 3143 3144 for (i = disks; i--; ) { 3145 struct r5dev *dev = &sh->dev[i]; 3146 if (i == pd_idx || i == qd_idx) 3147 continue; 3148 3149 if (dev->towrite && 3150 (test_bit(R5_UPTODATE, &dev->flags) || 3151 test_bit(R5_Wantcompute, &dev->flags))) { 3152 set_bit(R5_Wantdrain, &dev->flags); 3153 set_bit(R5_LOCKED, &dev->flags); 3154 clear_bit(R5_UPTODATE, &dev->flags); 3155 s->locked++; 3156 } else if (test_bit(R5_InJournal, &dev->flags)) { 3157 set_bit(R5_LOCKED, &dev->flags); 3158 s->locked++; 3159 } 3160 } 3161 if (!s->locked) 3162 /* False alarm - nothing to do */ 3163 return; 3164 sh->reconstruct_state = reconstruct_state_prexor_drain_run; 3165 set_bit(STRIPE_OP_PREXOR, &s->ops_request); 3166 set_bit(STRIPE_OP_BIODRAIN, &s->ops_request); 3167 set_bit(STRIPE_OP_RECONSTRUCT, &s->ops_request); 3168 } 3169 3170 /* keep the parity disk(s) locked while asynchronous operations 3171 * are in flight 3172 */ 3173 set_bit(R5_LOCKED, &sh->dev[pd_idx].flags); 3174 clear_bit(R5_UPTODATE, &sh->dev[pd_idx].flags); 3175 s->locked++; 3176 3177 if (level == 6) { 3178 int qd_idx = sh->qd_idx; 3179 struct r5dev *dev = &sh->dev[qd_idx]; 3180 3181 set_bit(R5_LOCKED, &dev->flags); 3182 clear_bit(R5_UPTODATE, &dev->flags); 3183 s->locked++; 3184 } 3185 3186 if (raid5_has_ppl(sh->raid_conf) && 3187 test_bit(STRIPE_OP_BIODRAIN, &s->ops_request) && 3188 !test_bit(STRIPE_FULL_WRITE, &sh->state) && 3189 test_bit(R5_Insync, &sh->dev[pd_idx].flags)) 3190 set_bit(STRIPE_OP_PARTIAL_PARITY, &s->ops_request); 3191 3192 pr_debug("%s: stripe %llu locked: %d ops_request: %lx\n", 3193 __func__, (unsigned long long)sh->sector, 3194 s->locked, s->ops_request); 3195 } 3196 3197 /* 3198 * Each stripe/dev can have one or more bion attached. 3199 * toread/towrite point to the first in a chain. 3200 * The bi_next chain must be in order. 3201 */ 3202 static int add_stripe_bio(struct stripe_head *sh, struct bio *bi, int dd_idx, 3203 int forwrite, int previous) 3204 { 3205 struct bio **bip; 3206 struct r5conf *conf = sh->raid_conf; 3207 int firstwrite=0; 3208 3209 pr_debug("adding bi b#%llu to stripe s#%llu\n", 3210 (unsigned long long)bi->bi_iter.bi_sector, 3211 (unsigned long long)sh->sector); 3212 3213 /* 3214 * If several bio share a stripe. The bio bi_phys_segments acts as a 3215 * reference count to avoid race. The reference count should already be 3216 * increased before this function is called (for example, in 3217 * raid5_make_request()), so other bio sharing this stripe will not free the 3218 * stripe. If a stripe is owned by one stripe, the stripe lock will 3219 * protect it. 3220 */ 3221 spin_lock_irq(&sh->stripe_lock); 3222 /* Don't allow new IO added to stripes in batch list */ 3223 if (sh->batch_head) 3224 goto overlap; 3225 if (forwrite) { 3226 bip = &sh->dev[dd_idx].towrite; 3227 if (*bip == NULL) 3228 firstwrite = 1; 3229 } else 3230 bip = &sh->dev[dd_idx].toread; 3231 while (*bip && (*bip)->bi_iter.bi_sector < bi->bi_iter.bi_sector) { 3232 if (bio_end_sector(*bip) > bi->bi_iter.bi_sector) 3233 goto overlap; 3234 bip = & (*bip)->bi_next; 3235 } 3236 if (*bip && (*bip)->bi_iter.bi_sector < bio_end_sector(bi)) 3237 goto overlap; 3238 3239 if (forwrite && raid5_has_ppl(conf)) { 3240 /* 3241 * With PPL only writes to consecutive data chunks within a 3242 * stripe are allowed because for a single stripe_head we can 3243 * only have one PPL entry at a time, which describes one data 3244 * range. Not really an overlap, but wait_for_overlap can be 3245 * used to handle this. 3246 */ 3247 sector_t sector; 3248 sector_t first = 0; 3249 sector_t last = 0; 3250 int count = 0; 3251 int i; 3252 3253 for (i = 0; i < sh->disks; i++) { 3254 if (i != sh->pd_idx && 3255 (i == dd_idx || sh->dev[i].towrite)) { 3256 sector = sh->dev[i].sector; 3257 if (count == 0 || sector < first) 3258 first = sector; 3259 if (sector > last) 3260 last = sector; 3261 count++; 3262 } 3263 } 3264 3265 if (first + conf->chunk_sectors * (count - 1) != last) 3266 goto overlap; 3267 } 3268 3269 if (!forwrite || previous) 3270 clear_bit(STRIPE_BATCH_READY, &sh->state); 3271 3272 BUG_ON(*bip && bi->bi_next && (*bip) != bi->bi_next); 3273 if (*bip) 3274 bi->bi_next = *bip; 3275 *bip = bi; 3276 raid5_inc_bi_active_stripes(bi); 3277 3278 if (forwrite) { 3279 /* check if page is covered */ 3280 sector_t sector = sh->dev[dd_idx].sector; 3281 for (bi=sh->dev[dd_idx].towrite; 3282 sector < sh->dev[dd_idx].sector + STRIPE_SECTORS && 3283 bi && bi->bi_iter.bi_sector <= sector; 3284 bi = r5_next_bio(bi, sh->dev[dd_idx].sector)) { 3285 if (bio_end_sector(bi) >= sector) 3286 sector = bio_end_sector(bi); 3287 } 3288 if (sector >= sh->dev[dd_idx].sector + STRIPE_SECTORS) 3289 if (!test_and_set_bit(R5_OVERWRITE, &sh->dev[dd_idx].flags)) 3290 sh->overwrite_disks++; 3291 } 3292 3293 pr_debug("added bi b#%llu to stripe s#%llu, disk %d.\n", 3294 (unsigned long long)(*bip)->bi_iter.bi_sector, 3295 (unsigned long long)sh->sector, dd_idx); 3296 3297 if (conf->mddev->bitmap && firstwrite) { 3298 /* Cannot hold spinlock over bitmap_startwrite, 3299 * but must ensure this isn't added to a batch until 3300 * we have added to the bitmap and set bm_seq. 3301 * So set STRIPE_BITMAP_PENDING to prevent 3302 * batching. 3303 * If multiple add_stripe_bio() calls race here they 3304 * much all set STRIPE_BITMAP_PENDING. So only the first one 3305 * to complete "bitmap_startwrite" gets to set 3306 * STRIPE_BIT_DELAY. This is important as once a stripe 3307 * is added to a batch, STRIPE_BIT_DELAY cannot be changed 3308 * any more. 3309 */ 3310 set_bit(STRIPE_BITMAP_PENDING, &sh->state); 3311 spin_unlock_irq(&sh->stripe_lock); 3312 bitmap_startwrite(conf->mddev->bitmap, sh->sector, 3313 STRIPE_SECTORS, 0); 3314 spin_lock_irq(&sh->stripe_lock); 3315 clear_bit(STRIPE_BITMAP_PENDING, &sh->state); 3316 if (!sh->batch_head) { 3317 sh->bm_seq = conf->seq_flush+1; 3318 set_bit(STRIPE_BIT_DELAY, &sh->state); 3319 } 3320 } 3321 spin_unlock_irq(&sh->stripe_lock); 3322 3323 if (stripe_can_batch(sh)) 3324 stripe_add_to_batch_list(conf, sh); 3325 return 1; 3326 3327 overlap: 3328 set_bit(R5_Overlap, &sh->dev[dd_idx].flags); 3329 spin_unlock_irq(&sh->stripe_lock); 3330 return 0; 3331 } 3332 3333 static void end_reshape(struct r5conf *conf); 3334 3335 static void stripe_set_idx(sector_t stripe, struct r5conf *conf, int previous, 3336 struct stripe_head *sh) 3337 { 3338 int sectors_per_chunk = 3339 previous ? conf->prev_chunk_sectors : conf->chunk_sectors; 3340 int dd_idx; 3341 int chunk_offset = sector_div(stripe, sectors_per_chunk); 3342 int disks = previous ? conf->previous_raid_disks : conf->raid_disks; 3343 3344 raid5_compute_sector(conf, 3345 stripe * (disks - conf->max_degraded) 3346 *sectors_per_chunk + chunk_offset, 3347 previous, 3348 &dd_idx, sh); 3349 } 3350 3351 static void 3352 handle_failed_stripe(struct r5conf *conf, struct stripe_head *sh, 3353 struct stripe_head_state *s, int disks, 3354 struct bio_list *return_bi) 3355 { 3356 int i; 3357 BUG_ON(sh->batch_head); 3358 for (i = disks; i--; ) { 3359 struct bio *bi; 3360 int bitmap_end = 0; 3361 3362 if (test_bit(R5_ReadError, &sh->dev[i].flags)) { 3363 struct md_rdev *rdev; 3364 rcu_read_lock(); 3365 rdev = rcu_dereference(conf->disks[i].rdev); 3366 if (rdev && test_bit(In_sync, &rdev->flags) && 3367 !test_bit(Faulty, &rdev->flags)) 3368 atomic_inc(&rdev->nr_pending); 3369 else 3370 rdev = NULL; 3371 rcu_read_unlock(); 3372 if (rdev) { 3373 if (!rdev_set_badblocks( 3374 rdev, 3375 sh->sector, 3376 STRIPE_SECTORS, 0)) 3377 md_error(conf->mddev, rdev); 3378 rdev_dec_pending(rdev, conf->mddev); 3379 } 3380 } 3381 spin_lock_irq(&sh->stripe_lock); 3382 /* fail all writes first */ 3383 bi = sh->dev[i].towrite; 3384 sh->dev[i].towrite = NULL; 3385 sh->overwrite_disks = 0; 3386 spin_unlock_irq(&sh->stripe_lock); 3387 if (bi) 3388 bitmap_end = 1; 3389 3390 log_stripe_write_finished(sh); 3391 3392 if (test_and_clear_bit(R5_Overlap, &sh->dev[i].flags)) 3393 wake_up(&conf->wait_for_overlap); 3394 3395 while (bi && bi->bi_iter.bi_sector < 3396 sh->dev[i].sector + STRIPE_SECTORS) { 3397 struct bio *nextbi = r5_next_bio(bi, sh->dev[i].sector); 3398 3399 bi->bi_error = -EIO; 3400 if (!raid5_dec_bi_active_stripes(bi)) { 3401 md_write_end(conf->mddev); 3402 bio_list_add(return_bi, bi); 3403 } 3404 bi = nextbi; 3405 } 3406 if (bitmap_end) 3407 bitmap_endwrite(conf->mddev->bitmap, sh->sector, 3408 STRIPE_SECTORS, 0, 0); 3409 bitmap_end = 0; 3410 /* and fail all 'written' */ 3411 bi = sh->dev[i].written; 3412 sh->dev[i].written = NULL; 3413 if (test_and_clear_bit(R5_SkipCopy, &sh->dev[i].flags)) { 3414 WARN_ON(test_bit(R5_UPTODATE, &sh->dev[i].flags)); 3415 sh->dev[i].page = sh->dev[i].orig_page; 3416 } 3417 3418 if (bi) bitmap_end = 1; 3419 while (bi && bi->bi_iter.bi_sector < 3420 sh->dev[i].sector + STRIPE_SECTORS) { 3421 struct bio *bi2 = r5_next_bio(bi, sh->dev[i].sector); 3422 3423 bi->bi_error = -EIO; 3424 if (!raid5_dec_bi_active_stripes(bi)) { 3425 md_write_end(conf->mddev); 3426 bio_list_add(return_bi, bi); 3427 } 3428 bi = bi2; 3429 } 3430 3431 /* fail any reads if this device is non-operational and 3432 * the data has not reached the cache yet. 3433 */ 3434 if (!test_bit(R5_Wantfill, &sh->dev[i].flags) && 3435 s->failed > conf->max_degraded && 3436 (!test_bit(R5_Insync, &sh->dev[i].flags) || 3437 test_bit(R5_ReadError, &sh->dev[i].flags))) { 3438 spin_lock_irq(&sh->stripe_lock); 3439 bi = sh->dev[i].toread; 3440 sh->dev[i].toread = NULL; 3441 spin_unlock_irq(&sh->stripe_lock); 3442 if (test_and_clear_bit(R5_Overlap, &sh->dev[i].flags)) 3443 wake_up(&conf->wait_for_overlap); 3444 if (bi) 3445 s->to_read--; 3446 while (bi && bi->bi_iter.bi_sector < 3447 sh->dev[i].sector + STRIPE_SECTORS) { 3448 struct bio *nextbi = 3449 r5_next_bio(bi, sh->dev[i].sector); 3450 3451 bi->bi_error = -EIO; 3452 if (!raid5_dec_bi_active_stripes(bi)) 3453 bio_list_add(return_bi, bi); 3454 bi = nextbi; 3455 } 3456 } 3457 if (bitmap_end) 3458 bitmap_endwrite(conf->mddev->bitmap, sh->sector, 3459 STRIPE_SECTORS, 0, 0); 3460 /* If we were in the middle of a write the parity block might 3461 * still be locked - so just clear all R5_LOCKED flags 3462 */ 3463 clear_bit(R5_LOCKED, &sh->dev[i].flags); 3464 } 3465 s->to_write = 0; 3466 s->written = 0; 3467 3468 if (test_and_clear_bit(STRIPE_FULL_WRITE, &sh->state)) 3469 if (atomic_dec_and_test(&conf->pending_full_writes)) 3470 md_wakeup_thread(conf->mddev->thread); 3471 } 3472 3473 static void 3474 handle_failed_sync(struct r5conf *conf, struct stripe_head *sh, 3475 struct stripe_head_state *s) 3476 { 3477 int abort = 0; 3478 int i; 3479 3480 BUG_ON(sh->batch_head); 3481 clear_bit(STRIPE_SYNCING, &sh->state); 3482 if (test_and_clear_bit(R5_Overlap, &sh->dev[sh->pd_idx].flags)) 3483 wake_up(&conf->wait_for_overlap); 3484 s->syncing = 0; 3485 s->replacing = 0; 3486 /* There is nothing more to do for sync/check/repair. 3487 * Don't even need to abort as that is handled elsewhere 3488 * if needed, and not always wanted e.g. if there is a known 3489 * bad block here. 3490 * For recover/replace we need to record a bad block on all 3491 * non-sync devices, or abort the recovery 3492 */ 3493 if (test_bit(MD_RECOVERY_RECOVER, &conf->mddev->recovery)) { 3494 /* During recovery devices cannot be removed, so 3495 * locking and refcounting of rdevs is not needed 3496 */ 3497 rcu_read_lock(); 3498 for (i = 0; i < conf->raid_disks; i++) { 3499 struct md_rdev *rdev = rcu_dereference(conf->disks[i].rdev); 3500 if (rdev 3501 && !test_bit(Faulty, &rdev->flags) 3502 && !test_bit(In_sync, &rdev->flags) 3503 && !rdev_set_badblocks(rdev, sh->sector, 3504 STRIPE_SECTORS, 0)) 3505 abort = 1; 3506 rdev = rcu_dereference(conf->disks[i].replacement); 3507 if (rdev 3508 && !test_bit(Faulty, &rdev->flags) 3509 && !test_bit(In_sync, &rdev->flags) 3510 && !rdev_set_badblocks(rdev, sh->sector, 3511 STRIPE_SECTORS, 0)) 3512 abort = 1; 3513 } 3514 rcu_read_unlock(); 3515 if (abort) 3516 conf->recovery_disabled = 3517 conf->mddev->recovery_disabled; 3518 } 3519 md_done_sync(conf->mddev, STRIPE_SECTORS, !abort); 3520 } 3521 3522 static int want_replace(struct stripe_head *sh, int disk_idx) 3523 { 3524 struct md_rdev *rdev; 3525 int rv = 0; 3526 3527 rcu_read_lock(); 3528 rdev = rcu_dereference(sh->raid_conf->disks[disk_idx].replacement); 3529 if (rdev 3530 && !test_bit(Faulty, &rdev->flags) 3531 && !test_bit(In_sync, &rdev->flags) 3532 && (rdev->recovery_offset <= sh->sector 3533 || rdev->mddev->recovery_cp <= sh->sector)) 3534 rv = 1; 3535 rcu_read_unlock(); 3536 return rv; 3537 } 3538 3539 static int need_this_block(struct stripe_head *sh, struct stripe_head_state *s, 3540 int disk_idx, int disks) 3541 { 3542 struct r5dev *dev = &sh->dev[disk_idx]; 3543 struct r5dev *fdev[2] = { &sh->dev[s->failed_num[0]], 3544 &sh->dev[s->failed_num[1]] }; 3545 int i; 3546 3547 3548 if (test_bit(R5_LOCKED, &dev->flags) || 3549 test_bit(R5_UPTODATE, &dev->flags)) 3550 /* No point reading this as we already have it or have 3551 * decided to get it. 3552 */ 3553 return 0; 3554 3555 if (dev->toread || 3556 (dev->towrite && !test_bit(R5_OVERWRITE, &dev->flags))) 3557 /* We need this block to directly satisfy a request */ 3558 return 1; 3559 3560 if (s->syncing || s->expanding || 3561 (s->replacing && want_replace(sh, disk_idx))) 3562 /* When syncing, or expanding we read everything. 3563 * When replacing, we need the replaced block. 3564 */ 3565 return 1; 3566 3567 if ((s->failed >= 1 && fdev[0]->toread) || 3568 (s->failed >= 2 && fdev[1]->toread)) 3569 /* If we want to read from a failed device, then 3570 * we need to actually read every other device. 3571 */ 3572 return 1; 3573 3574 /* Sometimes neither read-modify-write nor reconstruct-write 3575 * cycles can work. In those cases we read every block we 3576 * can. Then the parity-update is certain to have enough to 3577 * work with. 3578 * This can only be a problem when we need to write something, 3579 * and some device has failed. If either of those tests 3580 * fail we need look no further. 3581 */ 3582 if (!s->failed || !s->to_write) 3583 return 0; 3584 3585 if (test_bit(R5_Insync, &dev->flags) && 3586 !test_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) 3587 /* Pre-reads at not permitted until after short delay 3588 * to gather multiple requests. However if this 3589 * device is no Insync, the block could only be be computed 3590 * and there is no need to delay that. 3591 */ 3592 return 0; 3593 3594 for (i = 0; i < s->failed && i < 2; i++) { 3595 if (fdev[i]->towrite && 3596 !test_bit(R5_UPTODATE, &fdev[i]->flags) && 3597 !test_bit(R5_OVERWRITE, &fdev[i]->flags)) 3598 /* If we have a partial write to a failed 3599 * device, then we will need to reconstruct 3600 * the content of that device, so all other 3601 * devices must be read. 3602 */ 3603 return 1; 3604 } 3605 3606 /* If we are forced to do a reconstruct-write, either because 3607 * the current RAID6 implementation only supports that, or 3608 * or because parity cannot be trusted and we are currently 3609 * recovering it, there is extra need to be careful. 3610 * If one of the devices that we would need to read, because 3611 * it is not being overwritten (and maybe not written at all) 3612 * is missing/faulty, then we need to read everything we can. 3613 */ 3614 if (sh->raid_conf->level != 6 && 3615 sh->sector < sh->raid_conf->mddev->recovery_cp) 3616 /* reconstruct-write isn't being forced */ 3617 return 0; 3618 for (i = 0; i < s->failed && i < 2; i++) { 3619 if (s->failed_num[i] != sh->pd_idx && 3620 s->failed_num[i] != sh->qd_idx && 3621 !test_bit(R5_UPTODATE, &fdev[i]->flags) && 3622 !test_bit(R5_OVERWRITE, &fdev[i]->flags)) 3623 return 1; 3624 } 3625 3626 return 0; 3627 } 3628 3629 /* fetch_block - checks the given member device to see if its data needs 3630 * to be read or computed to satisfy a request. 3631 * 3632 * Returns 1 when no more member devices need to be checked, otherwise returns 3633 * 0 to tell the loop in handle_stripe_fill to continue 3634 */ 3635 static int fetch_block(struct stripe_head *sh, struct stripe_head_state *s, 3636 int disk_idx, int disks) 3637 { 3638 struct r5dev *dev = &sh->dev[disk_idx]; 3639 3640 /* is the data in this block needed, and can we get it? */ 3641 if (need_this_block(sh, s, disk_idx, disks)) { 3642 /* we would like to get this block, possibly by computing it, 3643 * otherwise read it if the backing disk is insync 3644 */ 3645 BUG_ON(test_bit(R5_Wantcompute, &dev->flags)); 3646 BUG_ON(test_bit(R5_Wantread, &dev->flags)); 3647 BUG_ON(sh->batch_head); 3648 if ((s->uptodate == disks - 1) && 3649 (s->failed && (disk_idx == s->failed_num[0] || 3650 disk_idx == s->failed_num[1]))) { 3651 /* have disk failed, and we're requested to fetch it; 3652 * do compute it 3653 */ 3654 pr_debug("Computing stripe %llu block %d\n", 3655 (unsigned long long)sh->sector, disk_idx); 3656 set_bit(STRIPE_COMPUTE_RUN, &sh->state); 3657 set_bit(STRIPE_OP_COMPUTE_BLK, &s->ops_request); 3658 set_bit(R5_Wantcompute, &dev->flags); 3659 sh->ops.target = disk_idx; 3660 sh->ops.target2 = -1; /* no 2nd target */ 3661 s->req_compute = 1; 3662 /* Careful: from this point on 'uptodate' is in the eye 3663 * of raid_run_ops which services 'compute' operations 3664 * before writes. R5_Wantcompute flags a block that will 3665 * be R5_UPTODATE by the time it is needed for a 3666 * subsequent operation. 3667 */ 3668 s->uptodate++; 3669 return 1; 3670 } else if (s->uptodate == disks-2 && s->failed >= 2) { 3671 /* Computing 2-failure is *very* expensive; only 3672 * do it if failed >= 2 3673 */ 3674 int other; 3675 for (other = disks; other--; ) { 3676 if (other == disk_idx) 3677 continue; 3678 if (!test_bit(R5_UPTODATE, 3679 &sh->dev[other].flags)) 3680 break; 3681 } 3682 BUG_ON(other < 0); 3683 pr_debug("Computing stripe %llu blocks %d,%d\n", 3684 (unsigned long long)sh->sector, 3685 disk_idx, other); 3686 set_bit(STRIPE_COMPUTE_RUN, &sh->state); 3687 set_bit(STRIPE_OP_COMPUTE_BLK, &s->ops_request); 3688 set_bit(R5_Wantcompute, &sh->dev[disk_idx].flags); 3689 set_bit(R5_Wantcompute, &sh->dev[other].flags); 3690 sh->ops.target = disk_idx; 3691 sh->ops.target2 = other; 3692 s->uptodate += 2; 3693 s->req_compute = 1; 3694 return 1; 3695 } else if (test_bit(R5_Insync, &dev->flags)) { 3696 set_bit(R5_LOCKED, &dev->flags); 3697 set_bit(R5_Wantread, &dev->flags); 3698 s->locked++; 3699 pr_debug("Reading block %d (sync=%d)\n", 3700 disk_idx, s->syncing); 3701 } 3702 } 3703 3704 return 0; 3705 } 3706 3707 /** 3708 * handle_stripe_fill - read or compute data to satisfy pending requests. 3709 */ 3710 static void handle_stripe_fill(struct stripe_head *sh, 3711 struct stripe_head_state *s, 3712 int disks) 3713 { 3714 int i; 3715 3716 /* look for blocks to read/compute, skip this if a compute 3717 * is already in flight, or if the stripe contents are in the 3718 * midst of changing due to a write 3719 */ 3720 if (!test_bit(STRIPE_COMPUTE_RUN, &sh->state) && !sh->check_state && 3721 !sh->reconstruct_state) { 3722 3723 /* 3724 * For degraded stripe with data in journal, do not handle 3725 * read requests yet, instead, flush the stripe to raid 3726 * disks first, this avoids handling complex rmw of write 3727 * back cache (prexor with orig_page, and then xor with 3728 * page) in the read path 3729 */ 3730 if (s->injournal && s->failed) { 3731 if (test_bit(STRIPE_R5C_CACHING, &sh->state)) 3732 r5c_make_stripe_write_out(sh); 3733 goto out; 3734 } 3735 3736 for (i = disks; i--; ) 3737 if (fetch_block(sh, s, i, disks)) 3738 break; 3739 } 3740 out: 3741 set_bit(STRIPE_HANDLE, &sh->state); 3742 } 3743 3744 static void break_stripe_batch_list(struct stripe_head *head_sh, 3745 unsigned long handle_flags); 3746 /* handle_stripe_clean_event 3747 * any written block on an uptodate or failed drive can be returned. 3748 * Note that if we 'wrote' to a failed drive, it will be UPTODATE, but 3749 * never LOCKED, so we don't need to test 'failed' directly. 3750 */ 3751 static void handle_stripe_clean_event(struct r5conf *conf, 3752 struct stripe_head *sh, int disks, struct bio_list *return_bi) 3753 { 3754 int i; 3755 struct r5dev *dev; 3756 int discard_pending = 0; 3757 struct stripe_head *head_sh = sh; 3758 bool do_endio = false; 3759 3760 for (i = disks; i--; ) 3761 if (sh->dev[i].written) { 3762 dev = &sh->dev[i]; 3763 if (!test_bit(R5_LOCKED, &dev->flags) && 3764 (test_bit(R5_UPTODATE, &dev->flags) || 3765 test_bit(R5_Discard, &dev->flags) || 3766 test_bit(R5_SkipCopy, &dev->flags))) { 3767 /* We can return any write requests */ 3768 struct bio *wbi, *wbi2; 3769 pr_debug("Return write for disc %d\n", i); 3770 if (test_and_clear_bit(R5_Discard, &dev->flags)) 3771 clear_bit(R5_UPTODATE, &dev->flags); 3772 if (test_and_clear_bit(R5_SkipCopy, &dev->flags)) { 3773 WARN_ON(test_bit(R5_UPTODATE, &dev->flags)); 3774 } 3775 do_endio = true; 3776 3777 returnbi: 3778 dev->page = dev->orig_page; 3779 wbi = dev->written; 3780 dev->written = NULL; 3781 while (wbi && wbi->bi_iter.bi_sector < 3782 dev->sector + STRIPE_SECTORS) { 3783 wbi2 = r5_next_bio(wbi, dev->sector); 3784 if (!raid5_dec_bi_active_stripes(wbi)) { 3785 md_write_end(conf->mddev); 3786 bio_list_add(return_bi, wbi); 3787 } 3788 wbi = wbi2; 3789 } 3790 bitmap_endwrite(conf->mddev->bitmap, sh->sector, 3791 STRIPE_SECTORS, 3792 !test_bit(STRIPE_DEGRADED, &sh->state), 3793 0); 3794 if (head_sh->batch_head) { 3795 sh = list_first_entry(&sh->batch_list, 3796 struct stripe_head, 3797 batch_list); 3798 if (sh != head_sh) { 3799 dev = &sh->dev[i]; 3800 goto returnbi; 3801 } 3802 } 3803 sh = head_sh; 3804 dev = &sh->dev[i]; 3805 } else if (test_bit(R5_Discard, &dev->flags)) 3806 discard_pending = 1; 3807 } 3808 3809 log_stripe_write_finished(sh); 3810 3811 if (!discard_pending && 3812 test_bit(R5_Discard, &sh->dev[sh->pd_idx].flags)) { 3813 int hash; 3814 clear_bit(R5_Discard, &sh->dev[sh->pd_idx].flags); 3815 clear_bit(R5_UPTODATE, &sh->dev[sh->pd_idx].flags); 3816 if (sh->qd_idx >= 0) { 3817 clear_bit(R5_Discard, &sh->dev[sh->qd_idx].flags); 3818 clear_bit(R5_UPTODATE, &sh->dev[sh->qd_idx].flags); 3819 } 3820 /* now that discard is done we can proceed with any sync */ 3821 clear_bit(STRIPE_DISCARD, &sh->state); 3822 /* 3823 * SCSI discard will change some bio fields and the stripe has 3824 * no updated data, so remove it from hash list and the stripe 3825 * will be reinitialized 3826 */ 3827 unhash: 3828 hash = sh->hash_lock_index; 3829 spin_lock_irq(conf->hash_locks + hash); 3830 remove_hash(sh); 3831 spin_unlock_irq(conf->hash_locks + hash); 3832 if (head_sh->batch_head) { 3833 sh = list_first_entry(&sh->batch_list, 3834 struct stripe_head, batch_list); 3835 if (sh != head_sh) 3836 goto unhash; 3837 } 3838 sh = head_sh; 3839 3840 if (test_bit(STRIPE_SYNC_REQUESTED, &sh->state)) 3841 set_bit(STRIPE_HANDLE, &sh->state); 3842 3843 } 3844 3845 if (test_and_clear_bit(STRIPE_FULL_WRITE, &sh->state)) 3846 if (atomic_dec_and_test(&conf->pending_full_writes)) 3847 md_wakeup_thread(conf->mddev->thread); 3848 3849 if (head_sh->batch_head && do_endio) 3850 break_stripe_batch_list(head_sh, STRIPE_EXPAND_SYNC_FLAGS); 3851 } 3852 3853 /* 3854 * For RMW in write back cache, we need extra page in prexor to store the 3855 * old data. This page is stored in dev->orig_page. 3856 * 3857 * This function checks whether we have data for prexor. The exact logic 3858 * is: 3859 * R5_UPTODATE && (!R5_InJournal || R5_OrigPageUPTDODATE) 3860 */ 3861 static inline bool uptodate_for_rmw(struct r5dev *dev) 3862 { 3863 return (test_bit(R5_UPTODATE, &dev->flags)) && 3864 (!test_bit(R5_InJournal, &dev->flags) || 3865 test_bit(R5_OrigPageUPTDODATE, &dev->flags)); 3866 } 3867 3868 static int handle_stripe_dirtying(struct r5conf *conf, 3869 struct stripe_head *sh, 3870 struct stripe_head_state *s, 3871 int disks) 3872 { 3873 int rmw = 0, rcw = 0, i; 3874 sector_t recovery_cp = conf->mddev->recovery_cp; 3875 3876 /* Check whether resync is now happening or should start. 3877 * If yes, then the array is dirty (after unclean shutdown or 3878 * initial creation), so parity in some stripes might be inconsistent. 3879 * In this case, we need to always do reconstruct-write, to ensure 3880 * that in case of drive failure or read-error correction, we 3881 * generate correct data from the parity. 3882 */ 3883 if (conf->rmw_level == PARITY_DISABLE_RMW || 3884 (recovery_cp < MaxSector && sh->sector >= recovery_cp && 3885 s->failed == 0)) { 3886 /* Calculate the real rcw later - for now make it 3887 * look like rcw is cheaper 3888 */ 3889 rcw = 1; rmw = 2; 3890 pr_debug("force RCW rmw_level=%u, recovery_cp=%llu sh->sector=%llu\n", 3891 conf->rmw_level, (unsigned long long)recovery_cp, 3892 (unsigned long long)sh->sector); 3893 } else for (i = disks; i--; ) { 3894 /* would I have to read this buffer for read_modify_write */ 3895 struct r5dev *dev = &sh->dev[i]; 3896 if (((dev->towrite && !delay_towrite(conf, dev, s)) || 3897 i == sh->pd_idx || i == sh->qd_idx || 3898 test_bit(R5_InJournal, &dev->flags)) && 3899 !test_bit(R5_LOCKED, &dev->flags) && 3900 !(uptodate_for_rmw(dev) || 3901 test_bit(R5_Wantcompute, &dev->flags))) { 3902 if (test_bit(R5_Insync, &dev->flags)) 3903 rmw++; 3904 else 3905 rmw += 2*disks; /* cannot read it */ 3906 } 3907 /* Would I have to read this buffer for reconstruct_write */ 3908 if (!test_bit(R5_OVERWRITE, &dev->flags) && 3909 i != sh->pd_idx && i != sh->qd_idx && 3910 !test_bit(R5_LOCKED, &dev->flags) && 3911 !(test_bit(R5_UPTODATE, &dev->flags) || 3912 test_bit(R5_Wantcompute, &dev->flags))) { 3913 if (test_bit(R5_Insync, &dev->flags)) 3914 rcw++; 3915 else 3916 rcw += 2*disks; 3917 } 3918 } 3919 3920 pr_debug("for sector %llu state 0x%lx, rmw=%d rcw=%d\n", 3921 (unsigned long long)sh->sector, sh->state, rmw, rcw); 3922 set_bit(STRIPE_HANDLE, &sh->state); 3923 if ((rmw < rcw || (rmw == rcw && conf->rmw_level == PARITY_PREFER_RMW)) && rmw > 0) { 3924 /* prefer read-modify-write, but need to get some data */ 3925 if (conf->mddev->queue) 3926 blk_add_trace_msg(conf->mddev->queue, 3927 "raid5 rmw %llu %d", 3928 (unsigned long long)sh->sector, rmw); 3929 for (i = disks; i--; ) { 3930 struct r5dev *dev = &sh->dev[i]; 3931 if (test_bit(R5_InJournal, &dev->flags) && 3932 dev->page == dev->orig_page && 3933 !test_bit(R5_LOCKED, &sh->dev[sh->pd_idx].flags)) { 3934 /* alloc page for prexor */ 3935 struct page *p = alloc_page(GFP_NOIO); 3936 3937 if (p) { 3938 dev->orig_page = p; 3939 continue; 3940 } 3941 3942 /* 3943 * alloc_page() failed, try use 3944 * disk_info->extra_page 3945 */ 3946 if (!test_and_set_bit(R5C_EXTRA_PAGE_IN_USE, 3947 &conf->cache_state)) { 3948 r5c_use_extra_page(sh); 3949 break; 3950 } 3951 3952 /* extra_page in use, add to delayed_list */ 3953 set_bit(STRIPE_DELAYED, &sh->state); 3954 s->waiting_extra_page = 1; 3955 return -EAGAIN; 3956 } 3957 } 3958 3959 for (i = disks; i--; ) { 3960 struct r5dev *dev = &sh->dev[i]; 3961 if (((dev->towrite && !delay_towrite(conf, dev, s)) || 3962 i == sh->pd_idx || i == sh->qd_idx || 3963 test_bit(R5_InJournal, &dev->flags)) && 3964 !test_bit(R5_LOCKED, &dev->flags) && 3965 !(uptodate_for_rmw(dev) || 3966 test_bit(R5_Wantcompute, &dev->flags)) && 3967 test_bit(R5_Insync, &dev->flags)) { 3968 if (test_bit(STRIPE_PREREAD_ACTIVE, 3969 &sh->state)) { 3970 pr_debug("Read_old block %d for r-m-w\n", 3971 i); 3972 set_bit(R5_LOCKED, &dev->flags); 3973 set_bit(R5_Wantread, &dev->flags); 3974 s->locked++; 3975 } else { 3976 set_bit(STRIPE_DELAYED, &sh->state); 3977 set_bit(STRIPE_HANDLE, &sh->state); 3978 } 3979 } 3980 } 3981 } 3982 if ((rcw < rmw || (rcw == rmw && conf->rmw_level != PARITY_PREFER_RMW)) && rcw > 0) { 3983 /* want reconstruct write, but need to get some data */ 3984 int qread =0; 3985 rcw = 0; 3986 for (i = disks; i--; ) { 3987 struct r5dev *dev = &sh->dev[i]; 3988 if (!test_bit(R5_OVERWRITE, &dev->flags) && 3989 i != sh->pd_idx && i != sh->qd_idx && 3990 !test_bit(R5_LOCKED, &dev->flags) && 3991 !(test_bit(R5_UPTODATE, &dev->flags) || 3992 test_bit(R5_Wantcompute, &dev->flags))) { 3993 rcw++; 3994 if (test_bit(R5_Insync, &dev->flags) && 3995 test_bit(STRIPE_PREREAD_ACTIVE, 3996 &sh->state)) { 3997 pr_debug("Read_old block " 3998 "%d for Reconstruct\n", i); 3999 set_bit(R5_LOCKED, &dev->flags); 4000 set_bit(R5_Wantread, &dev->flags); 4001 s->locked++; 4002 qread++; 4003 } else { 4004 set_bit(STRIPE_DELAYED, &sh->state); 4005 set_bit(STRIPE_HANDLE, &sh->state); 4006 } 4007 } 4008 } 4009 if (rcw && conf->mddev->queue) 4010 blk_add_trace_msg(conf->mddev->queue, "raid5 rcw %llu %d %d %d", 4011 (unsigned long long)sh->sector, 4012 rcw, qread, test_bit(STRIPE_DELAYED, &sh->state)); 4013 } 4014 4015 if (rcw > disks && rmw > disks && 4016 !test_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) 4017 set_bit(STRIPE_DELAYED, &sh->state); 4018 4019 /* now if nothing is locked, and if we have enough data, 4020 * we can start a write request 4021 */ 4022 /* since handle_stripe can be called at any time we need to handle the 4023 * case where a compute block operation has been submitted and then a 4024 * subsequent call wants to start a write request. raid_run_ops only 4025 * handles the case where compute block and reconstruct are requested 4026 * simultaneously. If this is not the case then new writes need to be 4027 * held off until the compute completes. 4028 */ 4029 if ((s->req_compute || !test_bit(STRIPE_COMPUTE_RUN, &sh->state)) && 4030 (s->locked == 0 && (rcw == 0 || rmw == 0) && 4031 !test_bit(STRIPE_BIT_DELAY, &sh->state))) 4032 schedule_reconstruction(sh, s, rcw == 0, 0); 4033 return 0; 4034 } 4035 4036 static void handle_parity_checks5(struct r5conf *conf, struct stripe_head *sh, 4037 struct stripe_head_state *s, int disks) 4038 { 4039 struct r5dev *dev = NULL; 4040 4041 BUG_ON(sh->batch_head); 4042 set_bit(STRIPE_HANDLE, &sh->state); 4043 4044 switch (sh->check_state) { 4045 case check_state_idle: 4046 /* start a new check operation if there are no failures */ 4047 if (s->failed == 0) { 4048 BUG_ON(s->uptodate != disks); 4049 sh->check_state = check_state_run; 4050 set_bit(STRIPE_OP_CHECK, &s->ops_request); 4051 clear_bit(R5_UPTODATE, &sh->dev[sh->pd_idx].flags); 4052 s->uptodate--; 4053 break; 4054 } 4055 dev = &sh->dev[s->failed_num[0]]; 4056 /* fall through */ 4057 case check_state_compute_result: 4058 sh->check_state = check_state_idle; 4059 if (!dev) 4060 dev = &sh->dev[sh->pd_idx]; 4061 4062 /* check that a write has not made the stripe insync */ 4063 if (test_bit(STRIPE_INSYNC, &sh->state)) 4064 break; 4065 4066 /* either failed parity check, or recovery is happening */ 4067 BUG_ON(!test_bit(R5_UPTODATE, &dev->flags)); 4068 BUG_ON(s->uptodate != disks); 4069 4070 set_bit(R5_LOCKED, &dev->flags); 4071 s->locked++; 4072 set_bit(R5_Wantwrite, &dev->flags); 4073 4074 clear_bit(STRIPE_DEGRADED, &sh->state); 4075 set_bit(STRIPE_INSYNC, &sh->state); 4076 break; 4077 case check_state_run: 4078 break; /* we will be called again upon completion */ 4079 case check_state_check_result: 4080 sh->check_state = check_state_idle; 4081 4082 /* if a failure occurred during the check operation, leave 4083 * STRIPE_INSYNC not set and let the stripe be handled again 4084 */ 4085 if (s->failed) 4086 break; 4087 4088 /* handle a successful check operation, if parity is correct 4089 * we are done. Otherwise update the mismatch count and repair 4090 * parity if !MD_RECOVERY_CHECK 4091 */ 4092 if ((sh->ops.zero_sum_result & SUM_CHECK_P_RESULT) == 0) 4093 /* parity is correct (on disc, 4094 * not in buffer any more) 4095 */ 4096 set_bit(STRIPE_INSYNC, &sh->state); 4097 else { 4098 atomic64_add(STRIPE_SECTORS, &conf->mddev->resync_mismatches); 4099 if (test_bit(MD_RECOVERY_CHECK, &conf->mddev->recovery)) 4100 /* don't try to repair!! */ 4101 set_bit(STRIPE_INSYNC, &sh->state); 4102 else { 4103 sh->check_state = check_state_compute_run; 4104 set_bit(STRIPE_COMPUTE_RUN, &sh->state); 4105 set_bit(STRIPE_OP_COMPUTE_BLK, &s->ops_request); 4106 set_bit(R5_Wantcompute, 4107 &sh->dev[sh->pd_idx].flags); 4108 sh->ops.target = sh->pd_idx; 4109 sh->ops.target2 = -1; 4110 s->uptodate++; 4111 } 4112 } 4113 break; 4114 case check_state_compute_run: 4115 break; 4116 default: 4117 pr_err("%s: unknown check_state: %d sector: %llu\n", 4118 __func__, sh->check_state, 4119 (unsigned long long) sh->sector); 4120 BUG(); 4121 } 4122 } 4123 4124 static void handle_parity_checks6(struct r5conf *conf, struct stripe_head *sh, 4125 struct stripe_head_state *s, 4126 int disks) 4127 { 4128 int pd_idx = sh->pd_idx; 4129 int qd_idx = sh->qd_idx; 4130 struct r5dev *dev; 4131 4132 BUG_ON(sh->batch_head); 4133 set_bit(STRIPE_HANDLE, &sh->state); 4134 4135 BUG_ON(s->failed > 2); 4136 4137 /* Want to check and possibly repair P and Q. 4138 * However there could be one 'failed' device, in which 4139 * case we can only check one of them, possibly using the 4140 * other to generate missing data 4141 */ 4142 4143 switch (sh->check_state) { 4144 case check_state_idle: 4145 /* start a new check operation if there are < 2 failures */ 4146 if (s->failed == s->q_failed) { 4147 /* The only possible failed device holds Q, so it 4148 * makes sense to check P (If anything else were failed, 4149 * we would have used P to recreate it). 4150 */ 4151 sh->check_state = check_state_run; 4152 } 4153 if (!s->q_failed && s->failed < 2) { 4154 /* Q is not failed, and we didn't use it to generate 4155 * anything, so it makes sense to check it 4156 */ 4157 if (sh->check_state == check_state_run) 4158 sh->check_state = check_state_run_pq; 4159 else 4160 sh->check_state = check_state_run_q; 4161 } 4162 4163 /* discard potentially stale zero_sum_result */ 4164 sh->ops.zero_sum_result = 0; 4165 4166 if (sh->check_state == check_state_run) { 4167 /* async_xor_zero_sum destroys the contents of P */ 4168 clear_bit(R5_UPTODATE, &sh->dev[pd_idx].flags); 4169 s->uptodate--; 4170 } 4171 if (sh->check_state >= check_state_run && 4172 sh->check_state <= check_state_run_pq) { 4173 /* async_syndrome_zero_sum preserves P and Q, so 4174 * no need to mark them !uptodate here 4175 */ 4176 set_bit(STRIPE_OP_CHECK, &s->ops_request); 4177 break; 4178 } 4179 4180 /* we have 2-disk failure */ 4181 BUG_ON(s->failed != 2); 4182 /* fall through */ 4183 case check_state_compute_result: 4184 sh->check_state = check_state_idle; 4185 4186 /* check that a write has not made the stripe insync */ 4187 if (test_bit(STRIPE_INSYNC, &sh->state)) 4188 break; 4189 4190 /* now write out any block on a failed drive, 4191 * or P or Q if they were recomputed 4192 */ 4193 BUG_ON(s->uptodate < disks - 1); /* We don't need Q to recover */ 4194 if (s->failed == 2) { 4195 dev = &sh->dev[s->failed_num[1]]; 4196 s->locked++; 4197 set_bit(R5_LOCKED, &dev->flags); 4198 set_bit(R5_Wantwrite, &dev->flags); 4199 } 4200 if (s->failed >= 1) { 4201 dev = &sh->dev[s->failed_num[0]]; 4202 s->locked++; 4203 set_bit(R5_LOCKED, &dev->flags); 4204 set_bit(R5_Wantwrite, &dev->flags); 4205 } 4206 if (sh->ops.zero_sum_result & SUM_CHECK_P_RESULT) { 4207 dev = &sh->dev[pd_idx]; 4208 s->locked++; 4209 set_bit(R5_LOCKED, &dev->flags); 4210 set_bit(R5_Wantwrite, &dev->flags); 4211 } 4212 if (sh->ops.zero_sum_result & SUM_CHECK_Q_RESULT) { 4213 dev = &sh->dev[qd_idx]; 4214 s->locked++; 4215 set_bit(R5_LOCKED, &dev->flags); 4216 set_bit(R5_Wantwrite, &dev->flags); 4217 } 4218 clear_bit(STRIPE_DEGRADED, &sh->state); 4219 4220 set_bit(STRIPE_INSYNC, &sh->state); 4221 break; 4222 case check_state_run: 4223 case check_state_run_q: 4224 case check_state_run_pq: 4225 break; /* we will be called again upon completion */ 4226 case check_state_check_result: 4227 sh->check_state = check_state_idle; 4228 4229 /* handle a successful check operation, if parity is correct 4230 * we are done. Otherwise update the mismatch count and repair 4231 * parity if !MD_RECOVERY_CHECK 4232 */ 4233 if (sh->ops.zero_sum_result == 0) { 4234 /* both parities are correct */ 4235 if (!s->failed) 4236 set_bit(STRIPE_INSYNC, &sh->state); 4237 else { 4238 /* in contrast to the raid5 case we can validate 4239 * parity, but still have a failure to write 4240 * back 4241 */ 4242 sh->check_state = check_state_compute_result; 4243 /* Returning at this point means that we may go 4244 * off and bring p and/or q uptodate again so 4245 * we make sure to check zero_sum_result again 4246 * to verify if p or q need writeback 4247 */ 4248 } 4249 } else { 4250 atomic64_add(STRIPE_SECTORS, &conf->mddev->resync_mismatches); 4251 if (test_bit(MD_RECOVERY_CHECK, &conf->mddev->recovery)) 4252 /* don't try to repair!! */ 4253 set_bit(STRIPE_INSYNC, &sh->state); 4254 else { 4255 int *target = &sh->ops.target; 4256 4257 sh->ops.target = -1; 4258 sh->ops.target2 = -1; 4259 sh->check_state = check_state_compute_run; 4260 set_bit(STRIPE_COMPUTE_RUN, &sh->state); 4261 set_bit(STRIPE_OP_COMPUTE_BLK, &s->ops_request); 4262 if (sh->ops.zero_sum_result & SUM_CHECK_P_RESULT) { 4263 set_bit(R5_Wantcompute, 4264 &sh->dev[pd_idx].flags); 4265 *target = pd_idx; 4266 target = &sh->ops.target2; 4267 s->uptodate++; 4268 } 4269 if (sh->ops.zero_sum_result & SUM_CHECK_Q_RESULT) { 4270 set_bit(R5_Wantcompute, 4271 &sh->dev[qd_idx].flags); 4272 *target = qd_idx; 4273 s->uptodate++; 4274 } 4275 } 4276 } 4277 break; 4278 case check_state_compute_run: 4279 break; 4280 default: 4281 pr_warn("%s: unknown check_state: %d sector: %llu\n", 4282 __func__, sh->check_state, 4283 (unsigned long long) sh->sector); 4284 BUG(); 4285 } 4286 } 4287 4288 static void handle_stripe_expansion(struct r5conf *conf, struct stripe_head *sh) 4289 { 4290 int i; 4291 4292 /* We have read all the blocks in this stripe and now we need to 4293 * copy some of them into a target stripe for expand. 4294 */ 4295 struct dma_async_tx_descriptor *tx = NULL; 4296 BUG_ON(sh->batch_head); 4297 clear_bit(STRIPE_EXPAND_SOURCE, &sh->state); 4298 for (i = 0; i < sh->disks; i++) 4299 if (i != sh->pd_idx && i != sh->qd_idx) { 4300 int dd_idx, j; 4301 struct stripe_head *sh2; 4302 struct async_submit_ctl submit; 4303 4304 sector_t bn = raid5_compute_blocknr(sh, i, 1); 4305 sector_t s = raid5_compute_sector(conf, bn, 0, 4306 &dd_idx, NULL); 4307 sh2 = raid5_get_active_stripe(conf, s, 0, 1, 1); 4308 if (sh2 == NULL) 4309 /* so far only the early blocks of this stripe 4310 * have been requested. When later blocks 4311 * get requested, we will try again 4312 */ 4313 continue; 4314 if (!test_bit(STRIPE_EXPANDING, &sh2->state) || 4315 test_bit(R5_Expanded, &sh2->dev[dd_idx].flags)) { 4316 /* must have already done this block */ 4317 raid5_release_stripe(sh2); 4318 continue; 4319 } 4320 4321 /* place all the copies on one channel */ 4322 init_async_submit(&submit, 0, tx, NULL, NULL, NULL); 4323 tx = async_memcpy(sh2->dev[dd_idx].page, 4324 sh->dev[i].page, 0, 0, STRIPE_SIZE, 4325 &submit); 4326 4327 set_bit(R5_Expanded, &sh2->dev[dd_idx].flags); 4328 set_bit(R5_UPTODATE, &sh2->dev[dd_idx].flags); 4329 for (j = 0; j < conf->raid_disks; j++) 4330 if (j != sh2->pd_idx && 4331 j != sh2->qd_idx && 4332 !test_bit(R5_Expanded, &sh2->dev[j].flags)) 4333 break; 4334 if (j == conf->raid_disks) { 4335 set_bit(STRIPE_EXPAND_READY, &sh2->state); 4336 set_bit(STRIPE_HANDLE, &sh2->state); 4337 } 4338 raid5_release_stripe(sh2); 4339 4340 } 4341 /* done submitting copies, wait for them to complete */ 4342 async_tx_quiesce(&tx); 4343 } 4344 4345 /* 4346 * handle_stripe - do things to a stripe. 4347 * 4348 * We lock the stripe by setting STRIPE_ACTIVE and then examine the 4349 * state of various bits to see what needs to be done. 4350 * Possible results: 4351 * return some read requests which now have data 4352 * return some write requests which are safely on storage 4353 * schedule a read on some buffers 4354 * schedule a write of some buffers 4355 * return confirmation of parity correctness 4356 * 4357 */ 4358 4359 static void analyse_stripe(struct stripe_head *sh, struct stripe_head_state *s) 4360 { 4361 struct r5conf *conf = sh->raid_conf; 4362 int disks = sh->disks; 4363 struct r5dev *dev; 4364 int i; 4365 int do_recovery = 0; 4366 4367 memset(s, 0, sizeof(*s)); 4368 4369 s->expanding = test_bit(STRIPE_EXPAND_SOURCE, &sh->state) && !sh->batch_head; 4370 s->expanded = test_bit(STRIPE_EXPAND_READY, &sh->state) && !sh->batch_head; 4371 s->failed_num[0] = -1; 4372 s->failed_num[1] = -1; 4373 s->log_failed = r5l_log_disk_error(conf); 4374 4375 /* Now to look around and see what can be done */ 4376 rcu_read_lock(); 4377 for (i=disks; i--; ) { 4378 struct md_rdev *rdev; 4379 sector_t first_bad; 4380 int bad_sectors; 4381 int is_bad = 0; 4382 4383 dev = &sh->dev[i]; 4384 4385 pr_debug("check %d: state 0x%lx read %p write %p written %p\n", 4386 i, dev->flags, 4387 dev->toread, dev->towrite, dev->written); 4388 /* maybe we can reply to a read 4389 * 4390 * new wantfill requests are only permitted while 4391 * ops_complete_biofill is guaranteed to be inactive 4392 */ 4393 if (test_bit(R5_UPTODATE, &dev->flags) && dev->toread && 4394 !test_bit(STRIPE_BIOFILL_RUN, &sh->state)) 4395 set_bit(R5_Wantfill, &dev->flags); 4396 4397 /* now count some things */ 4398 if (test_bit(R5_LOCKED, &dev->flags)) 4399 s->locked++; 4400 if (test_bit(R5_UPTODATE, &dev->flags)) 4401 s->uptodate++; 4402 if (test_bit(R5_Wantcompute, &dev->flags)) { 4403 s->compute++; 4404 BUG_ON(s->compute > 2); 4405 } 4406 4407 if (test_bit(R5_Wantfill, &dev->flags)) 4408 s->to_fill++; 4409 else if (dev->toread) 4410 s->to_read++; 4411 if (dev->towrite) { 4412 s->to_write++; 4413 if (!test_bit(R5_OVERWRITE, &dev->flags)) 4414 s->non_overwrite++; 4415 } 4416 if (dev->written) 4417 s->written++; 4418 /* Prefer to use the replacement for reads, but only 4419 * if it is recovered enough and has no bad blocks. 4420 */ 4421 rdev = rcu_dereference(conf->disks[i].replacement); 4422 if (rdev && !test_bit(Faulty, &rdev->flags) && 4423 rdev->recovery_offset >= sh->sector + STRIPE_SECTORS && 4424 !is_badblock(rdev, sh->sector, STRIPE_SECTORS, 4425 &first_bad, &bad_sectors)) 4426 set_bit(R5_ReadRepl, &dev->flags); 4427 else { 4428 if (rdev && !test_bit(Faulty, &rdev->flags)) 4429 set_bit(R5_NeedReplace, &dev->flags); 4430 else 4431 clear_bit(R5_NeedReplace, &dev->flags); 4432 rdev = rcu_dereference(conf->disks[i].rdev); 4433 clear_bit(R5_ReadRepl, &dev->flags); 4434 } 4435 if (rdev && test_bit(Faulty, &rdev->flags)) 4436 rdev = NULL; 4437 if (rdev) { 4438 is_bad = is_badblock(rdev, sh->sector, STRIPE_SECTORS, 4439 &first_bad, &bad_sectors); 4440 if (s->blocked_rdev == NULL 4441 && (test_bit(Blocked, &rdev->flags) 4442 || is_bad < 0)) { 4443 if (is_bad < 0) 4444 set_bit(BlockedBadBlocks, 4445 &rdev->flags); 4446 s->blocked_rdev = rdev; 4447 atomic_inc(&rdev->nr_pending); 4448 } 4449 } 4450 clear_bit(R5_Insync, &dev->flags); 4451 if (!rdev) 4452 /* Not in-sync */; 4453 else if (is_bad) { 4454 /* also not in-sync */ 4455 if (!test_bit(WriteErrorSeen, &rdev->flags) && 4456 test_bit(R5_UPTODATE, &dev->flags)) { 4457 /* treat as in-sync, but with a read error 4458 * which we can now try to correct 4459 */ 4460 set_bit(R5_Insync, &dev->flags); 4461 set_bit(R5_ReadError, &dev->flags); 4462 } 4463 } else if (test_bit(In_sync, &rdev->flags)) 4464 set_bit(R5_Insync, &dev->flags); 4465 else if (sh->sector + STRIPE_SECTORS <= rdev->recovery_offset) 4466 /* in sync if before recovery_offset */ 4467 set_bit(R5_Insync, &dev->flags); 4468 else if (test_bit(R5_UPTODATE, &dev->flags) && 4469 test_bit(R5_Expanded, &dev->flags)) 4470 /* If we've reshaped into here, we assume it is Insync. 4471 * We will shortly update recovery_offset to make 4472 * it official. 4473 */ 4474 set_bit(R5_Insync, &dev->flags); 4475 4476 if (test_bit(R5_WriteError, &dev->flags)) { 4477 /* This flag does not apply to '.replacement' 4478 * only to .rdev, so make sure to check that*/ 4479 struct md_rdev *rdev2 = rcu_dereference( 4480 conf->disks[i].rdev); 4481 if (rdev2 == rdev) 4482 clear_bit(R5_Insync, &dev->flags); 4483 if (rdev2 && !test_bit(Faulty, &rdev2->flags)) { 4484 s->handle_bad_blocks = 1; 4485 atomic_inc(&rdev2->nr_pending); 4486 } else 4487 clear_bit(R5_WriteError, &dev->flags); 4488 } 4489 if (test_bit(R5_MadeGood, &dev->flags)) { 4490 /* This flag does not apply to '.replacement' 4491 * only to .rdev, so make sure to check that*/ 4492 struct md_rdev *rdev2 = rcu_dereference( 4493 conf->disks[i].rdev); 4494 if (rdev2 && !test_bit(Faulty, &rdev2->flags)) { 4495 s->handle_bad_blocks = 1; 4496 atomic_inc(&rdev2->nr_pending); 4497 } else 4498 clear_bit(R5_MadeGood, &dev->flags); 4499 } 4500 if (test_bit(R5_MadeGoodRepl, &dev->flags)) { 4501 struct md_rdev *rdev2 = rcu_dereference( 4502 conf->disks[i].replacement); 4503 if (rdev2 && !test_bit(Faulty, &rdev2->flags)) { 4504 s->handle_bad_blocks = 1; 4505 atomic_inc(&rdev2->nr_pending); 4506 } else 4507 clear_bit(R5_MadeGoodRepl, &dev->flags); 4508 } 4509 if (!test_bit(R5_Insync, &dev->flags)) { 4510 /* The ReadError flag will just be confusing now */ 4511 clear_bit(R5_ReadError, &dev->flags); 4512 clear_bit(R5_ReWrite, &dev->flags); 4513 } 4514 if (test_bit(R5_ReadError, &dev->flags)) 4515 clear_bit(R5_Insync, &dev->flags); 4516 if (!test_bit(R5_Insync, &dev->flags)) { 4517 if (s->failed < 2) 4518 s->failed_num[s->failed] = i; 4519 s->failed++; 4520 if (rdev && !test_bit(Faulty, &rdev->flags)) 4521 do_recovery = 1; 4522 } 4523 4524 if (test_bit(R5_InJournal, &dev->flags)) 4525 s->injournal++; 4526 if (test_bit(R5_InJournal, &dev->flags) && dev->written) 4527 s->just_cached++; 4528 } 4529 if (test_bit(STRIPE_SYNCING, &sh->state)) { 4530 /* If there is a failed device being replaced, 4531 * we must be recovering. 4532 * else if we are after recovery_cp, we must be syncing 4533 * else if MD_RECOVERY_REQUESTED is set, we also are syncing. 4534 * else we can only be replacing 4535 * sync and recovery both need to read all devices, and so 4536 * use the same flag. 4537 */ 4538 if (do_recovery || 4539 sh->sector >= conf->mddev->recovery_cp || 4540 test_bit(MD_RECOVERY_REQUESTED, &(conf->mddev->recovery))) 4541 s->syncing = 1; 4542 else 4543 s->replacing = 1; 4544 } 4545 rcu_read_unlock(); 4546 } 4547 4548 static int clear_batch_ready(struct stripe_head *sh) 4549 { 4550 /* Return '1' if this is a member of batch, or 4551 * '0' if it is a lone stripe or a head which can now be 4552 * handled. 4553 */ 4554 struct stripe_head *tmp; 4555 if (!test_and_clear_bit(STRIPE_BATCH_READY, &sh->state)) 4556 return (sh->batch_head && sh->batch_head != sh); 4557 spin_lock(&sh->stripe_lock); 4558 if (!sh->batch_head) { 4559 spin_unlock(&sh->stripe_lock); 4560 return 0; 4561 } 4562 4563 /* 4564 * this stripe could be added to a batch list before we check 4565 * BATCH_READY, skips it 4566 */ 4567 if (sh->batch_head != sh) { 4568 spin_unlock(&sh->stripe_lock); 4569 return 1; 4570 } 4571 spin_lock(&sh->batch_lock); 4572 list_for_each_entry(tmp, &sh->batch_list, batch_list) 4573 clear_bit(STRIPE_BATCH_READY, &tmp->state); 4574 spin_unlock(&sh->batch_lock); 4575 spin_unlock(&sh->stripe_lock); 4576 4577 /* 4578 * BATCH_READY is cleared, no new stripes can be added. 4579 * batch_list can be accessed without lock 4580 */ 4581 return 0; 4582 } 4583 4584 static void break_stripe_batch_list(struct stripe_head *head_sh, 4585 unsigned long handle_flags) 4586 { 4587 struct stripe_head *sh, *next; 4588 int i; 4589 int do_wakeup = 0; 4590 4591 list_for_each_entry_safe(sh, next, &head_sh->batch_list, batch_list) { 4592 4593 list_del_init(&sh->batch_list); 4594 4595 WARN_ONCE(sh->state & ((1 << STRIPE_ACTIVE) | 4596 (1 << STRIPE_SYNCING) | 4597 (1 << STRIPE_REPLACED) | 4598 (1 << STRIPE_DELAYED) | 4599 (1 << STRIPE_BIT_DELAY) | 4600 (1 << STRIPE_FULL_WRITE) | 4601 (1 << STRIPE_BIOFILL_RUN) | 4602 (1 << STRIPE_COMPUTE_RUN) | 4603 (1 << STRIPE_OPS_REQ_PENDING) | 4604 (1 << STRIPE_DISCARD) | 4605 (1 << STRIPE_BATCH_READY) | 4606 (1 << STRIPE_BATCH_ERR) | 4607 (1 << STRIPE_BITMAP_PENDING)), 4608 "stripe state: %lx\n", sh->state); 4609 WARN_ONCE(head_sh->state & ((1 << STRIPE_DISCARD) | 4610 (1 << STRIPE_REPLACED)), 4611 "head stripe state: %lx\n", head_sh->state); 4612 4613 set_mask_bits(&sh->state, ~(STRIPE_EXPAND_SYNC_FLAGS | 4614 (1 << STRIPE_PREREAD_ACTIVE) | 4615 (1 << STRIPE_DEGRADED)), 4616 head_sh->state & (1 << STRIPE_INSYNC)); 4617 4618 sh->check_state = head_sh->check_state; 4619 sh->reconstruct_state = head_sh->reconstruct_state; 4620 for (i = 0; i < sh->disks; i++) { 4621 if (test_and_clear_bit(R5_Overlap, &sh->dev[i].flags)) 4622 do_wakeup = 1; 4623 sh->dev[i].flags = head_sh->dev[i].flags & 4624 (~((1 << R5_WriteError) | (1 << R5_Overlap))); 4625 } 4626 spin_lock_irq(&sh->stripe_lock); 4627 sh->batch_head = NULL; 4628 spin_unlock_irq(&sh->stripe_lock); 4629 if (handle_flags == 0 || 4630 sh->state & handle_flags) 4631 set_bit(STRIPE_HANDLE, &sh->state); 4632 raid5_release_stripe(sh); 4633 } 4634 spin_lock_irq(&head_sh->stripe_lock); 4635 head_sh->batch_head = NULL; 4636 spin_unlock_irq(&head_sh->stripe_lock); 4637 for (i = 0; i < head_sh->disks; i++) 4638 if (test_and_clear_bit(R5_Overlap, &head_sh->dev[i].flags)) 4639 do_wakeup = 1; 4640 if (head_sh->state & handle_flags) 4641 set_bit(STRIPE_HANDLE, &head_sh->state); 4642 4643 if (do_wakeup) 4644 wake_up(&head_sh->raid_conf->wait_for_overlap); 4645 } 4646 4647 static void handle_stripe(struct stripe_head *sh) 4648 { 4649 struct stripe_head_state s; 4650 struct r5conf *conf = sh->raid_conf; 4651 int i; 4652 int prexor; 4653 int disks = sh->disks; 4654 struct r5dev *pdev, *qdev; 4655 4656 clear_bit(STRIPE_HANDLE, &sh->state); 4657 if (test_and_set_bit_lock(STRIPE_ACTIVE, &sh->state)) { 4658 /* already being handled, ensure it gets handled 4659 * again when current action finishes */ 4660 set_bit(STRIPE_HANDLE, &sh->state); 4661 return; 4662 } 4663 4664 if (clear_batch_ready(sh) ) { 4665 clear_bit_unlock(STRIPE_ACTIVE, &sh->state); 4666 return; 4667 } 4668 4669 if (test_and_clear_bit(STRIPE_BATCH_ERR, &sh->state)) 4670 break_stripe_batch_list(sh, 0); 4671 4672 if (test_bit(STRIPE_SYNC_REQUESTED, &sh->state) && !sh->batch_head) { 4673 spin_lock(&sh->stripe_lock); 4674 /* Cannot process 'sync' concurrently with 'discard' */ 4675 if (!test_bit(STRIPE_DISCARD, &sh->state) && 4676 test_and_clear_bit(STRIPE_SYNC_REQUESTED, &sh->state)) { 4677 set_bit(STRIPE_SYNCING, &sh->state); 4678 clear_bit(STRIPE_INSYNC, &sh->state); 4679 clear_bit(STRIPE_REPLACED, &sh->state); 4680 } 4681 spin_unlock(&sh->stripe_lock); 4682 } 4683 clear_bit(STRIPE_DELAYED, &sh->state); 4684 4685 pr_debug("handling stripe %llu, state=%#lx cnt=%d, " 4686 "pd_idx=%d, qd_idx=%d\n, check:%d, reconstruct:%d\n", 4687 (unsigned long long)sh->sector, sh->state, 4688 atomic_read(&sh->count), sh->pd_idx, sh->qd_idx, 4689 sh->check_state, sh->reconstruct_state); 4690 4691 analyse_stripe(sh, &s); 4692 4693 if (test_bit(STRIPE_LOG_TRAPPED, &sh->state)) 4694 goto finish; 4695 4696 if (s.handle_bad_blocks) { 4697 set_bit(STRIPE_HANDLE, &sh->state); 4698 goto finish; 4699 } 4700 4701 if (unlikely(s.blocked_rdev)) { 4702 if (s.syncing || s.expanding || s.expanded || 4703 s.replacing || s.to_write || s.written) { 4704 set_bit(STRIPE_HANDLE, &sh->state); 4705 goto finish; 4706 } 4707 /* There is nothing for the blocked_rdev to block */ 4708 rdev_dec_pending(s.blocked_rdev, conf->mddev); 4709 s.blocked_rdev = NULL; 4710 } 4711 4712 if (s.to_fill && !test_bit(STRIPE_BIOFILL_RUN, &sh->state)) { 4713 set_bit(STRIPE_OP_BIOFILL, &s.ops_request); 4714 set_bit(STRIPE_BIOFILL_RUN, &sh->state); 4715 } 4716 4717 pr_debug("locked=%d uptodate=%d to_read=%d" 4718 " to_write=%d failed=%d failed_num=%d,%d\n", 4719 s.locked, s.uptodate, s.to_read, s.to_write, s.failed, 4720 s.failed_num[0], s.failed_num[1]); 4721 /* check if the array has lost more than max_degraded devices and, 4722 * if so, some requests might need to be failed. 4723 */ 4724 if (s.failed > conf->max_degraded || s.log_failed) { 4725 sh->check_state = 0; 4726 sh->reconstruct_state = 0; 4727 break_stripe_batch_list(sh, 0); 4728 if (s.to_read+s.to_write+s.written) 4729 handle_failed_stripe(conf, sh, &s, disks, &s.return_bi); 4730 if (s.syncing + s.replacing) 4731 handle_failed_sync(conf, sh, &s); 4732 } 4733 4734 /* Now we check to see if any write operations have recently 4735 * completed 4736 */ 4737 prexor = 0; 4738 if (sh->reconstruct_state == reconstruct_state_prexor_drain_result) 4739 prexor = 1; 4740 if (sh->reconstruct_state == reconstruct_state_drain_result || 4741 sh->reconstruct_state == reconstruct_state_prexor_drain_result) { 4742 sh->reconstruct_state = reconstruct_state_idle; 4743 4744 /* All the 'written' buffers and the parity block are ready to 4745 * be written back to disk 4746 */ 4747 BUG_ON(!test_bit(R5_UPTODATE, &sh->dev[sh->pd_idx].flags) && 4748 !test_bit(R5_Discard, &sh->dev[sh->pd_idx].flags)); 4749 BUG_ON(sh->qd_idx >= 0 && 4750 !test_bit(R5_UPTODATE, &sh->dev[sh->qd_idx].flags) && 4751 !test_bit(R5_Discard, &sh->dev[sh->qd_idx].flags)); 4752 for (i = disks; i--; ) { 4753 struct r5dev *dev = &sh->dev[i]; 4754 if (test_bit(R5_LOCKED, &dev->flags) && 4755 (i == sh->pd_idx || i == sh->qd_idx || 4756 dev->written || test_bit(R5_InJournal, 4757 &dev->flags))) { 4758 pr_debug("Writing block %d\n", i); 4759 set_bit(R5_Wantwrite, &dev->flags); 4760 if (prexor) 4761 continue; 4762 if (s.failed > 1) 4763 continue; 4764 if (!test_bit(R5_Insync, &dev->flags) || 4765 ((i == sh->pd_idx || i == sh->qd_idx) && 4766 s.failed == 0)) 4767 set_bit(STRIPE_INSYNC, &sh->state); 4768 } 4769 } 4770 if (test_and_clear_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) 4771 s.dec_preread_active = 1; 4772 } 4773 4774 /* 4775 * might be able to return some write requests if the parity blocks 4776 * are safe, or on a failed drive 4777 */ 4778 pdev = &sh->dev[sh->pd_idx]; 4779 s.p_failed = (s.failed >= 1 && s.failed_num[0] == sh->pd_idx) 4780 || (s.failed >= 2 && s.failed_num[1] == sh->pd_idx); 4781 qdev = &sh->dev[sh->qd_idx]; 4782 s.q_failed = (s.failed >= 1 && s.failed_num[0] == sh->qd_idx) 4783 || (s.failed >= 2 && s.failed_num[1] == sh->qd_idx) 4784 || conf->level < 6; 4785 4786 if (s.written && 4787 (s.p_failed || ((test_bit(R5_Insync, &pdev->flags) 4788 && !test_bit(R5_LOCKED, &pdev->flags) 4789 && (test_bit(R5_UPTODATE, &pdev->flags) || 4790 test_bit(R5_Discard, &pdev->flags))))) && 4791 (s.q_failed || ((test_bit(R5_Insync, &qdev->flags) 4792 && !test_bit(R5_LOCKED, &qdev->flags) 4793 && (test_bit(R5_UPTODATE, &qdev->flags) || 4794 test_bit(R5_Discard, &qdev->flags)))))) 4795 handle_stripe_clean_event(conf, sh, disks, &s.return_bi); 4796 4797 if (s.just_cached) 4798 r5c_handle_cached_data_endio(conf, sh, disks, &s.return_bi); 4799 log_stripe_write_finished(sh); 4800 4801 /* Now we might consider reading some blocks, either to check/generate 4802 * parity, or to satisfy requests 4803 * or to load a block that is being partially written. 4804 */ 4805 if (s.to_read || s.non_overwrite 4806 || (conf->level == 6 && s.to_write && s.failed) 4807 || (s.syncing && (s.uptodate + s.compute < disks)) 4808 || s.replacing 4809 || s.expanding) 4810 handle_stripe_fill(sh, &s, disks); 4811 4812 /* 4813 * When the stripe finishes full journal write cycle (write to journal 4814 * and raid disk), this is the clean up procedure so it is ready for 4815 * next operation. 4816 */ 4817 r5c_finish_stripe_write_out(conf, sh, &s); 4818 4819 /* 4820 * Now to consider new write requests, cache write back and what else, 4821 * if anything should be read. We do not handle new writes when: 4822 * 1/ A 'write' operation (copy+xor) is already in flight. 4823 * 2/ A 'check' operation is in flight, as it may clobber the parity 4824 * block. 4825 * 3/ A r5c cache log write is in flight. 4826 */ 4827 4828 if (!sh->reconstruct_state && !sh->check_state && !sh->log_io) { 4829 if (!r5c_is_writeback(conf->log)) { 4830 if (s.to_write) 4831 handle_stripe_dirtying(conf, sh, &s, disks); 4832 } else { /* write back cache */ 4833 int ret = 0; 4834 4835 /* First, try handle writes in caching phase */ 4836 if (s.to_write) 4837 ret = r5c_try_caching_write(conf, sh, &s, 4838 disks); 4839 /* 4840 * If caching phase failed: ret == -EAGAIN 4841 * OR 4842 * stripe under reclaim: !caching && injournal 4843 * 4844 * fall back to handle_stripe_dirtying() 4845 */ 4846 if (ret == -EAGAIN || 4847 /* stripe under reclaim: !caching && injournal */ 4848 (!test_bit(STRIPE_R5C_CACHING, &sh->state) && 4849 s.injournal > 0)) { 4850 ret = handle_stripe_dirtying(conf, sh, &s, 4851 disks); 4852 if (ret == -EAGAIN) 4853 goto finish; 4854 } 4855 } 4856 } 4857 4858 /* maybe we need to check and possibly fix the parity for this stripe 4859 * Any reads will already have been scheduled, so we just see if enough 4860 * data is available. The parity check is held off while parity 4861 * dependent operations are in flight. 4862 */ 4863 if (sh->check_state || 4864 (s.syncing && s.locked == 0 && 4865 !test_bit(STRIPE_COMPUTE_RUN, &sh->state) && 4866 !test_bit(STRIPE_INSYNC, &sh->state))) { 4867 if (conf->level == 6) 4868 handle_parity_checks6(conf, sh, &s, disks); 4869 else 4870 handle_parity_checks5(conf, sh, &s, disks); 4871 } 4872 4873 if ((s.replacing || s.syncing) && s.locked == 0 4874 && !test_bit(STRIPE_COMPUTE_RUN, &sh->state) 4875 && !test_bit(STRIPE_REPLACED, &sh->state)) { 4876 /* Write out to replacement devices where possible */ 4877 for (i = 0; i < conf->raid_disks; i++) 4878 if (test_bit(R5_NeedReplace, &sh->dev[i].flags)) { 4879 WARN_ON(!test_bit(R5_UPTODATE, &sh->dev[i].flags)); 4880 set_bit(R5_WantReplace, &sh->dev[i].flags); 4881 set_bit(R5_LOCKED, &sh->dev[i].flags); 4882 s.locked++; 4883 } 4884 if (s.replacing) 4885 set_bit(STRIPE_INSYNC, &sh->state); 4886 set_bit(STRIPE_REPLACED, &sh->state); 4887 } 4888 if ((s.syncing || s.replacing) && s.locked == 0 && 4889 !test_bit(STRIPE_COMPUTE_RUN, &sh->state) && 4890 test_bit(STRIPE_INSYNC, &sh->state)) { 4891 md_done_sync(conf->mddev, STRIPE_SECTORS, 1); 4892 clear_bit(STRIPE_SYNCING, &sh->state); 4893 if (test_and_clear_bit(R5_Overlap, &sh->dev[sh->pd_idx].flags)) 4894 wake_up(&conf->wait_for_overlap); 4895 } 4896 4897 /* If the failed drives are just a ReadError, then we might need 4898 * to progress the repair/check process 4899 */ 4900 if (s.failed <= conf->max_degraded && !conf->mddev->ro) 4901 for (i = 0; i < s.failed; i++) { 4902 struct r5dev *dev = &sh->dev[s.failed_num[i]]; 4903 if (test_bit(R5_ReadError, &dev->flags) 4904 && !test_bit(R5_LOCKED, &dev->flags) 4905 && test_bit(R5_UPTODATE, &dev->flags) 4906 ) { 4907 if (!test_bit(R5_ReWrite, &dev->flags)) { 4908 set_bit(R5_Wantwrite, &dev->flags); 4909 set_bit(R5_ReWrite, &dev->flags); 4910 set_bit(R5_LOCKED, &dev->flags); 4911 s.locked++; 4912 } else { 4913 /* let's read it back */ 4914 set_bit(R5_Wantread, &dev->flags); 4915 set_bit(R5_LOCKED, &dev->flags); 4916 s.locked++; 4917 } 4918 } 4919 } 4920 4921 /* Finish reconstruct operations initiated by the expansion process */ 4922 if (sh->reconstruct_state == reconstruct_state_result) { 4923 struct stripe_head *sh_src 4924 = raid5_get_active_stripe(conf, sh->sector, 1, 1, 1); 4925 if (sh_src && test_bit(STRIPE_EXPAND_SOURCE, &sh_src->state)) { 4926 /* sh cannot be written until sh_src has been read. 4927 * so arrange for sh to be delayed a little 4928 */ 4929 set_bit(STRIPE_DELAYED, &sh->state); 4930 set_bit(STRIPE_HANDLE, &sh->state); 4931 if (!test_and_set_bit(STRIPE_PREREAD_ACTIVE, 4932 &sh_src->state)) 4933 atomic_inc(&conf->preread_active_stripes); 4934 raid5_release_stripe(sh_src); 4935 goto finish; 4936 } 4937 if (sh_src) 4938 raid5_release_stripe(sh_src); 4939 4940 sh->reconstruct_state = reconstruct_state_idle; 4941 clear_bit(STRIPE_EXPANDING, &sh->state); 4942 for (i = conf->raid_disks; i--; ) { 4943 set_bit(R5_Wantwrite, &sh->dev[i].flags); 4944 set_bit(R5_LOCKED, &sh->dev[i].flags); 4945 s.locked++; 4946 } 4947 } 4948 4949 if (s.expanded && test_bit(STRIPE_EXPANDING, &sh->state) && 4950 !sh->reconstruct_state) { 4951 /* Need to write out all blocks after computing parity */ 4952 sh->disks = conf->raid_disks; 4953 stripe_set_idx(sh->sector, conf, 0, sh); 4954 schedule_reconstruction(sh, &s, 1, 1); 4955 } else if (s.expanded && !sh->reconstruct_state && s.locked == 0) { 4956 clear_bit(STRIPE_EXPAND_READY, &sh->state); 4957 atomic_dec(&conf->reshape_stripes); 4958 wake_up(&conf->wait_for_overlap); 4959 md_done_sync(conf->mddev, STRIPE_SECTORS, 1); 4960 } 4961 4962 if (s.expanding && s.locked == 0 && 4963 !test_bit(STRIPE_COMPUTE_RUN, &sh->state)) 4964 handle_stripe_expansion(conf, sh); 4965 4966 finish: 4967 /* wait for this device to become unblocked */ 4968 if (unlikely(s.blocked_rdev)) { 4969 if (conf->mddev->external) 4970 md_wait_for_blocked_rdev(s.blocked_rdev, 4971 conf->mddev); 4972 else 4973 /* Internal metadata will immediately 4974 * be written by raid5d, so we don't 4975 * need to wait here. 4976 */ 4977 rdev_dec_pending(s.blocked_rdev, 4978 conf->mddev); 4979 } 4980 4981 if (s.handle_bad_blocks) 4982 for (i = disks; i--; ) { 4983 struct md_rdev *rdev; 4984 struct r5dev *dev = &sh->dev[i]; 4985 if (test_and_clear_bit(R5_WriteError, &dev->flags)) { 4986 /* We own a safe reference to the rdev */ 4987 rdev = conf->disks[i].rdev; 4988 if (!rdev_set_badblocks(rdev, sh->sector, 4989 STRIPE_SECTORS, 0)) 4990 md_error(conf->mddev, rdev); 4991 rdev_dec_pending(rdev, conf->mddev); 4992 } 4993 if (test_and_clear_bit(R5_MadeGood, &dev->flags)) { 4994 rdev = conf->disks[i].rdev; 4995 rdev_clear_badblocks(rdev, sh->sector, 4996 STRIPE_SECTORS, 0); 4997 rdev_dec_pending(rdev, conf->mddev); 4998 } 4999 if (test_and_clear_bit(R5_MadeGoodRepl, &dev->flags)) { 5000 rdev = conf->disks[i].replacement; 5001 if (!rdev) 5002 /* rdev have been moved down */ 5003 rdev = conf->disks[i].rdev; 5004 rdev_clear_badblocks(rdev, sh->sector, 5005 STRIPE_SECTORS, 0); 5006 rdev_dec_pending(rdev, conf->mddev); 5007 } 5008 } 5009 5010 if (s.ops_request) 5011 raid_run_ops(sh, s.ops_request); 5012 5013 ops_run_io(sh, &s); 5014 5015 if (s.dec_preread_active) { 5016 /* We delay this until after ops_run_io so that if make_request 5017 * is waiting on a flush, it won't continue until the writes 5018 * have actually been submitted. 5019 */ 5020 atomic_dec(&conf->preread_active_stripes); 5021 if (atomic_read(&conf->preread_active_stripes) < 5022 IO_THRESHOLD) 5023 md_wakeup_thread(conf->mddev->thread); 5024 } 5025 5026 if (!bio_list_empty(&s.return_bi)) { 5027 if (test_bit(MD_SB_CHANGE_PENDING, &conf->mddev->sb_flags)) { 5028 spin_lock_irq(&conf->device_lock); 5029 bio_list_merge(&conf->return_bi, &s.return_bi); 5030 spin_unlock_irq(&conf->device_lock); 5031 md_wakeup_thread(conf->mddev->thread); 5032 } else 5033 return_io(&s.return_bi); 5034 } 5035 5036 clear_bit_unlock(STRIPE_ACTIVE, &sh->state); 5037 } 5038 5039 static void raid5_activate_delayed(struct r5conf *conf) 5040 { 5041 if (atomic_read(&conf->preread_active_stripes) < IO_THRESHOLD) { 5042 while (!list_empty(&conf->delayed_list)) { 5043 struct list_head *l = conf->delayed_list.next; 5044 struct stripe_head *sh; 5045 sh = list_entry(l, struct stripe_head, lru); 5046 list_del_init(l); 5047 clear_bit(STRIPE_DELAYED, &sh->state); 5048 if (!test_and_set_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) 5049 atomic_inc(&conf->preread_active_stripes); 5050 list_add_tail(&sh->lru, &conf->hold_list); 5051 raid5_wakeup_stripe_thread(sh); 5052 } 5053 } 5054 } 5055 5056 static void activate_bit_delay(struct r5conf *conf, 5057 struct list_head *temp_inactive_list) 5058 { 5059 /* device_lock is held */ 5060 struct list_head head; 5061 list_add(&head, &conf->bitmap_list); 5062 list_del_init(&conf->bitmap_list); 5063 while (!list_empty(&head)) { 5064 struct stripe_head *sh = list_entry(head.next, struct stripe_head, lru); 5065 int hash; 5066 list_del_init(&sh->lru); 5067 atomic_inc(&sh->count); 5068 hash = sh->hash_lock_index; 5069 __release_stripe(conf, sh, &temp_inactive_list[hash]); 5070 } 5071 } 5072 5073 static int raid5_congested(struct mddev *mddev, int bits) 5074 { 5075 struct r5conf *conf = mddev->private; 5076 5077 /* No difference between reads and writes. Just check 5078 * how busy the stripe_cache is 5079 */ 5080 5081 if (test_bit(R5_INACTIVE_BLOCKED, &conf->cache_state)) 5082 return 1; 5083 5084 /* Also checks whether there is pressure on r5cache log space */ 5085 if (test_bit(R5C_LOG_TIGHT, &conf->cache_state)) 5086 return 1; 5087 if (conf->quiesce) 5088 return 1; 5089 if (atomic_read(&conf->empty_inactive_list_nr)) 5090 return 1; 5091 5092 return 0; 5093 } 5094 5095 static int in_chunk_boundary(struct mddev *mddev, struct bio *bio) 5096 { 5097 struct r5conf *conf = mddev->private; 5098 sector_t sector = bio->bi_iter.bi_sector + get_start_sect(bio->bi_bdev); 5099 unsigned int chunk_sectors; 5100 unsigned int bio_sectors = bio_sectors(bio); 5101 5102 chunk_sectors = min(conf->chunk_sectors, conf->prev_chunk_sectors); 5103 return chunk_sectors >= 5104 ((sector & (chunk_sectors - 1)) + bio_sectors); 5105 } 5106 5107 /* 5108 * add bio to the retry LIFO ( in O(1) ... we are in interrupt ) 5109 * later sampled by raid5d. 5110 */ 5111 static void add_bio_to_retry(struct bio *bi,struct r5conf *conf) 5112 { 5113 unsigned long flags; 5114 5115 spin_lock_irqsave(&conf->device_lock, flags); 5116 5117 bi->bi_next = conf->retry_read_aligned_list; 5118 conf->retry_read_aligned_list = bi; 5119 5120 spin_unlock_irqrestore(&conf->device_lock, flags); 5121 md_wakeup_thread(conf->mddev->thread); 5122 } 5123 5124 static struct bio *remove_bio_from_retry(struct r5conf *conf) 5125 { 5126 struct bio *bi; 5127 5128 bi = conf->retry_read_aligned; 5129 if (bi) { 5130 conf->retry_read_aligned = NULL; 5131 return bi; 5132 } 5133 bi = conf->retry_read_aligned_list; 5134 if(bi) { 5135 conf->retry_read_aligned_list = bi->bi_next; 5136 bi->bi_next = NULL; 5137 /* 5138 * this sets the active strip count to 1 and the processed 5139 * strip count to zero (upper 8 bits) 5140 */ 5141 raid5_set_bi_stripes(bi, 1); /* biased count of active stripes */ 5142 } 5143 5144 return bi; 5145 } 5146 5147 /* 5148 * The "raid5_align_endio" should check if the read succeeded and if it 5149 * did, call bio_endio on the original bio (having bio_put the new bio 5150 * first). 5151 * If the read failed.. 5152 */ 5153 static void raid5_align_endio(struct bio *bi) 5154 { 5155 struct bio* raid_bi = bi->bi_private; 5156 struct mddev *mddev; 5157 struct r5conf *conf; 5158 struct md_rdev *rdev; 5159 int error = bi->bi_error; 5160 5161 bio_put(bi); 5162 5163 rdev = (void*)raid_bi->bi_next; 5164 raid_bi->bi_next = NULL; 5165 mddev = rdev->mddev; 5166 conf = mddev->private; 5167 5168 rdev_dec_pending(rdev, conf->mddev); 5169 5170 if (!error) { 5171 trace_block_bio_complete(bdev_get_queue(raid_bi->bi_bdev), 5172 raid_bi, 0); 5173 bio_endio(raid_bi); 5174 if (atomic_dec_and_test(&conf->active_aligned_reads)) 5175 wake_up(&conf->wait_for_quiescent); 5176 return; 5177 } 5178 5179 pr_debug("raid5_align_endio : io error...handing IO for a retry\n"); 5180 5181 add_bio_to_retry(raid_bi, conf); 5182 } 5183 5184 static int raid5_read_one_chunk(struct mddev *mddev, struct bio *raid_bio) 5185 { 5186 struct r5conf *conf = mddev->private; 5187 int dd_idx; 5188 struct bio* align_bi; 5189 struct md_rdev *rdev; 5190 sector_t end_sector; 5191 5192 if (!in_chunk_boundary(mddev, raid_bio)) { 5193 pr_debug("%s: non aligned\n", __func__); 5194 return 0; 5195 } 5196 /* 5197 * use bio_clone_fast to make a copy of the bio 5198 */ 5199 align_bi = bio_clone_fast(raid_bio, GFP_NOIO, mddev->bio_set); 5200 if (!align_bi) 5201 return 0; 5202 /* 5203 * set bi_end_io to a new function, and set bi_private to the 5204 * original bio. 5205 */ 5206 align_bi->bi_end_io = raid5_align_endio; 5207 align_bi->bi_private = raid_bio; 5208 /* 5209 * compute position 5210 */ 5211 align_bi->bi_iter.bi_sector = 5212 raid5_compute_sector(conf, raid_bio->bi_iter.bi_sector, 5213 0, &dd_idx, NULL); 5214 5215 end_sector = bio_end_sector(align_bi); 5216 rcu_read_lock(); 5217 rdev = rcu_dereference(conf->disks[dd_idx].replacement); 5218 if (!rdev || test_bit(Faulty, &rdev->flags) || 5219 rdev->recovery_offset < end_sector) { 5220 rdev = rcu_dereference(conf->disks[dd_idx].rdev); 5221 if (rdev && 5222 (test_bit(Faulty, &rdev->flags) || 5223 !(test_bit(In_sync, &rdev->flags) || 5224 rdev->recovery_offset >= end_sector))) 5225 rdev = NULL; 5226 } 5227 5228 if (r5c_big_stripe_cached(conf, align_bi->bi_iter.bi_sector)) { 5229 rcu_read_unlock(); 5230 bio_put(align_bi); 5231 return 0; 5232 } 5233 5234 if (rdev) { 5235 sector_t first_bad; 5236 int bad_sectors; 5237 5238 atomic_inc(&rdev->nr_pending); 5239 rcu_read_unlock(); 5240 raid_bio->bi_next = (void*)rdev; 5241 align_bi->bi_bdev = rdev->bdev; 5242 bio_clear_flag(align_bi, BIO_SEG_VALID); 5243 5244 if (is_badblock(rdev, align_bi->bi_iter.bi_sector, 5245 bio_sectors(align_bi), 5246 &first_bad, &bad_sectors)) { 5247 bio_put(align_bi); 5248 rdev_dec_pending(rdev, mddev); 5249 return 0; 5250 } 5251 5252 /* No reshape active, so we can trust rdev->data_offset */ 5253 align_bi->bi_iter.bi_sector += rdev->data_offset; 5254 5255 spin_lock_irq(&conf->device_lock); 5256 wait_event_lock_irq(conf->wait_for_quiescent, 5257 conf->quiesce == 0, 5258 conf->device_lock); 5259 atomic_inc(&conf->active_aligned_reads); 5260 spin_unlock_irq(&conf->device_lock); 5261 5262 if (mddev->gendisk) 5263 trace_block_bio_remap(bdev_get_queue(align_bi->bi_bdev), 5264 align_bi, disk_devt(mddev->gendisk), 5265 raid_bio->bi_iter.bi_sector); 5266 generic_make_request(align_bi); 5267 return 1; 5268 } else { 5269 rcu_read_unlock(); 5270 bio_put(align_bi); 5271 return 0; 5272 } 5273 } 5274 5275 static struct bio *chunk_aligned_read(struct mddev *mddev, struct bio *raid_bio) 5276 { 5277 struct bio *split; 5278 5279 do { 5280 sector_t sector = raid_bio->bi_iter.bi_sector; 5281 unsigned chunk_sects = mddev->chunk_sectors; 5282 unsigned sectors = chunk_sects - (sector & (chunk_sects-1)); 5283 5284 if (sectors < bio_sectors(raid_bio)) { 5285 split = bio_split(raid_bio, sectors, GFP_NOIO, fs_bio_set); 5286 bio_chain(split, raid_bio); 5287 } else 5288 split = raid_bio; 5289 5290 if (!raid5_read_one_chunk(mddev, split)) { 5291 if (split != raid_bio) 5292 generic_make_request(raid_bio); 5293 return split; 5294 } 5295 } while (split != raid_bio); 5296 5297 return NULL; 5298 } 5299 5300 /* __get_priority_stripe - get the next stripe to process 5301 * 5302 * Full stripe writes are allowed to pass preread active stripes up until 5303 * the bypass_threshold is exceeded. In general the bypass_count 5304 * increments when the handle_list is handled before the hold_list; however, it 5305 * will not be incremented when STRIPE_IO_STARTED is sampled set signifying a 5306 * stripe with in flight i/o. The bypass_count will be reset when the 5307 * head of the hold_list has changed, i.e. the head was promoted to the 5308 * handle_list. 5309 */ 5310 static struct stripe_head *__get_priority_stripe(struct r5conf *conf, int group) 5311 { 5312 struct stripe_head *sh, *tmp; 5313 struct list_head *handle_list = NULL; 5314 struct r5worker_group *wg; 5315 bool second_try = !r5c_is_writeback(conf->log); 5316 bool try_loprio = test_bit(R5C_LOG_TIGHT, &conf->cache_state); 5317 5318 again: 5319 wg = NULL; 5320 sh = NULL; 5321 if (conf->worker_cnt_per_group == 0) { 5322 handle_list = try_loprio ? &conf->loprio_list : 5323 &conf->handle_list; 5324 } else if (group != ANY_GROUP) { 5325 handle_list = try_loprio ? &conf->worker_groups[group].loprio_list : 5326 &conf->worker_groups[group].handle_list; 5327 wg = &conf->worker_groups[group]; 5328 } else { 5329 int i; 5330 for (i = 0; i < conf->group_cnt; i++) { 5331 handle_list = try_loprio ? &conf->worker_groups[i].loprio_list : 5332 &conf->worker_groups[i].handle_list; 5333 wg = &conf->worker_groups[i]; 5334 if (!list_empty(handle_list)) 5335 break; 5336 } 5337 } 5338 5339 pr_debug("%s: handle: %s hold: %s full_writes: %d bypass_count: %d\n", 5340 __func__, 5341 list_empty(handle_list) ? "empty" : "busy", 5342 list_empty(&conf->hold_list) ? "empty" : "busy", 5343 atomic_read(&conf->pending_full_writes), conf->bypass_count); 5344 5345 if (!list_empty(handle_list)) { 5346 sh = list_entry(handle_list->next, typeof(*sh), lru); 5347 5348 if (list_empty(&conf->hold_list)) 5349 conf->bypass_count = 0; 5350 else if (!test_bit(STRIPE_IO_STARTED, &sh->state)) { 5351 if (conf->hold_list.next == conf->last_hold) 5352 conf->bypass_count++; 5353 else { 5354 conf->last_hold = conf->hold_list.next; 5355 conf->bypass_count -= conf->bypass_threshold; 5356 if (conf->bypass_count < 0) 5357 conf->bypass_count = 0; 5358 } 5359 } 5360 } else if (!list_empty(&conf->hold_list) && 5361 ((conf->bypass_threshold && 5362 conf->bypass_count > conf->bypass_threshold) || 5363 atomic_read(&conf->pending_full_writes) == 0)) { 5364 5365 list_for_each_entry(tmp, &conf->hold_list, lru) { 5366 if (conf->worker_cnt_per_group == 0 || 5367 group == ANY_GROUP || 5368 !cpu_online(tmp->cpu) || 5369 cpu_to_group(tmp->cpu) == group) { 5370 sh = tmp; 5371 break; 5372 } 5373 } 5374 5375 if (sh) { 5376 conf->bypass_count -= conf->bypass_threshold; 5377 if (conf->bypass_count < 0) 5378 conf->bypass_count = 0; 5379 } 5380 wg = NULL; 5381 } 5382 5383 if (!sh) { 5384 if (second_try) 5385 return NULL; 5386 second_try = true; 5387 try_loprio = !try_loprio; 5388 goto again; 5389 } 5390 5391 if (wg) { 5392 wg->stripes_cnt--; 5393 sh->group = NULL; 5394 } 5395 list_del_init(&sh->lru); 5396 BUG_ON(atomic_inc_return(&sh->count) != 1); 5397 return sh; 5398 } 5399 5400 struct raid5_plug_cb { 5401 struct blk_plug_cb cb; 5402 struct list_head list; 5403 struct list_head temp_inactive_list[NR_STRIPE_HASH_LOCKS]; 5404 }; 5405 5406 static void raid5_unplug(struct blk_plug_cb *blk_cb, bool from_schedule) 5407 { 5408 struct raid5_plug_cb *cb = container_of( 5409 blk_cb, struct raid5_plug_cb, cb); 5410 struct stripe_head *sh; 5411 struct mddev *mddev = cb->cb.data; 5412 struct r5conf *conf = mddev->private; 5413 int cnt = 0; 5414 int hash; 5415 5416 if (cb->list.next && !list_empty(&cb->list)) { 5417 spin_lock_irq(&conf->device_lock); 5418 while (!list_empty(&cb->list)) { 5419 sh = list_first_entry(&cb->list, struct stripe_head, lru); 5420 list_del_init(&sh->lru); 5421 /* 5422 * avoid race release_stripe_plug() sees 5423 * STRIPE_ON_UNPLUG_LIST clear but the stripe 5424 * is still in our list 5425 */ 5426 smp_mb__before_atomic(); 5427 clear_bit(STRIPE_ON_UNPLUG_LIST, &sh->state); 5428 /* 5429 * STRIPE_ON_RELEASE_LIST could be set here. In that 5430 * case, the count is always > 1 here 5431 */ 5432 hash = sh->hash_lock_index; 5433 __release_stripe(conf, sh, &cb->temp_inactive_list[hash]); 5434 cnt++; 5435 } 5436 spin_unlock_irq(&conf->device_lock); 5437 } 5438 release_inactive_stripe_list(conf, cb->temp_inactive_list, 5439 NR_STRIPE_HASH_LOCKS); 5440 if (mddev->queue) 5441 trace_block_unplug(mddev->queue, cnt, !from_schedule); 5442 kfree(cb); 5443 } 5444 5445 static void release_stripe_plug(struct mddev *mddev, 5446 struct stripe_head *sh) 5447 { 5448 struct blk_plug_cb *blk_cb = blk_check_plugged( 5449 raid5_unplug, mddev, 5450 sizeof(struct raid5_plug_cb)); 5451 struct raid5_plug_cb *cb; 5452 5453 if (!blk_cb) { 5454 raid5_release_stripe(sh); 5455 return; 5456 } 5457 5458 cb = container_of(blk_cb, struct raid5_plug_cb, cb); 5459 5460 if (cb->list.next == NULL) { 5461 int i; 5462 INIT_LIST_HEAD(&cb->list); 5463 for (i = 0; i < NR_STRIPE_HASH_LOCKS; i++) 5464 INIT_LIST_HEAD(cb->temp_inactive_list + i); 5465 } 5466 5467 if (!test_and_set_bit(STRIPE_ON_UNPLUG_LIST, &sh->state)) 5468 list_add_tail(&sh->lru, &cb->list); 5469 else 5470 raid5_release_stripe(sh); 5471 } 5472 5473 static void make_discard_request(struct mddev *mddev, struct bio *bi) 5474 { 5475 struct r5conf *conf = mddev->private; 5476 sector_t logical_sector, last_sector; 5477 struct stripe_head *sh; 5478 int remaining; 5479 int stripe_sectors; 5480 5481 if (mddev->reshape_position != MaxSector) 5482 /* Skip discard while reshape is happening */ 5483 return; 5484 5485 logical_sector = bi->bi_iter.bi_sector & ~((sector_t)STRIPE_SECTORS-1); 5486 last_sector = bi->bi_iter.bi_sector + (bi->bi_iter.bi_size>>9); 5487 5488 bi->bi_next = NULL; 5489 bi->bi_phys_segments = 1; /* over-loaded to count active stripes */ 5490 5491 stripe_sectors = conf->chunk_sectors * 5492 (conf->raid_disks - conf->max_degraded); 5493 logical_sector = DIV_ROUND_UP_SECTOR_T(logical_sector, 5494 stripe_sectors); 5495 sector_div(last_sector, stripe_sectors); 5496 5497 logical_sector *= conf->chunk_sectors; 5498 last_sector *= conf->chunk_sectors; 5499 5500 for (; logical_sector < last_sector; 5501 logical_sector += STRIPE_SECTORS) { 5502 DEFINE_WAIT(w); 5503 int d; 5504 again: 5505 sh = raid5_get_active_stripe(conf, logical_sector, 0, 0, 0); 5506 prepare_to_wait(&conf->wait_for_overlap, &w, 5507 TASK_UNINTERRUPTIBLE); 5508 set_bit(R5_Overlap, &sh->dev[sh->pd_idx].flags); 5509 if (test_bit(STRIPE_SYNCING, &sh->state)) { 5510 raid5_release_stripe(sh); 5511 schedule(); 5512 goto again; 5513 } 5514 clear_bit(R5_Overlap, &sh->dev[sh->pd_idx].flags); 5515 spin_lock_irq(&sh->stripe_lock); 5516 for (d = 0; d < conf->raid_disks; d++) { 5517 if (d == sh->pd_idx || d == sh->qd_idx) 5518 continue; 5519 if (sh->dev[d].towrite || sh->dev[d].toread) { 5520 set_bit(R5_Overlap, &sh->dev[d].flags); 5521 spin_unlock_irq(&sh->stripe_lock); 5522 raid5_release_stripe(sh); 5523 schedule(); 5524 goto again; 5525 } 5526 } 5527 set_bit(STRIPE_DISCARD, &sh->state); 5528 finish_wait(&conf->wait_for_overlap, &w); 5529 sh->overwrite_disks = 0; 5530 for (d = 0; d < conf->raid_disks; d++) { 5531 if (d == sh->pd_idx || d == sh->qd_idx) 5532 continue; 5533 sh->dev[d].towrite = bi; 5534 set_bit(R5_OVERWRITE, &sh->dev[d].flags); 5535 raid5_inc_bi_active_stripes(bi); 5536 sh->overwrite_disks++; 5537 } 5538 spin_unlock_irq(&sh->stripe_lock); 5539 if (conf->mddev->bitmap) { 5540 for (d = 0; 5541 d < conf->raid_disks - conf->max_degraded; 5542 d++) 5543 bitmap_startwrite(mddev->bitmap, 5544 sh->sector, 5545 STRIPE_SECTORS, 5546 0); 5547 sh->bm_seq = conf->seq_flush + 1; 5548 set_bit(STRIPE_BIT_DELAY, &sh->state); 5549 } 5550 5551 set_bit(STRIPE_HANDLE, &sh->state); 5552 clear_bit(STRIPE_DELAYED, &sh->state); 5553 if (!test_and_set_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) 5554 atomic_inc(&conf->preread_active_stripes); 5555 release_stripe_plug(mddev, sh); 5556 } 5557 5558 remaining = raid5_dec_bi_active_stripes(bi); 5559 if (remaining == 0) { 5560 md_write_end(mddev); 5561 bio_endio(bi); 5562 } 5563 } 5564 5565 static void raid5_make_request(struct mddev *mddev, struct bio * bi) 5566 { 5567 struct r5conf *conf = mddev->private; 5568 int dd_idx; 5569 sector_t new_sector; 5570 sector_t logical_sector, last_sector; 5571 struct stripe_head *sh; 5572 const int rw = bio_data_dir(bi); 5573 int remaining; 5574 DEFINE_WAIT(w); 5575 bool do_prepare; 5576 bool do_flush = false; 5577 5578 if (unlikely(bi->bi_opf & REQ_PREFLUSH)) { 5579 int ret = r5l_handle_flush_request(conf->log, bi); 5580 5581 if (ret == 0) 5582 return; 5583 if (ret == -ENODEV) { 5584 md_flush_request(mddev, bi); 5585 return; 5586 } 5587 /* ret == -EAGAIN, fallback */ 5588 /* 5589 * if r5l_handle_flush_request() didn't clear REQ_PREFLUSH, 5590 * we need to flush journal device 5591 */ 5592 do_flush = bi->bi_opf & REQ_PREFLUSH; 5593 } 5594 5595 md_write_start(mddev, bi); 5596 5597 /* 5598 * If array is degraded, better not do chunk aligned read because 5599 * later we might have to read it again in order to reconstruct 5600 * data on failed drives. 5601 */ 5602 if (rw == READ && mddev->degraded == 0 && 5603 mddev->reshape_position == MaxSector) { 5604 bi = chunk_aligned_read(mddev, bi); 5605 if (!bi) 5606 return; 5607 } 5608 5609 if (unlikely(bio_op(bi) == REQ_OP_DISCARD)) { 5610 make_discard_request(mddev, bi); 5611 return; 5612 } 5613 5614 logical_sector = bi->bi_iter.bi_sector & ~((sector_t)STRIPE_SECTORS-1); 5615 last_sector = bio_end_sector(bi); 5616 bi->bi_next = NULL; 5617 bi->bi_phys_segments = 1; /* over-loaded to count active stripes */ 5618 5619 prepare_to_wait(&conf->wait_for_overlap, &w, TASK_UNINTERRUPTIBLE); 5620 for (;logical_sector < last_sector; logical_sector += STRIPE_SECTORS) { 5621 int previous; 5622 int seq; 5623 5624 do_prepare = false; 5625 retry: 5626 seq = read_seqcount_begin(&conf->gen_lock); 5627 previous = 0; 5628 if (do_prepare) 5629 prepare_to_wait(&conf->wait_for_overlap, &w, 5630 TASK_UNINTERRUPTIBLE); 5631 if (unlikely(conf->reshape_progress != MaxSector)) { 5632 /* spinlock is needed as reshape_progress may be 5633 * 64bit on a 32bit platform, and so it might be 5634 * possible to see a half-updated value 5635 * Of course reshape_progress could change after 5636 * the lock is dropped, so once we get a reference 5637 * to the stripe that we think it is, we will have 5638 * to check again. 5639 */ 5640 spin_lock_irq(&conf->device_lock); 5641 if (mddev->reshape_backwards 5642 ? logical_sector < conf->reshape_progress 5643 : logical_sector >= conf->reshape_progress) { 5644 previous = 1; 5645 } else { 5646 if (mddev->reshape_backwards 5647 ? logical_sector < conf->reshape_safe 5648 : logical_sector >= conf->reshape_safe) { 5649 spin_unlock_irq(&conf->device_lock); 5650 schedule(); 5651 do_prepare = true; 5652 goto retry; 5653 } 5654 } 5655 spin_unlock_irq(&conf->device_lock); 5656 } 5657 5658 new_sector = raid5_compute_sector(conf, logical_sector, 5659 previous, 5660 &dd_idx, NULL); 5661 pr_debug("raid456: raid5_make_request, sector %llu logical %llu\n", 5662 (unsigned long long)new_sector, 5663 (unsigned long long)logical_sector); 5664 5665 sh = raid5_get_active_stripe(conf, new_sector, previous, 5666 (bi->bi_opf & REQ_RAHEAD), 0); 5667 if (sh) { 5668 if (unlikely(previous)) { 5669 /* expansion might have moved on while waiting for a 5670 * stripe, so we must do the range check again. 5671 * Expansion could still move past after this 5672 * test, but as we are holding a reference to 5673 * 'sh', we know that if that happens, 5674 * STRIPE_EXPANDING will get set and the expansion 5675 * won't proceed until we finish with the stripe. 5676 */ 5677 int must_retry = 0; 5678 spin_lock_irq(&conf->device_lock); 5679 if (mddev->reshape_backwards 5680 ? logical_sector >= conf->reshape_progress 5681 : logical_sector < conf->reshape_progress) 5682 /* mismatch, need to try again */ 5683 must_retry = 1; 5684 spin_unlock_irq(&conf->device_lock); 5685 if (must_retry) { 5686 raid5_release_stripe(sh); 5687 schedule(); 5688 do_prepare = true; 5689 goto retry; 5690 } 5691 } 5692 if (read_seqcount_retry(&conf->gen_lock, seq)) { 5693 /* Might have got the wrong stripe_head 5694 * by accident 5695 */ 5696 raid5_release_stripe(sh); 5697 goto retry; 5698 } 5699 5700 if (rw == WRITE && 5701 logical_sector >= mddev->suspend_lo && 5702 logical_sector < mddev->suspend_hi) { 5703 raid5_release_stripe(sh); 5704 /* As the suspend_* range is controlled by 5705 * userspace, we want an interruptible 5706 * wait. 5707 */ 5708 flush_signals(current); 5709 prepare_to_wait(&conf->wait_for_overlap, 5710 &w, TASK_INTERRUPTIBLE); 5711 if (logical_sector >= mddev->suspend_lo && 5712 logical_sector < mddev->suspend_hi) { 5713 schedule(); 5714 do_prepare = true; 5715 } 5716 goto retry; 5717 } 5718 5719 if (test_bit(STRIPE_EXPANDING, &sh->state) || 5720 !add_stripe_bio(sh, bi, dd_idx, rw, previous)) { 5721 /* Stripe is busy expanding or 5722 * add failed due to overlap. Flush everything 5723 * and wait a while 5724 */ 5725 md_wakeup_thread(mddev->thread); 5726 raid5_release_stripe(sh); 5727 schedule(); 5728 do_prepare = true; 5729 goto retry; 5730 } 5731 if (do_flush) { 5732 set_bit(STRIPE_R5C_PREFLUSH, &sh->state); 5733 /* we only need flush for one stripe */ 5734 do_flush = false; 5735 } 5736 5737 set_bit(STRIPE_HANDLE, &sh->state); 5738 clear_bit(STRIPE_DELAYED, &sh->state); 5739 if ((!sh->batch_head || sh == sh->batch_head) && 5740 (bi->bi_opf & REQ_SYNC) && 5741 !test_and_set_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) 5742 atomic_inc(&conf->preread_active_stripes); 5743 release_stripe_plug(mddev, sh); 5744 } else { 5745 /* cannot get stripe for read-ahead, just give-up */ 5746 bi->bi_error = -EIO; 5747 break; 5748 } 5749 } 5750 finish_wait(&conf->wait_for_overlap, &w); 5751 5752 remaining = raid5_dec_bi_active_stripes(bi); 5753 if (remaining == 0) { 5754 5755 if ( rw == WRITE ) 5756 md_write_end(mddev); 5757 5758 trace_block_bio_complete(bdev_get_queue(bi->bi_bdev), 5759 bi, 0); 5760 bio_endio(bi); 5761 } 5762 } 5763 5764 static sector_t raid5_size(struct mddev *mddev, sector_t sectors, int raid_disks); 5765 5766 static sector_t reshape_request(struct mddev *mddev, sector_t sector_nr, int *skipped) 5767 { 5768 /* reshaping is quite different to recovery/resync so it is 5769 * handled quite separately ... here. 5770 * 5771 * On each call to sync_request, we gather one chunk worth of 5772 * destination stripes and flag them as expanding. 5773 * Then we find all the source stripes and request reads. 5774 * As the reads complete, handle_stripe will copy the data 5775 * into the destination stripe and release that stripe. 5776 */ 5777 struct r5conf *conf = mddev->private; 5778 struct stripe_head *sh; 5779 sector_t first_sector, last_sector; 5780 int raid_disks = conf->previous_raid_disks; 5781 int data_disks = raid_disks - conf->max_degraded; 5782 int new_data_disks = conf->raid_disks - conf->max_degraded; 5783 int i; 5784 int dd_idx; 5785 sector_t writepos, readpos, safepos; 5786 sector_t stripe_addr; 5787 int reshape_sectors; 5788 struct list_head stripes; 5789 sector_t retn; 5790 5791 if (sector_nr == 0) { 5792 /* If restarting in the middle, skip the initial sectors */ 5793 if (mddev->reshape_backwards && 5794 conf->reshape_progress < raid5_size(mddev, 0, 0)) { 5795 sector_nr = raid5_size(mddev, 0, 0) 5796 - conf->reshape_progress; 5797 } else if (mddev->reshape_backwards && 5798 conf->reshape_progress == MaxSector) { 5799 /* shouldn't happen, but just in case, finish up.*/ 5800 sector_nr = MaxSector; 5801 } else if (!mddev->reshape_backwards && 5802 conf->reshape_progress > 0) 5803 sector_nr = conf->reshape_progress; 5804 sector_div(sector_nr, new_data_disks); 5805 if (sector_nr) { 5806 mddev->curr_resync_completed = sector_nr; 5807 sysfs_notify(&mddev->kobj, NULL, "sync_completed"); 5808 *skipped = 1; 5809 retn = sector_nr; 5810 goto finish; 5811 } 5812 } 5813 5814 /* We need to process a full chunk at a time. 5815 * If old and new chunk sizes differ, we need to process the 5816 * largest of these 5817 */ 5818 5819 reshape_sectors = max(conf->chunk_sectors, conf->prev_chunk_sectors); 5820 5821 /* We update the metadata at least every 10 seconds, or when 5822 * the data about to be copied would over-write the source of 5823 * the data at the front of the range. i.e. one new_stripe 5824 * along from reshape_progress new_maps to after where 5825 * reshape_safe old_maps to 5826 */ 5827 writepos = conf->reshape_progress; 5828 sector_div(writepos, new_data_disks); 5829 readpos = conf->reshape_progress; 5830 sector_div(readpos, data_disks); 5831 safepos = conf->reshape_safe; 5832 sector_div(safepos, data_disks); 5833 if (mddev->reshape_backwards) { 5834 BUG_ON(writepos < reshape_sectors); 5835 writepos -= reshape_sectors; 5836 readpos += reshape_sectors; 5837 safepos += reshape_sectors; 5838 } else { 5839 writepos += reshape_sectors; 5840 /* readpos and safepos are worst-case calculations. 5841 * A negative number is overly pessimistic, and causes 5842 * obvious problems for unsigned storage. So clip to 0. 5843 */ 5844 readpos -= min_t(sector_t, reshape_sectors, readpos); 5845 safepos -= min_t(sector_t, reshape_sectors, safepos); 5846 } 5847 5848 /* Having calculated the 'writepos' possibly use it 5849 * to set 'stripe_addr' which is where we will write to. 5850 */ 5851 if (mddev->reshape_backwards) { 5852 BUG_ON(conf->reshape_progress == 0); 5853 stripe_addr = writepos; 5854 BUG_ON((mddev->dev_sectors & 5855 ~((sector_t)reshape_sectors - 1)) 5856 - reshape_sectors - stripe_addr 5857 != sector_nr); 5858 } else { 5859 BUG_ON(writepos != sector_nr + reshape_sectors); 5860 stripe_addr = sector_nr; 5861 } 5862 5863 /* 'writepos' is the most advanced device address we might write. 5864 * 'readpos' is the least advanced device address we might read. 5865 * 'safepos' is the least address recorded in the metadata as having 5866 * been reshaped. 5867 * If there is a min_offset_diff, these are adjusted either by 5868 * increasing the safepos/readpos if diff is negative, or 5869 * increasing writepos if diff is positive. 5870 * If 'readpos' is then behind 'writepos', there is no way that we can 5871 * ensure safety in the face of a crash - that must be done by userspace 5872 * making a backup of the data. So in that case there is no particular 5873 * rush to update metadata. 5874 * Otherwise if 'safepos' is behind 'writepos', then we really need to 5875 * update the metadata to advance 'safepos' to match 'readpos' so that 5876 * we can be safe in the event of a crash. 5877 * So we insist on updating metadata if safepos is behind writepos and 5878 * readpos is beyond writepos. 5879 * In any case, update the metadata every 10 seconds. 5880 * Maybe that number should be configurable, but I'm not sure it is 5881 * worth it.... maybe it could be a multiple of safemode_delay??? 5882 */ 5883 if (conf->min_offset_diff < 0) { 5884 safepos += -conf->min_offset_diff; 5885 readpos += -conf->min_offset_diff; 5886 } else 5887 writepos += conf->min_offset_diff; 5888 5889 if ((mddev->reshape_backwards 5890 ? (safepos > writepos && readpos < writepos) 5891 : (safepos < writepos && readpos > writepos)) || 5892 time_after(jiffies, conf->reshape_checkpoint + 10*HZ)) { 5893 /* Cannot proceed until we've updated the superblock... */ 5894 wait_event(conf->wait_for_overlap, 5895 atomic_read(&conf->reshape_stripes)==0 5896 || test_bit(MD_RECOVERY_INTR, &mddev->recovery)); 5897 if (atomic_read(&conf->reshape_stripes) != 0) 5898 return 0; 5899 mddev->reshape_position = conf->reshape_progress; 5900 mddev->curr_resync_completed = sector_nr; 5901 conf->reshape_checkpoint = jiffies; 5902 set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags); 5903 md_wakeup_thread(mddev->thread); 5904 wait_event(mddev->sb_wait, mddev->sb_flags == 0 || 5905 test_bit(MD_RECOVERY_INTR, &mddev->recovery)); 5906 if (test_bit(MD_RECOVERY_INTR, &mddev->recovery)) 5907 return 0; 5908 spin_lock_irq(&conf->device_lock); 5909 conf->reshape_safe = mddev->reshape_position; 5910 spin_unlock_irq(&conf->device_lock); 5911 wake_up(&conf->wait_for_overlap); 5912 sysfs_notify(&mddev->kobj, NULL, "sync_completed"); 5913 } 5914 5915 INIT_LIST_HEAD(&stripes); 5916 for (i = 0; i < reshape_sectors; i += STRIPE_SECTORS) { 5917 int j; 5918 int skipped_disk = 0; 5919 sh = raid5_get_active_stripe(conf, stripe_addr+i, 0, 0, 1); 5920 set_bit(STRIPE_EXPANDING, &sh->state); 5921 atomic_inc(&conf->reshape_stripes); 5922 /* If any of this stripe is beyond the end of the old 5923 * array, then we need to zero those blocks 5924 */ 5925 for (j=sh->disks; j--;) { 5926 sector_t s; 5927 if (j == sh->pd_idx) 5928 continue; 5929 if (conf->level == 6 && 5930 j == sh->qd_idx) 5931 continue; 5932 s = raid5_compute_blocknr(sh, j, 0); 5933 if (s < raid5_size(mddev, 0, 0)) { 5934 skipped_disk = 1; 5935 continue; 5936 } 5937 memset(page_address(sh->dev[j].page), 0, STRIPE_SIZE); 5938 set_bit(R5_Expanded, &sh->dev[j].flags); 5939 set_bit(R5_UPTODATE, &sh->dev[j].flags); 5940 } 5941 if (!skipped_disk) { 5942 set_bit(STRIPE_EXPAND_READY, &sh->state); 5943 set_bit(STRIPE_HANDLE, &sh->state); 5944 } 5945 list_add(&sh->lru, &stripes); 5946 } 5947 spin_lock_irq(&conf->device_lock); 5948 if (mddev->reshape_backwards) 5949 conf->reshape_progress -= reshape_sectors * new_data_disks; 5950 else 5951 conf->reshape_progress += reshape_sectors * new_data_disks; 5952 spin_unlock_irq(&conf->device_lock); 5953 /* Ok, those stripe are ready. We can start scheduling 5954 * reads on the source stripes. 5955 * The source stripes are determined by mapping the first and last 5956 * block on the destination stripes. 5957 */ 5958 first_sector = 5959 raid5_compute_sector(conf, stripe_addr*(new_data_disks), 5960 1, &dd_idx, NULL); 5961 last_sector = 5962 raid5_compute_sector(conf, ((stripe_addr+reshape_sectors) 5963 * new_data_disks - 1), 5964 1, &dd_idx, NULL); 5965 if (last_sector >= mddev->dev_sectors) 5966 last_sector = mddev->dev_sectors - 1; 5967 while (first_sector <= last_sector) { 5968 sh = raid5_get_active_stripe(conf, first_sector, 1, 0, 1); 5969 set_bit(STRIPE_EXPAND_SOURCE, &sh->state); 5970 set_bit(STRIPE_HANDLE, &sh->state); 5971 raid5_release_stripe(sh); 5972 first_sector += STRIPE_SECTORS; 5973 } 5974 /* Now that the sources are clearly marked, we can release 5975 * the destination stripes 5976 */ 5977 while (!list_empty(&stripes)) { 5978 sh = list_entry(stripes.next, struct stripe_head, lru); 5979 list_del_init(&sh->lru); 5980 raid5_release_stripe(sh); 5981 } 5982 /* If this takes us to the resync_max point where we have to pause, 5983 * then we need to write out the superblock. 5984 */ 5985 sector_nr += reshape_sectors; 5986 retn = reshape_sectors; 5987 finish: 5988 if (mddev->curr_resync_completed > mddev->resync_max || 5989 (sector_nr - mddev->curr_resync_completed) * 2 5990 >= mddev->resync_max - mddev->curr_resync_completed) { 5991 /* Cannot proceed until we've updated the superblock... */ 5992 wait_event(conf->wait_for_overlap, 5993 atomic_read(&conf->reshape_stripes) == 0 5994 || test_bit(MD_RECOVERY_INTR, &mddev->recovery)); 5995 if (atomic_read(&conf->reshape_stripes) != 0) 5996 goto ret; 5997 mddev->reshape_position = conf->reshape_progress; 5998 mddev->curr_resync_completed = sector_nr; 5999 conf->reshape_checkpoint = jiffies; 6000 set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags); 6001 md_wakeup_thread(mddev->thread); 6002 wait_event(mddev->sb_wait, 6003 !test_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags) 6004 || test_bit(MD_RECOVERY_INTR, &mddev->recovery)); 6005 if (test_bit(MD_RECOVERY_INTR, &mddev->recovery)) 6006 goto ret; 6007 spin_lock_irq(&conf->device_lock); 6008 conf->reshape_safe = mddev->reshape_position; 6009 spin_unlock_irq(&conf->device_lock); 6010 wake_up(&conf->wait_for_overlap); 6011 sysfs_notify(&mddev->kobj, NULL, "sync_completed"); 6012 } 6013 ret: 6014 return retn; 6015 } 6016 6017 static inline sector_t raid5_sync_request(struct mddev *mddev, sector_t sector_nr, 6018 int *skipped) 6019 { 6020 struct r5conf *conf = mddev->private; 6021 struct stripe_head *sh; 6022 sector_t max_sector = mddev->dev_sectors; 6023 sector_t sync_blocks; 6024 int still_degraded = 0; 6025 int i; 6026 6027 if (sector_nr >= max_sector) { 6028 /* just being told to finish up .. nothing much to do */ 6029 6030 if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)) { 6031 end_reshape(conf); 6032 return 0; 6033 } 6034 6035 if (mddev->curr_resync < max_sector) /* aborted */ 6036 bitmap_end_sync(mddev->bitmap, mddev->curr_resync, 6037 &sync_blocks, 1); 6038 else /* completed sync */ 6039 conf->fullsync = 0; 6040 bitmap_close_sync(mddev->bitmap); 6041 6042 return 0; 6043 } 6044 6045 /* Allow raid5_quiesce to complete */ 6046 wait_event(conf->wait_for_overlap, conf->quiesce != 2); 6047 6048 if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)) 6049 return reshape_request(mddev, sector_nr, skipped); 6050 6051 /* No need to check resync_max as we never do more than one 6052 * stripe, and as resync_max will always be on a chunk boundary, 6053 * if the check in md_do_sync didn't fire, there is no chance 6054 * of overstepping resync_max here 6055 */ 6056 6057 /* if there is too many failed drives and we are trying 6058 * to resync, then assert that we are finished, because there is 6059 * nothing we can do. 6060 */ 6061 if (mddev->degraded >= conf->max_degraded && 6062 test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) { 6063 sector_t rv = mddev->dev_sectors - sector_nr; 6064 *skipped = 1; 6065 return rv; 6066 } 6067 if (!test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery) && 6068 !conf->fullsync && 6069 !bitmap_start_sync(mddev->bitmap, sector_nr, &sync_blocks, 1) && 6070 sync_blocks >= STRIPE_SECTORS) { 6071 /* we can skip this block, and probably more */ 6072 sync_blocks /= STRIPE_SECTORS; 6073 *skipped = 1; 6074 return sync_blocks * STRIPE_SECTORS; /* keep things rounded to whole stripes */ 6075 } 6076 6077 bitmap_cond_end_sync(mddev->bitmap, sector_nr, false); 6078 6079 sh = raid5_get_active_stripe(conf, sector_nr, 0, 1, 0); 6080 if (sh == NULL) { 6081 sh = raid5_get_active_stripe(conf, sector_nr, 0, 0, 0); 6082 /* make sure we don't swamp the stripe cache if someone else 6083 * is trying to get access 6084 */ 6085 schedule_timeout_uninterruptible(1); 6086 } 6087 /* Need to check if array will still be degraded after recovery/resync 6088 * Note in case of > 1 drive failures it's possible we're rebuilding 6089 * one drive while leaving another faulty drive in array. 6090 */ 6091 rcu_read_lock(); 6092 for (i = 0; i < conf->raid_disks; i++) { 6093 struct md_rdev *rdev = ACCESS_ONCE(conf->disks[i].rdev); 6094 6095 if (rdev == NULL || test_bit(Faulty, &rdev->flags)) 6096 still_degraded = 1; 6097 } 6098 rcu_read_unlock(); 6099 6100 bitmap_start_sync(mddev->bitmap, sector_nr, &sync_blocks, still_degraded); 6101 6102 set_bit(STRIPE_SYNC_REQUESTED, &sh->state); 6103 set_bit(STRIPE_HANDLE, &sh->state); 6104 6105 raid5_release_stripe(sh); 6106 6107 return STRIPE_SECTORS; 6108 } 6109 6110 static int retry_aligned_read(struct r5conf *conf, struct bio *raid_bio) 6111 { 6112 /* We may not be able to submit a whole bio at once as there 6113 * may not be enough stripe_heads available. 6114 * We cannot pre-allocate enough stripe_heads as we may need 6115 * more than exist in the cache (if we allow ever large chunks). 6116 * So we do one stripe head at a time and record in 6117 * ->bi_hw_segments how many have been done. 6118 * 6119 * We *know* that this entire raid_bio is in one chunk, so 6120 * it will be only one 'dd_idx' and only need one call to raid5_compute_sector. 6121 */ 6122 struct stripe_head *sh; 6123 int dd_idx; 6124 sector_t sector, logical_sector, last_sector; 6125 int scnt = 0; 6126 int remaining; 6127 int handled = 0; 6128 6129 logical_sector = raid_bio->bi_iter.bi_sector & 6130 ~((sector_t)STRIPE_SECTORS-1); 6131 sector = raid5_compute_sector(conf, logical_sector, 6132 0, &dd_idx, NULL); 6133 last_sector = bio_end_sector(raid_bio); 6134 6135 for (; logical_sector < last_sector; 6136 logical_sector += STRIPE_SECTORS, 6137 sector += STRIPE_SECTORS, 6138 scnt++) { 6139 6140 if (scnt < raid5_bi_processed_stripes(raid_bio)) 6141 /* already done this stripe */ 6142 continue; 6143 6144 sh = raid5_get_active_stripe(conf, sector, 0, 1, 1); 6145 6146 if (!sh) { 6147 /* failed to get a stripe - must wait */ 6148 raid5_set_bi_processed_stripes(raid_bio, scnt); 6149 conf->retry_read_aligned = raid_bio; 6150 return handled; 6151 } 6152 6153 if (!add_stripe_bio(sh, raid_bio, dd_idx, 0, 0)) { 6154 raid5_release_stripe(sh); 6155 raid5_set_bi_processed_stripes(raid_bio, scnt); 6156 conf->retry_read_aligned = raid_bio; 6157 return handled; 6158 } 6159 6160 set_bit(R5_ReadNoMerge, &sh->dev[dd_idx].flags); 6161 handle_stripe(sh); 6162 raid5_release_stripe(sh); 6163 handled++; 6164 } 6165 remaining = raid5_dec_bi_active_stripes(raid_bio); 6166 if (remaining == 0) { 6167 trace_block_bio_complete(bdev_get_queue(raid_bio->bi_bdev), 6168 raid_bio, 0); 6169 bio_endio(raid_bio); 6170 } 6171 if (atomic_dec_and_test(&conf->active_aligned_reads)) 6172 wake_up(&conf->wait_for_quiescent); 6173 return handled; 6174 } 6175 6176 static int handle_active_stripes(struct r5conf *conf, int group, 6177 struct r5worker *worker, 6178 struct list_head *temp_inactive_list) 6179 { 6180 struct stripe_head *batch[MAX_STRIPE_BATCH], *sh; 6181 int i, batch_size = 0, hash; 6182 bool release_inactive = false; 6183 6184 while (batch_size < MAX_STRIPE_BATCH && 6185 (sh = __get_priority_stripe(conf, group)) != NULL) 6186 batch[batch_size++] = sh; 6187 6188 if (batch_size == 0) { 6189 for (i = 0; i < NR_STRIPE_HASH_LOCKS; i++) 6190 if (!list_empty(temp_inactive_list + i)) 6191 break; 6192 if (i == NR_STRIPE_HASH_LOCKS) { 6193 spin_unlock_irq(&conf->device_lock); 6194 r5l_flush_stripe_to_raid(conf->log); 6195 spin_lock_irq(&conf->device_lock); 6196 return batch_size; 6197 } 6198 release_inactive = true; 6199 } 6200 spin_unlock_irq(&conf->device_lock); 6201 6202 release_inactive_stripe_list(conf, temp_inactive_list, 6203 NR_STRIPE_HASH_LOCKS); 6204 6205 r5l_flush_stripe_to_raid(conf->log); 6206 if (release_inactive) { 6207 spin_lock_irq(&conf->device_lock); 6208 return 0; 6209 } 6210 6211 for (i = 0; i < batch_size; i++) 6212 handle_stripe(batch[i]); 6213 log_write_stripe_run(conf); 6214 6215 cond_resched(); 6216 6217 spin_lock_irq(&conf->device_lock); 6218 for (i = 0; i < batch_size; i++) { 6219 hash = batch[i]->hash_lock_index; 6220 __release_stripe(conf, batch[i], &temp_inactive_list[hash]); 6221 } 6222 return batch_size; 6223 } 6224 6225 static void raid5_do_work(struct work_struct *work) 6226 { 6227 struct r5worker *worker = container_of(work, struct r5worker, work); 6228 struct r5worker_group *group = worker->group; 6229 struct r5conf *conf = group->conf; 6230 int group_id = group - conf->worker_groups; 6231 int handled; 6232 struct blk_plug plug; 6233 6234 pr_debug("+++ raid5worker active\n"); 6235 6236 blk_start_plug(&plug); 6237 handled = 0; 6238 spin_lock_irq(&conf->device_lock); 6239 while (1) { 6240 int batch_size, released; 6241 6242 released = release_stripe_list(conf, worker->temp_inactive_list); 6243 6244 batch_size = handle_active_stripes(conf, group_id, worker, 6245 worker->temp_inactive_list); 6246 worker->working = false; 6247 if (!batch_size && !released) 6248 break; 6249 handled += batch_size; 6250 } 6251 pr_debug("%d stripes handled\n", handled); 6252 6253 spin_unlock_irq(&conf->device_lock); 6254 blk_finish_plug(&plug); 6255 6256 pr_debug("--- raid5worker inactive\n"); 6257 } 6258 6259 /* 6260 * This is our raid5 kernel thread. 6261 * 6262 * We scan the hash table for stripes which can be handled now. 6263 * During the scan, completed stripes are saved for us by the interrupt 6264 * handler, so that they will not have to wait for our next wakeup. 6265 */ 6266 static void raid5d(struct md_thread *thread) 6267 { 6268 struct mddev *mddev = thread->mddev; 6269 struct r5conf *conf = mddev->private; 6270 int handled; 6271 struct blk_plug plug; 6272 6273 pr_debug("+++ raid5d active\n"); 6274 6275 md_check_recovery(mddev); 6276 6277 if (!bio_list_empty(&conf->return_bi) && 6278 !test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags)) { 6279 struct bio_list tmp = BIO_EMPTY_LIST; 6280 spin_lock_irq(&conf->device_lock); 6281 if (!test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags)) { 6282 bio_list_merge(&tmp, &conf->return_bi); 6283 bio_list_init(&conf->return_bi); 6284 } 6285 spin_unlock_irq(&conf->device_lock); 6286 return_io(&tmp); 6287 } 6288 6289 blk_start_plug(&plug); 6290 handled = 0; 6291 spin_lock_irq(&conf->device_lock); 6292 while (1) { 6293 struct bio *bio; 6294 int batch_size, released; 6295 6296 released = release_stripe_list(conf, conf->temp_inactive_list); 6297 if (released) 6298 clear_bit(R5_DID_ALLOC, &conf->cache_state); 6299 6300 if ( 6301 !list_empty(&conf->bitmap_list)) { 6302 /* Now is a good time to flush some bitmap updates */ 6303 conf->seq_flush++; 6304 spin_unlock_irq(&conf->device_lock); 6305 bitmap_unplug(mddev->bitmap); 6306 spin_lock_irq(&conf->device_lock); 6307 conf->seq_write = conf->seq_flush; 6308 activate_bit_delay(conf, conf->temp_inactive_list); 6309 } 6310 raid5_activate_delayed(conf); 6311 6312 while ((bio = remove_bio_from_retry(conf))) { 6313 int ok; 6314 spin_unlock_irq(&conf->device_lock); 6315 ok = retry_aligned_read(conf, bio); 6316 spin_lock_irq(&conf->device_lock); 6317 if (!ok) 6318 break; 6319 handled++; 6320 } 6321 6322 batch_size = handle_active_stripes(conf, ANY_GROUP, NULL, 6323 conf->temp_inactive_list); 6324 if (!batch_size && !released) 6325 break; 6326 handled += batch_size; 6327 6328 if (mddev->sb_flags & ~(1 << MD_SB_CHANGE_PENDING)) { 6329 spin_unlock_irq(&conf->device_lock); 6330 md_check_recovery(mddev); 6331 spin_lock_irq(&conf->device_lock); 6332 } 6333 } 6334 pr_debug("%d stripes handled\n", handled); 6335 6336 spin_unlock_irq(&conf->device_lock); 6337 if (test_and_clear_bit(R5_ALLOC_MORE, &conf->cache_state) && 6338 mutex_trylock(&conf->cache_size_mutex)) { 6339 grow_one_stripe(conf, __GFP_NOWARN); 6340 /* Set flag even if allocation failed. This helps 6341 * slow down allocation requests when mem is short 6342 */ 6343 set_bit(R5_DID_ALLOC, &conf->cache_state); 6344 mutex_unlock(&conf->cache_size_mutex); 6345 } 6346 6347 flush_deferred_bios(conf); 6348 6349 r5l_flush_stripe_to_raid(conf->log); 6350 6351 async_tx_issue_pending_all(); 6352 blk_finish_plug(&plug); 6353 6354 pr_debug("--- raid5d inactive\n"); 6355 } 6356 6357 static ssize_t 6358 raid5_show_stripe_cache_size(struct mddev *mddev, char *page) 6359 { 6360 struct r5conf *conf; 6361 int ret = 0; 6362 spin_lock(&mddev->lock); 6363 conf = mddev->private; 6364 if (conf) 6365 ret = sprintf(page, "%d\n", conf->min_nr_stripes); 6366 spin_unlock(&mddev->lock); 6367 return ret; 6368 } 6369 6370 int 6371 raid5_set_cache_size(struct mddev *mddev, int size) 6372 { 6373 struct r5conf *conf = mddev->private; 6374 int err; 6375 6376 if (size <= 16 || size > 32768) 6377 return -EINVAL; 6378 6379 conf->min_nr_stripes = size; 6380 mutex_lock(&conf->cache_size_mutex); 6381 while (size < conf->max_nr_stripes && 6382 drop_one_stripe(conf)) 6383 ; 6384 mutex_unlock(&conf->cache_size_mutex); 6385 6386 6387 err = md_allow_write(mddev); 6388 if (err) 6389 return err; 6390 6391 mutex_lock(&conf->cache_size_mutex); 6392 while (size > conf->max_nr_stripes) 6393 if (!grow_one_stripe(conf, GFP_KERNEL)) 6394 break; 6395 mutex_unlock(&conf->cache_size_mutex); 6396 6397 return 0; 6398 } 6399 EXPORT_SYMBOL(raid5_set_cache_size); 6400 6401 static ssize_t 6402 raid5_store_stripe_cache_size(struct mddev *mddev, const char *page, size_t len) 6403 { 6404 struct r5conf *conf; 6405 unsigned long new; 6406 int err; 6407 6408 if (len >= PAGE_SIZE) 6409 return -EINVAL; 6410 if (kstrtoul(page, 10, &new)) 6411 return -EINVAL; 6412 err = mddev_lock(mddev); 6413 if (err) 6414 return err; 6415 conf = mddev->private; 6416 if (!conf) 6417 err = -ENODEV; 6418 else 6419 err = raid5_set_cache_size(mddev, new); 6420 mddev_unlock(mddev); 6421 6422 return err ?: len; 6423 } 6424 6425 static struct md_sysfs_entry 6426 raid5_stripecache_size = __ATTR(stripe_cache_size, S_IRUGO | S_IWUSR, 6427 raid5_show_stripe_cache_size, 6428 raid5_store_stripe_cache_size); 6429 6430 static ssize_t 6431 raid5_show_rmw_level(struct mddev *mddev, char *page) 6432 { 6433 struct r5conf *conf = mddev->private; 6434 if (conf) 6435 return sprintf(page, "%d\n", conf->rmw_level); 6436 else 6437 return 0; 6438 } 6439 6440 static ssize_t 6441 raid5_store_rmw_level(struct mddev *mddev, const char *page, size_t len) 6442 { 6443 struct r5conf *conf = mddev->private; 6444 unsigned long new; 6445 6446 if (!conf) 6447 return -ENODEV; 6448 6449 if (len >= PAGE_SIZE) 6450 return -EINVAL; 6451 6452 if (kstrtoul(page, 10, &new)) 6453 return -EINVAL; 6454 6455 if (new != PARITY_DISABLE_RMW && !raid6_call.xor_syndrome) 6456 return -EINVAL; 6457 6458 if (new != PARITY_DISABLE_RMW && 6459 new != PARITY_ENABLE_RMW && 6460 new != PARITY_PREFER_RMW) 6461 return -EINVAL; 6462 6463 conf->rmw_level = new; 6464 return len; 6465 } 6466 6467 static struct md_sysfs_entry 6468 raid5_rmw_level = __ATTR(rmw_level, S_IRUGO | S_IWUSR, 6469 raid5_show_rmw_level, 6470 raid5_store_rmw_level); 6471 6472 6473 static ssize_t 6474 raid5_show_preread_threshold(struct mddev *mddev, char *page) 6475 { 6476 struct r5conf *conf; 6477 int ret = 0; 6478 spin_lock(&mddev->lock); 6479 conf = mddev->private; 6480 if (conf) 6481 ret = sprintf(page, "%d\n", conf->bypass_threshold); 6482 spin_unlock(&mddev->lock); 6483 return ret; 6484 } 6485 6486 static ssize_t 6487 raid5_store_preread_threshold(struct mddev *mddev, const char *page, size_t len) 6488 { 6489 struct r5conf *conf; 6490 unsigned long new; 6491 int err; 6492 6493 if (len >= PAGE_SIZE) 6494 return -EINVAL; 6495 if (kstrtoul(page, 10, &new)) 6496 return -EINVAL; 6497 6498 err = mddev_lock(mddev); 6499 if (err) 6500 return err; 6501 conf = mddev->private; 6502 if (!conf) 6503 err = -ENODEV; 6504 else if (new > conf->min_nr_stripes) 6505 err = -EINVAL; 6506 else 6507 conf->bypass_threshold = new; 6508 mddev_unlock(mddev); 6509 return err ?: len; 6510 } 6511 6512 static struct md_sysfs_entry 6513 raid5_preread_bypass_threshold = __ATTR(preread_bypass_threshold, 6514 S_IRUGO | S_IWUSR, 6515 raid5_show_preread_threshold, 6516 raid5_store_preread_threshold); 6517 6518 static ssize_t 6519 raid5_show_skip_copy(struct mddev *mddev, char *page) 6520 { 6521 struct r5conf *conf; 6522 int ret = 0; 6523 spin_lock(&mddev->lock); 6524 conf = mddev->private; 6525 if (conf) 6526 ret = sprintf(page, "%d\n", conf->skip_copy); 6527 spin_unlock(&mddev->lock); 6528 return ret; 6529 } 6530 6531 static ssize_t 6532 raid5_store_skip_copy(struct mddev *mddev, const char *page, size_t len) 6533 { 6534 struct r5conf *conf; 6535 unsigned long new; 6536 int err; 6537 6538 if (len >= PAGE_SIZE) 6539 return -EINVAL; 6540 if (kstrtoul(page, 10, &new)) 6541 return -EINVAL; 6542 new = !!new; 6543 6544 err = mddev_lock(mddev); 6545 if (err) 6546 return err; 6547 conf = mddev->private; 6548 if (!conf) 6549 err = -ENODEV; 6550 else if (new != conf->skip_copy) { 6551 mddev_suspend(mddev); 6552 conf->skip_copy = new; 6553 if (new) 6554 mddev->queue->backing_dev_info->capabilities |= 6555 BDI_CAP_STABLE_WRITES; 6556 else 6557 mddev->queue->backing_dev_info->capabilities &= 6558 ~BDI_CAP_STABLE_WRITES; 6559 mddev_resume(mddev); 6560 } 6561 mddev_unlock(mddev); 6562 return err ?: len; 6563 } 6564 6565 static struct md_sysfs_entry 6566 raid5_skip_copy = __ATTR(skip_copy, S_IRUGO | S_IWUSR, 6567 raid5_show_skip_copy, 6568 raid5_store_skip_copy); 6569 6570 static ssize_t 6571 stripe_cache_active_show(struct mddev *mddev, char *page) 6572 { 6573 struct r5conf *conf = mddev->private; 6574 if (conf) 6575 return sprintf(page, "%d\n", atomic_read(&conf->active_stripes)); 6576 else 6577 return 0; 6578 } 6579 6580 static struct md_sysfs_entry 6581 raid5_stripecache_active = __ATTR_RO(stripe_cache_active); 6582 6583 static ssize_t 6584 raid5_show_group_thread_cnt(struct mddev *mddev, char *page) 6585 { 6586 struct r5conf *conf; 6587 int ret = 0; 6588 spin_lock(&mddev->lock); 6589 conf = mddev->private; 6590 if (conf) 6591 ret = sprintf(page, "%d\n", conf->worker_cnt_per_group); 6592 spin_unlock(&mddev->lock); 6593 return ret; 6594 } 6595 6596 static int alloc_thread_groups(struct r5conf *conf, int cnt, 6597 int *group_cnt, 6598 int *worker_cnt_per_group, 6599 struct r5worker_group **worker_groups); 6600 static ssize_t 6601 raid5_store_group_thread_cnt(struct mddev *mddev, const char *page, size_t len) 6602 { 6603 struct r5conf *conf; 6604 unsigned long new; 6605 int err; 6606 struct r5worker_group *new_groups, *old_groups; 6607 int group_cnt, worker_cnt_per_group; 6608 6609 if (len >= PAGE_SIZE) 6610 return -EINVAL; 6611 if (kstrtoul(page, 10, &new)) 6612 return -EINVAL; 6613 6614 err = mddev_lock(mddev); 6615 if (err) 6616 return err; 6617 conf = mddev->private; 6618 if (!conf) 6619 err = -ENODEV; 6620 else if (new != conf->worker_cnt_per_group) { 6621 mddev_suspend(mddev); 6622 6623 old_groups = conf->worker_groups; 6624 if (old_groups) 6625 flush_workqueue(raid5_wq); 6626 6627 err = alloc_thread_groups(conf, new, 6628 &group_cnt, &worker_cnt_per_group, 6629 &new_groups); 6630 if (!err) { 6631 spin_lock_irq(&conf->device_lock); 6632 conf->group_cnt = group_cnt; 6633 conf->worker_cnt_per_group = worker_cnt_per_group; 6634 conf->worker_groups = new_groups; 6635 spin_unlock_irq(&conf->device_lock); 6636 6637 if (old_groups) 6638 kfree(old_groups[0].workers); 6639 kfree(old_groups); 6640 } 6641 mddev_resume(mddev); 6642 } 6643 mddev_unlock(mddev); 6644 6645 return err ?: len; 6646 } 6647 6648 static struct md_sysfs_entry 6649 raid5_group_thread_cnt = __ATTR(group_thread_cnt, S_IRUGO | S_IWUSR, 6650 raid5_show_group_thread_cnt, 6651 raid5_store_group_thread_cnt); 6652 6653 static struct attribute *raid5_attrs[] = { 6654 &raid5_stripecache_size.attr, 6655 &raid5_stripecache_active.attr, 6656 &raid5_preread_bypass_threshold.attr, 6657 &raid5_group_thread_cnt.attr, 6658 &raid5_skip_copy.attr, 6659 &raid5_rmw_level.attr, 6660 &r5c_journal_mode.attr, 6661 NULL, 6662 }; 6663 static struct attribute_group raid5_attrs_group = { 6664 .name = NULL, 6665 .attrs = raid5_attrs, 6666 }; 6667 6668 static int alloc_thread_groups(struct r5conf *conf, int cnt, 6669 int *group_cnt, 6670 int *worker_cnt_per_group, 6671 struct r5worker_group **worker_groups) 6672 { 6673 int i, j, k; 6674 ssize_t size; 6675 struct r5worker *workers; 6676 6677 *worker_cnt_per_group = cnt; 6678 if (cnt == 0) { 6679 *group_cnt = 0; 6680 *worker_groups = NULL; 6681 return 0; 6682 } 6683 *group_cnt = num_possible_nodes(); 6684 size = sizeof(struct r5worker) * cnt; 6685 workers = kzalloc(size * *group_cnt, GFP_NOIO); 6686 *worker_groups = kzalloc(sizeof(struct r5worker_group) * 6687 *group_cnt, GFP_NOIO); 6688 if (!*worker_groups || !workers) { 6689 kfree(workers); 6690 kfree(*worker_groups); 6691 return -ENOMEM; 6692 } 6693 6694 for (i = 0; i < *group_cnt; i++) { 6695 struct r5worker_group *group; 6696 6697 group = &(*worker_groups)[i]; 6698 INIT_LIST_HEAD(&group->handle_list); 6699 INIT_LIST_HEAD(&group->loprio_list); 6700 group->conf = conf; 6701 group->workers = workers + i * cnt; 6702 6703 for (j = 0; j < cnt; j++) { 6704 struct r5worker *worker = group->workers + j; 6705 worker->group = group; 6706 INIT_WORK(&worker->work, raid5_do_work); 6707 6708 for (k = 0; k < NR_STRIPE_HASH_LOCKS; k++) 6709 INIT_LIST_HEAD(worker->temp_inactive_list + k); 6710 } 6711 } 6712 6713 return 0; 6714 } 6715 6716 static void free_thread_groups(struct r5conf *conf) 6717 { 6718 if (conf->worker_groups) 6719 kfree(conf->worker_groups[0].workers); 6720 kfree(conf->worker_groups); 6721 conf->worker_groups = NULL; 6722 } 6723 6724 static sector_t 6725 raid5_size(struct mddev *mddev, sector_t sectors, int raid_disks) 6726 { 6727 struct r5conf *conf = mddev->private; 6728 6729 if (!sectors) 6730 sectors = mddev->dev_sectors; 6731 if (!raid_disks) 6732 /* size is defined by the smallest of previous and new size */ 6733 raid_disks = min(conf->raid_disks, conf->previous_raid_disks); 6734 6735 sectors &= ~((sector_t)conf->chunk_sectors - 1); 6736 sectors &= ~((sector_t)conf->prev_chunk_sectors - 1); 6737 return sectors * (raid_disks - conf->max_degraded); 6738 } 6739 6740 static void free_scratch_buffer(struct r5conf *conf, struct raid5_percpu *percpu) 6741 { 6742 safe_put_page(percpu->spare_page); 6743 if (percpu->scribble) 6744 flex_array_free(percpu->scribble); 6745 percpu->spare_page = NULL; 6746 percpu->scribble = NULL; 6747 } 6748 6749 static int alloc_scratch_buffer(struct r5conf *conf, struct raid5_percpu *percpu) 6750 { 6751 if (conf->level == 6 && !percpu->spare_page) 6752 percpu->spare_page = alloc_page(GFP_KERNEL); 6753 if (!percpu->scribble) 6754 percpu->scribble = scribble_alloc(max(conf->raid_disks, 6755 conf->previous_raid_disks), 6756 max(conf->chunk_sectors, 6757 conf->prev_chunk_sectors) 6758 / STRIPE_SECTORS, 6759 GFP_KERNEL); 6760 6761 if (!percpu->scribble || (conf->level == 6 && !percpu->spare_page)) { 6762 free_scratch_buffer(conf, percpu); 6763 return -ENOMEM; 6764 } 6765 6766 return 0; 6767 } 6768 6769 static int raid456_cpu_dead(unsigned int cpu, struct hlist_node *node) 6770 { 6771 struct r5conf *conf = hlist_entry_safe(node, struct r5conf, node); 6772 6773 free_scratch_buffer(conf, per_cpu_ptr(conf->percpu, cpu)); 6774 return 0; 6775 } 6776 6777 static void raid5_free_percpu(struct r5conf *conf) 6778 { 6779 if (!conf->percpu) 6780 return; 6781 6782 cpuhp_state_remove_instance(CPUHP_MD_RAID5_PREPARE, &conf->node); 6783 free_percpu(conf->percpu); 6784 } 6785 6786 static void free_conf(struct r5conf *conf) 6787 { 6788 int i; 6789 6790 log_exit(conf); 6791 6792 if (conf->shrinker.nr_deferred) 6793 unregister_shrinker(&conf->shrinker); 6794 6795 free_thread_groups(conf); 6796 shrink_stripes(conf); 6797 raid5_free_percpu(conf); 6798 for (i = 0; i < conf->pool_size; i++) 6799 if (conf->disks[i].extra_page) 6800 put_page(conf->disks[i].extra_page); 6801 kfree(conf->disks); 6802 kfree(conf->stripe_hashtbl); 6803 kfree(conf->pending_data); 6804 kfree(conf); 6805 } 6806 6807 static int raid456_cpu_up_prepare(unsigned int cpu, struct hlist_node *node) 6808 { 6809 struct r5conf *conf = hlist_entry_safe(node, struct r5conf, node); 6810 struct raid5_percpu *percpu = per_cpu_ptr(conf->percpu, cpu); 6811 6812 if (alloc_scratch_buffer(conf, percpu)) { 6813 pr_warn("%s: failed memory allocation for cpu%u\n", 6814 __func__, cpu); 6815 return -ENOMEM; 6816 } 6817 return 0; 6818 } 6819 6820 static int raid5_alloc_percpu(struct r5conf *conf) 6821 { 6822 int err = 0; 6823 6824 conf->percpu = alloc_percpu(struct raid5_percpu); 6825 if (!conf->percpu) 6826 return -ENOMEM; 6827 6828 err = cpuhp_state_add_instance(CPUHP_MD_RAID5_PREPARE, &conf->node); 6829 if (!err) { 6830 conf->scribble_disks = max(conf->raid_disks, 6831 conf->previous_raid_disks); 6832 conf->scribble_sectors = max(conf->chunk_sectors, 6833 conf->prev_chunk_sectors); 6834 } 6835 return err; 6836 } 6837 6838 static unsigned long raid5_cache_scan(struct shrinker *shrink, 6839 struct shrink_control *sc) 6840 { 6841 struct r5conf *conf = container_of(shrink, struct r5conf, shrinker); 6842 unsigned long ret = SHRINK_STOP; 6843 6844 if (mutex_trylock(&conf->cache_size_mutex)) { 6845 ret= 0; 6846 while (ret < sc->nr_to_scan && 6847 conf->max_nr_stripes > conf->min_nr_stripes) { 6848 if (drop_one_stripe(conf) == 0) { 6849 ret = SHRINK_STOP; 6850 break; 6851 } 6852 ret++; 6853 } 6854 mutex_unlock(&conf->cache_size_mutex); 6855 } 6856 return ret; 6857 } 6858 6859 static unsigned long raid5_cache_count(struct shrinker *shrink, 6860 struct shrink_control *sc) 6861 { 6862 struct r5conf *conf = container_of(shrink, struct r5conf, shrinker); 6863 6864 if (conf->max_nr_stripes < conf->min_nr_stripes) 6865 /* unlikely, but not impossible */ 6866 return 0; 6867 return conf->max_nr_stripes - conf->min_nr_stripes; 6868 } 6869 6870 static struct r5conf *setup_conf(struct mddev *mddev) 6871 { 6872 struct r5conf *conf; 6873 int raid_disk, memory, max_disks; 6874 struct md_rdev *rdev; 6875 struct disk_info *disk; 6876 char pers_name[6]; 6877 int i; 6878 int group_cnt, worker_cnt_per_group; 6879 struct r5worker_group *new_group; 6880 6881 if (mddev->new_level != 5 6882 && mddev->new_level != 4 6883 && mddev->new_level != 6) { 6884 pr_warn("md/raid:%s: raid level not set to 4/5/6 (%d)\n", 6885 mdname(mddev), mddev->new_level); 6886 return ERR_PTR(-EIO); 6887 } 6888 if ((mddev->new_level == 5 6889 && !algorithm_valid_raid5(mddev->new_layout)) || 6890 (mddev->new_level == 6 6891 && !algorithm_valid_raid6(mddev->new_layout))) { 6892 pr_warn("md/raid:%s: layout %d not supported\n", 6893 mdname(mddev), mddev->new_layout); 6894 return ERR_PTR(-EIO); 6895 } 6896 if (mddev->new_level == 6 && mddev->raid_disks < 4) { 6897 pr_warn("md/raid:%s: not enough configured devices (%d, minimum 4)\n", 6898 mdname(mddev), mddev->raid_disks); 6899 return ERR_PTR(-EINVAL); 6900 } 6901 6902 if (!mddev->new_chunk_sectors || 6903 (mddev->new_chunk_sectors << 9) % PAGE_SIZE || 6904 !is_power_of_2(mddev->new_chunk_sectors)) { 6905 pr_warn("md/raid:%s: invalid chunk size %d\n", 6906 mdname(mddev), mddev->new_chunk_sectors << 9); 6907 return ERR_PTR(-EINVAL); 6908 } 6909 6910 conf = kzalloc(sizeof(struct r5conf), GFP_KERNEL); 6911 if (conf == NULL) 6912 goto abort; 6913 INIT_LIST_HEAD(&conf->free_list); 6914 INIT_LIST_HEAD(&conf->pending_list); 6915 conf->pending_data = kzalloc(sizeof(struct r5pending_data) * 6916 PENDING_IO_MAX, GFP_KERNEL); 6917 if (!conf->pending_data) 6918 goto abort; 6919 for (i = 0; i < PENDING_IO_MAX; i++) 6920 list_add(&conf->pending_data[i].sibling, &conf->free_list); 6921 /* Don't enable multi-threading by default*/ 6922 if (!alloc_thread_groups(conf, 0, &group_cnt, &worker_cnt_per_group, 6923 &new_group)) { 6924 conf->group_cnt = group_cnt; 6925 conf->worker_cnt_per_group = worker_cnt_per_group; 6926 conf->worker_groups = new_group; 6927 } else 6928 goto abort; 6929 spin_lock_init(&conf->device_lock); 6930 seqcount_init(&conf->gen_lock); 6931 mutex_init(&conf->cache_size_mutex); 6932 init_waitqueue_head(&conf->wait_for_quiescent); 6933 init_waitqueue_head(&conf->wait_for_stripe); 6934 init_waitqueue_head(&conf->wait_for_overlap); 6935 INIT_LIST_HEAD(&conf->handle_list); 6936 INIT_LIST_HEAD(&conf->loprio_list); 6937 INIT_LIST_HEAD(&conf->hold_list); 6938 INIT_LIST_HEAD(&conf->delayed_list); 6939 INIT_LIST_HEAD(&conf->bitmap_list); 6940 bio_list_init(&conf->return_bi); 6941 init_llist_head(&conf->released_stripes); 6942 atomic_set(&conf->active_stripes, 0); 6943 atomic_set(&conf->preread_active_stripes, 0); 6944 atomic_set(&conf->active_aligned_reads, 0); 6945 spin_lock_init(&conf->pending_bios_lock); 6946 conf->batch_bio_dispatch = true; 6947 rdev_for_each(rdev, mddev) { 6948 if (test_bit(Journal, &rdev->flags)) 6949 continue; 6950 if (blk_queue_nonrot(bdev_get_queue(rdev->bdev))) { 6951 conf->batch_bio_dispatch = false; 6952 break; 6953 } 6954 } 6955 6956 conf->bypass_threshold = BYPASS_THRESHOLD; 6957 conf->recovery_disabled = mddev->recovery_disabled - 1; 6958 6959 conf->raid_disks = mddev->raid_disks; 6960 if (mddev->reshape_position == MaxSector) 6961 conf->previous_raid_disks = mddev->raid_disks; 6962 else 6963 conf->previous_raid_disks = mddev->raid_disks - mddev->delta_disks; 6964 max_disks = max(conf->raid_disks, conf->previous_raid_disks); 6965 6966 conf->disks = kzalloc(max_disks * sizeof(struct disk_info), 6967 GFP_KERNEL); 6968 6969 if (!conf->disks) 6970 goto abort; 6971 6972 for (i = 0; i < max_disks; i++) { 6973 conf->disks[i].extra_page = alloc_page(GFP_KERNEL); 6974 if (!conf->disks[i].extra_page) 6975 goto abort; 6976 } 6977 6978 conf->mddev = mddev; 6979 6980 if ((conf->stripe_hashtbl = kzalloc(PAGE_SIZE, GFP_KERNEL)) == NULL) 6981 goto abort; 6982 6983 /* We init hash_locks[0] separately to that it can be used 6984 * as the reference lock in the spin_lock_nest_lock() call 6985 * in lock_all_device_hash_locks_irq in order to convince 6986 * lockdep that we know what we are doing. 6987 */ 6988 spin_lock_init(conf->hash_locks); 6989 for (i = 1; i < NR_STRIPE_HASH_LOCKS; i++) 6990 spin_lock_init(conf->hash_locks + i); 6991 6992 for (i = 0; i < NR_STRIPE_HASH_LOCKS; i++) 6993 INIT_LIST_HEAD(conf->inactive_list + i); 6994 6995 for (i = 0; i < NR_STRIPE_HASH_LOCKS; i++) 6996 INIT_LIST_HEAD(conf->temp_inactive_list + i); 6997 6998 atomic_set(&conf->r5c_cached_full_stripes, 0); 6999 INIT_LIST_HEAD(&conf->r5c_full_stripe_list); 7000 atomic_set(&conf->r5c_cached_partial_stripes, 0); 7001 INIT_LIST_HEAD(&conf->r5c_partial_stripe_list); 7002 atomic_set(&conf->r5c_flushing_full_stripes, 0); 7003 atomic_set(&conf->r5c_flushing_partial_stripes, 0); 7004 7005 conf->level = mddev->new_level; 7006 conf->chunk_sectors = mddev->new_chunk_sectors; 7007 if (raid5_alloc_percpu(conf) != 0) 7008 goto abort; 7009 7010 pr_debug("raid456: run(%s) called.\n", mdname(mddev)); 7011 7012 rdev_for_each(rdev, mddev) { 7013 raid_disk = rdev->raid_disk; 7014 if (raid_disk >= max_disks 7015 || raid_disk < 0 || test_bit(Journal, &rdev->flags)) 7016 continue; 7017 disk = conf->disks + raid_disk; 7018 7019 if (test_bit(Replacement, &rdev->flags)) { 7020 if (disk->replacement) 7021 goto abort; 7022 disk->replacement = rdev; 7023 } else { 7024 if (disk->rdev) 7025 goto abort; 7026 disk->rdev = rdev; 7027 } 7028 7029 if (test_bit(In_sync, &rdev->flags)) { 7030 char b[BDEVNAME_SIZE]; 7031 pr_info("md/raid:%s: device %s operational as raid disk %d\n", 7032 mdname(mddev), bdevname(rdev->bdev, b), raid_disk); 7033 } else if (rdev->saved_raid_disk != raid_disk) 7034 /* Cannot rely on bitmap to complete recovery */ 7035 conf->fullsync = 1; 7036 } 7037 7038 conf->level = mddev->new_level; 7039 if (conf->level == 6) { 7040 conf->max_degraded = 2; 7041 if (raid6_call.xor_syndrome) 7042 conf->rmw_level = PARITY_ENABLE_RMW; 7043 else 7044 conf->rmw_level = PARITY_DISABLE_RMW; 7045 } else { 7046 conf->max_degraded = 1; 7047 conf->rmw_level = PARITY_ENABLE_RMW; 7048 } 7049 conf->algorithm = mddev->new_layout; 7050 conf->reshape_progress = mddev->reshape_position; 7051 if (conf->reshape_progress != MaxSector) { 7052 conf->prev_chunk_sectors = mddev->chunk_sectors; 7053 conf->prev_algo = mddev->layout; 7054 } else { 7055 conf->prev_chunk_sectors = conf->chunk_sectors; 7056 conf->prev_algo = conf->algorithm; 7057 } 7058 7059 conf->min_nr_stripes = NR_STRIPES; 7060 if (mddev->reshape_position != MaxSector) { 7061 int stripes = max_t(int, 7062 ((mddev->chunk_sectors << 9) / STRIPE_SIZE) * 4, 7063 ((mddev->new_chunk_sectors << 9) / STRIPE_SIZE) * 4); 7064 conf->min_nr_stripes = max(NR_STRIPES, stripes); 7065 if (conf->min_nr_stripes != NR_STRIPES) 7066 pr_info("md/raid:%s: force stripe size %d for reshape\n", 7067 mdname(mddev), conf->min_nr_stripes); 7068 } 7069 memory = conf->min_nr_stripes * (sizeof(struct stripe_head) + 7070 max_disks * ((sizeof(struct bio) + PAGE_SIZE))) / 1024; 7071 atomic_set(&conf->empty_inactive_list_nr, NR_STRIPE_HASH_LOCKS); 7072 if (grow_stripes(conf, conf->min_nr_stripes)) { 7073 pr_warn("md/raid:%s: couldn't allocate %dkB for buffers\n", 7074 mdname(mddev), memory); 7075 goto abort; 7076 } else 7077 pr_debug("md/raid:%s: allocated %dkB\n", mdname(mddev), memory); 7078 /* 7079 * Losing a stripe head costs more than the time to refill it, 7080 * it reduces the queue depth and so can hurt throughput. 7081 * So set it rather large, scaled by number of devices. 7082 */ 7083 conf->shrinker.seeks = DEFAULT_SEEKS * conf->raid_disks * 4; 7084 conf->shrinker.scan_objects = raid5_cache_scan; 7085 conf->shrinker.count_objects = raid5_cache_count; 7086 conf->shrinker.batch = 128; 7087 conf->shrinker.flags = 0; 7088 if (register_shrinker(&conf->shrinker)) { 7089 pr_warn("md/raid:%s: couldn't register shrinker.\n", 7090 mdname(mddev)); 7091 goto abort; 7092 } 7093 7094 sprintf(pers_name, "raid%d", mddev->new_level); 7095 conf->thread = md_register_thread(raid5d, mddev, pers_name); 7096 if (!conf->thread) { 7097 pr_warn("md/raid:%s: couldn't allocate thread.\n", 7098 mdname(mddev)); 7099 goto abort; 7100 } 7101 7102 return conf; 7103 7104 abort: 7105 if (conf) { 7106 free_conf(conf); 7107 return ERR_PTR(-EIO); 7108 } else 7109 return ERR_PTR(-ENOMEM); 7110 } 7111 7112 static int only_parity(int raid_disk, int algo, int raid_disks, int max_degraded) 7113 { 7114 switch (algo) { 7115 case ALGORITHM_PARITY_0: 7116 if (raid_disk < max_degraded) 7117 return 1; 7118 break; 7119 case ALGORITHM_PARITY_N: 7120 if (raid_disk >= raid_disks - max_degraded) 7121 return 1; 7122 break; 7123 case ALGORITHM_PARITY_0_6: 7124 if (raid_disk == 0 || 7125 raid_disk == raid_disks - 1) 7126 return 1; 7127 break; 7128 case ALGORITHM_LEFT_ASYMMETRIC_6: 7129 case ALGORITHM_RIGHT_ASYMMETRIC_6: 7130 case ALGORITHM_LEFT_SYMMETRIC_6: 7131 case ALGORITHM_RIGHT_SYMMETRIC_6: 7132 if (raid_disk == raid_disks - 1) 7133 return 1; 7134 } 7135 return 0; 7136 } 7137 7138 static int raid5_run(struct mddev *mddev) 7139 { 7140 struct r5conf *conf; 7141 int working_disks = 0; 7142 int dirty_parity_disks = 0; 7143 struct md_rdev *rdev; 7144 struct md_rdev *journal_dev = NULL; 7145 sector_t reshape_offset = 0; 7146 int i; 7147 long long min_offset_diff = 0; 7148 int first = 1; 7149 7150 if (mddev->recovery_cp != MaxSector) 7151 pr_notice("md/raid:%s: not clean -- starting background reconstruction\n", 7152 mdname(mddev)); 7153 7154 rdev_for_each(rdev, mddev) { 7155 long long diff; 7156 7157 if (test_bit(Journal, &rdev->flags)) { 7158 journal_dev = rdev; 7159 continue; 7160 } 7161 if (rdev->raid_disk < 0) 7162 continue; 7163 diff = (rdev->new_data_offset - rdev->data_offset); 7164 if (first) { 7165 min_offset_diff = diff; 7166 first = 0; 7167 } else if (mddev->reshape_backwards && 7168 diff < min_offset_diff) 7169 min_offset_diff = diff; 7170 else if (!mddev->reshape_backwards && 7171 diff > min_offset_diff) 7172 min_offset_diff = diff; 7173 } 7174 7175 if (mddev->reshape_position != MaxSector) { 7176 /* Check that we can continue the reshape. 7177 * Difficulties arise if the stripe we would write to 7178 * next is at or after the stripe we would read from next. 7179 * For a reshape that changes the number of devices, this 7180 * is only possible for a very short time, and mdadm makes 7181 * sure that time appears to have past before assembling 7182 * the array. So we fail if that time hasn't passed. 7183 * For a reshape that keeps the number of devices the same 7184 * mdadm must be monitoring the reshape can keeping the 7185 * critical areas read-only and backed up. It will start 7186 * the array in read-only mode, so we check for that. 7187 */ 7188 sector_t here_new, here_old; 7189 int old_disks; 7190 int max_degraded = (mddev->level == 6 ? 2 : 1); 7191 int chunk_sectors; 7192 int new_data_disks; 7193 7194 if (journal_dev) { 7195 pr_warn("md/raid:%s: don't support reshape with journal - aborting.\n", 7196 mdname(mddev)); 7197 return -EINVAL; 7198 } 7199 7200 if (mddev->new_level != mddev->level) { 7201 pr_warn("md/raid:%s: unsupported reshape required - aborting.\n", 7202 mdname(mddev)); 7203 return -EINVAL; 7204 } 7205 old_disks = mddev->raid_disks - mddev->delta_disks; 7206 /* reshape_position must be on a new-stripe boundary, and one 7207 * further up in new geometry must map after here in old 7208 * geometry. 7209 * If the chunk sizes are different, then as we perform reshape 7210 * in units of the largest of the two, reshape_position needs 7211 * be a multiple of the largest chunk size times new data disks. 7212 */ 7213 here_new = mddev->reshape_position; 7214 chunk_sectors = max(mddev->chunk_sectors, mddev->new_chunk_sectors); 7215 new_data_disks = mddev->raid_disks - max_degraded; 7216 if (sector_div(here_new, chunk_sectors * new_data_disks)) { 7217 pr_warn("md/raid:%s: reshape_position not on a stripe boundary\n", 7218 mdname(mddev)); 7219 return -EINVAL; 7220 } 7221 reshape_offset = here_new * chunk_sectors; 7222 /* here_new is the stripe we will write to */ 7223 here_old = mddev->reshape_position; 7224 sector_div(here_old, chunk_sectors * (old_disks-max_degraded)); 7225 /* here_old is the first stripe that we might need to read 7226 * from */ 7227 if (mddev->delta_disks == 0) { 7228 /* We cannot be sure it is safe to start an in-place 7229 * reshape. It is only safe if user-space is monitoring 7230 * and taking constant backups. 7231 * mdadm always starts a situation like this in 7232 * readonly mode so it can take control before 7233 * allowing any writes. So just check for that. 7234 */ 7235 if (abs(min_offset_diff) >= mddev->chunk_sectors && 7236 abs(min_offset_diff) >= mddev->new_chunk_sectors) 7237 /* not really in-place - so OK */; 7238 else if (mddev->ro == 0) { 7239 pr_warn("md/raid:%s: in-place reshape must be started in read-only mode - aborting\n", 7240 mdname(mddev)); 7241 return -EINVAL; 7242 } 7243 } else if (mddev->reshape_backwards 7244 ? (here_new * chunk_sectors + min_offset_diff <= 7245 here_old * chunk_sectors) 7246 : (here_new * chunk_sectors >= 7247 here_old * chunk_sectors + (-min_offset_diff))) { 7248 /* Reading from the same stripe as writing to - bad */ 7249 pr_warn("md/raid:%s: reshape_position too early for auto-recovery - aborting.\n", 7250 mdname(mddev)); 7251 return -EINVAL; 7252 } 7253 pr_debug("md/raid:%s: reshape will continue\n", mdname(mddev)); 7254 /* OK, we should be able to continue; */ 7255 } else { 7256 BUG_ON(mddev->level != mddev->new_level); 7257 BUG_ON(mddev->layout != mddev->new_layout); 7258 BUG_ON(mddev->chunk_sectors != mddev->new_chunk_sectors); 7259 BUG_ON(mddev->delta_disks != 0); 7260 } 7261 7262 if (test_bit(MD_HAS_JOURNAL, &mddev->flags) && 7263 test_bit(MD_HAS_PPL, &mddev->flags)) { 7264 pr_warn("md/raid:%s: using journal device and PPL not allowed - disabling PPL\n", 7265 mdname(mddev)); 7266 clear_bit(MD_HAS_PPL, &mddev->flags); 7267 } 7268 7269 if (mddev->private == NULL) 7270 conf = setup_conf(mddev); 7271 else 7272 conf = mddev->private; 7273 7274 if (IS_ERR(conf)) 7275 return PTR_ERR(conf); 7276 7277 if (test_bit(MD_HAS_JOURNAL, &mddev->flags)) { 7278 if (!journal_dev) { 7279 pr_warn("md/raid:%s: journal disk is missing, force array readonly\n", 7280 mdname(mddev)); 7281 mddev->ro = 1; 7282 set_disk_ro(mddev->gendisk, 1); 7283 } else if (mddev->recovery_cp == MaxSector) 7284 set_bit(MD_JOURNAL_CLEAN, &mddev->flags); 7285 } 7286 7287 conf->min_offset_diff = min_offset_diff; 7288 mddev->thread = conf->thread; 7289 conf->thread = NULL; 7290 mddev->private = conf; 7291 7292 for (i = 0; i < conf->raid_disks && conf->previous_raid_disks; 7293 i++) { 7294 rdev = conf->disks[i].rdev; 7295 if (!rdev && conf->disks[i].replacement) { 7296 /* The replacement is all we have yet */ 7297 rdev = conf->disks[i].replacement; 7298 conf->disks[i].replacement = NULL; 7299 clear_bit(Replacement, &rdev->flags); 7300 conf->disks[i].rdev = rdev; 7301 } 7302 if (!rdev) 7303 continue; 7304 if (conf->disks[i].replacement && 7305 conf->reshape_progress != MaxSector) { 7306 /* replacements and reshape simply do not mix. */ 7307 pr_warn("md: cannot handle concurrent replacement and reshape.\n"); 7308 goto abort; 7309 } 7310 if (test_bit(In_sync, &rdev->flags)) { 7311 working_disks++; 7312 continue; 7313 } 7314 /* This disc is not fully in-sync. However if it 7315 * just stored parity (beyond the recovery_offset), 7316 * when we don't need to be concerned about the 7317 * array being dirty. 7318 * When reshape goes 'backwards', we never have 7319 * partially completed devices, so we only need 7320 * to worry about reshape going forwards. 7321 */ 7322 /* Hack because v0.91 doesn't store recovery_offset properly. */ 7323 if (mddev->major_version == 0 && 7324 mddev->minor_version > 90) 7325 rdev->recovery_offset = reshape_offset; 7326 7327 if (rdev->recovery_offset < reshape_offset) { 7328 /* We need to check old and new layout */ 7329 if (!only_parity(rdev->raid_disk, 7330 conf->algorithm, 7331 conf->raid_disks, 7332 conf->max_degraded)) 7333 continue; 7334 } 7335 if (!only_parity(rdev->raid_disk, 7336 conf->prev_algo, 7337 conf->previous_raid_disks, 7338 conf->max_degraded)) 7339 continue; 7340 dirty_parity_disks++; 7341 } 7342 7343 /* 7344 * 0 for a fully functional array, 1 or 2 for a degraded array. 7345 */ 7346 mddev->degraded = raid5_calc_degraded(conf); 7347 7348 if (has_failed(conf)) { 7349 pr_crit("md/raid:%s: not enough operational devices (%d/%d failed)\n", 7350 mdname(mddev), mddev->degraded, conf->raid_disks); 7351 goto abort; 7352 } 7353 7354 /* device size must be a multiple of chunk size */ 7355 mddev->dev_sectors &= ~(mddev->chunk_sectors - 1); 7356 mddev->resync_max_sectors = mddev->dev_sectors; 7357 7358 if (mddev->degraded > dirty_parity_disks && 7359 mddev->recovery_cp != MaxSector) { 7360 if (test_bit(MD_HAS_PPL, &mddev->flags)) 7361 pr_crit("md/raid:%s: starting dirty degraded array with PPL.\n", 7362 mdname(mddev)); 7363 else if (mddev->ok_start_degraded) 7364 pr_crit("md/raid:%s: starting dirty degraded array - data corruption possible.\n", 7365 mdname(mddev)); 7366 else { 7367 pr_crit("md/raid:%s: cannot start dirty degraded array.\n", 7368 mdname(mddev)); 7369 goto abort; 7370 } 7371 } 7372 7373 pr_info("md/raid:%s: raid level %d active with %d out of %d devices, algorithm %d\n", 7374 mdname(mddev), conf->level, 7375 mddev->raid_disks-mddev->degraded, mddev->raid_disks, 7376 mddev->new_layout); 7377 7378 print_raid5_conf(conf); 7379 7380 if (conf->reshape_progress != MaxSector) { 7381 conf->reshape_safe = conf->reshape_progress; 7382 atomic_set(&conf->reshape_stripes, 0); 7383 clear_bit(MD_RECOVERY_SYNC, &mddev->recovery); 7384 clear_bit(MD_RECOVERY_CHECK, &mddev->recovery); 7385 set_bit(MD_RECOVERY_RESHAPE, &mddev->recovery); 7386 set_bit(MD_RECOVERY_RUNNING, &mddev->recovery); 7387 mddev->sync_thread = md_register_thread(md_do_sync, mddev, 7388 "reshape"); 7389 } 7390 7391 /* Ok, everything is just fine now */ 7392 if (mddev->to_remove == &raid5_attrs_group) 7393 mddev->to_remove = NULL; 7394 else if (mddev->kobj.sd && 7395 sysfs_create_group(&mddev->kobj, &raid5_attrs_group)) 7396 pr_warn("raid5: failed to create sysfs attributes for %s\n", 7397 mdname(mddev)); 7398 md_set_array_sectors(mddev, raid5_size(mddev, 0, 0)); 7399 7400 if (mddev->queue) { 7401 int chunk_size; 7402 bool discard_supported = true; 7403 /* read-ahead size must cover two whole stripes, which 7404 * is 2 * (datadisks) * chunksize where 'n' is the 7405 * number of raid devices 7406 */ 7407 int data_disks = conf->previous_raid_disks - conf->max_degraded; 7408 int stripe = data_disks * 7409 ((mddev->chunk_sectors << 9) / PAGE_SIZE); 7410 if (mddev->queue->backing_dev_info->ra_pages < 2 * stripe) 7411 mddev->queue->backing_dev_info->ra_pages = 2 * stripe; 7412 7413 chunk_size = mddev->chunk_sectors << 9; 7414 blk_queue_io_min(mddev->queue, chunk_size); 7415 blk_queue_io_opt(mddev->queue, chunk_size * 7416 (conf->raid_disks - conf->max_degraded)); 7417 mddev->queue->limits.raid_partial_stripes_expensive = 1; 7418 /* 7419 * We can only discard a whole stripe. It doesn't make sense to 7420 * discard data disk but write parity disk 7421 */ 7422 stripe = stripe * PAGE_SIZE; 7423 /* Round up to power of 2, as discard handling 7424 * currently assumes that */ 7425 while ((stripe-1) & stripe) 7426 stripe = (stripe | (stripe-1)) + 1; 7427 mddev->queue->limits.discard_alignment = stripe; 7428 mddev->queue->limits.discard_granularity = stripe; 7429 7430 /* 7431 * We use 16-bit counter of active stripes in bi_phys_segments 7432 * (minus one for over-loaded initialization) 7433 */ 7434 blk_queue_max_hw_sectors(mddev->queue, 0xfffe * STRIPE_SECTORS); 7435 blk_queue_max_discard_sectors(mddev->queue, 7436 0xfffe * STRIPE_SECTORS); 7437 7438 /* 7439 * unaligned part of discard request will be ignored, so can't 7440 * guarantee discard_zeroes_data 7441 */ 7442 mddev->queue->limits.discard_zeroes_data = 0; 7443 7444 blk_queue_max_write_same_sectors(mddev->queue, 0); 7445 7446 rdev_for_each(rdev, mddev) { 7447 disk_stack_limits(mddev->gendisk, rdev->bdev, 7448 rdev->data_offset << 9); 7449 disk_stack_limits(mddev->gendisk, rdev->bdev, 7450 rdev->new_data_offset << 9); 7451 /* 7452 * discard_zeroes_data is required, otherwise data 7453 * could be lost. Consider a scenario: discard a stripe 7454 * (the stripe could be inconsistent if 7455 * discard_zeroes_data is 0); write one disk of the 7456 * stripe (the stripe could be inconsistent again 7457 * depending on which disks are used to calculate 7458 * parity); the disk is broken; The stripe data of this 7459 * disk is lost. 7460 */ 7461 if (!blk_queue_discard(bdev_get_queue(rdev->bdev)) || 7462 !bdev_get_queue(rdev->bdev)-> 7463 limits.discard_zeroes_data) 7464 discard_supported = false; 7465 /* Unfortunately, discard_zeroes_data is not currently 7466 * a guarantee - just a hint. So we only allow DISCARD 7467 * if the sysadmin has confirmed that only safe devices 7468 * are in use by setting a module parameter. 7469 */ 7470 if (!devices_handle_discard_safely) { 7471 if (discard_supported) { 7472 pr_info("md/raid456: discard support disabled due to uncertainty.\n"); 7473 pr_info("Set raid456.devices_handle_discard_safely=Y to override.\n"); 7474 } 7475 discard_supported = false; 7476 } 7477 } 7478 7479 if (discard_supported && 7480 mddev->queue->limits.max_discard_sectors >= (stripe >> 9) && 7481 mddev->queue->limits.discard_granularity >= stripe) 7482 queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, 7483 mddev->queue); 7484 else 7485 queue_flag_clear_unlocked(QUEUE_FLAG_DISCARD, 7486 mddev->queue); 7487 7488 blk_queue_max_hw_sectors(mddev->queue, UINT_MAX); 7489 } 7490 7491 if (log_init(conf, journal_dev)) 7492 goto abort; 7493 7494 return 0; 7495 abort: 7496 md_unregister_thread(&mddev->thread); 7497 print_raid5_conf(conf); 7498 free_conf(conf); 7499 mddev->private = NULL; 7500 pr_warn("md/raid:%s: failed to run raid set.\n", mdname(mddev)); 7501 return -EIO; 7502 } 7503 7504 static void raid5_free(struct mddev *mddev, void *priv) 7505 { 7506 struct r5conf *conf = priv; 7507 7508 free_conf(conf); 7509 mddev->to_remove = &raid5_attrs_group; 7510 } 7511 7512 static void raid5_status(struct seq_file *seq, struct mddev *mddev) 7513 { 7514 struct r5conf *conf = mddev->private; 7515 int i; 7516 7517 seq_printf(seq, " level %d, %dk chunk, algorithm %d", mddev->level, 7518 conf->chunk_sectors / 2, mddev->layout); 7519 seq_printf (seq, " [%d/%d] [", conf->raid_disks, conf->raid_disks - mddev->degraded); 7520 rcu_read_lock(); 7521 for (i = 0; i < conf->raid_disks; i++) { 7522 struct md_rdev *rdev = rcu_dereference(conf->disks[i].rdev); 7523 seq_printf (seq, "%s", rdev && test_bit(In_sync, &rdev->flags) ? "U" : "_"); 7524 } 7525 rcu_read_unlock(); 7526 seq_printf (seq, "]"); 7527 } 7528 7529 static void print_raid5_conf (struct r5conf *conf) 7530 { 7531 int i; 7532 struct disk_info *tmp; 7533 7534 pr_debug("RAID conf printout:\n"); 7535 if (!conf) { 7536 pr_debug("(conf==NULL)\n"); 7537 return; 7538 } 7539 pr_debug(" --- level:%d rd:%d wd:%d\n", conf->level, 7540 conf->raid_disks, 7541 conf->raid_disks - conf->mddev->degraded); 7542 7543 for (i = 0; i < conf->raid_disks; i++) { 7544 char b[BDEVNAME_SIZE]; 7545 tmp = conf->disks + i; 7546 if (tmp->rdev) 7547 pr_debug(" disk %d, o:%d, dev:%s\n", 7548 i, !test_bit(Faulty, &tmp->rdev->flags), 7549 bdevname(tmp->rdev->bdev, b)); 7550 } 7551 } 7552 7553 static int raid5_spare_active(struct mddev *mddev) 7554 { 7555 int i; 7556 struct r5conf *conf = mddev->private; 7557 struct disk_info *tmp; 7558 int count = 0; 7559 unsigned long flags; 7560 7561 for (i = 0; i < conf->raid_disks; i++) { 7562 tmp = conf->disks + i; 7563 if (tmp->replacement 7564 && tmp->replacement->recovery_offset == MaxSector 7565 && !test_bit(Faulty, &tmp->replacement->flags) 7566 && !test_and_set_bit(In_sync, &tmp->replacement->flags)) { 7567 /* Replacement has just become active. */ 7568 if (!tmp->rdev 7569 || !test_and_clear_bit(In_sync, &tmp->rdev->flags)) 7570 count++; 7571 if (tmp->rdev) { 7572 /* Replaced device not technically faulty, 7573 * but we need to be sure it gets removed 7574 * and never re-added. 7575 */ 7576 set_bit(Faulty, &tmp->rdev->flags); 7577 sysfs_notify_dirent_safe( 7578 tmp->rdev->sysfs_state); 7579 } 7580 sysfs_notify_dirent_safe(tmp->replacement->sysfs_state); 7581 } else if (tmp->rdev 7582 && tmp->rdev->recovery_offset == MaxSector 7583 && !test_bit(Faulty, &tmp->rdev->flags) 7584 && !test_and_set_bit(In_sync, &tmp->rdev->flags)) { 7585 count++; 7586 sysfs_notify_dirent_safe(tmp->rdev->sysfs_state); 7587 } 7588 } 7589 spin_lock_irqsave(&conf->device_lock, flags); 7590 mddev->degraded = raid5_calc_degraded(conf); 7591 spin_unlock_irqrestore(&conf->device_lock, flags); 7592 print_raid5_conf(conf); 7593 return count; 7594 } 7595 7596 static int raid5_remove_disk(struct mddev *mddev, struct md_rdev *rdev) 7597 { 7598 struct r5conf *conf = mddev->private; 7599 int err = 0; 7600 int number = rdev->raid_disk; 7601 struct md_rdev **rdevp; 7602 struct disk_info *p = conf->disks + number; 7603 7604 print_raid5_conf(conf); 7605 if (test_bit(Journal, &rdev->flags) && conf->log) { 7606 /* 7607 * we can't wait pending write here, as this is called in 7608 * raid5d, wait will deadlock. 7609 */ 7610 if (atomic_read(&mddev->writes_pending)) 7611 return -EBUSY; 7612 log_exit(conf); 7613 return 0; 7614 } 7615 if (rdev == p->rdev) 7616 rdevp = &p->rdev; 7617 else if (rdev == p->replacement) 7618 rdevp = &p->replacement; 7619 else 7620 return 0; 7621 7622 if (number >= conf->raid_disks && 7623 conf->reshape_progress == MaxSector) 7624 clear_bit(In_sync, &rdev->flags); 7625 7626 if (test_bit(In_sync, &rdev->flags) || 7627 atomic_read(&rdev->nr_pending)) { 7628 err = -EBUSY; 7629 goto abort; 7630 } 7631 /* Only remove non-faulty devices if recovery 7632 * isn't possible. 7633 */ 7634 if (!test_bit(Faulty, &rdev->flags) && 7635 mddev->recovery_disabled != conf->recovery_disabled && 7636 !has_failed(conf) && 7637 (!p->replacement || p->replacement == rdev) && 7638 number < conf->raid_disks) { 7639 err = -EBUSY; 7640 goto abort; 7641 } 7642 *rdevp = NULL; 7643 if (!test_bit(RemoveSynchronized, &rdev->flags)) { 7644 synchronize_rcu(); 7645 if (atomic_read(&rdev->nr_pending)) { 7646 /* lost the race, try later */ 7647 err = -EBUSY; 7648 *rdevp = rdev; 7649 } 7650 } 7651 if (!err) { 7652 err = log_modify(conf, rdev, false); 7653 if (err) 7654 goto abort; 7655 } 7656 if (p->replacement) { 7657 /* We must have just cleared 'rdev' */ 7658 p->rdev = p->replacement; 7659 clear_bit(Replacement, &p->replacement->flags); 7660 smp_mb(); /* Make sure other CPUs may see both as identical 7661 * but will never see neither - if they are careful 7662 */ 7663 p->replacement = NULL; 7664 clear_bit(WantReplacement, &rdev->flags); 7665 7666 if (!err) 7667 err = log_modify(conf, p->rdev, true); 7668 } else 7669 /* We might have just removed the Replacement as faulty- 7670 * clear the bit just in case 7671 */ 7672 clear_bit(WantReplacement, &rdev->flags); 7673 abort: 7674 7675 print_raid5_conf(conf); 7676 return err; 7677 } 7678 7679 static int raid5_add_disk(struct mddev *mddev, struct md_rdev *rdev) 7680 { 7681 struct r5conf *conf = mddev->private; 7682 int err = -EEXIST; 7683 int disk; 7684 struct disk_info *p; 7685 int first = 0; 7686 int last = conf->raid_disks - 1; 7687 7688 if (test_bit(Journal, &rdev->flags)) { 7689 if (conf->log) 7690 return -EBUSY; 7691 7692 rdev->raid_disk = 0; 7693 /* 7694 * The array is in readonly mode if journal is missing, so no 7695 * write requests running. We should be safe 7696 */ 7697 log_init(conf, rdev); 7698 return 0; 7699 } 7700 if (mddev->recovery_disabled == conf->recovery_disabled) 7701 return -EBUSY; 7702 7703 if (rdev->saved_raid_disk < 0 && has_failed(conf)) 7704 /* no point adding a device */ 7705 return -EINVAL; 7706 7707 if (rdev->raid_disk >= 0) 7708 first = last = rdev->raid_disk; 7709 7710 /* 7711 * find the disk ... but prefer rdev->saved_raid_disk 7712 * if possible. 7713 */ 7714 if (rdev->saved_raid_disk >= 0 && 7715 rdev->saved_raid_disk >= first && 7716 conf->disks[rdev->saved_raid_disk].rdev == NULL) 7717 first = rdev->saved_raid_disk; 7718 7719 for (disk = first; disk <= last; disk++) { 7720 p = conf->disks + disk; 7721 if (p->rdev == NULL) { 7722 clear_bit(In_sync, &rdev->flags); 7723 rdev->raid_disk = disk; 7724 if (rdev->saved_raid_disk != disk) 7725 conf->fullsync = 1; 7726 rcu_assign_pointer(p->rdev, rdev); 7727 7728 err = log_modify(conf, rdev, true); 7729 7730 goto out; 7731 } 7732 } 7733 for (disk = first; disk <= last; disk++) { 7734 p = conf->disks + disk; 7735 if (test_bit(WantReplacement, &p->rdev->flags) && 7736 p->replacement == NULL) { 7737 clear_bit(In_sync, &rdev->flags); 7738 set_bit(Replacement, &rdev->flags); 7739 rdev->raid_disk = disk; 7740 err = 0; 7741 conf->fullsync = 1; 7742 rcu_assign_pointer(p->replacement, rdev); 7743 break; 7744 } 7745 } 7746 out: 7747 print_raid5_conf(conf); 7748 return err; 7749 } 7750 7751 static int raid5_resize(struct mddev *mddev, sector_t sectors) 7752 { 7753 /* no resync is happening, and there is enough space 7754 * on all devices, so we can resize. 7755 * We need to make sure resync covers any new space. 7756 * If the array is shrinking we should possibly wait until 7757 * any io in the removed space completes, but it hardly seems 7758 * worth it. 7759 */ 7760 sector_t newsize; 7761 struct r5conf *conf = mddev->private; 7762 7763 if (conf->log || raid5_has_ppl(conf)) 7764 return -EINVAL; 7765 sectors &= ~((sector_t)conf->chunk_sectors - 1); 7766 newsize = raid5_size(mddev, sectors, mddev->raid_disks); 7767 if (mddev->external_size && 7768 mddev->array_sectors > newsize) 7769 return -EINVAL; 7770 if (mddev->bitmap) { 7771 int ret = bitmap_resize(mddev->bitmap, sectors, 0, 0); 7772 if (ret) 7773 return ret; 7774 } 7775 md_set_array_sectors(mddev, newsize); 7776 if (sectors > mddev->dev_sectors && 7777 mddev->recovery_cp > mddev->dev_sectors) { 7778 mddev->recovery_cp = mddev->dev_sectors; 7779 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 7780 } 7781 mddev->dev_sectors = sectors; 7782 mddev->resync_max_sectors = sectors; 7783 return 0; 7784 } 7785 7786 static int check_stripe_cache(struct mddev *mddev) 7787 { 7788 /* Can only proceed if there are plenty of stripe_heads. 7789 * We need a minimum of one full stripe,, and for sensible progress 7790 * it is best to have about 4 times that. 7791 * If we require 4 times, then the default 256 4K stripe_heads will 7792 * allow for chunk sizes up to 256K, which is probably OK. 7793 * If the chunk size is greater, user-space should request more 7794 * stripe_heads first. 7795 */ 7796 struct r5conf *conf = mddev->private; 7797 if (((mddev->chunk_sectors << 9) / STRIPE_SIZE) * 4 7798 > conf->min_nr_stripes || 7799 ((mddev->new_chunk_sectors << 9) / STRIPE_SIZE) * 4 7800 > conf->min_nr_stripes) { 7801 pr_warn("md/raid:%s: reshape: not enough stripes. Needed %lu\n", 7802 mdname(mddev), 7803 ((max(mddev->chunk_sectors, mddev->new_chunk_sectors) << 9) 7804 / STRIPE_SIZE)*4); 7805 return 0; 7806 } 7807 return 1; 7808 } 7809 7810 static int check_reshape(struct mddev *mddev) 7811 { 7812 struct r5conf *conf = mddev->private; 7813 7814 if (conf->log || raid5_has_ppl(conf)) 7815 return -EINVAL; 7816 if (mddev->delta_disks == 0 && 7817 mddev->new_layout == mddev->layout && 7818 mddev->new_chunk_sectors == mddev->chunk_sectors) 7819 return 0; /* nothing to do */ 7820 if (has_failed(conf)) 7821 return -EINVAL; 7822 if (mddev->delta_disks < 0 && mddev->reshape_position == MaxSector) { 7823 /* We might be able to shrink, but the devices must 7824 * be made bigger first. 7825 * For raid6, 4 is the minimum size. 7826 * Otherwise 2 is the minimum 7827 */ 7828 int min = 2; 7829 if (mddev->level == 6) 7830 min = 4; 7831 if (mddev->raid_disks + mddev->delta_disks < min) 7832 return -EINVAL; 7833 } 7834 7835 if (!check_stripe_cache(mddev)) 7836 return -ENOSPC; 7837 7838 if (mddev->new_chunk_sectors > mddev->chunk_sectors || 7839 mddev->delta_disks > 0) 7840 if (resize_chunks(conf, 7841 conf->previous_raid_disks 7842 + max(0, mddev->delta_disks), 7843 max(mddev->new_chunk_sectors, 7844 mddev->chunk_sectors) 7845 ) < 0) 7846 return -ENOMEM; 7847 return resize_stripes(conf, (conf->previous_raid_disks 7848 + mddev->delta_disks)); 7849 } 7850 7851 static int raid5_start_reshape(struct mddev *mddev) 7852 { 7853 struct r5conf *conf = mddev->private; 7854 struct md_rdev *rdev; 7855 int spares = 0; 7856 unsigned long flags; 7857 7858 if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) 7859 return -EBUSY; 7860 7861 if (!check_stripe_cache(mddev)) 7862 return -ENOSPC; 7863 7864 if (has_failed(conf)) 7865 return -EINVAL; 7866 7867 rdev_for_each(rdev, mddev) { 7868 if (!test_bit(In_sync, &rdev->flags) 7869 && !test_bit(Faulty, &rdev->flags)) 7870 spares++; 7871 } 7872 7873 if (spares - mddev->degraded < mddev->delta_disks - conf->max_degraded) 7874 /* Not enough devices even to make a degraded array 7875 * of that size 7876 */ 7877 return -EINVAL; 7878 7879 /* Refuse to reduce size of the array. Any reductions in 7880 * array size must be through explicit setting of array_size 7881 * attribute. 7882 */ 7883 if (raid5_size(mddev, 0, conf->raid_disks + mddev->delta_disks) 7884 < mddev->array_sectors) { 7885 pr_warn("md/raid:%s: array size must be reduced before number of disks\n", 7886 mdname(mddev)); 7887 return -EINVAL; 7888 } 7889 7890 atomic_set(&conf->reshape_stripes, 0); 7891 spin_lock_irq(&conf->device_lock); 7892 write_seqcount_begin(&conf->gen_lock); 7893 conf->previous_raid_disks = conf->raid_disks; 7894 conf->raid_disks += mddev->delta_disks; 7895 conf->prev_chunk_sectors = conf->chunk_sectors; 7896 conf->chunk_sectors = mddev->new_chunk_sectors; 7897 conf->prev_algo = conf->algorithm; 7898 conf->algorithm = mddev->new_layout; 7899 conf->generation++; 7900 /* Code that selects data_offset needs to see the generation update 7901 * if reshape_progress has been set - so a memory barrier needed. 7902 */ 7903 smp_mb(); 7904 if (mddev->reshape_backwards) 7905 conf->reshape_progress = raid5_size(mddev, 0, 0); 7906 else 7907 conf->reshape_progress = 0; 7908 conf->reshape_safe = conf->reshape_progress; 7909 write_seqcount_end(&conf->gen_lock); 7910 spin_unlock_irq(&conf->device_lock); 7911 7912 /* Now make sure any requests that proceeded on the assumption 7913 * the reshape wasn't running - like Discard or Read - have 7914 * completed. 7915 */ 7916 mddev_suspend(mddev); 7917 mddev_resume(mddev); 7918 7919 /* Add some new drives, as many as will fit. 7920 * We know there are enough to make the newly sized array work. 7921 * Don't add devices if we are reducing the number of 7922 * devices in the array. This is because it is not possible 7923 * to correctly record the "partially reconstructed" state of 7924 * such devices during the reshape and confusion could result. 7925 */ 7926 if (mddev->delta_disks >= 0) { 7927 rdev_for_each(rdev, mddev) 7928 if (rdev->raid_disk < 0 && 7929 !test_bit(Faulty, &rdev->flags)) { 7930 if (raid5_add_disk(mddev, rdev) == 0) { 7931 if (rdev->raid_disk 7932 >= conf->previous_raid_disks) 7933 set_bit(In_sync, &rdev->flags); 7934 else 7935 rdev->recovery_offset = 0; 7936 7937 if (sysfs_link_rdev(mddev, rdev)) 7938 /* Failure here is OK */; 7939 } 7940 } else if (rdev->raid_disk >= conf->previous_raid_disks 7941 && !test_bit(Faulty, &rdev->flags)) { 7942 /* This is a spare that was manually added */ 7943 set_bit(In_sync, &rdev->flags); 7944 } 7945 7946 /* When a reshape changes the number of devices, 7947 * ->degraded is measured against the larger of the 7948 * pre and post number of devices. 7949 */ 7950 spin_lock_irqsave(&conf->device_lock, flags); 7951 mddev->degraded = raid5_calc_degraded(conf); 7952 spin_unlock_irqrestore(&conf->device_lock, flags); 7953 } 7954 mddev->raid_disks = conf->raid_disks; 7955 mddev->reshape_position = conf->reshape_progress; 7956 set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags); 7957 7958 clear_bit(MD_RECOVERY_SYNC, &mddev->recovery); 7959 clear_bit(MD_RECOVERY_CHECK, &mddev->recovery); 7960 clear_bit(MD_RECOVERY_DONE, &mddev->recovery); 7961 set_bit(MD_RECOVERY_RESHAPE, &mddev->recovery); 7962 set_bit(MD_RECOVERY_RUNNING, &mddev->recovery); 7963 mddev->sync_thread = md_register_thread(md_do_sync, mddev, 7964 "reshape"); 7965 if (!mddev->sync_thread) { 7966 mddev->recovery = 0; 7967 spin_lock_irq(&conf->device_lock); 7968 write_seqcount_begin(&conf->gen_lock); 7969 mddev->raid_disks = conf->raid_disks = conf->previous_raid_disks; 7970 mddev->new_chunk_sectors = 7971 conf->chunk_sectors = conf->prev_chunk_sectors; 7972 mddev->new_layout = conf->algorithm = conf->prev_algo; 7973 rdev_for_each(rdev, mddev) 7974 rdev->new_data_offset = rdev->data_offset; 7975 smp_wmb(); 7976 conf->generation --; 7977 conf->reshape_progress = MaxSector; 7978 mddev->reshape_position = MaxSector; 7979 write_seqcount_end(&conf->gen_lock); 7980 spin_unlock_irq(&conf->device_lock); 7981 return -EAGAIN; 7982 } 7983 conf->reshape_checkpoint = jiffies; 7984 md_wakeup_thread(mddev->sync_thread); 7985 md_new_event(mddev); 7986 return 0; 7987 } 7988 7989 /* This is called from the reshape thread and should make any 7990 * changes needed in 'conf' 7991 */ 7992 static void end_reshape(struct r5conf *conf) 7993 { 7994 7995 if (!test_bit(MD_RECOVERY_INTR, &conf->mddev->recovery)) { 7996 struct md_rdev *rdev; 7997 7998 spin_lock_irq(&conf->device_lock); 7999 conf->previous_raid_disks = conf->raid_disks; 8000 rdev_for_each(rdev, conf->mddev) 8001 rdev->data_offset = rdev->new_data_offset; 8002 smp_wmb(); 8003 conf->reshape_progress = MaxSector; 8004 conf->mddev->reshape_position = MaxSector; 8005 spin_unlock_irq(&conf->device_lock); 8006 wake_up(&conf->wait_for_overlap); 8007 8008 /* read-ahead size must cover two whole stripes, which is 8009 * 2 * (datadisks) * chunksize where 'n' is the number of raid devices 8010 */ 8011 if (conf->mddev->queue) { 8012 int data_disks = conf->raid_disks - conf->max_degraded; 8013 int stripe = data_disks * ((conf->chunk_sectors << 9) 8014 / PAGE_SIZE); 8015 if (conf->mddev->queue->backing_dev_info->ra_pages < 2 * stripe) 8016 conf->mddev->queue->backing_dev_info->ra_pages = 2 * stripe; 8017 } 8018 } 8019 } 8020 8021 /* This is called from the raid5d thread with mddev_lock held. 8022 * It makes config changes to the device. 8023 */ 8024 static void raid5_finish_reshape(struct mddev *mddev) 8025 { 8026 struct r5conf *conf = mddev->private; 8027 8028 if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery)) { 8029 8030 if (mddev->delta_disks > 0) { 8031 md_set_array_sectors(mddev, raid5_size(mddev, 0, 0)); 8032 if (mddev->queue) { 8033 set_capacity(mddev->gendisk, mddev->array_sectors); 8034 revalidate_disk(mddev->gendisk); 8035 } 8036 } else { 8037 int d; 8038 spin_lock_irq(&conf->device_lock); 8039 mddev->degraded = raid5_calc_degraded(conf); 8040 spin_unlock_irq(&conf->device_lock); 8041 for (d = conf->raid_disks ; 8042 d < conf->raid_disks - mddev->delta_disks; 8043 d++) { 8044 struct md_rdev *rdev = conf->disks[d].rdev; 8045 if (rdev) 8046 clear_bit(In_sync, &rdev->flags); 8047 rdev = conf->disks[d].replacement; 8048 if (rdev) 8049 clear_bit(In_sync, &rdev->flags); 8050 } 8051 } 8052 mddev->layout = conf->algorithm; 8053 mddev->chunk_sectors = conf->chunk_sectors; 8054 mddev->reshape_position = MaxSector; 8055 mddev->delta_disks = 0; 8056 mddev->reshape_backwards = 0; 8057 } 8058 } 8059 8060 static void raid5_quiesce(struct mddev *mddev, int state) 8061 { 8062 struct r5conf *conf = mddev->private; 8063 8064 switch(state) { 8065 case 2: /* resume for a suspend */ 8066 wake_up(&conf->wait_for_overlap); 8067 break; 8068 8069 case 1: /* stop all writes */ 8070 lock_all_device_hash_locks_irq(conf); 8071 /* '2' tells resync/reshape to pause so that all 8072 * active stripes can drain 8073 */ 8074 r5c_flush_cache(conf, INT_MAX); 8075 conf->quiesce = 2; 8076 wait_event_cmd(conf->wait_for_quiescent, 8077 atomic_read(&conf->active_stripes) == 0 && 8078 atomic_read(&conf->active_aligned_reads) == 0, 8079 unlock_all_device_hash_locks_irq(conf), 8080 lock_all_device_hash_locks_irq(conf)); 8081 conf->quiesce = 1; 8082 unlock_all_device_hash_locks_irq(conf); 8083 /* allow reshape to continue */ 8084 wake_up(&conf->wait_for_overlap); 8085 break; 8086 8087 case 0: /* re-enable writes */ 8088 lock_all_device_hash_locks_irq(conf); 8089 conf->quiesce = 0; 8090 wake_up(&conf->wait_for_quiescent); 8091 wake_up(&conf->wait_for_overlap); 8092 unlock_all_device_hash_locks_irq(conf); 8093 break; 8094 } 8095 r5l_quiesce(conf->log, state); 8096 } 8097 8098 static void *raid45_takeover_raid0(struct mddev *mddev, int level) 8099 { 8100 struct r0conf *raid0_conf = mddev->private; 8101 sector_t sectors; 8102 8103 /* for raid0 takeover only one zone is supported */ 8104 if (raid0_conf->nr_strip_zones > 1) { 8105 pr_warn("md/raid:%s: cannot takeover raid0 with more than one zone.\n", 8106 mdname(mddev)); 8107 return ERR_PTR(-EINVAL); 8108 } 8109 8110 sectors = raid0_conf->strip_zone[0].zone_end; 8111 sector_div(sectors, raid0_conf->strip_zone[0].nb_dev); 8112 mddev->dev_sectors = sectors; 8113 mddev->new_level = level; 8114 mddev->new_layout = ALGORITHM_PARITY_N; 8115 mddev->new_chunk_sectors = mddev->chunk_sectors; 8116 mddev->raid_disks += 1; 8117 mddev->delta_disks = 1; 8118 /* make sure it will be not marked as dirty */ 8119 mddev->recovery_cp = MaxSector; 8120 8121 return setup_conf(mddev); 8122 } 8123 8124 static void *raid5_takeover_raid1(struct mddev *mddev) 8125 { 8126 int chunksect; 8127 void *ret; 8128 8129 if (mddev->raid_disks != 2 || 8130 mddev->degraded > 1) 8131 return ERR_PTR(-EINVAL); 8132 8133 /* Should check if there are write-behind devices? */ 8134 8135 chunksect = 64*2; /* 64K by default */ 8136 8137 /* The array must be an exact multiple of chunksize */ 8138 while (chunksect && (mddev->array_sectors & (chunksect-1))) 8139 chunksect >>= 1; 8140 8141 if ((chunksect<<9) < STRIPE_SIZE) 8142 /* array size does not allow a suitable chunk size */ 8143 return ERR_PTR(-EINVAL); 8144 8145 mddev->new_level = 5; 8146 mddev->new_layout = ALGORITHM_LEFT_SYMMETRIC; 8147 mddev->new_chunk_sectors = chunksect; 8148 8149 ret = setup_conf(mddev); 8150 if (!IS_ERR(ret)) 8151 mddev_clear_unsupported_flags(mddev, 8152 UNSUPPORTED_MDDEV_FLAGS); 8153 return ret; 8154 } 8155 8156 static void *raid5_takeover_raid6(struct mddev *mddev) 8157 { 8158 int new_layout; 8159 8160 switch (mddev->layout) { 8161 case ALGORITHM_LEFT_ASYMMETRIC_6: 8162 new_layout = ALGORITHM_LEFT_ASYMMETRIC; 8163 break; 8164 case ALGORITHM_RIGHT_ASYMMETRIC_6: 8165 new_layout = ALGORITHM_RIGHT_ASYMMETRIC; 8166 break; 8167 case ALGORITHM_LEFT_SYMMETRIC_6: 8168 new_layout = ALGORITHM_LEFT_SYMMETRIC; 8169 break; 8170 case ALGORITHM_RIGHT_SYMMETRIC_6: 8171 new_layout = ALGORITHM_RIGHT_SYMMETRIC; 8172 break; 8173 case ALGORITHM_PARITY_0_6: 8174 new_layout = ALGORITHM_PARITY_0; 8175 break; 8176 case ALGORITHM_PARITY_N: 8177 new_layout = ALGORITHM_PARITY_N; 8178 break; 8179 default: 8180 return ERR_PTR(-EINVAL); 8181 } 8182 mddev->new_level = 5; 8183 mddev->new_layout = new_layout; 8184 mddev->delta_disks = -1; 8185 mddev->raid_disks -= 1; 8186 return setup_conf(mddev); 8187 } 8188 8189 static int raid5_check_reshape(struct mddev *mddev) 8190 { 8191 /* For a 2-drive array, the layout and chunk size can be changed 8192 * immediately as not restriping is needed. 8193 * For larger arrays we record the new value - after validation 8194 * to be used by a reshape pass. 8195 */ 8196 struct r5conf *conf = mddev->private; 8197 int new_chunk = mddev->new_chunk_sectors; 8198 8199 if (mddev->new_layout >= 0 && !algorithm_valid_raid5(mddev->new_layout)) 8200 return -EINVAL; 8201 if (new_chunk > 0) { 8202 if (!is_power_of_2(new_chunk)) 8203 return -EINVAL; 8204 if (new_chunk < (PAGE_SIZE>>9)) 8205 return -EINVAL; 8206 if (mddev->array_sectors & (new_chunk-1)) 8207 /* not factor of array size */ 8208 return -EINVAL; 8209 } 8210 8211 /* They look valid */ 8212 8213 if (mddev->raid_disks == 2) { 8214 /* can make the change immediately */ 8215 if (mddev->new_layout >= 0) { 8216 conf->algorithm = mddev->new_layout; 8217 mddev->layout = mddev->new_layout; 8218 } 8219 if (new_chunk > 0) { 8220 conf->chunk_sectors = new_chunk ; 8221 mddev->chunk_sectors = new_chunk; 8222 } 8223 set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags); 8224 md_wakeup_thread(mddev->thread); 8225 } 8226 return check_reshape(mddev); 8227 } 8228 8229 static int raid6_check_reshape(struct mddev *mddev) 8230 { 8231 int new_chunk = mddev->new_chunk_sectors; 8232 8233 if (mddev->new_layout >= 0 && !algorithm_valid_raid6(mddev->new_layout)) 8234 return -EINVAL; 8235 if (new_chunk > 0) { 8236 if (!is_power_of_2(new_chunk)) 8237 return -EINVAL; 8238 if (new_chunk < (PAGE_SIZE >> 9)) 8239 return -EINVAL; 8240 if (mddev->array_sectors & (new_chunk-1)) 8241 /* not factor of array size */ 8242 return -EINVAL; 8243 } 8244 8245 /* They look valid */ 8246 return check_reshape(mddev); 8247 } 8248 8249 static void *raid5_takeover(struct mddev *mddev) 8250 { 8251 /* raid5 can take over: 8252 * raid0 - if there is only one strip zone - make it a raid4 layout 8253 * raid1 - if there are two drives. We need to know the chunk size 8254 * raid4 - trivial - just use a raid4 layout. 8255 * raid6 - Providing it is a *_6 layout 8256 */ 8257 if (mddev->level == 0) 8258 return raid45_takeover_raid0(mddev, 5); 8259 if (mddev->level == 1) 8260 return raid5_takeover_raid1(mddev); 8261 if (mddev->level == 4) { 8262 mddev->new_layout = ALGORITHM_PARITY_N; 8263 mddev->new_level = 5; 8264 return setup_conf(mddev); 8265 } 8266 if (mddev->level == 6) 8267 return raid5_takeover_raid6(mddev); 8268 8269 return ERR_PTR(-EINVAL); 8270 } 8271 8272 static void *raid4_takeover(struct mddev *mddev) 8273 { 8274 /* raid4 can take over: 8275 * raid0 - if there is only one strip zone 8276 * raid5 - if layout is right 8277 */ 8278 if (mddev->level == 0) 8279 return raid45_takeover_raid0(mddev, 4); 8280 if (mddev->level == 5 && 8281 mddev->layout == ALGORITHM_PARITY_N) { 8282 mddev->new_layout = 0; 8283 mddev->new_level = 4; 8284 return setup_conf(mddev); 8285 } 8286 return ERR_PTR(-EINVAL); 8287 } 8288 8289 static struct md_personality raid5_personality; 8290 8291 static void *raid6_takeover(struct mddev *mddev) 8292 { 8293 /* Currently can only take over a raid5. We map the 8294 * personality to an equivalent raid6 personality 8295 * with the Q block at the end. 8296 */ 8297 int new_layout; 8298 8299 if (mddev->pers != &raid5_personality) 8300 return ERR_PTR(-EINVAL); 8301 if (mddev->degraded > 1) 8302 return ERR_PTR(-EINVAL); 8303 if (mddev->raid_disks > 253) 8304 return ERR_PTR(-EINVAL); 8305 if (mddev->raid_disks < 3) 8306 return ERR_PTR(-EINVAL); 8307 8308 switch (mddev->layout) { 8309 case ALGORITHM_LEFT_ASYMMETRIC: 8310 new_layout = ALGORITHM_LEFT_ASYMMETRIC_6; 8311 break; 8312 case ALGORITHM_RIGHT_ASYMMETRIC: 8313 new_layout = ALGORITHM_RIGHT_ASYMMETRIC_6; 8314 break; 8315 case ALGORITHM_LEFT_SYMMETRIC: 8316 new_layout = ALGORITHM_LEFT_SYMMETRIC_6; 8317 break; 8318 case ALGORITHM_RIGHT_SYMMETRIC: 8319 new_layout = ALGORITHM_RIGHT_SYMMETRIC_6; 8320 break; 8321 case ALGORITHM_PARITY_0: 8322 new_layout = ALGORITHM_PARITY_0_6; 8323 break; 8324 case ALGORITHM_PARITY_N: 8325 new_layout = ALGORITHM_PARITY_N; 8326 break; 8327 default: 8328 return ERR_PTR(-EINVAL); 8329 } 8330 mddev->new_level = 6; 8331 mddev->new_layout = new_layout; 8332 mddev->delta_disks = 1; 8333 mddev->raid_disks += 1; 8334 return setup_conf(mddev); 8335 } 8336 8337 static struct md_personality raid6_personality = 8338 { 8339 .name = "raid6", 8340 .level = 6, 8341 .owner = THIS_MODULE, 8342 .make_request = raid5_make_request, 8343 .run = raid5_run, 8344 .free = raid5_free, 8345 .status = raid5_status, 8346 .error_handler = raid5_error, 8347 .hot_add_disk = raid5_add_disk, 8348 .hot_remove_disk= raid5_remove_disk, 8349 .spare_active = raid5_spare_active, 8350 .sync_request = raid5_sync_request, 8351 .resize = raid5_resize, 8352 .size = raid5_size, 8353 .check_reshape = raid6_check_reshape, 8354 .start_reshape = raid5_start_reshape, 8355 .finish_reshape = raid5_finish_reshape, 8356 .quiesce = raid5_quiesce, 8357 .takeover = raid6_takeover, 8358 .congested = raid5_congested, 8359 }; 8360 static struct md_personality raid5_personality = 8361 { 8362 .name = "raid5", 8363 .level = 5, 8364 .owner = THIS_MODULE, 8365 .make_request = raid5_make_request, 8366 .run = raid5_run, 8367 .free = raid5_free, 8368 .status = raid5_status, 8369 .error_handler = raid5_error, 8370 .hot_add_disk = raid5_add_disk, 8371 .hot_remove_disk= raid5_remove_disk, 8372 .spare_active = raid5_spare_active, 8373 .sync_request = raid5_sync_request, 8374 .resize = raid5_resize, 8375 .size = raid5_size, 8376 .check_reshape = raid5_check_reshape, 8377 .start_reshape = raid5_start_reshape, 8378 .finish_reshape = raid5_finish_reshape, 8379 .quiesce = raid5_quiesce, 8380 .takeover = raid5_takeover, 8381 .congested = raid5_congested, 8382 }; 8383 8384 static struct md_personality raid4_personality = 8385 { 8386 .name = "raid4", 8387 .level = 4, 8388 .owner = THIS_MODULE, 8389 .make_request = raid5_make_request, 8390 .run = raid5_run, 8391 .free = raid5_free, 8392 .status = raid5_status, 8393 .error_handler = raid5_error, 8394 .hot_add_disk = raid5_add_disk, 8395 .hot_remove_disk= raid5_remove_disk, 8396 .spare_active = raid5_spare_active, 8397 .sync_request = raid5_sync_request, 8398 .resize = raid5_resize, 8399 .size = raid5_size, 8400 .check_reshape = raid5_check_reshape, 8401 .start_reshape = raid5_start_reshape, 8402 .finish_reshape = raid5_finish_reshape, 8403 .quiesce = raid5_quiesce, 8404 .takeover = raid4_takeover, 8405 .congested = raid5_congested, 8406 }; 8407 8408 static int __init raid5_init(void) 8409 { 8410 int ret; 8411 8412 raid5_wq = alloc_workqueue("raid5wq", 8413 WQ_UNBOUND|WQ_MEM_RECLAIM|WQ_CPU_INTENSIVE|WQ_SYSFS, 0); 8414 if (!raid5_wq) 8415 return -ENOMEM; 8416 8417 ret = cpuhp_setup_state_multi(CPUHP_MD_RAID5_PREPARE, 8418 "md/raid5:prepare", 8419 raid456_cpu_up_prepare, 8420 raid456_cpu_dead); 8421 if (ret) { 8422 destroy_workqueue(raid5_wq); 8423 return ret; 8424 } 8425 register_md_personality(&raid6_personality); 8426 register_md_personality(&raid5_personality); 8427 register_md_personality(&raid4_personality); 8428 return 0; 8429 } 8430 8431 static void raid5_exit(void) 8432 { 8433 unregister_md_personality(&raid6_personality); 8434 unregister_md_personality(&raid5_personality); 8435 unregister_md_personality(&raid4_personality); 8436 cpuhp_remove_multi_state(CPUHP_MD_RAID5_PREPARE); 8437 destroy_workqueue(raid5_wq); 8438 } 8439 8440 module_init(raid5_init); 8441 module_exit(raid5_exit); 8442 MODULE_LICENSE("GPL"); 8443 MODULE_DESCRIPTION("RAID4/5/6 (striping with parity) personality for MD"); 8444 MODULE_ALIAS("md-personality-4"); /* RAID5 */ 8445 MODULE_ALIAS("md-raid5"); 8446 MODULE_ALIAS("md-raid4"); 8447 MODULE_ALIAS("md-level-5"); 8448 MODULE_ALIAS("md-level-4"); 8449 MODULE_ALIAS("md-personality-8"); /* RAID6 */ 8450 MODULE_ALIAS("md-raid6"); 8451 MODULE_ALIAS("md-level-6"); 8452 8453 /* This used to be two separate modules, they were: */ 8454 MODULE_ALIAS("raid5"); 8455 MODULE_ALIAS("raid6"); 8456