1 /* 2 * raid5.c : Multiple Devices driver for Linux 3 * Copyright (C) 1996, 1997 Ingo Molnar, Miguel de Icaza, Gadi Oxman 4 * Copyright (C) 1999, 2000 Ingo Molnar 5 * Copyright (C) 2002, 2003 H. Peter Anvin 6 * 7 * RAID-4/5/6 management functions. 8 * Thanks to Penguin Computing for making the RAID-6 development possible 9 * by donating a test server! 10 * 11 * This program is free software; you can redistribute it and/or modify 12 * it under the terms of the GNU General Public License as published by 13 * the Free Software Foundation; either version 2, or (at your option) 14 * any later version. 15 * 16 * You should have received a copy of the GNU General Public License 17 * (for example /usr/src/linux/COPYING); if not, write to the Free 18 * Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. 19 */ 20 21 /* 22 * BITMAP UNPLUGGING: 23 * 24 * The sequencing for updating the bitmap reliably is a little 25 * subtle (and I got it wrong the first time) so it deserves some 26 * explanation. 27 * 28 * We group bitmap updates into batches. Each batch has a number. 29 * We may write out several batches at once, but that isn't very important. 30 * conf->seq_write is the number of the last batch successfully written. 31 * conf->seq_flush is the number of the last batch that was closed to 32 * new additions. 33 * When we discover that we will need to write to any block in a stripe 34 * (in add_stripe_bio) we update the in-memory bitmap and record in sh->bm_seq 35 * the number of the batch it will be in. This is seq_flush+1. 36 * When we are ready to do a write, if that batch hasn't been written yet, 37 * we plug the array and queue the stripe for later. 38 * When an unplug happens, we increment bm_flush, thus closing the current 39 * batch. 40 * When we notice that bm_flush > bm_write, we write out all pending updates 41 * to the bitmap, and advance bm_write to where bm_flush was. 42 * This may occasionally write a bit out twice, but is sure never to 43 * miss any bits. 44 */ 45 46 #include <linux/blkdev.h> 47 #include <linux/kthread.h> 48 #include <linux/raid/pq.h> 49 #include <linux/async_tx.h> 50 #include <linux/module.h> 51 #include <linux/async.h> 52 #include <linux/seq_file.h> 53 #include <linux/cpu.h> 54 #include <linux/slab.h> 55 #include <linux/ratelimit.h> 56 #include <linux/nodemask.h> 57 #include <linux/flex_array.h> 58 #include <linux/sched/signal.h> 59 60 #include <trace/events/block.h> 61 #include <linux/list_sort.h> 62 63 #include "md.h" 64 #include "raid5.h" 65 #include "raid0.h" 66 #include "bitmap.h" 67 #include "raid5-log.h" 68 69 #define UNSUPPORTED_MDDEV_FLAGS (1L << MD_FAILFAST_SUPPORTED) 70 71 #define cpu_to_group(cpu) cpu_to_node(cpu) 72 #define ANY_GROUP NUMA_NO_NODE 73 74 static bool devices_handle_discard_safely = false; 75 module_param(devices_handle_discard_safely, bool, 0644); 76 MODULE_PARM_DESC(devices_handle_discard_safely, 77 "Set to Y if all devices in each array reliably return zeroes on reads from discarded regions"); 78 static struct workqueue_struct *raid5_wq; 79 80 static inline struct hlist_head *stripe_hash(struct r5conf *conf, sector_t sect) 81 { 82 int hash = (sect >> STRIPE_SHIFT) & HASH_MASK; 83 return &conf->stripe_hashtbl[hash]; 84 } 85 86 static inline int stripe_hash_locks_hash(sector_t sect) 87 { 88 return (sect >> STRIPE_SHIFT) & STRIPE_HASH_LOCKS_MASK; 89 } 90 91 static inline void lock_device_hash_lock(struct r5conf *conf, int hash) 92 { 93 spin_lock_irq(conf->hash_locks + hash); 94 spin_lock(&conf->device_lock); 95 } 96 97 static inline void unlock_device_hash_lock(struct r5conf *conf, int hash) 98 { 99 spin_unlock(&conf->device_lock); 100 spin_unlock_irq(conf->hash_locks + hash); 101 } 102 103 static inline void lock_all_device_hash_locks_irq(struct r5conf *conf) 104 { 105 int i; 106 local_irq_disable(); 107 spin_lock(conf->hash_locks); 108 for (i = 1; i < NR_STRIPE_HASH_LOCKS; i++) 109 spin_lock_nest_lock(conf->hash_locks + i, conf->hash_locks); 110 spin_lock(&conf->device_lock); 111 } 112 113 static inline void unlock_all_device_hash_locks_irq(struct r5conf *conf) 114 { 115 int i; 116 spin_unlock(&conf->device_lock); 117 for (i = NR_STRIPE_HASH_LOCKS; i; i--) 118 spin_unlock(conf->hash_locks + i - 1); 119 local_irq_enable(); 120 } 121 122 /* Find first data disk in a raid6 stripe */ 123 static inline int raid6_d0(struct stripe_head *sh) 124 { 125 if (sh->ddf_layout) 126 /* ddf always start from first device */ 127 return 0; 128 /* md starts just after Q block */ 129 if (sh->qd_idx == sh->disks - 1) 130 return 0; 131 else 132 return sh->qd_idx + 1; 133 } 134 static inline int raid6_next_disk(int disk, int raid_disks) 135 { 136 disk++; 137 return (disk < raid_disks) ? disk : 0; 138 } 139 140 /* When walking through the disks in a raid5, starting at raid6_d0, 141 * We need to map each disk to a 'slot', where the data disks are slot 142 * 0 .. raid_disks-3, the parity disk is raid_disks-2 and the Q disk 143 * is raid_disks-1. This help does that mapping. 144 */ 145 static int raid6_idx_to_slot(int idx, struct stripe_head *sh, 146 int *count, int syndrome_disks) 147 { 148 int slot = *count; 149 150 if (sh->ddf_layout) 151 (*count)++; 152 if (idx == sh->pd_idx) 153 return syndrome_disks; 154 if (idx == sh->qd_idx) 155 return syndrome_disks + 1; 156 if (!sh->ddf_layout) 157 (*count)++; 158 return slot; 159 } 160 161 static void print_raid5_conf (struct r5conf *conf); 162 163 static int stripe_operations_active(struct stripe_head *sh) 164 { 165 return sh->check_state || sh->reconstruct_state || 166 test_bit(STRIPE_BIOFILL_RUN, &sh->state) || 167 test_bit(STRIPE_COMPUTE_RUN, &sh->state); 168 } 169 170 static bool stripe_is_lowprio(struct stripe_head *sh) 171 { 172 return (test_bit(STRIPE_R5C_FULL_STRIPE, &sh->state) || 173 test_bit(STRIPE_R5C_PARTIAL_STRIPE, &sh->state)) && 174 !test_bit(STRIPE_R5C_CACHING, &sh->state); 175 } 176 177 static void raid5_wakeup_stripe_thread(struct stripe_head *sh) 178 { 179 struct r5conf *conf = sh->raid_conf; 180 struct r5worker_group *group; 181 int thread_cnt; 182 int i, cpu = sh->cpu; 183 184 if (!cpu_online(cpu)) { 185 cpu = cpumask_any(cpu_online_mask); 186 sh->cpu = cpu; 187 } 188 189 if (list_empty(&sh->lru)) { 190 struct r5worker_group *group; 191 group = conf->worker_groups + cpu_to_group(cpu); 192 if (stripe_is_lowprio(sh)) 193 list_add_tail(&sh->lru, &group->loprio_list); 194 else 195 list_add_tail(&sh->lru, &group->handle_list); 196 group->stripes_cnt++; 197 sh->group = group; 198 } 199 200 if (conf->worker_cnt_per_group == 0) { 201 md_wakeup_thread(conf->mddev->thread); 202 return; 203 } 204 205 group = conf->worker_groups + cpu_to_group(sh->cpu); 206 207 group->workers[0].working = true; 208 /* at least one worker should run to avoid race */ 209 queue_work_on(sh->cpu, raid5_wq, &group->workers[0].work); 210 211 thread_cnt = group->stripes_cnt / MAX_STRIPE_BATCH - 1; 212 /* wakeup more workers */ 213 for (i = 1; i < conf->worker_cnt_per_group && thread_cnt > 0; i++) { 214 if (group->workers[i].working == false) { 215 group->workers[i].working = true; 216 queue_work_on(sh->cpu, raid5_wq, 217 &group->workers[i].work); 218 thread_cnt--; 219 } 220 } 221 } 222 223 static void do_release_stripe(struct r5conf *conf, struct stripe_head *sh, 224 struct list_head *temp_inactive_list) 225 { 226 int i; 227 int injournal = 0; /* number of date pages with R5_InJournal */ 228 229 BUG_ON(!list_empty(&sh->lru)); 230 BUG_ON(atomic_read(&conf->active_stripes)==0); 231 232 if (r5c_is_writeback(conf->log)) 233 for (i = sh->disks; i--; ) 234 if (test_bit(R5_InJournal, &sh->dev[i].flags)) 235 injournal++; 236 /* 237 * When quiesce in r5c write back, set STRIPE_HANDLE for stripes with 238 * data in journal, so they are not released to cached lists 239 */ 240 if (conf->quiesce && r5c_is_writeback(conf->log) && 241 !test_bit(STRIPE_HANDLE, &sh->state) && injournal != 0) { 242 if (test_bit(STRIPE_R5C_CACHING, &sh->state)) 243 r5c_make_stripe_write_out(sh); 244 set_bit(STRIPE_HANDLE, &sh->state); 245 } 246 247 if (test_bit(STRIPE_HANDLE, &sh->state)) { 248 if (test_bit(STRIPE_DELAYED, &sh->state) && 249 !test_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) 250 list_add_tail(&sh->lru, &conf->delayed_list); 251 else if (test_bit(STRIPE_BIT_DELAY, &sh->state) && 252 sh->bm_seq - conf->seq_write > 0) 253 list_add_tail(&sh->lru, &conf->bitmap_list); 254 else { 255 clear_bit(STRIPE_DELAYED, &sh->state); 256 clear_bit(STRIPE_BIT_DELAY, &sh->state); 257 if (conf->worker_cnt_per_group == 0) { 258 if (stripe_is_lowprio(sh)) 259 list_add_tail(&sh->lru, 260 &conf->loprio_list); 261 else 262 list_add_tail(&sh->lru, 263 &conf->handle_list); 264 } else { 265 raid5_wakeup_stripe_thread(sh); 266 return; 267 } 268 } 269 md_wakeup_thread(conf->mddev->thread); 270 } else { 271 BUG_ON(stripe_operations_active(sh)); 272 if (test_and_clear_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) 273 if (atomic_dec_return(&conf->preread_active_stripes) 274 < IO_THRESHOLD) 275 md_wakeup_thread(conf->mddev->thread); 276 atomic_dec(&conf->active_stripes); 277 if (!test_bit(STRIPE_EXPANDING, &sh->state)) { 278 if (!r5c_is_writeback(conf->log)) 279 list_add_tail(&sh->lru, temp_inactive_list); 280 else { 281 WARN_ON(test_bit(R5_InJournal, &sh->dev[sh->pd_idx].flags)); 282 if (injournal == 0) 283 list_add_tail(&sh->lru, temp_inactive_list); 284 else if (injournal == conf->raid_disks - conf->max_degraded) { 285 /* full stripe */ 286 if (!test_and_set_bit(STRIPE_R5C_FULL_STRIPE, &sh->state)) 287 atomic_inc(&conf->r5c_cached_full_stripes); 288 if (test_and_clear_bit(STRIPE_R5C_PARTIAL_STRIPE, &sh->state)) 289 atomic_dec(&conf->r5c_cached_partial_stripes); 290 list_add_tail(&sh->lru, &conf->r5c_full_stripe_list); 291 r5c_check_cached_full_stripe(conf); 292 } else 293 /* 294 * STRIPE_R5C_PARTIAL_STRIPE is set in 295 * r5c_try_caching_write(). No need to 296 * set it again. 297 */ 298 list_add_tail(&sh->lru, &conf->r5c_partial_stripe_list); 299 } 300 } 301 } 302 } 303 304 static void __release_stripe(struct r5conf *conf, struct stripe_head *sh, 305 struct list_head *temp_inactive_list) 306 { 307 if (atomic_dec_and_test(&sh->count)) 308 do_release_stripe(conf, sh, temp_inactive_list); 309 } 310 311 /* 312 * @hash could be NR_STRIPE_HASH_LOCKS, then we have a list of inactive_list 313 * 314 * Be careful: Only one task can add/delete stripes from temp_inactive_list at 315 * given time. Adding stripes only takes device lock, while deleting stripes 316 * only takes hash lock. 317 */ 318 static void release_inactive_stripe_list(struct r5conf *conf, 319 struct list_head *temp_inactive_list, 320 int hash) 321 { 322 int size; 323 bool do_wakeup = false; 324 unsigned long flags; 325 326 if (hash == NR_STRIPE_HASH_LOCKS) { 327 size = NR_STRIPE_HASH_LOCKS; 328 hash = NR_STRIPE_HASH_LOCKS - 1; 329 } else 330 size = 1; 331 while (size) { 332 struct list_head *list = &temp_inactive_list[size - 1]; 333 334 /* 335 * We don't hold any lock here yet, raid5_get_active_stripe() might 336 * remove stripes from the list 337 */ 338 if (!list_empty_careful(list)) { 339 spin_lock_irqsave(conf->hash_locks + hash, flags); 340 if (list_empty(conf->inactive_list + hash) && 341 !list_empty(list)) 342 atomic_dec(&conf->empty_inactive_list_nr); 343 list_splice_tail_init(list, conf->inactive_list + hash); 344 do_wakeup = true; 345 spin_unlock_irqrestore(conf->hash_locks + hash, flags); 346 } 347 size--; 348 hash--; 349 } 350 351 if (do_wakeup) { 352 wake_up(&conf->wait_for_stripe); 353 if (atomic_read(&conf->active_stripes) == 0) 354 wake_up(&conf->wait_for_quiescent); 355 if (conf->retry_read_aligned) 356 md_wakeup_thread(conf->mddev->thread); 357 } 358 } 359 360 /* should hold conf->device_lock already */ 361 static int release_stripe_list(struct r5conf *conf, 362 struct list_head *temp_inactive_list) 363 { 364 struct stripe_head *sh, *t; 365 int count = 0; 366 struct llist_node *head; 367 368 head = llist_del_all(&conf->released_stripes); 369 head = llist_reverse_order(head); 370 llist_for_each_entry_safe(sh, t, head, release_list) { 371 int hash; 372 373 /* sh could be readded after STRIPE_ON_RELEASE_LIST is cleard */ 374 smp_mb(); 375 clear_bit(STRIPE_ON_RELEASE_LIST, &sh->state); 376 /* 377 * Don't worry the bit is set here, because if the bit is set 378 * again, the count is always > 1. This is true for 379 * STRIPE_ON_UNPLUG_LIST bit too. 380 */ 381 hash = sh->hash_lock_index; 382 __release_stripe(conf, sh, &temp_inactive_list[hash]); 383 count++; 384 } 385 386 return count; 387 } 388 389 void raid5_release_stripe(struct stripe_head *sh) 390 { 391 struct r5conf *conf = sh->raid_conf; 392 unsigned long flags; 393 struct list_head list; 394 int hash; 395 bool wakeup; 396 397 /* Avoid release_list until the last reference. 398 */ 399 if (atomic_add_unless(&sh->count, -1, 1)) 400 return; 401 402 if (unlikely(!conf->mddev->thread) || 403 test_and_set_bit(STRIPE_ON_RELEASE_LIST, &sh->state)) 404 goto slow_path; 405 wakeup = llist_add(&sh->release_list, &conf->released_stripes); 406 if (wakeup) 407 md_wakeup_thread(conf->mddev->thread); 408 return; 409 slow_path: 410 local_irq_save(flags); 411 /* we are ok here if STRIPE_ON_RELEASE_LIST is set or not */ 412 if (atomic_dec_and_lock(&sh->count, &conf->device_lock)) { 413 INIT_LIST_HEAD(&list); 414 hash = sh->hash_lock_index; 415 do_release_stripe(conf, sh, &list); 416 spin_unlock(&conf->device_lock); 417 release_inactive_stripe_list(conf, &list, hash); 418 } 419 local_irq_restore(flags); 420 } 421 422 static inline void remove_hash(struct stripe_head *sh) 423 { 424 pr_debug("remove_hash(), stripe %llu\n", 425 (unsigned long long)sh->sector); 426 427 hlist_del_init(&sh->hash); 428 } 429 430 static inline void insert_hash(struct r5conf *conf, struct stripe_head *sh) 431 { 432 struct hlist_head *hp = stripe_hash(conf, sh->sector); 433 434 pr_debug("insert_hash(), stripe %llu\n", 435 (unsigned long long)sh->sector); 436 437 hlist_add_head(&sh->hash, hp); 438 } 439 440 /* find an idle stripe, make sure it is unhashed, and return it. */ 441 static struct stripe_head *get_free_stripe(struct r5conf *conf, int hash) 442 { 443 struct stripe_head *sh = NULL; 444 struct list_head *first; 445 446 if (list_empty(conf->inactive_list + hash)) 447 goto out; 448 first = (conf->inactive_list + hash)->next; 449 sh = list_entry(first, struct stripe_head, lru); 450 list_del_init(first); 451 remove_hash(sh); 452 atomic_inc(&conf->active_stripes); 453 BUG_ON(hash != sh->hash_lock_index); 454 if (list_empty(conf->inactive_list + hash)) 455 atomic_inc(&conf->empty_inactive_list_nr); 456 out: 457 return sh; 458 } 459 460 static void shrink_buffers(struct stripe_head *sh) 461 { 462 struct page *p; 463 int i; 464 int num = sh->raid_conf->pool_size; 465 466 for (i = 0; i < num ; i++) { 467 WARN_ON(sh->dev[i].page != sh->dev[i].orig_page); 468 p = sh->dev[i].page; 469 if (!p) 470 continue; 471 sh->dev[i].page = NULL; 472 put_page(p); 473 } 474 } 475 476 static int grow_buffers(struct stripe_head *sh, gfp_t gfp) 477 { 478 int i; 479 int num = sh->raid_conf->pool_size; 480 481 for (i = 0; i < num; i++) { 482 struct page *page; 483 484 if (!(page = alloc_page(gfp))) { 485 return 1; 486 } 487 sh->dev[i].page = page; 488 sh->dev[i].orig_page = page; 489 } 490 491 return 0; 492 } 493 494 static void raid5_build_block(struct stripe_head *sh, int i, int previous); 495 static void stripe_set_idx(sector_t stripe, struct r5conf *conf, int previous, 496 struct stripe_head *sh); 497 498 static void init_stripe(struct stripe_head *sh, sector_t sector, int previous) 499 { 500 struct r5conf *conf = sh->raid_conf; 501 int i, seq; 502 503 BUG_ON(atomic_read(&sh->count) != 0); 504 BUG_ON(test_bit(STRIPE_HANDLE, &sh->state)); 505 BUG_ON(stripe_operations_active(sh)); 506 BUG_ON(sh->batch_head); 507 508 pr_debug("init_stripe called, stripe %llu\n", 509 (unsigned long long)sector); 510 retry: 511 seq = read_seqcount_begin(&conf->gen_lock); 512 sh->generation = conf->generation - previous; 513 sh->disks = previous ? conf->previous_raid_disks : conf->raid_disks; 514 sh->sector = sector; 515 stripe_set_idx(sector, conf, previous, sh); 516 sh->state = 0; 517 518 for (i = sh->disks; i--; ) { 519 struct r5dev *dev = &sh->dev[i]; 520 521 if (dev->toread || dev->read || dev->towrite || dev->written || 522 test_bit(R5_LOCKED, &dev->flags)) { 523 pr_err("sector=%llx i=%d %p %p %p %p %d\n", 524 (unsigned long long)sh->sector, i, dev->toread, 525 dev->read, dev->towrite, dev->written, 526 test_bit(R5_LOCKED, &dev->flags)); 527 WARN_ON(1); 528 } 529 dev->flags = 0; 530 raid5_build_block(sh, i, previous); 531 } 532 if (read_seqcount_retry(&conf->gen_lock, seq)) 533 goto retry; 534 sh->overwrite_disks = 0; 535 insert_hash(conf, sh); 536 sh->cpu = smp_processor_id(); 537 set_bit(STRIPE_BATCH_READY, &sh->state); 538 } 539 540 static struct stripe_head *__find_stripe(struct r5conf *conf, sector_t sector, 541 short generation) 542 { 543 struct stripe_head *sh; 544 545 pr_debug("__find_stripe, sector %llu\n", (unsigned long long)sector); 546 hlist_for_each_entry(sh, stripe_hash(conf, sector), hash) 547 if (sh->sector == sector && sh->generation == generation) 548 return sh; 549 pr_debug("__stripe %llu not in cache\n", (unsigned long long)sector); 550 return NULL; 551 } 552 553 /* 554 * Need to check if array has failed when deciding whether to: 555 * - start an array 556 * - remove non-faulty devices 557 * - add a spare 558 * - allow a reshape 559 * This determination is simple when no reshape is happening. 560 * However if there is a reshape, we need to carefully check 561 * both the before and after sections. 562 * This is because some failed devices may only affect one 563 * of the two sections, and some non-in_sync devices may 564 * be insync in the section most affected by failed devices. 565 */ 566 int raid5_calc_degraded(struct r5conf *conf) 567 { 568 int degraded, degraded2; 569 int i; 570 571 rcu_read_lock(); 572 degraded = 0; 573 for (i = 0; i < conf->previous_raid_disks; i++) { 574 struct md_rdev *rdev = rcu_dereference(conf->disks[i].rdev); 575 if (rdev && test_bit(Faulty, &rdev->flags)) 576 rdev = rcu_dereference(conf->disks[i].replacement); 577 if (!rdev || test_bit(Faulty, &rdev->flags)) 578 degraded++; 579 else if (test_bit(In_sync, &rdev->flags)) 580 ; 581 else 582 /* not in-sync or faulty. 583 * If the reshape increases the number of devices, 584 * this is being recovered by the reshape, so 585 * this 'previous' section is not in_sync. 586 * If the number of devices is being reduced however, 587 * the device can only be part of the array if 588 * we are reverting a reshape, so this section will 589 * be in-sync. 590 */ 591 if (conf->raid_disks >= conf->previous_raid_disks) 592 degraded++; 593 } 594 rcu_read_unlock(); 595 if (conf->raid_disks == conf->previous_raid_disks) 596 return degraded; 597 rcu_read_lock(); 598 degraded2 = 0; 599 for (i = 0; i < conf->raid_disks; i++) { 600 struct md_rdev *rdev = rcu_dereference(conf->disks[i].rdev); 601 if (rdev && test_bit(Faulty, &rdev->flags)) 602 rdev = rcu_dereference(conf->disks[i].replacement); 603 if (!rdev || test_bit(Faulty, &rdev->flags)) 604 degraded2++; 605 else if (test_bit(In_sync, &rdev->flags)) 606 ; 607 else 608 /* not in-sync or faulty. 609 * If reshape increases the number of devices, this 610 * section has already been recovered, else it 611 * almost certainly hasn't. 612 */ 613 if (conf->raid_disks <= conf->previous_raid_disks) 614 degraded2++; 615 } 616 rcu_read_unlock(); 617 if (degraded2 > degraded) 618 return degraded2; 619 return degraded; 620 } 621 622 static int has_failed(struct r5conf *conf) 623 { 624 int degraded; 625 626 if (conf->mddev->reshape_position == MaxSector) 627 return conf->mddev->degraded > conf->max_degraded; 628 629 degraded = raid5_calc_degraded(conf); 630 if (degraded > conf->max_degraded) 631 return 1; 632 return 0; 633 } 634 635 struct stripe_head * 636 raid5_get_active_stripe(struct r5conf *conf, sector_t sector, 637 int previous, int noblock, int noquiesce) 638 { 639 struct stripe_head *sh; 640 int hash = stripe_hash_locks_hash(sector); 641 int inc_empty_inactive_list_flag; 642 643 pr_debug("get_stripe, sector %llu\n", (unsigned long long)sector); 644 645 spin_lock_irq(conf->hash_locks + hash); 646 647 do { 648 wait_event_lock_irq(conf->wait_for_quiescent, 649 conf->quiesce == 0 || noquiesce, 650 *(conf->hash_locks + hash)); 651 sh = __find_stripe(conf, sector, conf->generation - previous); 652 if (!sh) { 653 if (!test_bit(R5_INACTIVE_BLOCKED, &conf->cache_state)) { 654 sh = get_free_stripe(conf, hash); 655 if (!sh && !test_bit(R5_DID_ALLOC, 656 &conf->cache_state)) 657 set_bit(R5_ALLOC_MORE, 658 &conf->cache_state); 659 } 660 if (noblock && sh == NULL) 661 break; 662 663 r5c_check_stripe_cache_usage(conf); 664 if (!sh) { 665 set_bit(R5_INACTIVE_BLOCKED, 666 &conf->cache_state); 667 r5l_wake_reclaim(conf->log, 0); 668 wait_event_lock_irq( 669 conf->wait_for_stripe, 670 !list_empty(conf->inactive_list + hash) && 671 (atomic_read(&conf->active_stripes) 672 < (conf->max_nr_stripes * 3 / 4) 673 || !test_bit(R5_INACTIVE_BLOCKED, 674 &conf->cache_state)), 675 *(conf->hash_locks + hash)); 676 clear_bit(R5_INACTIVE_BLOCKED, 677 &conf->cache_state); 678 } else { 679 init_stripe(sh, sector, previous); 680 atomic_inc(&sh->count); 681 } 682 } else if (!atomic_inc_not_zero(&sh->count)) { 683 spin_lock(&conf->device_lock); 684 if (!atomic_read(&sh->count)) { 685 if (!test_bit(STRIPE_HANDLE, &sh->state)) 686 atomic_inc(&conf->active_stripes); 687 BUG_ON(list_empty(&sh->lru) && 688 !test_bit(STRIPE_EXPANDING, &sh->state)); 689 inc_empty_inactive_list_flag = 0; 690 if (!list_empty(conf->inactive_list + hash)) 691 inc_empty_inactive_list_flag = 1; 692 list_del_init(&sh->lru); 693 if (list_empty(conf->inactive_list + hash) && inc_empty_inactive_list_flag) 694 atomic_inc(&conf->empty_inactive_list_nr); 695 if (sh->group) { 696 sh->group->stripes_cnt--; 697 sh->group = NULL; 698 } 699 } 700 atomic_inc(&sh->count); 701 spin_unlock(&conf->device_lock); 702 } 703 } while (sh == NULL); 704 705 spin_unlock_irq(conf->hash_locks + hash); 706 return sh; 707 } 708 709 static bool is_full_stripe_write(struct stripe_head *sh) 710 { 711 BUG_ON(sh->overwrite_disks > (sh->disks - sh->raid_conf->max_degraded)); 712 return sh->overwrite_disks == (sh->disks - sh->raid_conf->max_degraded); 713 } 714 715 static void lock_two_stripes(struct stripe_head *sh1, struct stripe_head *sh2) 716 { 717 local_irq_disable(); 718 if (sh1 > sh2) { 719 spin_lock(&sh2->stripe_lock); 720 spin_lock_nested(&sh1->stripe_lock, 1); 721 } else { 722 spin_lock(&sh1->stripe_lock); 723 spin_lock_nested(&sh2->stripe_lock, 1); 724 } 725 } 726 727 static void unlock_two_stripes(struct stripe_head *sh1, struct stripe_head *sh2) 728 { 729 spin_unlock(&sh1->stripe_lock); 730 spin_unlock(&sh2->stripe_lock); 731 local_irq_enable(); 732 } 733 734 /* Only freshly new full stripe normal write stripe can be added to a batch list */ 735 static bool stripe_can_batch(struct stripe_head *sh) 736 { 737 struct r5conf *conf = sh->raid_conf; 738 739 if (conf->log || raid5_has_ppl(conf)) 740 return false; 741 return test_bit(STRIPE_BATCH_READY, &sh->state) && 742 !test_bit(STRIPE_BITMAP_PENDING, &sh->state) && 743 is_full_stripe_write(sh); 744 } 745 746 /* we only do back search */ 747 static void stripe_add_to_batch_list(struct r5conf *conf, struct stripe_head *sh) 748 { 749 struct stripe_head *head; 750 sector_t head_sector, tmp_sec; 751 int hash; 752 int dd_idx; 753 int inc_empty_inactive_list_flag; 754 755 /* Don't cross chunks, so stripe pd_idx/qd_idx is the same */ 756 tmp_sec = sh->sector; 757 if (!sector_div(tmp_sec, conf->chunk_sectors)) 758 return; 759 head_sector = sh->sector - STRIPE_SECTORS; 760 761 hash = stripe_hash_locks_hash(head_sector); 762 spin_lock_irq(conf->hash_locks + hash); 763 head = __find_stripe(conf, head_sector, conf->generation); 764 if (head && !atomic_inc_not_zero(&head->count)) { 765 spin_lock(&conf->device_lock); 766 if (!atomic_read(&head->count)) { 767 if (!test_bit(STRIPE_HANDLE, &head->state)) 768 atomic_inc(&conf->active_stripes); 769 BUG_ON(list_empty(&head->lru) && 770 !test_bit(STRIPE_EXPANDING, &head->state)); 771 inc_empty_inactive_list_flag = 0; 772 if (!list_empty(conf->inactive_list + hash)) 773 inc_empty_inactive_list_flag = 1; 774 list_del_init(&head->lru); 775 if (list_empty(conf->inactive_list + hash) && inc_empty_inactive_list_flag) 776 atomic_inc(&conf->empty_inactive_list_nr); 777 if (head->group) { 778 head->group->stripes_cnt--; 779 head->group = NULL; 780 } 781 } 782 atomic_inc(&head->count); 783 spin_unlock(&conf->device_lock); 784 } 785 spin_unlock_irq(conf->hash_locks + hash); 786 787 if (!head) 788 return; 789 if (!stripe_can_batch(head)) 790 goto out; 791 792 lock_two_stripes(head, sh); 793 /* clear_batch_ready clear the flag */ 794 if (!stripe_can_batch(head) || !stripe_can_batch(sh)) 795 goto unlock_out; 796 797 if (sh->batch_head) 798 goto unlock_out; 799 800 dd_idx = 0; 801 while (dd_idx == sh->pd_idx || dd_idx == sh->qd_idx) 802 dd_idx++; 803 if (head->dev[dd_idx].towrite->bi_opf != sh->dev[dd_idx].towrite->bi_opf || 804 bio_op(head->dev[dd_idx].towrite) != bio_op(sh->dev[dd_idx].towrite)) 805 goto unlock_out; 806 807 if (head->batch_head) { 808 spin_lock(&head->batch_head->batch_lock); 809 /* This batch list is already running */ 810 if (!stripe_can_batch(head)) { 811 spin_unlock(&head->batch_head->batch_lock); 812 goto unlock_out; 813 } 814 815 /* 816 * at this point, head's BATCH_READY could be cleared, but we 817 * can still add the stripe to batch list 818 */ 819 list_add(&sh->batch_list, &head->batch_list); 820 spin_unlock(&head->batch_head->batch_lock); 821 822 sh->batch_head = head->batch_head; 823 } else { 824 head->batch_head = head; 825 sh->batch_head = head->batch_head; 826 spin_lock(&head->batch_lock); 827 list_add_tail(&sh->batch_list, &head->batch_list); 828 spin_unlock(&head->batch_lock); 829 } 830 831 if (test_and_clear_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) 832 if (atomic_dec_return(&conf->preread_active_stripes) 833 < IO_THRESHOLD) 834 md_wakeup_thread(conf->mddev->thread); 835 836 if (test_and_clear_bit(STRIPE_BIT_DELAY, &sh->state)) { 837 int seq = sh->bm_seq; 838 if (test_bit(STRIPE_BIT_DELAY, &sh->batch_head->state) && 839 sh->batch_head->bm_seq > seq) 840 seq = sh->batch_head->bm_seq; 841 set_bit(STRIPE_BIT_DELAY, &sh->batch_head->state); 842 sh->batch_head->bm_seq = seq; 843 } 844 845 atomic_inc(&sh->count); 846 unlock_out: 847 unlock_two_stripes(head, sh); 848 out: 849 raid5_release_stripe(head); 850 } 851 852 /* Determine if 'data_offset' or 'new_data_offset' should be used 853 * in this stripe_head. 854 */ 855 static int use_new_offset(struct r5conf *conf, struct stripe_head *sh) 856 { 857 sector_t progress = conf->reshape_progress; 858 /* Need a memory barrier to make sure we see the value 859 * of conf->generation, or ->data_offset that was set before 860 * reshape_progress was updated. 861 */ 862 smp_rmb(); 863 if (progress == MaxSector) 864 return 0; 865 if (sh->generation == conf->generation - 1) 866 return 0; 867 /* We are in a reshape, and this is a new-generation stripe, 868 * so use new_data_offset. 869 */ 870 return 1; 871 } 872 873 static void dispatch_bio_list(struct bio_list *tmp) 874 { 875 struct bio *bio; 876 877 while ((bio = bio_list_pop(tmp))) 878 generic_make_request(bio); 879 } 880 881 static int cmp_stripe(void *priv, struct list_head *a, struct list_head *b) 882 { 883 const struct r5pending_data *da = list_entry(a, 884 struct r5pending_data, sibling); 885 const struct r5pending_data *db = list_entry(b, 886 struct r5pending_data, sibling); 887 if (da->sector > db->sector) 888 return 1; 889 if (da->sector < db->sector) 890 return -1; 891 return 0; 892 } 893 894 static void dispatch_defer_bios(struct r5conf *conf, int target, 895 struct bio_list *list) 896 { 897 struct r5pending_data *data; 898 struct list_head *first, *next = NULL; 899 int cnt = 0; 900 901 if (conf->pending_data_cnt == 0) 902 return; 903 904 list_sort(NULL, &conf->pending_list, cmp_stripe); 905 906 first = conf->pending_list.next; 907 908 /* temporarily move the head */ 909 if (conf->next_pending_data) 910 list_move_tail(&conf->pending_list, 911 &conf->next_pending_data->sibling); 912 913 while (!list_empty(&conf->pending_list)) { 914 data = list_first_entry(&conf->pending_list, 915 struct r5pending_data, sibling); 916 if (&data->sibling == first) 917 first = data->sibling.next; 918 next = data->sibling.next; 919 920 bio_list_merge(list, &data->bios); 921 list_move(&data->sibling, &conf->free_list); 922 cnt++; 923 if (cnt >= target) 924 break; 925 } 926 conf->pending_data_cnt -= cnt; 927 BUG_ON(conf->pending_data_cnt < 0 || cnt < target); 928 929 if (next != &conf->pending_list) 930 conf->next_pending_data = list_entry(next, 931 struct r5pending_data, sibling); 932 else 933 conf->next_pending_data = NULL; 934 /* list isn't empty */ 935 if (first != &conf->pending_list) 936 list_move_tail(&conf->pending_list, first); 937 } 938 939 static void flush_deferred_bios(struct r5conf *conf) 940 { 941 struct bio_list tmp = BIO_EMPTY_LIST; 942 943 if (conf->pending_data_cnt == 0) 944 return; 945 946 spin_lock(&conf->pending_bios_lock); 947 dispatch_defer_bios(conf, conf->pending_data_cnt, &tmp); 948 BUG_ON(conf->pending_data_cnt != 0); 949 spin_unlock(&conf->pending_bios_lock); 950 951 dispatch_bio_list(&tmp); 952 } 953 954 static void defer_issue_bios(struct r5conf *conf, sector_t sector, 955 struct bio_list *bios) 956 { 957 struct bio_list tmp = BIO_EMPTY_LIST; 958 struct r5pending_data *ent; 959 960 spin_lock(&conf->pending_bios_lock); 961 ent = list_first_entry(&conf->free_list, struct r5pending_data, 962 sibling); 963 list_move_tail(&ent->sibling, &conf->pending_list); 964 ent->sector = sector; 965 bio_list_init(&ent->bios); 966 bio_list_merge(&ent->bios, bios); 967 conf->pending_data_cnt++; 968 if (conf->pending_data_cnt >= PENDING_IO_MAX) 969 dispatch_defer_bios(conf, PENDING_IO_ONE_FLUSH, &tmp); 970 971 spin_unlock(&conf->pending_bios_lock); 972 973 dispatch_bio_list(&tmp); 974 } 975 976 static void 977 raid5_end_read_request(struct bio *bi); 978 static void 979 raid5_end_write_request(struct bio *bi); 980 981 static void ops_run_io(struct stripe_head *sh, struct stripe_head_state *s) 982 { 983 struct r5conf *conf = sh->raid_conf; 984 int i, disks = sh->disks; 985 struct stripe_head *head_sh = sh; 986 struct bio_list pending_bios = BIO_EMPTY_LIST; 987 bool should_defer; 988 989 might_sleep(); 990 991 if (log_stripe(sh, s) == 0) 992 return; 993 994 should_defer = conf->batch_bio_dispatch && conf->group_cnt; 995 996 for (i = disks; i--; ) { 997 int op, op_flags = 0; 998 int replace_only = 0; 999 struct bio *bi, *rbi; 1000 struct md_rdev *rdev, *rrdev = NULL; 1001 1002 sh = head_sh; 1003 if (test_and_clear_bit(R5_Wantwrite, &sh->dev[i].flags)) { 1004 op = REQ_OP_WRITE; 1005 if (test_and_clear_bit(R5_WantFUA, &sh->dev[i].flags)) 1006 op_flags = REQ_FUA; 1007 if (test_bit(R5_Discard, &sh->dev[i].flags)) 1008 op = REQ_OP_DISCARD; 1009 } else if (test_and_clear_bit(R5_Wantread, &sh->dev[i].flags)) 1010 op = REQ_OP_READ; 1011 else if (test_and_clear_bit(R5_WantReplace, 1012 &sh->dev[i].flags)) { 1013 op = REQ_OP_WRITE; 1014 replace_only = 1; 1015 } else 1016 continue; 1017 if (test_and_clear_bit(R5_SyncIO, &sh->dev[i].flags)) 1018 op_flags |= REQ_SYNC; 1019 1020 again: 1021 bi = &sh->dev[i].req; 1022 rbi = &sh->dev[i].rreq; /* For writing to replacement */ 1023 1024 rcu_read_lock(); 1025 rrdev = rcu_dereference(conf->disks[i].replacement); 1026 smp_mb(); /* Ensure that if rrdev is NULL, rdev won't be */ 1027 rdev = rcu_dereference(conf->disks[i].rdev); 1028 if (!rdev) { 1029 rdev = rrdev; 1030 rrdev = NULL; 1031 } 1032 if (op_is_write(op)) { 1033 if (replace_only) 1034 rdev = NULL; 1035 if (rdev == rrdev) 1036 /* We raced and saw duplicates */ 1037 rrdev = NULL; 1038 } else { 1039 if (test_bit(R5_ReadRepl, &head_sh->dev[i].flags) && rrdev) 1040 rdev = rrdev; 1041 rrdev = NULL; 1042 } 1043 1044 if (rdev && test_bit(Faulty, &rdev->flags)) 1045 rdev = NULL; 1046 if (rdev) 1047 atomic_inc(&rdev->nr_pending); 1048 if (rrdev && test_bit(Faulty, &rrdev->flags)) 1049 rrdev = NULL; 1050 if (rrdev) 1051 atomic_inc(&rrdev->nr_pending); 1052 rcu_read_unlock(); 1053 1054 /* We have already checked bad blocks for reads. Now 1055 * need to check for writes. We never accept write errors 1056 * on the replacement, so we don't to check rrdev. 1057 */ 1058 while (op_is_write(op) && rdev && 1059 test_bit(WriteErrorSeen, &rdev->flags)) { 1060 sector_t first_bad; 1061 int bad_sectors; 1062 int bad = is_badblock(rdev, sh->sector, STRIPE_SECTORS, 1063 &first_bad, &bad_sectors); 1064 if (!bad) 1065 break; 1066 1067 if (bad < 0) { 1068 set_bit(BlockedBadBlocks, &rdev->flags); 1069 if (!conf->mddev->external && 1070 conf->mddev->sb_flags) { 1071 /* It is very unlikely, but we might 1072 * still need to write out the 1073 * bad block log - better give it 1074 * a chance*/ 1075 md_check_recovery(conf->mddev); 1076 } 1077 /* 1078 * Because md_wait_for_blocked_rdev 1079 * will dec nr_pending, we must 1080 * increment it first. 1081 */ 1082 atomic_inc(&rdev->nr_pending); 1083 md_wait_for_blocked_rdev(rdev, conf->mddev); 1084 } else { 1085 /* Acknowledged bad block - skip the write */ 1086 rdev_dec_pending(rdev, conf->mddev); 1087 rdev = NULL; 1088 } 1089 } 1090 1091 if (rdev) { 1092 if (s->syncing || s->expanding || s->expanded 1093 || s->replacing) 1094 md_sync_acct(rdev->bdev, STRIPE_SECTORS); 1095 1096 set_bit(STRIPE_IO_STARTED, &sh->state); 1097 1098 bi->bi_bdev = rdev->bdev; 1099 bio_set_op_attrs(bi, op, op_flags); 1100 bi->bi_end_io = op_is_write(op) 1101 ? raid5_end_write_request 1102 : raid5_end_read_request; 1103 bi->bi_private = sh; 1104 1105 pr_debug("%s: for %llu schedule op %d on disc %d\n", 1106 __func__, (unsigned long long)sh->sector, 1107 bi->bi_opf, i); 1108 atomic_inc(&sh->count); 1109 if (sh != head_sh) 1110 atomic_inc(&head_sh->count); 1111 if (use_new_offset(conf, sh)) 1112 bi->bi_iter.bi_sector = (sh->sector 1113 + rdev->new_data_offset); 1114 else 1115 bi->bi_iter.bi_sector = (sh->sector 1116 + rdev->data_offset); 1117 if (test_bit(R5_ReadNoMerge, &head_sh->dev[i].flags)) 1118 bi->bi_opf |= REQ_NOMERGE; 1119 1120 if (test_bit(R5_SkipCopy, &sh->dev[i].flags)) 1121 WARN_ON(test_bit(R5_UPTODATE, &sh->dev[i].flags)); 1122 1123 if (!op_is_write(op) && 1124 test_bit(R5_InJournal, &sh->dev[i].flags)) 1125 /* 1126 * issuing read for a page in journal, this 1127 * must be preparing for prexor in rmw; read 1128 * the data into orig_page 1129 */ 1130 sh->dev[i].vec.bv_page = sh->dev[i].orig_page; 1131 else 1132 sh->dev[i].vec.bv_page = sh->dev[i].page; 1133 bi->bi_vcnt = 1; 1134 bi->bi_io_vec[0].bv_len = STRIPE_SIZE; 1135 bi->bi_io_vec[0].bv_offset = 0; 1136 bi->bi_iter.bi_size = STRIPE_SIZE; 1137 /* 1138 * If this is discard request, set bi_vcnt 0. We don't 1139 * want to confuse SCSI because SCSI will replace payload 1140 */ 1141 if (op == REQ_OP_DISCARD) 1142 bi->bi_vcnt = 0; 1143 if (rrdev) 1144 set_bit(R5_DOUBLE_LOCKED, &sh->dev[i].flags); 1145 1146 if (conf->mddev->gendisk) 1147 trace_block_bio_remap(bdev_get_queue(bi->bi_bdev), 1148 bi, disk_devt(conf->mddev->gendisk), 1149 sh->dev[i].sector); 1150 if (should_defer && op_is_write(op)) 1151 bio_list_add(&pending_bios, bi); 1152 else 1153 generic_make_request(bi); 1154 } 1155 if (rrdev) { 1156 if (s->syncing || s->expanding || s->expanded 1157 || s->replacing) 1158 md_sync_acct(rrdev->bdev, STRIPE_SECTORS); 1159 1160 set_bit(STRIPE_IO_STARTED, &sh->state); 1161 1162 rbi->bi_bdev = rrdev->bdev; 1163 bio_set_op_attrs(rbi, op, op_flags); 1164 BUG_ON(!op_is_write(op)); 1165 rbi->bi_end_io = raid5_end_write_request; 1166 rbi->bi_private = sh; 1167 1168 pr_debug("%s: for %llu schedule op %d on " 1169 "replacement disc %d\n", 1170 __func__, (unsigned long long)sh->sector, 1171 rbi->bi_opf, i); 1172 atomic_inc(&sh->count); 1173 if (sh != head_sh) 1174 atomic_inc(&head_sh->count); 1175 if (use_new_offset(conf, sh)) 1176 rbi->bi_iter.bi_sector = (sh->sector 1177 + rrdev->new_data_offset); 1178 else 1179 rbi->bi_iter.bi_sector = (sh->sector 1180 + rrdev->data_offset); 1181 if (test_bit(R5_SkipCopy, &sh->dev[i].flags)) 1182 WARN_ON(test_bit(R5_UPTODATE, &sh->dev[i].flags)); 1183 sh->dev[i].rvec.bv_page = sh->dev[i].page; 1184 rbi->bi_vcnt = 1; 1185 rbi->bi_io_vec[0].bv_len = STRIPE_SIZE; 1186 rbi->bi_io_vec[0].bv_offset = 0; 1187 rbi->bi_iter.bi_size = STRIPE_SIZE; 1188 /* 1189 * If this is discard request, set bi_vcnt 0. We don't 1190 * want to confuse SCSI because SCSI will replace payload 1191 */ 1192 if (op == REQ_OP_DISCARD) 1193 rbi->bi_vcnt = 0; 1194 if (conf->mddev->gendisk) 1195 trace_block_bio_remap(bdev_get_queue(rbi->bi_bdev), 1196 rbi, disk_devt(conf->mddev->gendisk), 1197 sh->dev[i].sector); 1198 if (should_defer && op_is_write(op)) 1199 bio_list_add(&pending_bios, rbi); 1200 else 1201 generic_make_request(rbi); 1202 } 1203 if (!rdev && !rrdev) { 1204 if (op_is_write(op)) 1205 set_bit(STRIPE_DEGRADED, &sh->state); 1206 pr_debug("skip op %d on disc %d for sector %llu\n", 1207 bi->bi_opf, i, (unsigned long long)sh->sector); 1208 clear_bit(R5_LOCKED, &sh->dev[i].flags); 1209 set_bit(STRIPE_HANDLE, &sh->state); 1210 } 1211 1212 if (!head_sh->batch_head) 1213 continue; 1214 sh = list_first_entry(&sh->batch_list, struct stripe_head, 1215 batch_list); 1216 if (sh != head_sh) 1217 goto again; 1218 } 1219 1220 if (should_defer && !bio_list_empty(&pending_bios)) 1221 defer_issue_bios(conf, head_sh->sector, &pending_bios); 1222 } 1223 1224 static struct dma_async_tx_descriptor * 1225 async_copy_data(int frombio, struct bio *bio, struct page **page, 1226 sector_t sector, struct dma_async_tx_descriptor *tx, 1227 struct stripe_head *sh, int no_skipcopy) 1228 { 1229 struct bio_vec bvl; 1230 struct bvec_iter iter; 1231 struct page *bio_page; 1232 int page_offset; 1233 struct async_submit_ctl submit; 1234 enum async_tx_flags flags = 0; 1235 1236 if (bio->bi_iter.bi_sector >= sector) 1237 page_offset = (signed)(bio->bi_iter.bi_sector - sector) * 512; 1238 else 1239 page_offset = (signed)(sector - bio->bi_iter.bi_sector) * -512; 1240 1241 if (frombio) 1242 flags |= ASYNC_TX_FENCE; 1243 init_async_submit(&submit, flags, tx, NULL, NULL, NULL); 1244 1245 bio_for_each_segment(bvl, bio, iter) { 1246 int len = bvl.bv_len; 1247 int clen; 1248 int b_offset = 0; 1249 1250 if (page_offset < 0) { 1251 b_offset = -page_offset; 1252 page_offset += b_offset; 1253 len -= b_offset; 1254 } 1255 1256 if (len > 0 && page_offset + len > STRIPE_SIZE) 1257 clen = STRIPE_SIZE - page_offset; 1258 else 1259 clen = len; 1260 1261 if (clen > 0) { 1262 b_offset += bvl.bv_offset; 1263 bio_page = bvl.bv_page; 1264 if (frombio) { 1265 if (sh->raid_conf->skip_copy && 1266 b_offset == 0 && page_offset == 0 && 1267 clen == STRIPE_SIZE && 1268 !no_skipcopy) 1269 *page = bio_page; 1270 else 1271 tx = async_memcpy(*page, bio_page, page_offset, 1272 b_offset, clen, &submit); 1273 } else 1274 tx = async_memcpy(bio_page, *page, b_offset, 1275 page_offset, clen, &submit); 1276 } 1277 /* chain the operations */ 1278 submit.depend_tx = tx; 1279 1280 if (clen < len) /* hit end of page */ 1281 break; 1282 page_offset += len; 1283 } 1284 1285 return tx; 1286 } 1287 1288 static void ops_complete_biofill(void *stripe_head_ref) 1289 { 1290 struct stripe_head *sh = stripe_head_ref; 1291 int i; 1292 1293 pr_debug("%s: stripe %llu\n", __func__, 1294 (unsigned long long)sh->sector); 1295 1296 /* clear completed biofills */ 1297 for (i = sh->disks; i--; ) { 1298 struct r5dev *dev = &sh->dev[i]; 1299 1300 /* acknowledge completion of a biofill operation */ 1301 /* and check if we need to reply to a read request, 1302 * new R5_Wantfill requests are held off until 1303 * !STRIPE_BIOFILL_RUN 1304 */ 1305 if (test_and_clear_bit(R5_Wantfill, &dev->flags)) { 1306 struct bio *rbi, *rbi2; 1307 1308 BUG_ON(!dev->read); 1309 rbi = dev->read; 1310 dev->read = NULL; 1311 while (rbi && rbi->bi_iter.bi_sector < 1312 dev->sector + STRIPE_SECTORS) { 1313 rbi2 = r5_next_bio(rbi, dev->sector); 1314 bio_endio(rbi); 1315 rbi = rbi2; 1316 } 1317 } 1318 } 1319 clear_bit(STRIPE_BIOFILL_RUN, &sh->state); 1320 1321 set_bit(STRIPE_HANDLE, &sh->state); 1322 raid5_release_stripe(sh); 1323 } 1324 1325 static void ops_run_biofill(struct stripe_head *sh) 1326 { 1327 struct dma_async_tx_descriptor *tx = NULL; 1328 struct async_submit_ctl submit; 1329 int i; 1330 1331 BUG_ON(sh->batch_head); 1332 pr_debug("%s: stripe %llu\n", __func__, 1333 (unsigned long long)sh->sector); 1334 1335 for (i = sh->disks; i--; ) { 1336 struct r5dev *dev = &sh->dev[i]; 1337 if (test_bit(R5_Wantfill, &dev->flags)) { 1338 struct bio *rbi; 1339 spin_lock_irq(&sh->stripe_lock); 1340 dev->read = rbi = dev->toread; 1341 dev->toread = NULL; 1342 spin_unlock_irq(&sh->stripe_lock); 1343 while (rbi && rbi->bi_iter.bi_sector < 1344 dev->sector + STRIPE_SECTORS) { 1345 tx = async_copy_data(0, rbi, &dev->page, 1346 dev->sector, tx, sh, 0); 1347 rbi = r5_next_bio(rbi, dev->sector); 1348 } 1349 } 1350 } 1351 1352 atomic_inc(&sh->count); 1353 init_async_submit(&submit, ASYNC_TX_ACK, tx, ops_complete_biofill, sh, NULL); 1354 async_trigger_callback(&submit); 1355 } 1356 1357 static void mark_target_uptodate(struct stripe_head *sh, int target) 1358 { 1359 struct r5dev *tgt; 1360 1361 if (target < 0) 1362 return; 1363 1364 tgt = &sh->dev[target]; 1365 set_bit(R5_UPTODATE, &tgt->flags); 1366 BUG_ON(!test_bit(R5_Wantcompute, &tgt->flags)); 1367 clear_bit(R5_Wantcompute, &tgt->flags); 1368 } 1369 1370 static void ops_complete_compute(void *stripe_head_ref) 1371 { 1372 struct stripe_head *sh = stripe_head_ref; 1373 1374 pr_debug("%s: stripe %llu\n", __func__, 1375 (unsigned long long)sh->sector); 1376 1377 /* mark the computed target(s) as uptodate */ 1378 mark_target_uptodate(sh, sh->ops.target); 1379 mark_target_uptodate(sh, sh->ops.target2); 1380 1381 clear_bit(STRIPE_COMPUTE_RUN, &sh->state); 1382 if (sh->check_state == check_state_compute_run) 1383 sh->check_state = check_state_compute_result; 1384 set_bit(STRIPE_HANDLE, &sh->state); 1385 raid5_release_stripe(sh); 1386 } 1387 1388 /* return a pointer to the address conversion region of the scribble buffer */ 1389 static addr_conv_t *to_addr_conv(struct stripe_head *sh, 1390 struct raid5_percpu *percpu, int i) 1391 { 1392 void *addr; 1393 1394 addr = flex_array_get(percpu->scribble, i); 1395 return addr + sizeof(struct page *) * (sh->disks + 2); 1396 } 1397 1398 /* return a pointer to the address conversion region of the scribble buffer */ 1399 static struct page **to_addr_page(struct raid5_percpu *percpu, int i) 1400 { 1401 void *addr; 1402 1403 addr = flex_array_get(percpu->scribble, i); 1404 return addr; 1405 } 1406 1407 static struct dma_async_tx_descriptor * 1408 ops_run_compute5(struct stripe_head *sh, struct raid5_percpu *percpu) 1409 { 1410 int disks = sh->disks; 1411 struct page **xor_srcs = to_addr_page(percpu, 0); 1412 int target = sh->ops.target; 1413 struct r5dev *tgt = &sh->dev[target]; 1414 struct page *xor_dest = tgt->page; 1415 int count = 0; 1416 struct dma_async_tx_descriptor *tx; 1417 struct async_submit_ctl submit; 1418 int i; 1419 1420 BUG_ON(sh->batch_head); 1421 1422 pr_debug("%s: stripe %llu block: %d\n", 1423 __func__, (unsigned long long)sh->sector, target); 1424 BUG_ON(!test_bit(R5_Wantcompute, &tgt->flags)); 1425 1426 for (i = disks; i--; ) 1427 if (i != target) 1428 xor_srcs[count++] = sh->dev[i].page; 1429 1430 atomic_inc(&sh->count); 1431 1432 init_async_submit(&submit, ASYNC_TX_FENCE|ASYNC_TX_XOR_ZERO_DST, NULL, 1433 ops_complete_compute, sh, to_addr_conv(sh, percpu, 0)); 1434 if (unlikely(count == 1)) 1435 tx = async_memcpy(xor_dest, xor_srcs[0], 0, 0, STRIPE_SIZE, &submit); 1436 else 1437 tx = async_xor(xor_dest, xor_srcs, 0, count, STRIPE_SIZE, &submit); 1438 1439 return tx; 1440 } 1441 1442 /* set_syndrome_sources - populate source buffers for gen_syndrome 1443 * @srcs - (struct page *) array of size sh->disks 1444 * @sh - stripe_head to parse 1445 * 1446 * Populates srcs in proper layout order for the stripe and returns the 1447 * 'count' of sources to be used in a call to async_gen_syndrome. The P 1448 * destination buffer is recorded in srcs[count] and the Q destination 1449 * is recorded in srcs[count+1]]. 1450 */ 1451 static int set_syndrome_sources(struct page **srcs, 1452 struct stripe_head *sh, 1453 int srctype) 1454 { 1455 int disks = sh->disks; 1456 int syndrome_disks = sh->ddf_layout ? disks : (disks - 2); 1457 int d0_idx = raid6_d0(sh); 1458 int count; 1459 int i; 1460 1461 for (i = 0; i < disks; i++) 1462 srcs[i] = NULL; 1463 1464 count = 0; 1465 i = d0_idx; 1466 do { 1467 int slot = raid6_idx_to_slot(i, sh, &count, syndrome_disks); 1468 struct r5dev *dev = &sh->dev[i]; 1469 1470 if (i == sh->qd_idx || i == sh->pd_idx || 1471 (srctype == SYNDROME_SRC_ALL) || 1472 (srctype == SYNDROME_SRC_WANT_DRAIN && 1473 (test_bit(R5_Wantdrain, &dev->flags) || 1474 test_bit(R5_InJournal, &dev->flags))) || 1475 (srctype == SYNDROME_SRC_WRITTEN && 1476 (dev->written || 1477 test_bit(R5_InJournal, &dev->flags)))) { 1478 if (test_bit(R5_InJournal, &dev->flags)) 1479 srcs[slot] = sh->dev[i].orig_page; 1480 else 1481 srcs[slot] = sh->dev[i].page; 1482 } 1483 i = raid6_next_disk(i, disks); 1484 } while (i != d0_idx); 1485 1486 return syndrome_disks; 1487 } 1488 1489 static struct dma_async_tx_descriptor * 1490 ops_run_compute6_1(struct stripe_head *sh, struct raid5_percpu *percpu) 1491 { 1492 int disks = sh->disks; 1493 struct page **blocks = to_addr_page(percpu, 0); 1494 int target; 1495 int qd_idx = sh->qd_idx; 1496 struct dma_async_tx_descriptor *tx; 1497 struct async_submit_ctl submit; 1498 struct r5dev *tgt; 1499 struct page *dest; 1500 int i; 1501 int count; 1502 1503 BUG_ON(sh->batch_head); 1504 if (sh->ops.target < 0) 1505 target = sh->ops.target2; 1506 else if (sh->ops.target2 < 0) 1507 target = sh->ops.target; 1508 else 1509 /* we should only have one valid target */ 1510 BUG(); 1511 BUG_ON(target < 0); 1512 pr_debug("%s: stripe %llu block: %d\n", 1513 __func__, (unsigned long long)sh->sector, target); 1514 1515 tgt = &sh->dev[target]; 1516 BUG_ON(!test_bit(R5_Wantcompute, &tgt->flags)); 1517 dest = tgt->page; 1518 1519 atomic_inc(&sh->count); 1520 1521 if (target == qd_idx) { 1522 count = set_syndrome_sources(blocks, sh, SYNDROME_SRC_ALL); 1523 blocks[count] = NULL; /* regenerating p is not necessary */ 1524 BUG_ON(blocks[count+1] != dest); /* q should already be set */ 1525 init_async_submit(&submit, ASYNC_TX_FENCE, NULL, 1526 ops_complete_compute, sh, 1527 to_addr_conv(sh, percpu, 0)); 1528 tx = async_gen_syndrome(blocks, 0, count+2, STRIPE_SIZE, &submit); 1529 } else { 1530 /* Compute any data- or p-drive using XOR */ 1531 count = 0; 1532 for (i = disks; i-- ; ) { 1533 if (i == target || i == qd_idx) 1534 continue; 1535 blocks[count++] = sh->dev[i].page; 1536 } 1537 1538 init_async_submit(&submit, ASYNC_TX_FENCE|ASYNC_TX_XOR_ZERO_DST, 1539 NULL, ops_complete_compute, sh, 1540 to_addr_conv(sh, percpu, 0)); 1541 tx = async_xor(dest, blocks, 0, count, STRIPE_SIZE, &submit); 1542 } 1543 1544 return tx; 1545 } 1546 1547 static struct dma_async_tx_descriptor * 1548 ops_run_compute6_2(struct stripe_head *sh, struct raid5_percpu *percpu) 1549 { 1550 int i, count, disks = sh->disks; 1551 int syndrome_disks = sh->ddf_layout ? disks : disks-2; 1552 int d0_idx = raid6_d0(sh); 1553 int faila = -1, failb = -1; 1554 int target = sh->ops.target; 1555 int target2 = sh->ops.target2; 1556 struct r5dev *tgt = &sh->dev[target]; 1557 struct r5dev *tgt2 = &sh->dev[target2]; 1558 struct dma_async_tx_descriptor *tx; 1559 struct page **blocks = to_addr_page(percpu, 0); 1560 struct async_submit_ctl submit; 1561 1562 BUG_ON(sh->batch_head); 1563 pr_debug("%s: stripe %llu block1: %d block2: %d\n", 1564 __func__, (unsigned long long)sh->sector, target, target2); 1565 BUG_ON(target < 0 || target2 < 0); 1566 BUG_ON(!test_bit(R5_Wantcompute, &tgt->flags)); 1567 BUG_ON(!test_bit(R5_Wantcompute, &tgt2->flags)); 1568 1569 /* we need to open-code set_syndrome_sources to handle the 1570 * slot number conversion for 'faila' and 'failb' 1571 */ 1572 for (i = 0; i < disks ; i++) 1573 blocks[i] = NULL; 1574 count = 0; 1575 i = d0_idx; 1576 do { 1577 int slot = raid6_idx_to_slot(i, sh, &count, syndrome_disks); 1578 1579 blocks[slot] = sh->dev[i].page; 1580 1581 if (i == target) 1582 faila = slot; 1583 if (i == target2) 1584 failb = slot; 1585 i = raid6_next_disk(i, disks); 1586 } while (i != d0_idx); 1587 1588 BUG_ON(faila == failb); 1589 if (failb < faila) 1590 swap(faila, failb); 1591 pr_debug("%s: stripe: %llu faila: %d failb: %d\n", 1592 __func__, (unsigned long long)sh->sector, faila, failb); 1593 1594 atomic_inc(&sh->count); 1595 1596 if (failb == syndrome_disks+1) { 1597 /* Q disk is one of the missing disks */ 1598 if (faila == syndrome_disks) { 1599 /* Missing P+Q, just recompute */ 1600 init_async_submit(&submit, ASYNC_TX_FENCE, NULL, 1601 ops_complete_compute, sh, 1602 to_addr_conv(sh, percpu, 0)); 1603 return async_gen_syndrome(blocks, 0, syndrome_disks+2, 1604 STRIPE_SIZE, &submit); 1605 } else { 1606 struct page *dest; 1607 int data_target; 1608 int qd_idx = sh->qd_idx; 1609 1610 /* Missing D+Q: recompute D from P, then recompute Q */ 1611 if (target == qd_idx) 1612 data_target = target2; 1613 else 1614 data_target = target; 1615 1616 count = 0; 1617 for (i = disks; i-- ; ) { 1618 if (i == data_target || i == qd_idx) 1619 continue; 1620 blocks[count++] = sh->dev[i].page; 1621 } 1622 dest = sh->dev[data_target].page; 1623 init_async_submit(&submit, 1624 ASYNC_TX_FENCE|ASYNC_TX_XOR_ZERO_DST, 1625 NULL, NULL, NULL, 1626 to_addr_conv(sh, percpu, 0)); 1627 tx = async_xor(dest, blocks, 0, count, STRIPE_SIZE, 1628 &submit); 1629 1630 count = set_syndrome_sources(blocks, sh, SYNDROME_SRC_ALL); 1631 init_async_submit(&submit, ASYNC_TX_FENCE, tx, 1632 ops_complete_compute, sh, 1633 to_addr_conv(sh, percpu, 0)); 1634 return async_gen_syndrome(blocks, 0, count+2, 1635 STRIPE_SIZE, &submit); 1636 } 1637 } else { 1638 init_async_submit(&submit, ASYNC_TX_FENCE, NULL, 1639 ops_complete_compute, sh, 1640 to_addr_conv(sh, percpu, 0)); 1641 if (failb == syndrome_disks) { 1642 /* We're missing D+P. */ 1643 return async_raid6_datap_recov(syndrome_disks+2, 1644 STRIPE_SIZE, faila, 1645 blocks, &submit); 1646 } else { 1647 /* We're missing D+D. */ 1648 return async_raid6_2data_recov(syndrome_disks+2, 1649 STRIPE_SIZE, faila, failb, 1650 blocks, &submit); 1651 } 1652 } 1653 } 1654 1655 static void ops_complete_prexor(void *stripe_head_ref) 1656 { 1657 struct stripe_head *sh = stripe_head_ref; 1658 1659 pr_debug("%s: stripe %llu\n", __func__, 1660 (unsigned long long)sh->sector); 1661 1662 if (r5c_is_writeback(sh->raid_conf->log)) 1663 /* 1664 * raid5-cache write back uses orig_page during prexor. 1665 * After prexor, it is time to free orig_page 1666 */ 1667 r5c_release_extra_page(sh); 1668 } 1669 1670 static struct dma_async_tx_descriptor * 1671 ops_run_prexor5(struct stripe_head *sh, struct raid5_percpu *percpu, 1672 struct dma_async_tx_descriptor *tx) 1673 { 1674 int disks = sh->disks; 1675 struct page **xor_srcs = to_addr_page(percpu, 0); 1676 int count = 0, pd_idx = sh->pd_idx, i; 1677 struct async_submit_ctl submit; 1678 1679 /* existing parity data subtracted */ 1680 struct page *xor_dest = xor_srcs[count++] = sh->dev[pd_idx].page; 1681 1682 BUG_ON(sh->batch_head); 1683 pr_debug("%s: stripe %llu\n", __func__, 1684 (unsigned long long)sh->sector); 1685 1686 for (i = disks; i--; ) { 1687 struct r5dev *dev = &sh->dev[i]; 1688 /* Only process blocks that are known to be uptodate */ 1689 if (test_bit(R5_InJournal, &dev->flags)) 1690 xor_srcs[count++] = dev->orig_page; 1691 else if (test_bit(R5_Wantdrain, &dev->flags)) 1692 xor_srcs[count++] = dev->page; 1693 } 1694 1695 init_async_submit(&submit, ASYNC_TX_FENCE|ASYNC_TX_XOR_DROP_DST, tx, 1696 ops_complete_prexor, sh, to_addr_conv(sh, percpu, 0)); 1697 tx = async_xor(xor_dest, xor_srcs, 0, count, STRIPE_SIZE, &submit); 1698 1699 return tx; 1700 } 1701 1702 static struct dma_async_tx_descriptor * 1703 ops_run_prexor6(struct stripe_head *sh, struct raid5_percpu *percpu, 1704 struct dma_async_tx_descriptor *tx) 1705 { 1706 struct page **blocks = to_addr_page(percpu, 0); 1707 int count; 1708 struct async_submit_ctl submit; 1709 1710 pr_debug("%s: stripe %llu\n", __func__, 1711 (unsigned long long)sh->sector); 1712 1713 count = set_syndrome_sources(blocks, sh, SYNDROME_SRC_WANT_DRAIN); 1714 1715 init_async_submit(&submit, ASYNC_TX_FENCE|ASYNC_TX_PQ_XOR_DST, tx, 1716 ops_complete_prexor, sh, to_addr_conv(sh, percpu, 0)); 1717 tx = async_gen_syndrome(blocks, 0, count+2, STRIPE_SIZE, &submit); 1718 1719 return tx; 1720 } 1721 1722 static struct dma_async_tx_descriptor * 1723 ops_run_biodrain(struct stripe_head *sh, struct dma_async_tx_descriptor *tx) 1724 { 1725 struct r5conf *conf = sh->raid_conf; 1726 int disks = sh->disks; 1727 int i; 1728 struct stripe_head *head_sh = sh; 1729 1730 pr_debug("%s: stripe %llu\n", __func__, 1731 (unsigned long long)sh->sector); 1732 1733 for (i = disks; i--; ) { 1734 struct r5dev *dev; 1735 struct bio *chosen; 1736 1737 sh = head_sh; 1738 if (test_and_clear_bit(R5_Wantdrain, &head_sh->dev[i].flags)) { 1739 struct bio *wbi; 1740 1741 again: 1742 dev = &sh->dev[i]; 1743 /* 1744 * clear R5_InJournal, so when rewriting a page in 1745 * journal, it is not skipped by r5l_log_stripe() 1746 */ 1747 clear_bit(R5_InJournal, &dev->flags); 1748 spin_lock_irq(&sh->stripe_lock); 1749 chosen = dev->towrite; 1750 dev->towrite = NULL; 1751 sh->overwrite_disks = 0; 1752 BUG_ON(dev->written); 1753 wbi = dev->written = chosen; 1754 spin_unlock_irq(&sh->stripe_lock); 1755 WARN_ON(dev->page != dev->orig_page); 1756 1757 while (wbi && wbi->bi_iter.bi_sector < 1758 dev->sector + STRIPE_SECTORS) { 1759 if (wbi->bi_opf & REQ_FUA) 1760 set_bit(R5_WantFUA, &dev->flags); 1761 if (wbi->bi_opf & REQ_SYNC) 1762 set_bit(R5_SyncIO, &dev->flags); 1763 if (bio_op(wbi) == REQ_OP_DISCARD) 1764 set_bit(R5_Discard, &dev->flags); 1765 else { 1766 tx = async_copy_data(1, wbi, &dev->page, 1767 dev->sector, tx, sh, 1768 r5c_is_writeback(conf->log)); 1769 if (dev->page != dev->orig_page && 1770 !r5c_is_writeback(conf->log)) { 1771 set_bit(R5_SkipCopy, &dev->flags); 1772 clear_bit(R5_UPTODATE, &dev->flags); 1773 clear_bit(R5_OVERWRITE, &dev->flags); 1774 } 1775 } 1776 wbi = r5_next_bio(wbi, dev->sector); 1777 } 1778 1779 if (head_sh->batch_head) { 1780 sh = list_first_entry(&sh->batch_list, 1781 struct stripe_head, 1782 batch_list); 1783 if (sh == head_sh) 1784 continue; 1785 goto again; 1786 } 1787 } 1788 } 1789 1790 return tx; 1791 } 1792 1793 static void ops_complete_reconstruct(void *stripe_head_ref) 1794 { 1795 struct stripe_head *sh = stripe_head_ref; 1796 int disks = sh->disks; 1797 int pd_idx = sh->pd_idx; 1798 int qd_idx = sh->qd_idx; 1799 int i; 1800 bool fua = false, sync = false, discard = false; 1801 1802 pr_debug("%s: stripe %llu\n", __func__, 1803 (unsigned long long)sh->sector); 1804 1805 for (i = disks; i--; ) { 1806 fua |= test_bit(R5_WantFUA, &sh->dev[i].flags); 1807 sync |= test_bit(R5_SyncIO, &sh->dev[i].flags); 1808 discard |= test_bit(R5_Discard, &sh->dev[i].flags); 1809 } 1810 1811 for (i = disks; i--; ) { 1812 struct r5dev *dev = &sh->dev[i]; 1813 1814 if (dev->written || i == pd_idx || i == qd_idx) { 1815 if (!discard && !test_bit(R5_SkipCopy, &dev->flags)) 1816 set_bit(R5_UPTODATE, &dev->flags); 1817 if (fua) 1818 set_bit(R5_WantFUA, &dev->flags); 1819 if (sync) 1820 set_bit(R5_SyncIO, &dev->flags); 1821 } 1822 } 1823 1824 if (sh->reconstruct_state == reconstruct_state_drain_run) 1825 sh->reconstruct_state = reconstruct_state_drain_result; 1826 else if (sh->reconstruct_state == reconstruct_state_prexor_drain_run) 1827 sh->reconstruct_state = reconstruct_state_prexor_drain_result; 1828 else { 1829 BUG_ON(sh->reconstruct_state != reconstruct_state_run); 1830 sh->reconstruct_state = reconstruct_state_result; 1831 } 1832 1833 set_bit(STRIPE_HANDLE, &sh->state); 1834 raid5_release_stripe(sh); 1835 } 1836 1837 static void 1838 ops_run_reconstruct5(struct stripe_head *sh, struct raid5_percpu *percpu, 1839 struct dma_async_tx_descriptor *tx) 1840 { 1841 int disks = sh->disks; 1842 struct page **xor_srcs; 1843 struct async_submit_ctl submit; 1844 int count, pd_idx = sh->pd_idx, i; 1845 struct page *xor_dest; 1846 int prexor = 0; 1847 unsigned long flags; 1848 int j = 0; 1849 struct stripe_head *head_sh = sh; 1850 int last_stripe; 1851 1852 pr_debug("%s: stripe %llu\n", __func__, 1853 (unsigned long long)sh->sector); 1854 1855 for (i = 0; i < sh->disks; i++) { 1856 if (pd_idx == i) 1857 continue; 1858 if (!test_bit(R5_Discard, &sh->dev[i].flags)) 1859 break; 1860 } 1861 if (i >= sh->disks) { 1862 atomic_inc(&sh->count); 1863 set_bit(R5_Discard, &sh->dev[pd_idx].flags); 1864 ops_complete_reconstruct(sh); 1865 return; 1866 } 1867 again: 1868 count = 0; 1869 xor_srcs = to_addr_page(percpu, j); 1870 /* check if prexor is active which means only process blocks 1871 * that are part of a read-modify-write (written) 1872 */ 1873 if (head_sh->reconstruct_state == reconstruct_state_prexor_drain_run) { 1874 prexor = 1; 1875 xor_dest = xor_srcs[count++] = sh->dev[pd_idx].page; 1876 for (i = disks; i--; ) { 1877 struct r5dev *dev = &sh->dev[i]; 1878 if (head_sh->dev[i].written || 1879 test_bit(R5_InJournal, &head_sh->dev[i].flags)) 1880 xor_srcs[count++] = dev->page; 1881 } 1882 } else { 1883 xor_dest = sh->dev[pd_idx].page; 1884 for (i = disks; i--; ) { 1885 struct r5dev *dev = &sh->dev[i]; 1886 if (i != pd_idx) 1887 xor_srcs[count++] = dev->page; 1888 } 1889 } 1890 1891 /* 1/ if we prexor'd then the dest is reused as a source 1892 * 2/ if we did not prexor then we are redoing the parity 1893 * set ASYNC_TX_XOR_DROP_DST and ASYNC_TX_XOR_ZERO_DST 1894 * for the synchronous xor case 1895 */ 1896 last_stripe = !head_sh->batch_head || 1897 list_first_entry(&sh->batch_list, 1898 struct stripe_head, batch_list) == head_sh; 1899 if (last_stripe) { 1900 flags = ASYNC_TX_ACK | 1901 (prexor ? ASYNC_TX_XOR_DROP_DST : ASYNC_TX_XOR_ZERO_DST); 1902 1903 atomic_inc(&head_sh->count); 1904 init_async_submit(&submit, flags, tx, ops_complete_reconstruct, head_sh, 1905 to_addr_conv(sh, percpu, j)); 1906 } else { 1907 flags = prexor ? ASYNC_TX_XOR_DROP_DST : ASYNC_TX_XOR_ZERO_DST; 1908 init_async_submit(&submit, flags, tx, NULL, NULL, 1909 to_addr_conv(sh, percpu, j)); 1910 } 1911 1912 if (unlikely(count == 1)) 1913 tx = async_memcpy(xor_dest, xor_srcs[0], 0, 0, STRIPE_SIZE, &submit); 1914 else 1915 tx = async_xor(xor_dest, xor_srcs, 0, count, STRIPE_SIZE, &submit); 1916 if (!last_stripe) { 1917 j++; 1918 sh = list_first_entry(&sh->batch_list, struct stripe_head, 1919 batch_list); 1920 goto again; 1921 } 1922 } 1923 1924 static void 1925 ops_run_reconstruct6(struct stripe_head *sh, struct raid5_percpu *percpu, 1926 struct dma_async_tx_descriptor *tx) 1927 { 1928 struct async_submit_ctl submit; 1929 struct page **blocks; 1930 int count, i, j = 0; 1931 struct stripe_head *head_sh = sh; 1932 int last_stripe; 1933 int synflags; 1934 unsigned long txflags; 1935 1936 pr_debug("%s: stripe %llu\n", __func__, (unsigned long long)sh->sector); 1937 1938 for (i = 0; i < sh->disks; i++) { 1939 if (sh->pd_idx == i || sh->qd_idx == i) 1940 continue; 1941 if (!test_bit(R5_Discard, &sh->dev[i].flags)) 1942 break; 1943 } 1944 if (i >= sh->disks) { 1945 atomic_inc(&sh->count); 1946 set_bit(R5_Discard, &sh->dev[sh->pd_idx].flags); 1947 set_bit(R5_Discard, &sh->dev[sh->qd_idx].flags); 1948 ops_complete_reconstruct(sh); 1949 return; 1950 } 1951 1952 again: 1953 blocks = to_addr_page(percpu, j); 1954 1955 if (sh->reconstruct_state == reconstruct_state_prexor_drain_run) { 1956 synflags = SYNDROME_SRC_WRITTEN; 1957 txflags = ASYNC_TX_ACK | ASYNC_TX_PQ_XOR_DST; 1958 } else { 1959 synflags = SYNDROME_SRC_ALL; 1960 txflags = ASYNC_TX_ACK; 1961 } 1962 1963 count = set_syndrome_sources(blocks, sh, synflags); 1964 last_stripe = !head_sh->batch_head || 1965 list_first_entry(&sh->batch_list, 1966 struct stripe_head, batch_list) == head_sh; 1967 1968 if (last_stripe) { 1969 atomic_inc(&head_sh->count); 1970 init_async_submit(&submit, txflags, tx, ops_complete_reconstruct, 1971 head_sh, to_addr_conv(sh, percpu, j)); 1972 } else 1973 init_async_submit(&submit, 0, tx, NULL, NULL, 1974 to_addr_conv(sh, percpu, j)); 1975 tx = async_gen_syndrome(blocks, 0, count+2, STRIPE_SIZE, &submit); 1976 if (!last_stripe) { 1977 j++; 1978 sh = list_first_entry(&sh->batch_list, struct stripe_head, 1979 batch_list); 1980 goto again; 1981 } 1982 } 1983 1984 static void ops_complete_check(void *stripe_head_ref) 1985 { 1986 struct stripe_head *sh = stripe_head_ref; 1987 1988 pr_debug("%s: stripe %llu\n", __func__, 1989 (unsigned long long)sh->sector); 1990 1991 sh->check_state = check_state_check_result; 1992 set_bit(STRIPE_HANDLE, &sh->state); 1993 raid5_release_stripe(sh); 1994 } 1995 1996 static void ops_run_check_p(struct stripe_head *sh, struct raid5_percpu *percpu) 1997 { 1998 int disks = sh->disks; 1999 int pd_idx = sh->pd_idx; 2000 int qd_idx = sh->qd_idx; 2001 struct page *xor_dest; 2002 struct page **xor_srcs = to_addr_page(percpu, 0); 2003 struct dma_async_tx_descriptor *tx; 2004 struct async_submit_ctl submit; 2005 int count; 2006 int i; 2007 2008 pr_debug("%s: stripe %llu\n", __func__, 2009 (unsigned long long)sh->sector); 2010 2011 BUG_ON(sh->batch_head); 2012 count = 0; 2013 xor_dest = sh->dev[pd_idx].page; 2014 xor_srcs[count++] = xor_dest; 2015 for (i = disks; i--; ) { 2016 if (i == pd_idx || i == qd_idx) 2017 continue; 2018 xor_srcs[count++] = sh->dev[i].page; 2019 } 2020 2021 init_async_submit(&submit, 0, NULL, NULL, NULL, 2022 to_addr_conv(sh, percpu, 0)); 2023 tx = async_xor_val(xor_dest, xor_srcs, 0, count, STRIPE_SIZE, 2024 &sh->ops.zero_sum_result, &submit); 2025 2026 atomic_inc(&sh->count); 2027 init_async_submit(&submit, ASYNC_TX_ACK, tx, ops_complete_check, sh, NULL); 2028 tx = async_trigger_callback(&submit); 2029 } 2030 2031 static void ops_run_check_pq(struct stripe_head *sh, struct raid5_percpu *percpu, int checkp) 2032 { 2033 struct page **srcs = to_addr_page(percpu, 0); 2034 struct async_submit_ctl submit; 2035 int count; 2036 2037 pr_debug("%s: stripe %llu checkp: %d\n", __func__, 2038 (unsigned long long)sh->sector, checkp); 2039 2040 BUG_ON(sh->batch_head); 2041 count = set_syndrome_sources(srcs, sh, SYNDROME_SRC_ALL); 2042 if (!checkp) 2043 srcs[count] = NULL; 2044 2045 atomic_inc(&sh->count); 2046 init_async_submit(&submit, ASYNC_TX_ACK, NULL, ops_complete_check, 2047 sh, to_addr_conv(sh, percpu, 0)); 2048 async_syndrome_val(srcs, 0, count+2, STRIPE_SIZE, 2049 &sh->ops.zero_sum_result, percpu->spare_page, &submit); 2050 } 2051 2052 static void raid_run_ops(struct stripe_head *sh, unsigned long ops_request) 2053 { 2054 int overlap_clear = 0, i, disks = sh->disks; 2055 struct dma_async_tx_descriptor *tx = NULL; 2056 struct r5conf *conf = sh->raid_conf; 2057 int level = conf->level; 2058 struct raid5_percpu *percpu; 2059 unsigned long cpu; 2060 2061 cpu = get_cpu(); 2062 percpu = per_cpu_ptr(conf->percpu, cpu); 2063 if (test_bit(STRIPE_OP_BIOFILL, &ops_request)) { 2064 ops_run_biofill(sh); 2065 overlap_clear++; 2066 } 2067 2068 if (test_bit(STRIPE_OP_COMPUTE_BLK, &ops_request)) { 2069 if (level < 6) 2070 tx = ops_run_compute5(sh, percpu); 2071 else { 2072 if (sh->ops.target2 < 0 || sh->ops.target < 0) 2073 tx = ops_run_compute6_1(sh, percpu); 2074 else 2075 tx = ops_run_compute6_2(sh, percpu); 2076 } 2077 /* terminate the chain if reconstruct is not set to be run */ 2078 if (tx && !test_bit(STRIPE_OP_RECONSTRUCT, &ops_request)) 2079 async_tx_ack(tx); 2080 } 2081 2082 if (test_bit(STRIPE_OP_PREXOR, &ops_request)) { 2083 if (level < 6) 2084 tx = ops_run_prexor5(sh, percpu, tx); 2085 else 2086 tx = ops_run_prexor6(sh, percpu, tx); 2087 } 2088 2089 if (test_bit(STRIPE_OP_PARTIAL_PARITY, &ops_request)) 2090 tx = ops_run_partial_parity(sh, percpu, tx); 2091 2092 if (test_bit(STRIPE_OP_BIODRAIN, &ops_request)) { 2093 tx = ops_run_biodrain(sh, tx); 2094 overlap_clear++; 2095 } 2096 2097 if (test_bit(STRIPE_OP_RECONSTRUCT, &ops_request)) { 2098 if (level < 6) 2099 ops_run_reconstruct5(sh, percpu, tx); 2100 else 2101 ops_run_reconstruct6(sh, percpu, tx); 2102 } 2103 2104 if (test_bit(STRIPE_OP_CHECK, &ops_request)) { 2105 if (sh->check_state == check_state_run) 2106 ops_run_check_p(sh, percpu); 2107 else if (sh->check_state == check_state_run_q) 2108 ops_run_check_pq(sh, percpu, 0); 2109 else if (sh->check_state == check_state_run_pq) 2110 ops_run_check_pq(sh, percpu, 1); 2111 else 2112 BUG(); 2113 } 2114 2115 if (overlap_clear && !sh->batch_head) 2116 for (i = disks; i--; ) { 2117 struct r5dev *dev = &sh->dev[i]; 2118 if (test_and_clear_bit(R5_Overlap, &dev->flags)) 2119 wake_up(&sh->raid_conf->wait_for_overlap); 2120 } 2121 put_cpu(); 2122 } 2123 2124 static void free_stripe(struct kmem_cache *sc, struct stripe_head *sh) 2125 { 2126 if (sh->ppl_page) 2127 __free_page(sh->ppl_page); 2128 kmem_cache_free(sc, sh); 2129 } 2130 2131 static struct stripe_head *alloc_stripe(struct kmem_cache *sc, gfp_t gfp, 2132 int disks, struct r5conf *conf) 2133 { 2134 struct stripe_head *sh; 2135 int i; 2136 2137 sh = kmem_cache_zalloc(sc, gfp); 2138 if (sh) { 2139 spin_lock_init(&sh->stripe_lock); 2140 spin_lock_init(&sh->batch_lock); 2141 INIT_LIST_HEAD(&sh->batch_list); 2142 INIT_LIST_HEAD(&sh->lru); 2143 INIT_LIST_HEAD(&sh->r5c); 2144 INIT_LIST_HEAD(&sh->log_list); 2145 atomic_set(&sh->count, 1); 2146 sh->raid_conf = conf; 2147 sh->log_start = MaxSector; 2148 for (i = 0; i < disks; i++) { 2149 struct r5dev *dev = &sh->dev[i]; 2150 2151 bio_init(&dev->req, &dev->vec, 1); 2152 bio_init(&dev->rreq, &dev->rvec, 1); 2153 } 2154 2155 if (raid5_has_ppl(conf)) { 2156 sh->ppl_page = alloc_page(gfp); 2157 if (!sh->ppl_page) { 2158 free_stripe(sc, sh); 2159 sh = NULL; 2160 } 2161 } 2162 } 2163 return sh; 2164 } 2165 static int grow_one_stripe(struct r5conf *conf, gfp_t gfp) 2166 { 2167 struct stripe_head *sh; 2168 2169 sh = alloc_stripe(conf->slab_cache, gfp, conf->pool_size, conf); 2170 if (!sh) 2171 return 0; 2172 2173 if (grow_buffers(sh, gfp)) { 2174 shrink_buffers(sh); 2175 free_stripe(conf->slab_cache, sh); 2176 return 0; 2177 } 2178 sh->hash_lock_index = 2179 conf->max_nr_stripes % NR_STRIPE_HASH_LOCKS; 2180 /* we just created an active stripe so... */ 2181 atomic_inc(&conf->active_stripes); 2182 2183 raid5_release_stripe(sh); 2184 conf->max_nr_stripes++; 2185 return 1; 2186 } 2187 2188 static int grow_stripes(struct r5conf *conf, int num) 2189 { 2190 struct kmem_cache *sc; 2191 int devs = max(conf->raid_disks, conf->previous_raid_disks); 2192 2193 if (conf->mddev->gendisk) 2194 sprintf(conf->cache_name[0], 2195 "raid%d-%s", conf->level, mdname(conf->mddev)); 2196 else 2197 sprintf(conf->cache_name[0], 2198 "raid%d-%p", conf->level, conf->mddev); 2199 sprintf(conf->cache_name[1], "%s-alt", conf->cache_name[0]); 2200 2201 conf->active_name = 0; 2202 sc = kmem_cache_create(conf->cache_name[conf->active_name], 2203 sizeof(struct stripe_head)+(devs-1)*sizeof(struct r5dev), 2204 0, 0, NULL); 2205 if (!sc) 2206 return 1; 2207 conf->slab_cache = sc; 2208 conf->pool_size = devs; 2209 while (num--) 2210 if (!grow_one_stripe(conf, GFP_KERNEL)) 2211 return 1; 2212 2213 return 0; 2214 } 2215 2216 /** 2217 * scribble_len - return the required size of the scribble region 2218 * @num - total number of disks in the array 2219 * 2220 * The size must be enough to contain: 2221 * 1/ a struct page pointer for each device in the array +2 2222 * 2/ room to convert each entry in (1) to its corresponding dma 2223 * (dma_map_page()) or page (page_address()) address. 2224 * 2225 * Note: the +2 is for the destination buffers of the ddf/raid6 case where we 2226 * calculate over all devices (not just the data blocks), using zeros in place 2227 * of the P and Q blocks. 2228 */ 2229 static struct flex_array *scribble_alloc(int num, int cnt, gfp_t flags) 2230 { 2231 struct flex_array *ret; 2232 size_t len; 2233 2234 len = sizeof(struct page *) * (num+2) + sizeof(addr_conv_t) * (num+2); 2235 ret = flex_array_alloc(len, cnt, flags); 2236 if (!ret) 2237 return NULL; 2238 /* always prealloc all elements, so no locking is required */ 2239 if (flex_array_prealloc(ret, 0, cnt, flags)) { 2240 flex_array_free(ret); 2241 return NULL; 2242 } 2243 return ret; 2244 } 2245 2246 static int resize_chunks(struct r5conf *conf, int new_disks, int new_sectors) 2247 { 2248 unsigned long cpu; 2249 int err = 0; 2250 2251 /* 2252 * Never shrink. And mddev_suspend() could deadlock if this is called 2253 * from raid5d. In that case, scribble_disks and scribble_sectors 2254 * should equal to new_disks and new_sectors 2255 */ 2256 if (conf->scribble_disks >= new_disks && 2257 conf->scribble_sectors >= new_sectors) 2258 return 0; 2259 mddev_suspend(conf->mddev); 2260 get_online_cpus(); 2261 for_each_present_cpu(cpu) { 2262 struct raid5_percpu *percpu; 2263 struct flex_array *scribble; 2264 2265 percpu = per_cpu_ptr(conf->percpu, cpu); 2266 scribble = scribble_alloc(new_disks, 2267 new_sectors / STRIPE_SECTORS, 2268 GFP_NOIO); 2269 2270 if (scribble) { 2271 flex_array_free(percpu->scribble); 2272 percpu->scribble = scribble; 2273 } else { 2274 err = -ENOMEM; 2275 break; 2276 } 2277 } 2278 put_online_cpus(); 2279 mddev_resume(conf->mddev); 2280 if (!err) { 2281 conf->scribble_disks = new_disks; 2282 conf->scribble_sectors = new_sectors; 2283 } 2284 return err; 2285 } 2286 2287 static int resize_stripes(struct r5conf *conf, int newsize) 2288 { 2289 /* Make all the stripes able to hold 'newsize' devices. 2290 * New slots in each stripe get 'page' set to a new page. 2291 * 2292 * This happens in stages: 2293 * 1/ create a new kmem_cache and allocate the required number of 2294 * stripe_heads. 2295 * 2/ gather all the old stripe_heads and transfer the pages across 2296 * to the new stripe_heads. This will have the side effect of 2297 * freezing the array as once all stripe_heads have been collected, 2298 * no IO will be possible. Old stripe heads are freed once their 2299 * pages have been transferred over, and the old kmem_cache is 2300 * freed when all stripes are done. 2301 * 3/ reallocate conf->disks to be suitable bigger. If this fails, 2302 * we simple return a failure status - no need to clean anything up. 2303 * 4/ allocate new pages for the new slots in the new stripe_heads. 2304 * If this fails, we don't bother trying the shrink the 2305 * stripe_heads down again, we just leave them as they are. 2306 * As each stripe_head is processed the new one is released into 2307 * active service. 2308 * 2309 * Once step2 is started, we cannot afford to wait for a write, 2310 * so we use GFP_NOIO allocations. 2311 */ 2312 struct stripe_head *osh, *nsh; 2313 LIST_HEAD(newstripes); 2314 struct disk_info *ndisks; 2315 int err; 2316 struct kmem_cache *sc; 2317 int i; 2318 int hash, cnt; 2319 2320 err = md_allow_write(conf->mddev); 2321 if (err) 2322 return err; 2323 2324 /* Step 1 */ 2325 sc = kmem_cache_create(conf->cache_name[1-conf->active_name], 2326 sizeof(struct stripe_head)+(newsize-1)*sizeof(struct r5dev), 2327 0, 0, NULL); 2328 if (!sc) 2329 return -ENOMEM; 2330 2331 /* Need to ensure auto-resizing doesn't interfere */ 2332 mutex_lock(&conf->cache_size_mutex); 2333 2334 for (i = conf->max_nr_stripes; i; i--) { 2335 nsh = alloc_stripe(sc, GFP_KERNEL, newsize, conf); 2336 if (!nsh) 2337 break; 2338 2339 list_add(&nsh->lru, &newstripes); 2340 } 2341 if (i) { 2342 /* didn't get enough, give up */ 2343 while (!list_empty(&newstripes)) { 2344 nsh = list_entry(newstripes.next, struct stripe_head, lru); 2345 list_del(&nsh->lru); 2346 free_stripe(sc, nsh); 2347 } 2348 kmem_cache_destroy(sc); 2349 mutex_unlock(&conf->cache_size_mutex); 2350 return -ENOMEM; 2351 } 2352 /* Step 2 - Must use GFP_NOIO now. 2353 * OK, we have enough stripes, start collecting inactive 2354 * stripes and copying them over 2355 */ 2356 hash = 0; 2357 cnt = 0; 2358 list_for_each_entry(nsh, &newstripes, lru) { 2359 lock_device_hash_lock(conf, hash); 2360 wait_event_cmd(conf->wait_for_stripe, 2361 !list_empty(conf->inactive_list + hash), 2362 unlock_device_hash_lock(conf, hash), 2363 lock_device_hash_lock(conf, hash)); 2364 osh = get_free_stripe(conf, hash); 2365 unlock_device_hash_lock(conf, hash); 2366 2367 for(i=0; i<conf->pool_size; i++) { 2368 nsh->dev[i].page = osh->dev[i].page; 2369 nsh->dev[i].orig_page = osh->dev[i].page; 2370 } 2371 nsh->hash_lock_index = hash; 2372 free_stripe(conf->slab_cache, osh); 2373 cnt++; 2374 if (cnt >= conf->max_nr_stripes / NR_STRIPE_HASH_LOCKS + 2375 !!((conf->max_nr_stripes % NR_STRIPE_HASH_LOCKS) > hash)) { 2376 hash++; 2377 cnt = 0; 2378 } 2379 } 2380 kmem_cache_destroy(conf->slab_cache); 2381 2382 /* Step 3. 2383 * At this point, we are holding all the stripes so the array 2384 * is completely stalled, so now is a good time to resize 2385 * conf->disks and the scribble region 2386 */ 2387 ndisks = kzalloc(newsize * sizeof(struct disk_info), GFP_NOIO); 2388 if (ndisks) { 2389 for (i = 0; i < conf->pool_size; i++) 2390 ndisks[i] = conf->disks[i]; 2391 2392 for (i = conf->pool_size; i < newsize; i++) { 2393 ndisks[i].extra_page = alloc_page(GFP_NOIO); 2394 if (!ndisks[i].extra_page) 2395 err = -ENOMEM; 2396 } 2397 2398 if (err) { 2399 for (i = conf->pool_size; i < newsize; i++) 2400 if (ndisks[i].extra_page) 2401 put_page(ndisks[i].extra_page); 2402 kfree(ndisks); 2403 } else { 2404 kfree(conf->disks); 2405 conf->disks = ndisks; 2406 } 2407 } else 2408 err = -ENOMEM; 2409 2410 mutex_unlock(&conf->cache_size_mutex); 2411 2412 conf->slab_cache = sc; 2413 conf->active_name = 1-conf->active_name; 2414 2415 /* Step 4, return new stripes to service */ 2416 while(!list_empty(&newstripes)) { 2417 nsh = list_entry(newstripes.next, struct stripe_head, lru); 2418 list_del_init(&nsh->lru); 2419 2420 for (i=conf->raid_disks; i < newsize; i++) 2421 if (nsh->dev[i].page == NULL) { 2422 struct page *p = alloc_page(GFP_NOIO); 2423 nsh->dev[i].page = p; 2424 nsh->dev[i].orig_page = p; 2425 if (!p) 2426 err = -ENOMEM; 2427 } 2428 raid5_release_stripe(nsh); 2429 } 2430 /* critical section pass, GFP_NOIO no longer needed */ 2431 2432 if (!err) 2433 conf->pool_size = newsize; 2434 return err; 2435 } 2436 2437 static int drop_one_stripe(struct r5conf *conf) 2438 { 2439 struct stripe_head *sh; 2440 int hash = (conf->max_nr_stripes - 1) & STRIPE_HASH_LOCKS_MASK; 2441 2442 spin_lock_irq(conf->hash_locks + hash); 2443 sh = get_free_stripe(conf, hash); 2444 spin_unlock_irq(conf->hash_locks + hash); 2445 if (!sh) 2446 return 0; 2447 BUG_ON(atomic_read(&sh->count)); 2448 shrink_buffers(sh); 2449 free_stripe(conf->slab_cache, sh); 2450 atomic_dec(&conf->active_stripes); 2451 conf->max_nr_stripes--; 2452 return 1; 2453 } 2454 2455 static void shrink_stripes(struct r5conf *conf) 2456 { 2457 while (conf->max_nr_stripes && 2458 drop_one_stripe(conf)) 2459 ; 2460 2461 kmem_cache_destroy(conf->slab_cache); 2462 conf->slab_cache = NULL; 2463 } 2464 2465 static void raid5_end_read_request(struct bio * bi) 2466 { 2467 struct stripe_head *sh = bi->bi_private; 2468 struct r5conf *conf = sh->raid_conf; 2469 int disks = sh->disks, i; 2470 char b[BDEVNAME_SIZE]; 2471 struct md_rdev *rdev = NULL; 2472 sector_t s; 2473 2474 for (i=0 ; i<disks; i++) 2475 if (bi == &sh->dev[i].req) 2476 break; 2477 2478 pr_debug("end_read_request %llu/%d, count: %d, error %d.\n", 2479 (unsigned long long)sh->sector, i, atomic_read(&sh->count), 2480 bi->bi_error); 2481 if (i == disks) { 2482 bio_reset(bi); 2483 BUG(); 2484 return; 2485 } 2486 if (test_bit(R5_ReadRepl, &sh->dev[i].flags)) 2487 /* If replacement finished while this request was outstanding, 2488 * 'replacement' might be NULL already. 2489 * In that case it moved down to 'rdev'. 2490 * rdev is not removed until all requests are finished. 2491 */ 2492 rdev = conf->disks[i].replacement; 2493 if (!rdev) 2494 rdev = conf->disks[i].rdev; 2495 2496 if (use_new_offset(conf, sh)) 2497 s = sh->sector + rdev->new_data_offset; 2498 else 2499 s = sh->sector + rdev->data_offset; 2500 if (!bi->bi_error) { 2501 set_bit(R5_UPTODATE, &sh->dev[i].flags); 2502 if (test_bit(R5_ReadError, &sh->dev[i].flags)) { 2503 /* Note that this cannot happen on a 2504 * replacement device. We just fail those on 2505 * any error 2506 */ 2507 pr_info_ratelimited( 2508 "md/raid:%s: read error corrected (%lu sectors at %llu on %s)\n", 2509 mdname(conf->mddev), STRIPE_SECTORS, 2510 (unsigned long long)s, 2511 bdevname(rdev->bdev, b)); 2512 atomic_add(STRIPE_SECTORS, &rdev->corrected_errors); 2513 clear_bit(R5_ReadError, &sh->dev[i].flags); 2514 clear_bit(R5_ReWrite, &sh->dev[i].flags); 2515 } else if (test_bit(R5_ReadNoMerge, &sh->dev[i].flags)) 2516 clear_bit(R5_ReadNoMerge, &sh->dev[i].flags); 2517 2518 if (test_bit(R5_InJournal, &sh->dev[i].flags)) 2519 /* 2520 * end read for a page in journal, this 2521 * must be preparing for prexor in rmw 2522 */ 2523 set_bit(R5_OrigPageUPTDODATE, &sh->dev[i].flags); 2524 2525 if (atomic_read(&rdev->read_errors)) 2526 atomic_set(&rdev->read_errors, 0); 2527 } else { 2528 const char *bdn = bdevname(rdev->bdev, b); 2529 int retry = 0; 2530 int set_bad = 0; 2531 2532 clear_bit(R5_UPTODATE, &sh->dev[i].flags); 2533 atomic_inc(&rdev->read_errors); 2534 if (test_bit(R5_ReadRepl, &sh->dev[i].flags)) 2535 pr_warn_ratelimited( 2536 "md/raid:%s: read error on replacement device (sector %llu on %s).\n", 2537 mdname(conf->mddev), 2538 (unsigned long long)s, 2539 bdn); 2540 else if (conf->mddev->degraded >= conf->max_degraded) { 2541 set_bad = 1; 2542 pr_warn_ratelimited( 2543 "md/raid:%s: read error not correctable (sector %llu on %s).\n", 2544 mdname(conf->mddev), 2545 (unsigned long long)s, 2546 bdn); 2547 } else if (test_bit(R5_ReWrite, &sh->dev[i].flags)) { 2548 /* Oh, no!!! */ 2549 set_bad = 1; 2550 pr_warn_ratelimited( 2551 "md/raid:%s: read error NOT corrected!! (sector %llu on %s).\n", 2552 mdname(conf->mddev), 2553 (unsigned long long)s, 2554 bdn); 2555 } else if (atomic_read(&rdev->read_errors) 2556 > conf->max_nr_stripes) 2557 pr_warn("md/raid:%s: Too many read errors, failing device %s.\n", 2558 mdname(conf->mddev), bdn); 2559 else 2560 retry = 1; 2561 if (set_bad && test_bit(In_sync, &rdev->flags) 2562 && !test_bit(R5_ReadNoMerge, &sh->dev[i].flags)) 2563 retry = 1; 2564 if (retry) 2565 if (test_bit(R5_ReadNoMerge, &sh->dev[i].flags)) { 2566 set_bit(R5_ReadError, &sh->dev[i].flags); 2567 clear_bit(R5_ReadNoMerge, &sh->dev[i].flags); 2568 } else 2569 set_bit(R5_ReadNoMerge, &sh->dev[i].flags); 2570 else { 2571 clear_bit(R5_ReadError, &sh->dev[i].flags); 2572 clear_bit(R5_ReWrite, &sh->dev[i].flags); 2573 if (!(set_bad 2574 && test_bit(In_sync, &rdev->flags) 2575 && rdev_set_badblocks( 2576 rdev, sh->sector, STRIPE_SECTORS, 0))) 2577 md_error(conf->mddev, rdev); 2578 } 2579 } 2580 rdev_dec_pending(rdev, conf->mddev); 2581 bio_reset(bi); 2582 clear_bit(R5_LOCKED, &sh->dev[i].flags); 2583 set_bit(STRIPE_HANDLE, &sh->state); 2584 raid5_release_stripe(sh); 2585 } 2586 2587 static void raid5_end_write_request(struct bio *bi) 2588 { 2589 struct stripe_head *sh = bi->bi_private; 2590 struct r5conf *conf = sh->raid_conf; 2591 int disks = sh->disks, i; 2592 struct md_rdev *uninitialized_var(rdev); 2593 sector_t first_bad; 2594 int bad_sectors; 2595 int replacement = 0; 2596 2597 for (i = 0 ; i < disks; i++) { 2598 if (bi == &sh->dev[i].req) { 2599 rdev = conf->disks[i].rdev; 2600 break; 2601 } 2602 if (bi == &sh->dev[i].rreq) { 2603 rdev = conf->disks[i].replacement; 2604 if (rdev) 2605 replacement = 1; 2606 else 2607 /* rdev was removed and 'replacement' 2608 * replaced it. rdev is not removed 2609 * until all requests are finished. 2610 */ 2611 rdev = conf->disks[i].rdev; 2612 break; 2613 } 2614 } 2615 pr_debug("end_write_request %llu/%d, count %d, error: %d.\n", 2616 (unsigned long long)sh->sector, i, atomic_read(&sh->count), 2617 bi->bi_error); 2618 if (i == disks) { 2619 bio_reset(bi); 2620 BUG(); 2621 return; 2622 } 2623 2624 if (replacement) { 2625 if (bi->bi_error) 2626 md_error(conf->mddev, rdev); 2627 else if (is_badblock(rdev, sh->sector, 2628 STRIPE_SECTORS, 2629 &first_bad, &bad_sectors)) 2630 set_bit(R5_MadeGoodRepl, &sh->dev[i].flags); 2631 } else { 2632 if (bi->bi_error) { 2633 set_bit(STRIPE_DEGRADED, &sh->state); 2634 set_bit(WriteErrorSeen, &rdev->flags); 2635 set_bit(R5_WriteError, &sh->dev[i].flags); 2636 if (!test_and_set_bit(WantReplacement, &rdev->flags)) 2637 set_bit(MD_RECOVERY_NEEDED, 2638 &rdev->mddev->recovery); 2639 } else if (is_badblock(rdev, sh->sector, 2640 STRIPE_SECTORS, 2641 &first_bad, &bad_sectors)) { 2642 set_bit(R5_MadeGood, &sh->dev[i].flags); 2643 if (test_bit(R5_ReadError, &sh->dev[i].flags)) 2644 /* That was a successful write so make 2645 * sure it looks like we already did 2646 * a re-write. 2647 */ 2648 set_bit(R5_ReWrite, &sh->dev[i].flags); 2649 } 2650 } 2651 rdev_dec_pending(rdev, conf->mddev); 2652 2653 if (sh->batch_head && bi->bi_error && !replacement) 2654 set_bit(STRIPE_BATCH_ERR, &sh->batch_head->state); 2655 2656 bio_reset(bi); 2657 if (!test_and_clear_bit(R5_DOUBLE_LOCKED, &sh->dev[i].flags)) 2658 clear_bit(R5_LOCKED, &sh->dev[i].flags); 2659 set_bit(STRIPE_HANDLE, &sh->state); 2660 raid5_release_stripe(sh); 2661 2662 if (sh->batch_head && sh != sh->batch_head) 2663 raid5_release_stripe(sh->batch_head); 2664 } 2665 2666 static void raid5_build_block(struct stripe_head *sh, int i, int previous) 2667 { 2668 struct r5dev *dev = &sh->dev[i]; 2669 2670 dev->flags = 0; 2671 dev->sector = raid5_compute_blocknr(sh, i, previous); 2672 } 2673 2674 static void raid5_error(struct mddev *mddev, struct md_rdev *rdev) 2675 { 2676 char b[BDEVNAME_SIZE]; 2677 struct r5conf *conf = mddev->private; 2678 unsigned long flags; 2679 pr_debug("raid456: error called\n"); 2680 2681 spin_lock_irqsave(&conf->device_lock, flags); 2682 clear_bit(In_sync, &rdev->flags); 2683 mddev->degraded = raid5_calc_degraded(conf); 2684 spin_unlock_irqrestore(&conf->device_lock, flags); 2685 set_bit(MD_RECOVERY_INTR, &mddev->recovery); 2686 2687 set_bit(Blocked, &rdev->flags); 2688 set_bit(Faulty, &rdev->flags); 2689 set_mask_bits(&mddev->sb_flags, 0, 2690 BIT(MD_SB_CHANGE_DEVS) | BIT(MD_SB_CHANGE_PENDING)); 2691 pr_crit("md/raid:%s: Disk failure on %s, disabling device.\n" 2692 "md/raid:%s: Operation continuing on %d devices.\n", 2693 mdname(mddev), 2694 bdevname(rdev->bdev, b), 2695 mdname(mddev), 2696 conf->raid_disks - mddev->degraded); 2697 r5c_update_on_rdev_error(mddev); 2698 } 2699 2700 /* 2701 * Input: a 'big' sector number, 2702 * Output: index of the data and parity disk, and the sector # in them. 2703 */ 2704 sector_t raid5_compute_sector(struct r5conf *conf, sector_t r_sector, 2705 int previous, int *dd_idx, 2706 struct stripe_head *sh) 2707 { 2708 sector_t stripe, stripe2; 2709 sector_t chunk_number; 2710 unsigned int chunk_offset; 2711 int pd_idx, qd_idx; 2712 int ddf_layout = 0; 2713 sector_t new_sector; 2714 int algorithm = previous ? conf->prev_algo 2715 : conf->algorithm; 2716 int sectors_per_chunk = previous ? conf->prev_chunk_sectors 2717 : conf->chunk_sectors; 2718 int raid_disks = previous ? conf->previous_raid_disks 2719 : conf->raid_disks; 2720 int data_disks = raid_disks - conf->max_degraded; 2721 2722 /* First compute the information on this sector */ 2723 2724 /* 2725 * Compute the chunk number and the sector offset inside the chunk 2726 */ 2727 chunk_offset = sector_div(r_sector, sectors_per_chunk); 2728 chunk_number = r_sector; 2729 2730 /* 2731 * Compute the stripe number 2732 */ 2733 stripe = chunk_number; 2734 *dd_idx = sector_div(stripe, data_disks); 2735 stripe2 = stripe; 2736 /* 2737 * Select the parity disk based on the user selected algorithm. 2738 */ 2739 pd_idx = qd_idx = -1; 2740 switch(conf->level) { 2741 case 4: 2742 pd_idx = data_disks; 2743 break; 2744 case 5: 2745 switch (algorithm) { 2746 case ALGORITHM_LEFT_ASYMMETRIC: 2747 pd_idx = data_disks - sector_div(stripe2, raid_disks); 2748 if (*dd_idx >= pd_idx) 2749 (*dd_idx)++; 2750 break; 2751 case ALGORITHM_RIGHT_ASYMMETRIC: 2752 pd_idx = sector_div(stripe2, raid_disks); 2753 if (*dd_idx >= pd_idx) 2754 (*dd_idx)++; 2755 break; 2756 case ALGORITHM_LEFT_SYMMETRIC: 2757 pd_idx = data_disks - sector_div(stripe2, raid_disks); 2758 *dd_idx = (pd_idx + 1 + *dd_idx) % raid_disks; 2759 break; 2760 case ALGORITHM_RIGHT_SYMMETRIC: 2761 pd_idx = sector_div(stripe2, raid_disks); 2762 *dd_idx = (pd_idx + 1 + *dd_idx) % raid_disks; 2763 break; 2764 case ALGORITHM_PARITY_0: 2765 pd_idx = 0; 2766 (*dd_idx)++; 2767 break; 2768 case ALGORITHM_PARITY_N: 2769 pd_idx = data_disks; 2770 break; 2771 default: 2772 BUG(); 2773 } 2774 break; 2775 case 6: 2776 2777 switch (algorithm) { 2778 case ALGORITHM_LEFT_ASYMMETRIC: 2779 pd_idx = raid_disks - 1 - sector_div(stripe2, raid_disks); 2780 qd_idx = pd_idx + 1; 2781 if (pd_idx == raid_disks-1) { 2782 (*dd_idx)++; /* Q D D D P */ 2783 qd_idx = 0; 2784 } else if (*dd_idx >= pd_idx) 2785 (*dd_idx) += 2; /* D D P Q D */ 2786 break; 2787 case ALGORITHM_RIGHT_ASYMMETRIC: 2788 pd_idx = sector_div(stripe2, raid_disks); 2789 qd_idx = pd_idx + 1; 2790 if (pd_idx == raid_disks-1) { 2791 (*dd_idx)++; /* Q D D D P */ 2792 qd_idx = 0; 2793 } else if (*dd_idx >= pd_idx) 2794 (*dd_idx) += 2; /* D D P Q D */ 2795 break; 2796 case ALGORITHM_LEFT_SYMMETRIC: 2797 pd_idx = raid_disks - 1 - sector_div(stripe2, raid_disks); 2798 qd_idx = (pd_idx + 1) % raid_disks; 2799 *dd_idx = (pd_idx + 2 + *dd_idx) % raid_disks; 2800 break; 2801 case ALGORITHM_RIGHT_SYMMETRIC: 2802 pd_idx = sector_div(stripe2, raid_disks); 2803 qd_idx = (pd_idx + 1) % raid_disks; 2804 *dd_idx = (pd_idx + 2 + *dd_idx) % raid_disks; 2805 break; 2806 2807 case ALGORITHM_PARITY_0: 2808 pd_idx = 0; 2809 qd_idx = 1; 2810 (*dd_idx) += 2; 2811 break; 2812 case ALGORITHM_PARITY_N: 2813 pd_idx = data_disks; 2814 qd_idx = data_disks + 1; 2815 break; 2816 2817 case ALGORITHM_ROTATING_ZERO_RESTART: 2818 /* Exactly the same as RIGHT_ASYMMETRIC, but or 2819 * of blocks for computing Q is different. 2820 */ 2821 pd_idx = sector_div(stripe2, raid_disks); 2822 qd_idx = pd_idx + 1; 2823 if (pd_idx == raid_disks-1) { 2824 (*dd_idx)++; /* Q D D D P */ 2825 qd_idx = 0; 2826 } else if (*dd_idx >= pd_idx) 2827 (*dd_idx) += 2; /* D D P Q D */ 2828 ddf_layout = 1; 2829 break; 2830 2831 case ALGORITHM_ROTATING_N_RESTART: 2832 /* Same a left_asymmetric, by first stripe is 2833 * D D D P Q rather than 2834 * Q D D D P 2835 */ 2836 stripe2 += 1; 2837 pd_idx = raid_disks - 1 - sector_div(stripe2, raid_disks); 2838 qd_idx = pd_idx + 1; 2839 if (pd_idx == raid_disks-1) { 2840 (*dd_idx)++; /* Q D D D P */ 2841 qd_idx = 0; 2842 } else if (*dd_idx >= pd_idx) 2843 (*dd_idx) += 2; /* D D P Q D */ 2844 ddf_layout = 1; 2845 break; 2846 2847 case ALGORITHM_ROTATING_N_CONTINUE: 2848 /* Same as left_symmetric but Q is before P */ 2849 pd_idx = raid_disks - 1 - sector_div(stripe2, raid_disks); 2850 qd_idx = (pd_idx + raid_disks - 1) % raid_disks; 2851 *dd_idx = (pd_idx + 1 + *dd_idx) % raid_disks; 2852 ddf_layout = 1; 2853 break; 2854 2855 case ALGORITHM_LEFT_ASYMMETRIC_6: 2856 /* RAID5 left_asymmetric, with Q on last device */ 2857 pd_idx = data_disks - sector_div(stripe2, raid_disks-1); 2858 if (*dd_idx >= pd_idx) 2859 (*dd_idx)++; 2860 qd_idx = raid_disks - 1; 2861 break; 2862 2863 case ALGORITHM_RIGHT_ASYMMETRIC_6: 2864 pd_idx = sector_div(stripe2, raid_disks-1); 2865 if (*dd_idx >= pd_idx) 2866 (*dd_idx)++; 2867 qd_idx = raid_disks - 1; 2868 break; 2869 2870 case ALGORITHM_LEFT_SYMMETRIC_6: 2871 pd_idx = data_disks - sector_div(stripe2, raid_disks-1); 2872 *dd_idx = (pd_idx + 1 + *dd_idx) % (raid_disks-1); 2873 qd_idx = raid_disks - 1; 2874 break; 2875 2876 case ALGORITHM_RIGHT_SYMMETRIC_6: 2877 pd_idx = sector_div(stripe2, raid_disks-1); 2878 *dd_idx = (pd_idx + 1 + *dd_idx) % (raid_disks-1); 2879 qd_idx = raid_disks - 1; 2880 break; 2881 2882 case ALGORITHM_PARITY_0_6: 2883 pd_idx = 0; 2884 (*dd_idx)++; 2885 qd_idx = raid_disks - 1; 2886 break; 2887 2888 default: 2889 BUG(); 2890 } 2891 break; 2892 } 2893 2894 if (sh) { 2895 sh->pd_idx = pd_idx; 2896 sh->qd_idx = qd_idx; 2897 sh->ddf_layout = ddf_layout; 2898 } 2899 /* 2900 * Finally, compute the new sector number 2901 */ 2902 new_sector = (sector_t)stripe * sectors_per_chunk + chunk_offset; 2903 return new_sector; 2904 } 2905 2906 sector_t raid5_compute_blocknr(struct stripe_head *sh, int i, int previous) 2907 { 2908 struct r5conf *conf = sh->raid_conf; 2909 int raid_disks = sh->disks; 2910 int data_disks = raid_disks - conf->max_degraded; 2911 sector_t new_sector = sh->sector, check; 2912 int sectors_per_chunk = previous ? conf->prev_chunk_sectors 2913 : conf->chunk_sectors; 2914 int algorithm = previous ? conf->prev_algo 2915 : conf->algorithm; 2916 sector_t stripe; 2917 int chunk_offset; 2918 sector_t chunk_number; 2919 int dummy1, dd_idx = i; 2920 sector_t r_sector; 2921 struct stripe_head sh2; 2922 2923 chunk_offset = sector_div(new_sector, sectors_per_chunk); 2924 stripe = new_sector; 2925 2926 if (i == sh->pd_idx) 2927 return 0; 2928 switch(conf->level) { 2929 case 4: break; 2930 case 5: 2931 switch (algorithm) { 2932 case ALGORITHM_LEFT_ASYMMETRIC: 2933 case ALGORITHM_RIGHT_ASYMMETRIC: 2934 if (i > sh->pd_idx) 2935 i--; 2936 break; 2937 case ALGORITHM_LEFT_SYMMETRIC: 2938 case ALGORITHM_RIGHT_SYMMETRIC: 2939 if (i < sh->pd_idx) 2940 i += raid_disks; 2941 i -= (sh->pd_idx + 1); 2942 break; 2943 case ALGORITHM_PARITY_0: 2944 i -= 1; 2945 break; 2946 case ALGORITHM_PARITY_N: 2947 break; 2948 default: 2949 BUG(); 2950 } 2951 break; 2952 case 6: 2953 if (i == sh->qd_idx) 2954 return 0; /* It is the Q disk */ 2955 switch (algorithm) { 2956 case ALGORITHM_LEFT_ASYMMETRIC: 2957 case ALGORITHM_RIGHT_ASYMMETRIC: 2958 case ALGORITHM_ROTATING_ZERO_RESTART: 2959 case ALGORITHM_ROTATING_N_RESTART: 2960 if (sh->pd_idx == raid_disks-1) 2961 i--; /* Q D D D P */ 2962 else if (i > sh->pd_idx) 2963 i -= 2; /* D D P Q D */ 2964 break; 2965 case ALGORITHM_LEFT_SYMMETRIC: 2966 case ALGORITHM_RIGHT_SYMMETRIC: 2967 if (sh->pd_idx == raid_disks-1) 2968 i--; /* Q D D D P */ 2969 else { 2970 /* D D P Q D */ 2971 if (i < sh->pd_idx) 2972 i += raid_disks; 2973 i -= (sh->pd_idx + 2); 2974 } 2975 break; 2976 case ALGORITHM_PARITY_0: 2977 i -= 2; 2978 break; 2979 case ALGORITHM_PARITY_N: 2980 break; 2981 case ALGORITHM_ROTATING_N_CONTINUE: 2982 /* Like left_symmetric, but P is before Q */ 2983 if (sh->pd_idx == 0) 2984 i--; /* P D D D Q */ 2985 else { 2986 /* D D Q P D */ 2987 if (i < sh->pd_idx) 2988 i += raid_disks; 2989 i -= (sh->pd_idx + 1); 2990 } 2991 break; 2992 case ALGORITHM_LEFT_ASYMMETRIC_6: 2993 case ALGORITHM_RIGHT_ASYMMETRIC_6: 2994 if (i > sh->pd_idx) 2995 i--; 2996 break; 2997 case ALGORITHM_LEFT_SYMMETRIC_6: 2998 case ALGORITHM_RIGHT_SYMMETRIC_6: 2999 if (i < sh->pd_idx) 3000 i += data_disks + 1; 3001 i -= (sh->pd_idx + 1); 3002 break; 3003 case ALGORITHM_PARITY_0_6: 3004 i -= 1; 3005 break; 3006 default: 3007 BUG(); 3008 } 3009 break; 3010 } 3011 3012 chunk_number = stripe * data_disks + i; 3013 r_sector = chunk_number * sectors_per_chunk + chunk_offset; 3014 3015 check = raid5_compute_sector(conf, r_sector, 3016 previous, &dummy1, &sh2); 3017 if (check != sh->sector || dummy1 != dd_idx || sh2.pd_idx != sh->pd_idx 3018 || sh2.qd_idx != sh->qd_idx) { 3019 pr_warn("md/raid:%s: compute_blocknr: map not correct\n", 3020 mdname(conf->mddev)); 3021 return 0; 3022 } 3023 return r_sector; 3024 } 3025 3026 /* 3027 * There are cases where we want handle_stripe_dirtying() and 3028 * schedule_reconstruction() to delay towrite to some dev of a stripe. 3029 * 3030 * This function checks whether we want to delay the towrite. Specifically, 3031 * we delay the towrite when: 3032 * 3033 * 1. degraded stripe has a non-overwrite to the missing dev, AND this 3034 * stripe has data in journal (for other devices). 3035 * 3036 * In this case, when reading data for the non-overwrite dev, it is 3037 * necessary to handle complex rmw of write back cache (prexor with 3038 * orig_page, and xor with page). To keep read path simple, we would 3039 * like to flush data in journal to RAID disks first, so complex rmw 3040 * is handled in the write patch (handle_stripe_dirtying). 3041 * 3042 * 2. when journal space is critical (R5C_LOG_CRITICAL=1) 3043 * 3044 * It is important to be able to flush all stripes in raid5-cache. 3045 * Therefore, we need reserve some space on the journal device for 3046 * these flushes. If flush operation includes pending writes to the 3047 * stripe, we need to reserve (conf->raid_disk + 1) pages per stripe 3048 * for the flush out. If we exclude these pending writes from flush 3049 * operation, we only need (conf->max_degraded + 1) pages per stripe. 3050 * Therefore, excluding pending writes in these cases enables more 3051 * efficient use of the journal device. 3052 * 3053 * Note: To make sure the stripe makes progress, we only delay 3054 * towrite for stripes with data already in journal (injournal > 0). 3055 * When LOG_CRITICAL, stripes with injournal == 0 will be sent to 3056 * no_space_stripes list. 3057 * 3058 */ 3059 static inline bool delay_towrite(struct r5conf *conf, 3060 struct r5dev *dev, 3061 struct stripe_head_state *s) 3062 { 3063 /* case 1 above */ 3064 if (!test_bit(R5_OVERWRITE, &dev->flags) && 3065 !test_bit(R5_Insync, &dev->flags) && s->injournal) 3066 return true; 3067 /* case 2 above */ 3068 if (test_bit(R5C_LOG_CRITICAL, &conf->cache_state) && 3069 s->injournal > 0) 3070 return true; 3071 return false; 3072 } 3073 3074 static void 3075 schedule_reconstruction(struct stripe_head *sh, struct stripe_head_state *s, 3076 int rcw, int expand) 3077 { 3078 int i, pd_idx = sh->pd_idx, qd_idx = sh->qd_idx, disks = sh->disks; 3079 struct r5conf *conf = sh->raid_conf; 3080 int level = conf->level; 3081 3082 if (rcw) { 3083 /* 3084 * In some cases, handle_stripe_dirtying initially decided to 3085 * run rmw and allocates extra page for prexor. However, rcw is 3086 * cheaper later on. We need to free the extra page now, 3087 * because we won't be able to do that in ops_complete_prexor(). 3088 */ 3089 r5c_release_extra_page(sh); 3090 3091 for (i = disks; i--; ) { 3092 struct r5dev *dev = &sh->dev[i]; 3093 3094 if (dev->towrite && !delay_towrite(conf, dev, s)) { 3095 set_bit(R5_LOCKED, &dev->flags); 3096 set_bit(R5_Wantdrain, &dev->flags); 3097 if (!expand) 3098 clear_bit(R5_UPTODATE, &dev->flags); 3099 s->locked++; 3100 } else if (test_bit(R5_InJournal, &dev->flags)) { 3101 set_bit(R5_LOCKED, &dev->flags); 3102 s->locked++; 3103 } 3104 } 3105 /* if we are not expanding this is a proper write request, and 3106 * there will be bios with new data to be drained into the 3107 * stripe cache 3108 */ 3109 if (!expand) { 3110 if (!s->locked) 3111 /* False alarm, nothing to do */ 3112 return; 3113 sh->reconstruct_state = reconstruct_state_drain_run; 3114 set_bit(STRIPE_OP_BIODRAIN, &s->ops_request); 3115 } else 3116 sh->reconstruct_state = reconstruct_state_run; 3117 3118 set_bit(STRIPE_OP_RECONSTRUCT, &s->ops_request); 3119 3120 if (s->locked + conf->max_degraded == disks) 3121 if (!test_and_set_bit(STRIPE_FULL_WRITE, &sh->state)) 3122 atomic_inc(&conf->pending_full_writes); 3123 } else { 3124 BUG_ON(!(test_bit(R5_UPTODATE, &sh->dev[pd_idx].flags) || 3125 test_bit(R5_Wantcompute, &sh->dev[pd_idx].flags))); 3126 BUG_ON(level == 6 && 3127 (!(test_bit(R5_UPTODATE, &sh->dev[qd_idx].flags) || 3128 test_bit(R5_Wantcompute, &sh->dev[qd_idx].flags)))); 3129 3130 for (i = disks; i--; ) { 3131 struct r5dev *dev = &sh->dev[i]; 3132 if (i == pd_idx || i == qd_idx) 3133 continue; 3134 3135 if (dev->towrite && 3136 (test_bit(R5_UPTODATE, &dev->flags) || 3137 test_bit(R5_Wantcompute, &dev->flags))) { 3138 set_bit(R5_Wantdrain, &dev->flags); 3139 set_bit(R5_LOCKED, &dev->flags); 3140 clear_bit(R5_UPTODATE, &dev->flags); 3141 s->locked++; 3142 } else if (test_bit(R5_InJournal, &dev->flags)) { 3143 set_bit(R5_LOCKED, &dev->flags); 3144 s->locked++; 3145 } 3146 } 3147 if (!s->locked) 3148 /* False alarm - nothing to do */ 3149 return; 3150 sh->reconstruct_state = reconstruct_state_prexor_drain_run; 3151 set_bit(STRIPE_OP_PREXOR, &s->ops_request); 3152 set_bit(STRIPE_OP_BIODRAIN, &s->ops_request); 3153 set_bit(STRIPE_OP_RECONSTRUCT, &s->ops_request); 3154 } 3155 3156 /* keep the parity disk(s) locked while asynchronous operations 3157 * are in flight 3158 */ 3159 set_bit(R5_LOCKED, &sh->dev[pd_idx].flags); 3160 clear_bit(R5_UPTODATE, &sh->dev[pd_idx].flags); 3161 s->locked++; 3162 3163 if (level == 6) { 3164 int qd_idx = sh->qd_idx; 3165 struct r5dev *dev = &sh->dev[qd_idx]; 3166 3167 set_bit(R5_LOCKED, &dev->flags); 3168 clear_bit(R5_UPTODATE, &dev->flags); 3169 s->locked++; 3170 } 3171 3172 if (raid5_has_ppl(sh->raid_conf) && sh->ppl_page && 3173 test_bit(STRIPE_OP_BIODRAIN, &s->ops_request) && 3174 !test_bit(STRIPE_FULL_WRITE, &sh->state) && 3175 test_bit(R5_Insync, &sh->dev[pd_idx].flags)) 3176 set_bit(STRIPE_OP_PARTIAL_PARITY, &s->ops_request); 3177 3178 pr_debug("%s: stripe %llu locked: %d ops_request: %lx\n", 3179 __func__, (unsigned long long)sh->sector, 3180 s->locked, s->ops_request); 3181 } 3182 3183 /* 3184 * Each stripe/dev can have one or more bion attached. 3185 * toread/towrite point to the first in a chain. 3186 * The bi_next chain must be in order. 3187 */ 3188 static int add_stripe_bio(struct stripe_head *sh, struct bio *bi, int dd_idx, 3189 int forwrite, int previous) 3190 { 3191 struct bio **bip; 3192 struct r5conf *conf = sh->raid_conf; 3193 int firstwrite=0; 3194 3195 pr_debug("adding bi b#%llu to stripe s#%llu\n", 3196 (unsigned long long)bi->bi_iter.bi_sector, 3197 (unsigned long long)sh->sector); 3198 3199 spin_lock_irq(&sh->stripe_lock); 3200 /* Don't allow new IO added to stripes in batch list */ 3201 if (sh->batch_head) 3202 goto overlap; 3203 if (forwrite) { 3204 bip = &sh->dev[dd_idx].towrite; 3205 if (*bip == NULL) 3206 firstwrite = 1; 3207 } else 3208 bip = &sh->dev[dd_idx].toread; 3209 while (*bip && (*bip)->bi_iter.bi_sector < bi->bi_iter.bi_sector) { 3210 if (bio_end_sector(*bip) > bi->bi_iter.bi_sector) 3211 goto overlap; 3212 bip = & (*bip)->bi_next; 3213 } 3214 if (*bip && (*bip)->bi_iter.bi_sector < bio_end_sector(bi)) 3215 goto overlap; 3216 3217 if (forwrite && raid5_has_ppl(conf)) { 3218 /* 3219 * With PPL only writes to consecutive data chunks within a 3220 * stripe are allowed because for a single stripe_head we can 3221 * only have one PPL entry at a time, which describes one data 3222 * range. Not really an overlap, but wait_for_overlap can be 3223 * used to handle this. 3224 */ 3225 sector_t sector; 3226 sector_t first = 0; 3227 sector_t last = 0; 3228 int count = 0; 3229 int i; 3230 3231 for (i = 0; i < sh->disks; i++) { 3232 if (i != sh->pd_idx && 3233 (i == dd_idx || sh->dev[i].towrite)) { 3234 sector = sh->dev[i].sector; 3235 if (count == 0 || sector < first) 3236 first = sector; 3237 if (sector > last) 3238 last = sector; 3239 count++; 3240 } 3241 } 3242 3243 if (first + conf->chunk_sectors * (count - 1) != last) 3244 goto overlap; 3245 } 3246 3247 if (!forwrite || previous) 3248 clear_bit(STRIPE_BATCH_READY, &sh->state); 3249 3250 BUG_ON(*bip && bi->bi_next && (*bip) != bi->bi_next); 3251 if (*bip) 3252 bi->bi_next = *bip; 3253 *bip = bi; 3254 bio_inc_remaining(bi); 3255 md_write_inc(conf->mddev, bi); 3256 3257 if (forwrite) { 3258 /* check if page is covered */ 3259 sector_t sector = sh->dev[dd_idx].sector; 3260 for (bi=sh->dev[dd_idx].towrite; 3261 sector < sh->dev[dd_idx].sector + STRIPE_SECTORS && 3262 bi && bi->bi_iter.bi_sector <= sector; 3263 bi = r5_next_bio(bi, sh->dev[dd_idx].sector)) { 3264 if (bio_end_sector(bi) >= sector) 3265 sector = bio_end_sector(bi); 3266 } 3267 if (sector >= sh->dev[dd_idx].sector + STRIPE_SECTORS) 3268 if (!test_and_set_bit(R5_OVERWRITE, &sh->dev[dd_idx].flags)) 3269 sh->overwrite_disks++; 3270 } 3271 3272 pr_debug("added bi b#%llu to stripe s#%llu, disk %d.\n", 3273 (unsigned long long)(*bip)->bi_iter.bi_sector, 3274 (unsigned long long)sh->sector, dd_idx); 3275 3276 if (conf->mddev->bitmap && firstwrite) { 3277 /* Cannot hold spinlock over bitmap_startwrite, 3278 * but must ensure this isn't added to a batch until 3279 * we have added to the bitmap and set bm_seq. 3280 * So set STRIPE_BITMAP_PENDING to prevent 3281 * batching. 3282 * If multiple add_stripe_bio() calls race here they 3283 * much all set STRIPE_BITMAP_PENDING. So only the first one 3284 * to complete "bitmap_startwrite" gets to set 3285 * STRIPE_BIT_DELAY. This is important as once a stripe 3286 * is added to a batch, STRIPE_BIT_DELAY cannot be changed 3287 * any more. 3288 */ 3289 set_bit(STRIPE_BITMAP_PENDING, &sh->state); 3290 spin_unlock_irq(&sh->stripe_lock); 3291 bitmap_startwrite(conf->mddev->bitmap, sh->sector, 3292 STRIPE_SECTORS, 0); 3293 spin_lock_irq(&sh->stripe_lock); 3294 clear_bit(STRIPE_BITMAP_PENDING, &sh->state); 3295 if (!sh->batch_head) { 3296 sh->bm_seq = conf->seq_flush+1; 3297 set_bit(STRIPE_BIT_DELAY, &sh->state); 3298 } 3299 } 3300 spin_unlock_irq(&sh->stripe_lock); 3301 3302 if (stripe_can_batch(sh)) 3303 stripe_add_to_batch_list(conf, sh); 3304 return 1; 3305 3306 overlap: 3307 set_bit(R5_Overlap, &sh->dev[dd_idx].flags); 3308 spin_unlock_irq(&sh->stripe_lock); 3309 return 0; 3310 } 3311 3312 static void end_reshape(struct r5conf *conf); 3313 3314 static void stripe_set_idx(sector_t stripe, struct r5conf *conf, int previous, 3315 struct stripe_head *sh) 3316 { 3317 int sectors_per_chunk = 3318 previous ? conf->prev_chunk_sectors : conf->chunk_sectors; 3319 int dd_idx; 3320 int chunk_offset = sector_div(stripe, sectors_per_chunk); 3321 int disks = previous ? conf->previous_raid_disks : conf->raid_disks; 3322 3323 raid5_compute_sector(conf, 3324 stripe * (disks - conf->max_degraded) 3325 *sectors_per_chunk + chunk_offset, 3326 previous, 3327 &dd_idx, sh); 3328 } 3329 3330 static void 3331 handle_failed_stripe(struct r5conf *conf, struct stripe_head *sh, 3332 struct stripe_head_state *s, int disks) 3333 { 3334 int i; 3335 BUG_ON(sh->batch_head); 3336 for (i = disks; i--; ) { 3337 struct bio *bi; 3338 int bitmap_end = 0; 3339 3340 if (test_bit(R5_ReadError, &sh->dev[i].flags)) { 3341 struct md_rdev *rdev; 3342 rcu_read_lock(); 3343 rdev = rcu_dereference(conf->disks[i].rdev); 3344 if (rdev && test_bit(In_sync, &rdev->flags) && 3345 !test_bit(Faulty, &rdev->flags)) 3346 atomic_inc(&rdev->nr_pending); 3347 else 3348 rdev = NULL; 3349 rcu_read_unlock(); 3350 if (rdev) { 3351 if (!rdev_set_badblocks( 3352 rdev, 3353 sh->sector, 3354 STRIPE_SECTORS, 0)) 3355 md_error(conf->mddev, rdev); 3356 rdev_dec_pending(rdev, conf->mddev); 3357 } 3358 } 3359 spin_lock_irq(&sh->stripe_lock); 3360 /* fail all writes first */ 3361 bi = sh->dev[i].towrite; 3362 sh->dev[i].towrite = NULL; 3363 sh->overwrite_disks = 0; 3364 spin_unlock_irq(&sh->stripe_lock); 3365 if (bi) 3366 bitmap_end = 1; 3367 3368 log_stripe_write_finished(sh); 3369 3370 if (test_and_clear_bit(R5_Overlap, &sh->dev[i].flags)) 3371 wake_up(&conf->wait_for_overlap); 3372 3373 while (bi && bi->bi_iter.bi_sector < 3374 sh->dev[i].sector + STRIPE_SECTORS) { 3375 struct bio *nextbi = r5_next_bio(bi, sh->dev[i].sector); 3376 3377 bi->bi_error = -EIO; 3378 md_write_end(conf->mddev); 3379 bio_endio(bi); 3380 bi = nextbi; 3381 } 3382 if (bitmap_end) 3383 bitmap_endwrite(conf->mddev->bitmap, sh->sector, 3384 STRIPE_SECTORS, 0, 0); 3385 bitmap_end = 0; 3386 /* and fail all 'written' */ 3387 bi = sh->dev[i].written; 3388 sh->dev[i].written = NULL; 3389 if (test_and_clear_bit(R5_SkipCopy, &sh->dev[i].flags)) { 3390 WARN_ON(test_bit(R5_UPTODATE, &sh->dev[i].flags)); 3391 sh->dev[i].page = sh->dev[i].orig_page; 3392 } 3393 3394 if (bi) bitmap_end = 1; 3395 while (bi && bi->bi_iter.bi_sector < 3396 sh->dev[i].sector + STRIPE_SECTORS) { 3397 struct bio *bi2 = r5_next_bio(bi, sh->dev[i].sector); 3398 3399 bi->bi_error = -EIO; 3400 md_write_end(conf->mddev); 3401 bio_endio(bi); 3402 bi = bi2; 3403 } 3404 3405 /* fail any reads if this device is non-operational and 3406 * the data has not reached the cache yet. 3407 */ 3408 if (!test_bit(R5_Wantfill, &sh->dev[i].flags) && 3409 s->failed > conf->max_degraded && 3410 (!test_bit(R5_Insync, &sh->dev[i].flags) || 3411 test_bit(R5_ReadError, &sh->dev[i].flags))) { 3412 spin_lock_irq(&sh->stripe_lock); 3413 bi = sh->dev[i].toread; 3414 sh->dev[i].toread = NULL; 3415 spin_unlock_irq(&sh->stripe_lock); 3416 if (test_and_clear_bit(R5_Overlap, &sh->dev[i].flags)) 3417 wake_up(&conf->wait_for_overlap); 3418 if (bi) 3419 s->to_read--; 3420 while (bi && bi->bi_iter.bi_sector < 3421 sh->dev[i].sector + STRIPE_SECTORS) { 3422 struct bio *nextbi = 3423 r5_next_bio(bi, sh->dev[i].sector); 3424 3425 bi->bi_error = -EIO; 3426 bio_endio(bi); 3427 bi = nextbi; 3428 } 3429 } 3430 if (bitmap_end) 3431 bitmap_endwrite(conf->mddev->bitmap, sh->sector, 3432 STRIPE_SECTORS, 0, 0); 3433 /* If we were in the middle of a write the parity block might 3434 * still be locked - so just clear all R5_LOCKED flags 3435 */ 3436 clear_bit(R5_LOCKED, &sh->dev[i].flags); 3437 } 3438 s->to_write = 0; 3439 s->written = 0; 3440 3441 if (test_and_clear_bit(STRIPE_FULL_WRITE, &sh->state)) 3442 if (atomic_dec_and_test(&conf->pending_full_writes)) 3443 md_wakeup_thread(conf->mddev->thread); 3444 } 3445 3446 static void 3447 handle_failed_sync(struct r5conf *conf, struct stripe_head *sh, 3448 struct stripe_head_state *s) 3449 { 3450 int abort = 0; 3451 int i; 3452 3453 BUG_ON(sh->batch_head); 3454 clear_bit(STRIPE_SYNCING, &sh->state); 3455 if (test_and_clear_bit(R5_Overlap, &sh->dev[sh->pd_idx].flags)) 3456 wake_up(&conf->wait_for_overlap); 3457 s->syncing = 0; 3458 s->replacing = 0; 3459 /* There is nothing more to do for sync/check/repair. 3460 * Don't even need to abort as that is handled elsewhere 3461 * if needed, and not always wanted e.g. if there is a known 3462 * bad block here. 3463 * For recover/replace we need to record a bad block on all 3464 * non-sync devices, or abort the recovery 3465 */ 3466 if (test_bit(MD_RECOVERY_RECOVER, &conf->mddev->recovery)) { 3467 /* During recovery devices cannot be removed, so 3468 * locking and refcounting of rdevs is not needed 3469 */ 3470 rcu_read_lock(); 3471 for (i = 0; i < conf->raid_disks; i++) { 3472 struct md_rdev *rdev = rcu_dereference(conf->disks[i].rdev); 3473 if (rdev 3474 && !test_bit(Faulty, &rdev->flags) 3475 && !test_bit(In_sync, &rdev->flags) 3476 && !rdev_set_badblocks(rdev, sh->sector, 3477 STRIPE_SECTORS, 0)) 3478 abort = 1; 3479 rdev = rcu_dereference(conf->disks[i].replacement); 3480 if (rdev 3481 && !test_bit(Faulty, &rdev->flags) 3482 && !test_bit(In_sync, &rdev->flags) 3483 && !rdev_set_badblocks(rdev, sh->sector, 3484 STRIPE_SECTORS, 0)) 3485 abort = 1; 3486 } 3487 rcu_read_unlock(); 3488 if (abort) 3489 conf->recovery_disabled = 3490 conf->mddev->recovery_disabled; 3491 } 3492 md_done_sync(conf->mddev, STRIPE_SECTORS, !abort); 3493 } 3494 3495 static int want_replace(struct stripe_head *sh, int disk_idx) 3496 { 3497 struct md_rdev *rdev; 3498 int rv = 0; 3499 3500 rcu_read_lock(); 3501 rdev = rcu_dereference(sh->raid_conf->disks[disk_idx].replacement); 3502 if (rdev 3503 && !test_bit(Faulty, &rdev->flags) 3504 && !test_bit(In_sync, &rdev->flags) 3505 && (rdev->recovery_offset <= sh->sector 3506 || rdev->mddev->recovery_cp <= sh->sector)) 3507 rv = 1; 3508 rcu_read_unlock(); 3509 return rv; 3510 } 3511 3512 static int need_this_block(struct stripe_head *sh, struct stripe_head_state *s, 3513 int disk_idx, int disks) 3514 { 3515 struct r5dev *dev = &sh->dev[disk_idx]; 3516 struct r5dev *fdev[2] = { &sh->dev[s->failed_num[0]], 3517 &sh->dev[s->failed_num[1]] }; 3518 int i; 3519 3520 3521 if (test_bit(R5_LOCKED, &dev->flags) || 3522 test_bit(R5_UPTODATE, &dev->flags)) 3523 /* No point reading this as we already have it or have 3524 * decided to get it. 3525 */ 3526 return 0; 3527 3528 if (dev->toread || 3529 (dev->towrite && !test_bit(R5_OVERWRITE, &dev->flags))) 3530 /* We need this block to directly satisfy a request */ 3531 return 1; 3532 3533 if (s->syncing || s->expanding || 3534 (s->replacing && want_replace(sh, disk_idx))) 3535 /* When syncing, or expanding we read everything. 3536 * When replacing, we need the replaced block. 3537 */ 3538 return 1; 3539 3540 if ((s->failed >= 1 && fdev[0]->toread) || 3541 (s->failed >= 2 && fdev[1]->toread)) 3542 /* If we want to read from a failed device, then 3543 * we need to actually read every other device. 3544 */ 3545 return 1; 3546 3547 /* Sometimes neither read-modify-write nor reconstruct-write 3548 * cycles can work. In those cases we read every block we 3549 * can. Then the parity-update is certain to have enough to 3550 * work with. 3551 * This can only be a problem when we need to write something, 3552 * and some device has failed. If either of those tests 3553 * fail we need look no further. 3554 */ 3555 if (!s->failed || !s->to_write) 3556 return 0; 3557 3558 if (test_bit(R5_Insync, &dev->flags) && 3559 !test_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) 3560 /* Pre-reads at not permitted until after short delay 3561 * to gather multiple requests. However if this 3562 * device is no Insync, the block could only be computed 3563 * and there is no need to delay that. 3564 */ 3565 return 0; 3566 3567 for (i = 0; i < s->failed && i < 2; i++) { 3568 if (fdev[i]->towrite && 3569 !test_bit(R5_UPTODATE, &fdev[i]->flags) && 3570 !test_bit(R5_OVERWRITE, &fdev[i]->flags)) 3571 /* If we have a partial write to a failed 3572 * device, then we will need to reconstruct 3573 * the content of that device, so all other 3574 * devices must be read. 3575 */ 3576 return 1; 3577 } 3578 3579 /* If we are forced to do a reconstruct-write, either because 3580 * the current RAID6 implementation only supports that, or 3581 * because parity cannot be trusted and we are currently 3582 * recovering it, there is extra need to be careful. 3583 * If one of the devices that we would need to read, because 3584 * it is not being overwritten (and maybe not written at all) 3585 * is missing/faulty, then we need to read everything we can. 3586 */ 3587 if (sh->raid_conf->level != 6 && 3588 sh->sector < sh->raid_conf->mddev->recovery_cp) 3589 /* reconstruct-write isn't being forced */ 3590 return 0; 3591 for (i = 0; i < s->failed && i < 2; i++) { 3592 if (s->failed_num[i] != sh->pd_idx && 3593 s->failed_num[i] != sh->qd_idx && 3594 !test_bit(R5_UPTODATE, &fdev[i]->flags) && 3595 !test_bit(R5_OVERWRITE, &fdev[i]->flags)) 3596 return 1; 3597 } 3598 3599 return 0; 3600 } 3601 3602 /* fetch_block - checks the given member device to see if its data needs 3603 * to be read or computed to satisfy a request. 3604 * 3605 * Returns 1 when no more member devices need to be checked, otherwise returns 3606 * 0 to tell the loop in handle_stripe_fill to continue 3607 */ 3608 static int fetch_block(struct stripe_head *sh, struct stripe_head_state *s, 3609 int disk_idx, int disks) 3610 { 3611 struct r5dev *dev = &sh->dev[disk_idx]; 3612 3613 /* is the data in this block needed, and can we get it? */ 3614 if (need_this_block(sh, s, disk_idx, disks)) { 3615 /* we would like to get this block, possibly by computing it, 3616 * otherwise read it if the backing disk is insync 3617 */ 3618 BUG_ON(test_bit(R5_Wantcompute, &dev->flags)); 3619 BUG_ON(test_bit(R5_Wantread, &dev->flags)); 3620 BUG_ON(sh->batch_head); 3621 3622 /* 3623 * In the raid6 case if the only non-uptodate disk is P 3624 * then we already trusted P to compute the other failed 3625 * drives. It is safe to compute rather than re-read P. 3626 * In other cases we only compute blocks from failed 3627 * devices, otherwise check/repair might fail to detect 3628 * a real inconsistency. 3629 */ 3630 3631 if ((s->uptodate == disks - 1) && 3632 ((sh->qd_idx >= 0 && sh->pd_idx == disk_idx) || 3633 (s->failed && (disk_idx == s->failed_num[0] || 3634 disk_idx == s->failed_num[1])))) { 3635 /* have disk failed, and we're requested to fetch it; 3636 * do compute it 3637 */ 3638 pr_debug("Computing stripe %llu block %d\n", 3639 (unsigned long long)sh->sector, disk_idx); 3640 set_bit(STRIPE_COMPUTE_RUN, &sh->state); 3641 set_bit(STRIPE_OP_COMPUTE_BLK, &s->ops_request); 3642 set_bit(R5_Wantcompute, &dev->flags); 3643 sh->ops.target = disk_idx; 3644 sh->ops.target2 = -1; /* no 2nd target */ 3645 s->req_compute = 1; 3646 /* Careful: from this point on 'uptodate' is in the eye 3647 * of raid_run_ops which services 'compute' operations 3648 * before writes. R5_Wantcompute flags a block that will 3649 * be R5_UPTODATE by the time it is needed for a 3650 * subsequent operation. 3651 */ 3652 s->uptodate++; 3653 return 1; 3654 } else if (s->uptodate == disks-2 && s->failed >= 2) { 3655 /* Computing 2-failure is *very* expensive; only 3656 * do it if failed >= 2 3657 */ 3658 int other; 3659 for (other = disks; other--; ) { 3660 if (other == disk_idx) 3661 continue; 3662 if (!test_bit(R5_UPTODATE, 3663 &sh->dev[other].flags)) 3664 break; 3665 } 3666 BUG_ON(other < 0); 3667 pr_debug("Computing stripe %llu blocks %d,%d\n", 3668 (unsigned long long)sh->sector, 3669 disk_idx, other); 3670 set_bit(STRIPE_COMPUTE_RUN, &sh->state); 3671 set_bit(STRIPE_OP_COMPUTE_BLK, &s->ops_request); 3672 set_bit(R5_Wantcompute, &sh->dev[disk_idx].flags); 3673 set_bit(R5_Wantcompute, &sh->dev[other].flags); 3674 sh->ops.target = disk_idx; 3675 sh->ops.target2 = other; 3676 s->uptodate += 2; 3677 s->req_compute = 1; 3678 return 1; 3679 } else if (test_bit(R5_Insync, &dev->flags)) { 3680 set_bit(R5_LOCKED, &dev->flags); 3681 set_bit(R5_Wantread, &dev->flags); 3682 s->locked++; 3683 pr_debug("Reading block %d (sync=%d)\n", 3684 disk_idx, s->syncing); 3685 } 3686 } 3687 3688 return 0; 3689 } 3690 3691 /** 3692 * handle_stripe_fill - read or compute data to satisfy pending requests. 3693 */ 3694 static void handle_stripe_fill(struct stripe_head *sh, 3695 struct stripe_head_state *s, 3696 int disks) 3697 { 3698 int i; 3699 3700 /* look for blocks to read/compute, skip this if a compute 3701 * is already in flight, or if the stripe contents are in the 3702 * midst of changing due to a write 3703 */ 3704 if (!test_bit(STRIPE_COMPUTE_RUN, &sh->state) && !sh->check_state && 3705 !sh->reconstruct_state) { 3706 3707 /* 3708 * For degraded stripe with data in journal, do not handle 3709 * read requests yet, instead, flush the stripe to raid 3710 * disks first, this avoids handling complex rmw of write 3711 * back cache (prexor with orig_page, and then xor with 3712 * page) in the read path 3713 */ 3714 if (s->injournal && s->failed) { 3715 if (test_bit(STRIPE_R5C_CACHING, &sh->state)) 3716 r5c_make_stripe_write_out(sh); 3717 goto out; 3718 } 3719 3720 for (i = disks; i--; ) 3721 if (fetch_block(sh, s, i, disks)) 3722 break; 3723 } 3724 out: 3725 set_bit(STRIPE_HANDLE, &sh->state); 3726 } 3727 3728 static void break_stripe_batch_list(struct stripe_head *head_sh, 3729 unsigned long handle_flags); 3730 /* handle_stripe_clean_event 3731 * any written block on an uptodate or failed drive can be returned. 3732 * Note that if we 'wrote' to a failed drive, it will be UPTODATE, but 3733 * never LOCKED, so we don't need to test 'failed' directly. 3734 */ 3735 static void handle_stripe_clean_event(struct r5conf *conf, 3736 struct stripe_head *sh, int disks) 3737 { 3738 int i; 3739 struct r5dev *dev; 3740 int discard_pending = 0; 3741 struct stripe_head *head_sh = sh; 3742 bool do_endio = false; 3743 3744 for (i = disks; i--; ) 3745 if (sh->dev[i].written) { 3746 dev = &sh->dev[i]; 3747 if (!test_bit(R5_LOCKED, &dev->flags) && 3748 (test_bit(R5_UPTODATE, &dev->flags) || 3749 test_bit(R5_Discard, &dev->flags) || 3750 test_bit(R5_SkipCopy, &dev->flags))) { 3751 /* We can return any write requests */ 3752 struct bio *wbi, *wbi2; 3753 pr_debug("Return write for disc %d\n", i); 3754 if (test_and_clear_bit(R5_Discard, &dev->flags)) 3755 clear_bit(R5_UPTODATE, &dev->flags); 3756 if (test_and_clear_bit(R5_SkipCopy, &dev->flags)) { 3757 WARN_ON(test_bit(R5_UPTODATE, &dev->flags)); 3758 } 3759 do_endio = true; 3760 3761 returnbi: 3762 dev->page = dev->orig_page; 3763 wbi = dev->written; 3764 dev->written = NULL; 3765 while (wbi && wbi->bi_iter.bi_sector < 3766 dev->sector + STRIPE_SECTORS) { 3767 wbi2 = r5_next_bio(wbi, dev->sector); 3768 md_write_end(conf->mddev); 3769 bio_endio(wbi); 3770 wbi = wbi2; 3771 } 3772 bitmap_endwrite(conf->mddev->bitmap, sh->sector, 3773 STRIPE_SECTORS, 3774 !test_bit(STRIPE_DEGRADED, &sh->state), 3775 0); 3776 if (head_sh->batch_head) { 3777 sh = list_first_entry(&sh->batch_list, 3778 struct stripe_head, 3779 batch_list); 3780 if (sh != head_sh) { 3781 dev = &sh->dev[i]; 3782 goto returnbi; 3783 } 3784 } 3785 sh = head_sh; 3786 dev = &sh->dev[i]; 3787 } else if (test_bit(R5_Discard, &dev->flags)) 3788 discard_pending = 1; 3789 } 3790 3791 log_stripe_write_finished(sh); 3792 3793 if (!discard_pending && 3794 test_bit(R5_Discard, &sh->dev[sh->pd_idx].flags)) { 3795 int hash; 3796 clear_bit(R5_Discard, &sh->dev[sh->pd_idx].flags); 3797 clear_bit(R5_UPTODATE, &sh->dev[sh->pd_idx].flags); 3798 if (sh->qd_idx >= 0) { 3799 clear_bit(R5_Discard, &sh->dev[sh->qd_idx].flags); 3800 clear_bit(R5_UPTODATE, &sh->dev[sh->qd_idx].flags); 3801 } 3802 /* now that discard is done we can proceed with any sync */ 3803 clear_bit(STRIPE_DISCARD, &sh->state); 3804 /* 3805 * SCSI discard will change some bio fields and the stripe has 3806 * no updated data, so remove it from hash list and the stripe 3807 * will be reinitialized 3808 */ 3809 unhash: 3810 hash = sh->hash_lock_index; 3811 spin_lock_irq(conf->hash_locks + hash); 3812 remove_hash(sh); 3813 spin_unlock_irq(conf->hash_locks + hash); 3814 if (head_sh->batch_head) { 3815 sh = list_first_entry(&sh->batch_list, 3816 struct stripe_head, batch_list); 3817 if (sh != head_sh) 3818 goto unhash; 3819 } 3820 sh = head_sh; 3821 3822 if (test_bit(STRIPE_SYNC_REQUESTED, &sh->state)) 3823 set_bit(STRIPE_HANDLE, &sh->state); 3824 3825 } 3826 3827 if (test_and_clear_bit(STRIPE_FULL_WRITE, &sh->state)) 3828 if (atomic_dec_and_test(&conf->pending_full_writes)) 3829 md_wakeup_thread(conf->mddev->thread); 3830 3831 if (head_sh->batch_head && do_endio) 3832 break_stripe_batch_list(head_sh, STRIPE_EXPAND_SYNC_FLAGS); 3833 } 3834 3835 /* 3836 * For RMW in write back cache, we need extra page in prexor to store the 3837 * old data. This page is stored in dev->orig_page. 3838 * 3839 * This function checks whether we have data for prexor. The exact logic 3840 * is: 3841 * R5_UPTODATE && (!R5_InJournal || R5_OrigPageUPTDODATE) 3842 */ 3843 static inline bool uptodate_for_rmw(struct r5dev *dev) 3844 { 3845 return (test_bit(R5_UPTODATE, &dev->flags)) && 3846 (!test_bit(R5_InJournal, &dev->flags) || 3847 test_bit(R5_OrigPageUPTDODATE, &dev->flags)); 3848 } 3849 3850 static int handle_stripe_dirtying(struct r5conf *conf, 3851 struct stripe_head *sh, 3852 struct stripe_head_state *s, 3853 int disks) 3854 { 3855 int rmw = 0, rcw = 0, i; 3856 sector_t recovery_cp = conf->mddev->recovery_cp; 3857 3858 /* Check whether resync is now happening or should start. 3859 * If yes, then the array is dirty (after unclean shutdown or 3860 * initial creation), so parity in some stripes might be inconsistent. 3861 * In this case, we need to always do reconstruct-write, to ensure 3862 * that in case of drive failure or read-error correction, we 3863 * generate correct data from the parity. 3864 */ 3865 if (conf->rmw_level == PARITY_DISABLE_RMW || 3866 (recovery_cp < MaxSector && sh->sector >= recovery_cp && 3867 s->failed == 0)) { 3868 /* Calculate the real rcw later - for now make it 3869 * look like rcw is cheaper 3870 */ 3871 rcw = 1; rmw = 2; 3872 pr_debug("force RCW rmw_level=%u, recovery_cp=%llu sh->sector=%llu\n", 3873 conf->rmw_level, (unsigned long long)recovery_cp, 3874 (unsigned long long)sh->sector); 3875 } else for (i = disks; i--; ) { 3876 /* would I have to read this buffer for read_modify_write */ 3877 struct r5dev *dev = &sh->dev[i]; 3878 if (((dev->towrite && !delay_towrite(conf, dev, s)) || 3879 i == sh->pd_idx || i == sh->qd_idx || 3880 test_bit(R5_InJournal, &dev->flags)) && 3881 !test_bit(R5_LOCKED, &dev->flags) && 3882 !(uptodate_for_rmw(dev) || 3883 test_bit(R5_Wantcompute, &dev->flags))) { 3884 if (test_bit(R5_Insync, &dev->flags)) 3885 rmw++; 3886 else 3887 rmw += 2*disks; /* cannot read it */ 3888 } 3889 /* Would I have to read this buffer for reconstruct_write */ 3890 if (!test_bit(R5_OVERWRITE, &dev->flags) && 3891 i != sh->pd_idx && i != sh->qd_idx && 3892 !test_bit(R5_LOCKED, &dev->flags) && 3893 !(test_bit(R5_UPTODATE, &dev->flags) || 3894 test_bit(R5_Wantcompute, &dev->flags))) { 3895 if (test_bit(R5_Insync, &dev->flags)) 3896 rcw++; 3897 else 3898 rcw += 2*disks; 3899 } 3900 } 3901 3902 pr_debug("for sector %llu state 0x%lx, rmw=%d rcw=%d\n", 3903 (unsigned long long)sh->sector, sh->state, rmw, rcw); 3904 set_bit(STRIPE_HANDLE, &sh->state); 3905 if ((rmw < rcw || (rmw == rcw && conf->rmw_level == PARITY_PREFER_RMW)) && rmw > 0) { 3906 /* prefer read-modify-write, but need to get some data */ 3907 if (conf->mddev->queue) 3908 blk_add_trace_msg(conf->mddev->queue, 3909 "raid5 rmw %llu %d", 3910 (unsigned long long)sh->sector, rmw); 3911 for (i = disks; i--; ) { 3912 struct r5dev *dev = &sh->dev[i]; 3913 if (test_bit(R5_InJournal, &dev->flags) && 3914 dev->page == dev->orig_page && 3915 !test_bit(R5_LOCKED, &sh->dev[sh->pd_idx].flags)) { 3916 /* alloc page for prexor */ 3917 struct page *p = alloc_page(GFP_NOIO); 3918 3919 if (p) { 3920 dev->orig_page = p; 3921 continue; 3922 } 3923 3924 /* 3925 * alloc_page() failed, try use 3926 * disk_info->extra_page 3927 */ 3928 if (!test_and_set_bit(R5C_EXTRA_PAGE_IN_USE, 3929 &conf->cache_state)) { 3930 r5c_use_extra_page(sh); 3931 break; 3932 } 3933 3934 /* extra_page in use, add to delayed_list */ 3935 set_bit(STRIPE_DELAYED, &sh->state); 3936 s->waiting_extra_page = 1; 3937 return -EAGAIN; 3938 } 3939 } 3940 3941 for (i = disks; i--; ) { 3942 struct r5dev *dev = &sh->dev[i]; 3943 if (((dev->towrite && !delay_towrite(conf, dev, s)) || 3944 i == sh->pd_idx || i == sh->qd_idx || 3945 test_bit(R5_InJournal, &dev->flags)) && 3946 !test_bit(R5_LOCKED, &dev->flags) && 3947 !(uptodate_for_rmw(dev) || 3948 test_bit(R5_Wantcompute, &dev->flags)) && 3949 test_bit(R5_Insync, &dev->flags)) { 3950 if (test_bit(STRIPE_PREREAD_ACTIVE, 3951 &sh->state)) { 3952 pr_debug("Read_old block %d for r-m-w\n", 3953 i); 3954 set_bit(R5_LOCKED, &dev->flags); 3955 set_bit(R5_Wantread, &dev->flags); 3956 s->locked++; 3957 } else { 3958 set_bit(STRIPE_DELAYED, &sh->state); 3959 set_bit(STRIPE_HANDLE, &sh->state); 3960 } 3961 } 3962 } 3963 } 3964 if ((rcw < rmw || (rcw == rmw && conf->rmw_level != PARITY_PREFER_RMW)) && rcw > 0) { 3965 /* want reconstruct write, but need to get some data */ 3966 int qread =0; 3967 rcw = 0; 3968 for (i = disks; i--; ) { 3969 struct r5dev *dev = &sh->dev[i]; 3970 if (!test_bit(R5_OVERWRITE, &dev->flags) && 3971 i != sh->pd_idx && i != sh->qd_idx && 3972 !test_bit(R5_LOCKED, &dev->flags) && 3973 !(test_bit(R5_UPTODATE, &dev->flags) || 3974 test_bit(R5_Wantcompute, &dev->flags))) { 3975 rcw++; 3976 if (test_bit(R5_Insync, &dev->flags) && 3977 test_bit(STRIPE_PREREAD_ACTIVE, 3978 &sh->state)) { 3979 pr_debug("Read_old block " 3980 "%d for Reconstruct\n", i); 3981 set_bit(R5_LOCKED, &dev->flags); 3982 set_bit(R5_Wantread, &dev->flags); 3983 s->locked++; 3984 qread++; 3985 } else { 3986 set_bit(STRIPE_DELAYED, &sh->state); 3987 set_bit(STRIPE_HANDLE, &sh->state); 3988 } 3989 } 3990 } 3991 if (rcw && conf->mddev->queue) 3992 blk_add_trace_msg(conf->mddev->queue, "raid5 rcw %llu %d %d %d", 3993 (unsigned long long)sh->sector, 3994 rcw, qread, test_bit(STRIPE_DELAYED, &sh->state)); 3995 } 3996 3997 if (rcw > disks && rmw > disks && 3998 !test_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) 3999 set_bit(STRIPE_DELAYED, &sh->state); 4000 4001 /* now if nothing is locked, and if we have enough data, 4002 * we can start a write request 4003 */ 4004 /* since handle_stripe can be called at any time we need to handle the 4005 * case where a compute block operation has been submitted and then a 4006 * subsequent call wants to start a write request. raid_run_ops only 4007 * handles the case where compute block and reconstruct are requested 4008 * simultaneously. If this is not the case then new writes need to be 4009 * held off until the compute completes. 4010 */ 4011 if ((s->req_compute || !test_bit(STRIPE_COMPUTE_RUN, &sh->state)) && 4012 (s->locked == 0 && (rcw == 0 || rmw == 0) && 4013 !test_bit(STRIPE_BIT_DELAY, &sh->state))) 4014 schedule_reconstruction(sh, s, rcw == 0, 0); 4015 return 0; 4016 } 4017 4018 static void handle_parity_checks5(struct r5conf *conf, struct stripe_head *sh, 4019 struct stripe_head_state *s, int disks) 4020 { 4021 struct r5dev *dev = NULL; 4022 4023 BUG_ON(sh->batch_head); 4024 set_bit(STRIPE_HANDLE, &sh->state); 4025 4026 switch (sh->check_state) { 4027 case check_state_idle: 4028 /* start a new check operation if there are no failures */ 4029 if (s->failed == 0) { 4030 BUG_ON(s->uptodate != disks); 4031 sh->check_state = check_state_run; 4032 set_bit(STRIPE_OP_CHECK, &s->ops_request); 4033 clear_bit(R5_UPTODATE, &sh->dev[sh->pd_idx].flags); 4034 s->uptodate--; 4035 break; 4036 } 4037 dev = &sh->dev[s->failed_num[0]]; 4038 /* fall through */ 4039 case check_state_compute_result: 4040 sh->check_state = check_state_idle; 4041 if (!dev) 4042 dev = &sh->dev[sh->pd_idx]; 4043 4044 /* check that a write has not made the stripe insync */ 4045 if (test_bit(STRIPE_INSYNC, &sh->state)) 4046 break; 4047 4048 /* either failed parity check, or recovery is happening */ 4049 BUG_ON(!test_bit(R5_UPTODATE, &dev->flags)); 4050 BUG_ON(s->uptodate != disks); 4051 4052 set_bit(R5_LOCKED, &dev->flags); 4053 s->locked++; 4054 set_bit(R5_Wantwrite, &dev->flags); 4055 4056 clear_bit(STRIPE_DEGRADED, &sh->state); 4057 set_bit(STRIPE_INSYNC, &sh->state); 4058 break; 4059 case check_state_run: 4060 break; /* we will be called again upon completion */ 4061 case check_state_check_result: 4062 sh->check_state = check_state_idle; 4063 4064 /* if a failure occurred during the check operation, leave 4065 * STRIPE_INSYNC not set and let the stripe be handled again 4066 */ 4067 if (s->failed) 4068 break; 4069 4070 /* handle a successful check operation, if parity is correct 4071 * we are done. Otherwise update the mismatch count and repair 4072 * parity if !MD_RECOVERY_CHECK 4073 */ 4074 if ((sh->ops.zero_sum_result & SUM_CHECK_P_RESULT) == 0) 4075 /* parity is correct (on disc, 4076 * not in buffer any more) 4077 */ 4078 set_bit(STRIPE_INSYNC, &sh->state); 4079 else { 4080 atomic64_add(STRIPE_SECTORS, &conf->mddev->resync_mismatches); 4081 if (test_bit(MD_RECOVERY_CHECK, &conf->mddev->recovery)) 4082 /* don't try to repair!! */ 4083 set_bit(STRIPE_INSYNC, &sh->state); 4084 else { 4085 sh->check_state = check_state_compute_run; 4086 set_bit(STRIPE_COMPUTE_RUN, &sh->state); 4087 set_bit(STRIPE_OP_COMPUTE_BLK, &s->ops_request); 4088 set_bit(R5_Wantcompute, 4089 &sh->dev[sh->pd_idx].flags); 4090 sh->ops.target = sh->pd_idx; 4091 sh->ops.target2 = -1; 4092 s->uptodate++; 4093 } 4094 } 4095 break; 4096 case check_state_compute_run: 4097 break; 4098 default: 4099 pr_err("%s: unknown check_state: %d sector: %llu\n", 4100 __func__, sh->check_state, 4101 (unsigned long long) sh->sector); 4102 BUG(); 4103 } 4104 } 4105 4106 static void handle_parity_checks6(struct r5conf *conf, struct stripe_head *sh, 4107 struct stripe_head_state *s, 4108 int disks) 4109 { 4110 int pd_idx = sh->pd_idx; 4111 int qd_idx = sh->qd_idx; 4112 struct r5dev *dev; 4113 4114 BUG_ON(sh->batch_head); 4115 set_bit(STRIPE_HANDLE, &sh->state); 4116 4117 BUG_ON(s->failed > 2); 4118 4119 /* Want to check and possibly repair P and Q. 4120 * However there could be one 'failed' device, in which 4121 * case we can only check one of them, possibly using the 4122 * other to generate missing data 4123 */ 4124 4125 switch (sh->check_state) { 4126 case check_state_idle: 4127 /* start a new check operation if there are < 2 failures */ 4128 if (s->failed == s->q_failed) { 4129 /* The only possible failed device holds Q, so it 4130 * makes sense to check P (If anything else were failed, 4131 * we would have used P to recreate it). 4132 */ 4133 sh->check_state = check_state_run; 4134 } 4135 if (!s->q_failed && s->failed < 2) { 4136 /* Q is not failed, and we didn't use it to generate 4137 * anything, so it makes sense to check it 4138 */ 4139 if (sh->check_state == check_state_run) 4140 sh->check_state = check_state_run_pq; 4141 else 4142 sh->check_state = check_state_run_q; 4143 } 4144 4145 /* discard potentially stale zero_sum_result */ 4146 sh->ops.zero_sum_result = 0; 4147 4148 if (sh->check_state == check_state_run) { 4149 /* async_xor_zero_sum destroys the contents of P */ 4150 clear_bit(R5_UPTODATE, &sh->dev[pd_idx].flags); 4151 s->uptodate--; 4152 } 4153 if (sh->check_state >= check_state_run && 4154 sh->check_state <= check_state_run_pq) { 4155 /* async_syndrome_zero_sum preserves P and Q, so 4156 * no need to mark them !uptodate here 4157 */ 4158 set_bit(STRIPE_OP_CHECK, &s->ops_request); 4159 break; 4160 } 4161 4162 /* we have 2-disk failure */ 4163 BUG_ON(s->failed != 2); 4164 /* fall through */ 4165 case check_state_compute_result: 4166 sh->check_state = check_state_idle; 4167 4168 /* check that a write has not made the stripe insync */ 4169 if (test_bit(STRIPE_INSYNC, &sh->state)) 4170 break; 4171 4172 /* now write out any block on a failed drive, 4173 * or P or Q if they were recomputed 4174 */ 4175 BUG_ON(s->uptodate < disks - 1); /* We don't need Q to recover */ 4176 if (s->failed == 2) { 4177 dev = &sh->dev[s->failed_num[1]]; 4178 s->locked++; 4179 set_bit(R5_LOCKED, &dev->flags); 4180 set_bit(R5_Wantwrite, &dev->flags); 4181 } 4182 if (s->failed >= 1) { 4183 dev = &sh->dev[s->failed_num[0]]; 4184 s->locked++; 4185 set_bit(R5_LOCKED, &dev->flags); 4186 set_bit(R5_Wantwrite, &dev->flags); 4187 } 4188 if (sh->ops.zero_sum_result & SUM_CHECK_P_RESULT) { 4189 dev = &sh->dev[pd_idx]; 4190 s->locked++; 4191 set_bit(R5_LOCKED, &dev->flags); 4192 set_bit(R5_Wantwrite, &dev->flags); 4193 } 4194 if (sh->ops.zero_sum_result & SUM_CHECK_Q_RESULT) { 4195 dev = &sh->dev[qd_idx]; 4196 s->locked++; 4197 set_bit(R5_LOCKED, &dev->flags); 4198 set_bit(R5_Wantwrite, &dev->flags); 4199 } 4200 clear_bit(STRIPE_DEGRADED, &sh->state); 4201 4202 set_bit(STRIPE_INSYNC, &sh->state); 4203 break; 4204 case check_state_run: 4205 case check_state_run_q: 4206 case check_state_run_pq: 4207 break; /* we will be called again upon completion */ 4208 case check_state_check_result: 4209 sh->check_state = check_state_idle; 4210 4211 /* handle a successful check operation, if parity is correct 4212 * we are done. Otherwise update the mismatch count and repair 4213 * parity if !MD_RECOVERY_CHECK 4214 */ 4215 if (sh->ops.zero_sum_result == 0) { 4216 /* both parities are correct */ 4217 if (!s->failed) 4218 set_bit(STRIPE_INSYNC, &sh->state); 4219 else { 4220 /* in contrast to the raid5 case we can validate 4221 * parity, but still have a failure to write 4222 * back 4223 */ 4224 sh->check_state = check_state_compute_result; 4225 /* Returning at this point means that we may go 4226 * off and bring p and/or q uptodate again so 4227 * we make sure to check zero_sum_result again 4228 * to verify if p or q need writeback 4229 */ 4230 } 4231 } else { 4232 atomic64_add(STRIPE_SECTORS, &conf->mddev->resync_mismatches); 4233 if (test_bit(MD_RECOVERY_CHECK, &conf->mddev->recovery)) 4234 /* don't try to repair!! */ 4235 set_bit(STRIPE_INSYNC, &sh->state); 4236 else { 4237 int *target = &sh->ops.target; 4238 4239 sh->ops.target = -1; 4240 sh->ops.target2 = -1; 4241 sh->check_state = check_state_compute_run; 4242 set_bit(STRIPE_COMPUTE_RUN, &sh->state); 4243 set_bit(STRIPE_OP_COMPUTE_BLK, &s->ops_request); 4244 if (sh->ops.zero_sum_result & SUM_CHECK_P_RESULT) { 4245 set_bit(R5_Wantcompute, 4246 &sh->dev[pd_idx].flags); 4247 *target = pd_idx; 4248 target = &sh->ops.target2; 4249 s->uptodate++; 4250 } 4251 if (sh->ops.zero_sum_result & SUM_CHECK_Q_RESULT) { 4252 set_bit(R5_Wantcompute, 4253 &sh->dev[qd_idx].flags); 4254 *target = qd_idx; 4255 s->uptodate++; 4256 } 4257 } 4258 } 4259 break; 4260 case check_state_compute_run: 4261 break; 4262 default: 4263 pr_warn("%s: unknown check_state: %d sector: %llu\n", 4264 __func__, sh->check_state, 4265 (unsigned long long) sh->sector); 4266 BUG(); 4267 } 4268 } 4269 4270 static void handle_stripe_expansion(struct r5conf *conf, struct stripe_head *sh) 4271 { 4272 int i; 4273 4274 /* We have read all the blocks in this stripe and now we need to 4275 * copy some of them into a target stripe for expand. 4276 */ 4277 struct dma_async_tx_descriptor *tx = NULL; 4278 BUG_ON(sh->batch_head); 4279 clear_bit(STRIPE_EXPAND_SOURCE, &sh->state); 4280 for (i = 0; i < sh->disks; i++) 4281 if (i != sh->pd_idx && i != sh->qd_idx) { 4282 int dd_idx, j; 4283 struct stripe_head *sh2; 4284 struct async_submit_ctl submit; 4285 4286 sector_t bn = raid5_compute_blocknr(sh, i, 1); 4287 sector_t s = raid5_compute_sector(conf, bn, 0, 4288 &dd_idx, NULL); 4289 sh2 = raid5_get_active_stripe(conf, s, 0, 1, 1); 4290 if (sh2 == NULL) 4291 /* so far only the early blocks of this stripe 4292 * have been requested. When later blocks 4293 * get requested, we will try again 4294 */ 4295 continue; 4296 if (!test_bit(STRIPE_EXPANDING, &sh2->state) || 4297 test_bit(R5_Expanded, &sh2->dev[dd_idx].flags)) { 4298 /* must have already done this block */ 4299 raid5_release_stripe(sh2); 4300 continue; 4301 } 4302 4303 /* place all the copies on one channel */ 4304 init_async_submit(&submit, 0, tx, NULL, NULL, NULL); 4305 tx = async_memcpy(sh2->dev[dd_idx].page, 4306 sh->dev[i].page, 0, 0, STRIPE_SIZE, 4307 &submit); 4308 4309 set_bit(R5_Expanded, &sh2->dev[dd_idx].flags); 4310 set_bit(R5_UPTODATE, &sh2->dev[dd_idx].flags); 4311 for (j = 0; j < conf->raid_disks; j++) 4312 if (j != sh2->pd_idx && 4313 j != sh2->qd_idx && 4314 !test_bit(R5_Expanded, &sh2->dev[j].flags)) 4315 break; 4316 if (j == conf->raid_disks) { 4317 set_bit(STRIPE_EXPAND_READY, &sh2->state); 4318 set_bit(STRIPE_HANDLE, &sh2->state); 4319 } 4320 raid5_release_stripe(sh2); 4321 4322 } 4323 /* done submitting copies, wait for them to complete */ 4324 async_tx_quiesce(&tx); 4325 } 4326 4327 /* 4328 * handle_stripe - do things to a stripe. 4329 * 4330 * We lock the stripe by setting STRIPE_ACTIVE and then examine the 4331 * state of various bits to see what needs to be done. 4332 * Possible results: 4333 * return some read requests which now have data 4334 * return some write requests which are safely on storage 4335 * schedule a read on some buffers 4336 * schedule a write of some buffers 4337 * return confirmation of parity correctness 4338 * 4339 */ 4340 4341 static void analyse_stripe(struct stripe_head *sh, struct stripe_head_state *s) 4342 { 4343 struct r5conf *conf = sh->raid_conf; 4344 int disks = sh->disks; 4345 struct r5dev *dev; 4346 int i; 4347 int do_recovery = 0; 4348 4349 memset(s, 0, sizeof(*s)); 4350 4351 s->expanding = test_bit(STRIPE_EXPAND_SOURCE, &sh->state) && !sh->batch_head; 4352 s->expanded = test_bit(STRIPE_EXPAND_READY, &sh->state) && !sh->batch_head; 4353 s->failed_num[0] = -1; 4354 s->failed_num[1] = -1; 4355 s->log_failed = r5l_log_disk_error(conf); 4356 4357 /* Now to look around and see what can be done */ 4358 rcu_read_lock(); 4359 for (i=disks; i--; ) { 4360 struct md_rdev *rdev; 4361 sector_t first_bad; 4362 int bad_sectors; 4363 int is_bad = 0; 4364 4365 dev = &sh->dev[i]; 4366 4367 pr_debug("check %d: state 0x%lx read %p write %p written %p\n", 4368 i, dev->flags, 4369 dev->toread, dev->towrite, dev->written); 4370 /* maybe we can reply to a read 4371 * 4372 * new wantfill requests are only permitted while 4373 * ops_complete_biofill is guaranteed to be inactive 4374 */ 4375 if (test_bit(R5_UPTODATE, &dev->flags) && dev->toread && 4376 !test_bit(STRIPE_BIOFILL_RUN, &sh->state)) 4377 set_bit(R5_Wantfill, &dev->flags); 4378 4379 /* now count some things */ 4380 if (test_bit(R5_LOCKED, &dev->flags)) 4381 s->locked++; 4382 if (test_bit(R5_UPTODATE, &dev->flags)) 4383 s->uptodate++; 4384 if (test_bit(R5_Wantcompute, &dev->flags)) { 4385 s->compute++; 4386 BUG_ON(s->compute > 2); 4387 } 4388 4389 if (test_bit(R5_Wantfill, &dev->flags)) 4390 s->to_fill++; 4391 else if (dev->toread) 4392 s->to_read++; 4393 if (dev->towrite) { 4394 s->to_write++; 4395 if (!test_bit(R5_OVERWRITE, &dev->flags)) 4396 s->non_overwrite++; 4397 } 4398 if (dev->written) 4399 s->written++; 4400 /* Prefer to use the replacement for reads, but only 4401 * if it is recovered enough and has no bad blocks. 4402 */ 4403 rdev = rcu_dereference(conf->disks[i].replacement); 4404 if (rdev && !test_bit(Faulty, &rdev->flags) && 4405 rdev->recovery_offset >= sh->sector + STRIPE_SECTORS && 4406 !is_badblock(rdev, sh->sector, STRIPE_SECTORS, 4407 &first_bad, &bad_sectors)) 4408 set_bit(R5_ReadRepl, &dev->flags); 4409 else { 4410 if (rdev && !test_bit(Faulty, &rdev->flags)) 4411 set_bit(R5_NeedReplace, &dev->flags); 4412 else 4413 clear_bit(R5_NeedReplace, &dev->flags); 4414 rdev = rcu_dereference(conf->disks[i].rdev); 4415 clear_bit(R5_ReadRepl, &dev->flags); 4416 } 4417 if (rdev && test_bit(Faulty, &rdev->flags)) 4418 rdev = NULL; 4419 if (rdev) { 4420 is_bad = is_badblock(rdev, sh->sector, STRIPE_SECTORS, 4421 &first_bad, &bad_sectors); 4422 if (s->blocked_rdev == NULL 4423 && (test_bit(Blocked, &rdev->flags) 4424 || is_bad < 0)) { 4425 if (is_bad < 0) 4426 set_bit(BlockedBadBlocks, 4427 &rdev->flags); 4428 s->blocked_rdev = rdev; 4429 atomic_inc(&rdev->nr_pending); 4430 } 4431 } 4432 clear_bit(R5_Insync, &dev->flags); 4433 if (!rdev) 4434 /* Not in-sync */; 4435 else if (is_bad) { 4436 /* also not in-sync */ 4437 if (!test_bit(WriteErrorSeen, &rdev->flags) && 4438 test_bit(R5_UPTODATE, &dev->flags)) { 4439 /* treat as in-sync, but with a read error 4440 * which we can now try to correct 4441 */ 4442 set_bit(R5_Insync, &dev->flags); 4443 set_bit(R5_ReadError, &dev->flags); 4444 } 4445 } else if (test_bit(In_sync, &rdev->flags)) 4446 set_bit(R5_Insync, &dev->flags); 4447 else if (sh->sector + STRIPE_SECTORS <= rdev->recovery_offset) 4448 /* in sync if before recovery_offset */ 4449 set_bit(R5_Insync, &dev->flags); 4450 else if (test_bit(R5_UPTODATE, &dev->flags) && 4451 test_bit(R5_Expanded, &dev->flags)) 4452 /* If we've reshaped into here, we assume it is Insync. 4453 * We will shortly update recovery_offset to make 4454 * it official. 4455 */ 4456 set_bit(R5_Insync, &dev->flags); 4457 4458 if (test_bit(R5_WriteError, &dev->flags)) { 4459 /* This flag does not apply to '.replacement' 4460 * only to .rdev, so make sure to check that*/ 4461 struct md_rdev *rdev2 = rcu_dereference( 4462 conf->disks[i].rdev); 4463 if (rdev2 == rdev) 4464 clear_bit(R5_Insync, &dev->flags); 4465 if (rdev2 && !test_bit(Faulty, &rdev2->flags)) { 4466 s->handle_bad_blocks = 1; 4467 atomic_inc(&rdev2->nr_pending); 4468 } else 4469 clear_bit(R5_WriteError, &dev->flags); 4470 } 4471 if (test_bit(R5_MadeGood, &dev->flags)) { 4472 /* This flag does not apply to '.replacement' 4473 * only to .rdev, so make sure to check that*/ 4474 struct md_rdev *rdev2 = rcu_dereference( 4475 conf->disks[i].rdev); 4476 if (rdev2 && !test_bit(Faulty, &rdev2->flags)) { 4477 s->handle_bad_blocks = 1; 4478 atomic_inc(&rdev2->nr_pending); 4479 } else 4480 clear_bit(R5_MadeGood, &dev->flags); 4481 } 4482 if (test_bit(R5_MadeGoodRepl, &dev->flags)) { 4483 struct md_rdev *rdev2 = rcu_dereference( 4484 conf->disks[i].replacement); 4485 if (rdev2 && !test_bit(Faulty, &rdev2->flags)) { 4486 s->handle_bad_blocks = 1; 4487 atomic_inc(&rdev2->nr_pending); 4488 } else 4489 clear_bit(R5_MadeGoodRepl, &dev->flags); 4490 } 4491 if (!test_bit(R5_Insync, &dev->flags)) { 4492 /* The ReadError flag will just be confusing now */ 4493 clear_bit(R5_ReadError, &dev->flags); 4494 clear_bit(R5_ReWrite, &dev->flags); 4495 } 4496 if (test_bit(R5_ReadError, &dev->flags)) 4497 clear_bit(R5_Insync, &dev->flags); 4498 if (!test_bit(R5_Insync, &dev->flags)) { 4499 if (s->failed < 2) 4500 s->failed_num[s->failed] = i; 4501 s->failed++; 4502 if (rdev && !test_bit(Faulty, &rdev->flags)) 4503 do_recovery = 1; 4504 } 4505 4506 if (test_bit(R5_InJournal, &dev->flags)) 4507 s->injournal++; 4508 if (test_bit(R5_InJournal, &dev->flags) && dev->written) 4509 s->just_cached++; 4510 } 4511 if (test_bit(STRIPE_SYNCING, &sh->state)) { 4512 /* If there is a failed device being replaced, 4513 * we must be recovering. 4514 * else if we are after recovery_cp, we must be syncing 4515 * else if MD_RECOVERY_REQUESTED is set, we also are syncing. 4516 * else we can only be replacing 4517 * sync and recovery both need to read all devices, and so 4518 * use the same flag. 4519 */ 4520 if (do_recovery || 4521 sh->sector >= conf->mddev->recovery_cp || 4522 test_bit(MD_RECOVERY_REQUESTED, &(conf->mddev->recovery))) 4523 s->syncing = 1; 4524 else 4525 s->replacing = 1; 4526 } 4527 rcu_read_unlock(); 4528 } 4529 4530 static int clear_batch_ready(struct stripe_head *sh) 4531 { 4532 /* Return '1' if this is a member of batch, or 4533 * '0' if it is a lone stripe or a head which can now be 4534 * handled. 4535 */ 4536 struct stripe_head *tmp; 4537 if (!test_and_clear_bit(STRIPE_BATCH_READY, &sh->state)) 4538 return (sh->batch_head && sh->batch_head != sh); 4539 spin_lock(&sh->stripe_lock); 4540 if (!sh->batch_head) { 4541 spin_unlock(&sh->stripe_lock); 4542 return 0; 4543 } 4544 4545 /* 4546 * this stripe could be added to a batch list before we check 4547 * BATCH_READY, skips it 4548 */ 4549 if (sh->batch_head != sh) { 4550 spin_unlock(&sh->stripe_lock); 4551 return 1; 4552 } 4553 spin_lock(&sh->batch_lock); 4554 list_for_each_entry(tmp, &sh->batch_list, batch_list) 4555 clear_bit(STRIPE_BATCH_READY, &tmp->state); 4556 spin_unlock(&sh->batch_lock); 4557 spin_unlock(&sh->stripe_lock); 4558 4559 /* 4560 * BATCH_READY is cleared, no new stripes can be added. 4561 * batch_list can be accessed without lock 4562 */ 4563 return 0; 4564 } 4565 4566 static void break_stripe_batch_list(struct stripe_head *head_sh, 4567 unsigned long handle_flags) 4568 { 4569 struct stripe_head *sh, *next; 4570 int i; 4571 int do_wakeup = 0; 4572 4573 list_for_each_entry_safe(sh, next, &head_sh->batch_list, batch_list) { 4574 4575 list_del_init(&sh->batch_list); 4576 4577 WARN_ONCE(sh->state & ((1 << STRIPE_ACTIVE) | 4578 (1 << STRIPE_SYNCING) | 4579 (1 << STRIPE_REPLACED) | 4580 (1 << STRIPE_DELAYED) | 4581 (1 << STRIPE_BIT_DELAY) | 4582 (1 << STRIPE_FULL_WRITE) | 4583 (1 << STRIPE_BIOFILL_RUN) | 4584 (1 << STRIPE_COMPUTE_RUN) | 4585 (1 << STRIPE_OPS_REQ_PENDING) | 4586 (1 << STRIPE_DISCARD) | 4587 (1 << STRIPE_BATCH_READY) | 4588 (1 << STRIPE_BATCH_ERR) | 4589 (1 << STRIPE_BITMAP_PENDING)), 4590 "stripe state: %lx\n", sh->state); 4591 WARN_ONCE(head_sh->state & ((1 << STRIPE_DISCARD) | 4592 (1 << STRIPE_REPLACED)), 4593 "head stripe state: %lx\n", head_sh->state); 4594 4595 set_mask_bits(&sh->state, ~(STRIPE_EXPAND_SYNC_FLAGS | 4596 (1 << STRIPE_PREREAD_ACTIVE) | 4597 (1 << STRIPE_DEGRADED)), 4598 head_sh->state & (1 << STRIPE_INSYNC)); 4599 4600 sh->check_state = head_sh->check_state; 4601 sh->reconstruct_state = head_sh->reconstruct_state; 4602 for (i = 0; i < sh->disks; i++) { 4603 if (test_and_clear_bit(R5_Overlap, &sh->dev[i].flags)) 4604 do_wakeup = 1; 4605 sh->dev[i].flags = head_sh->dev[i].flags & 4606 (~((1 << R5_WriteError) | (1 << R5_Overlap))); 4607 } 4608 spin_lock_irq(&sh->stripe_lock); 4609 sh->batch_head = NULL; 4610 spin_unlock_irq(&sh->stripe_lock); 4611 if (handle_flags == 0 || 4612 sh->state & handle_flags) 4613 set_bit(STRIPE_HANDLE, &sh->state); 4614 raid5_release_stripe(sh); 4615 } 4616 spin_lock_irq(&head_sh->stripe_lock); 4617 head_sh->batch_head = NULL; 4618 spin_unlock_irq(&head_sh->stripe_lock); 4619 for (i = 0; i < head_sh->disks; i++) 4620 if (test_and_clear_bit(R5_Overlap, &head_sh->dev[i].flags)) 4621 do_wakeup = 1; 4622 if (head_sh->state & handle_flags) 4623 set_bit(STRIPE_HANDLE, &head_sh->state); 4624 4625 if (do_wakeup) 4626 wake_up(&head_sh->raid_conf->wait_for_overlap); 4627 } 4628 4629 static void handle_stripe(struct stripe_head *sh) 4630 { 4631 struct stripe_head_state s; 4632 struct r5conf *conf = sh->raid_conf; 4633 int i; 4634 int prexor; 4635 int disks = sh->disks; 4636 struct r5dev *pdev, *qdev; 4637 4638 clear_bit(STRIPE_HANDLE, &sh->state); 4639 if (test_and_set_bit_lock(STRIPE_ACTIVE, &sh->state)) { 4640 /* already being handled, ensure it gets handled 4641 * again when current action finishes */ 4642 set_bit(STRIPE_HANDLE, &sh->state); 4643 return; 4644 } 4645 4646 if (clear_batch_ready(sh) ) { 4647 clear_bit_unlock(STRIPE_ACTIVE, &sh->state); 4648 return; 4649 } 4650 4651 if (test_and_clear_bit(STRIPE_BATCH_ERR, &sh->state)) 4652 break_stripe_batch_list(sh, 0); 4653 4654 if (test_bit(STRIPE_SYNC_REQUESTED, &sh->state) && !sh->batch_head) { 4655 spin_lock(&sh->stripe_lock); 4656 /* Cannot process 'sync' concurrently with 'discard' */ 4657 if (!test_bit(STRIPE_DISCARD, &sh->state) && 4658 test_and_clear_bit(STRIPE_SYNC_REQUESTED, &sh->state)) { 4659 set_bit(STRIPE_SYNCING, &sh->state); 4660 clear_bit(STRIPE_INSYNC, &sh->state); 4661 clear_bit(STRIPE_REPLACED, &sh->state); 4662 } 4663 spin_unlock(&sh->stripe_lock); 4664 } 4665 clear_bit(STRIPE_DELAYED, &sh->state); 4666 4667 pr_debug("handling stripe %llu, state=%#lx cnt=%d, " 4668 "pd_idx=%d, qd_idx=%d\n, check:%d, reconstruct:%d\n", 4669 (unsigned long long)sh->sector, sh->state, 4670 atomic_read(&sh->count), sh->pd_idx, sh->qd_idx, 4671 sh->check_state, sh->reconstruct_state); 4672 4673 analyse_stripe(sh, &s); 4674 4675 if (test_bit(STRIPE_LOG_TRAPPED, &sh->state)) 4676 goto finish; 4677 4678 if (s.handle_bad_blocks || 4679 test_bit(MD_SB_CHANGE_PENDING, &conf->mddev->sb_flags)) { 4680 set_bit(STRIPE_HANDLE, &sh->state); 4681 goto finish; 4682 } 4683 4684 if (unlikely(s.blocked_rdev)) { 4685 if (s.syncing || s.expanding || s.expanded || 4686 s.replacing || s.to_write || s.written) { 4687 set_bit(STRIPE_HANDLE, &sh->state); 4688 goto finish; 4689 } 4690 /* There is nothing for the blocked_rdev to block */ 4691 rdev_dec_pending(s.blocked_rdev, conf->mddev); 4692 s.blocked_rdev = NULL; 4693 } 4694 4695 if (s.to_fill && !test_bit(STRIPE_BIOFILL_RUN, &sh->state)) { 4696 set_bit(STRIPE_OP_BIOFILL, &s.ops_request); 4697 set_bit(STRIPE_BIOFILL_RUN, &sh->state); 4698 } 4699 4700 pr_debug("locked=%d uptodate=%d to_read=%d" 4701 " to_write=%d failed=%d failed_num=%d,%d\n", 4702 s.locked, s.uptodate, s.to_read, s.to_write, s.failed, 4703 s.failed_num[0], s.failed_num[1]); 4704 /* check if the array has lost more than max_degraded devices and, 4705 * if so, some requests might need to be failed. 4706 */ 4707 if (s.failed > conf->max_degraded || s.log_failed) { 4708 sh->check_state = 0; 4709 sh->reconstruct_state = 0; 4710 break_stripe_batch_list(sh, 0); 4711 if (s.to_read+s.to_write+s.written) 4712 handle_failed_stripe(conf, sh, &s, disks); 4713 if (s.syncing + s.replacing) 4714 handle_failed_sync(conf, sh, &s); 4715 } 4716 4717 /* Now we check to see if any write operations have recently 4718 * completed 4719 */ 4720 prexor = 0; 4721 if (sh->reconstruct_state == reconstruct_state_prexor_drain_result) 4722 prexor = 1; 4723 if (sh->reconstruct_state == reconstruct_state_drain_result || 4724 sh->reconstruct_state == reconstruct_state_prexor_drain_result) { 4725 sh->reconstruct_state = reconstruct_state_idle; 4726 4727 /* All the 'written' buffers and the parity block are ready to 4728 * be written back to disk 4729 */ 4730 BUG_ON(!test_bit(R5_UPTODATE, &sh->dev[sh->pd_idx].flags) && 4731 !test_bit(R5_Discard, &sh->dev[sh->pd_idx].flags)); 4732 BUG_ON(sh->qd_idx >= 0 && 4733 !test_bit(R5_UPTODATE, &sh->dev[sh->qd_idx].flags) && 4734 !test_bit(R5_Discard, &sh->dev[sh->qd_idx].flags)); 4735 for (i = disks; i--; ) { 4736 struct r5dev *dev = &sh->dev[i]; 4737 if (test_bit(R5_LOCKED, &dev->flags) && 4738 (i == sh->pd_idx || i == sh->qd_idx || 4739 dev->written || test_bit(R5_InJournal, 4740 &dev->flags))) { 4741 pr_debug("Writing block %d\n", i); 4742 set_bit(R5_Wantwrite, &dev->flags); 4743 if (prexor) 4744 continue; 4745 if (s.failed > 1) 4746 continue; 4747 if (!test_bit(R5_Insync, &dev->flags) || 4748 ((i == sh->pd_idx || i == sh->qd_idx) && 4749 s.failed == 0)) 4750 set_bit(STRIPE_INSYNC, &sh->state); 4751 } 4752 } 4753 if (test_and_clear_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) 4754 s.dec_preread_active = 1; 4755 } 4756 4757 /* 4758 * might be able to return some write requests if the parity blocks 4759 * are safe, or on a failed drive 4760 */ 4761 pdev = &sh->dev[sh->pd_idx]; 4762 s.p_failed = (s.failed >= 1 && s.failed_num[0] == sh->pd_idx) 4763 || (s.failed >= 2 && s.failed_num[1] == sh->pd_idx); 4764 qdev = &sh->dev[sh->qd_idx]; 4765 s.q_failed = (s.failed >= 1 && s.failed_num[0] == sh->qd_idx) 4766 || (s.failed >= 2 && s.failed_num[1] == sh->qd_idx) 4767 || conf->level < 6; 4768 4769 if (s.written && 4770 (s.p_failed || ((test_bit(R5_Insync, &pdev->flags) 4771 && !test_bit(R5_LOCKED, &pdev->flags) 4772 && (test_bit(R5_UPTODATE, &pdev->flags) || 4773 test_bit(R5_Discard, &pdev->flags))))) && 4774 (s.q_failed || ((test_bit(R5_Insync, &qdev->flags) 4775 && !test_bit(R5_LOCKED, &qdev->flags) 4776 && (test_bit(R5_UPTODATE, &qdev->flags) || 4777 test_bit(R5_Discard, &qdev->flags)))))) 4778 handle_stripe_clean_event(conf, sh, disks); 4779 4780 if (s.just_cached) 4781 r5c_handle_cached_data_endio(conf, sh, disks); 4782 log_stripe_write_finished(sh); 4783 4784 /* Now we might consider reading some blocks, either to check/generate 4785 * parity, or to satisfy requests 4786 * or to load a block that is being partially written. 4787 */ 4788 if (s.to_read || s.non_overwrite 4789 || (conf->level == 6 && s.to_write && s.failed) 4790 || (s.syncing && (s.uptodate + s.compute < disks)) 4791 || s.replacing 4792 || s.expanding) 4793 handle_stripe_fill(sh, &s, disks); 4794 4795 /* 4796 * When the stripe finishes full journal write cycle (write to journal 4797 * and raid disk), this is the clean up procedure so it is ready for 4798 * next operation. 4799 */ 4800 r5c_finish_stripe_write_out(conf, sh, &s); 4801 4802 /* 4803 * Now to consider new write requests, cache write back and what else, 4804 * if anything should be read. We do not handle new writes when: 4805 * 1/ A 'write' operation (copy+xor) is already in flight. 4806 * 2/ A 'check' operation is in flight, as it may clobber the parity 4807 * block. 4808 * 3/ A r5c cache log write is in flight. 4809 */ 4810 4811 if (!sh->reconstruct_state && !sh->check_state && !sh->log_io) { 4812 if (!r5c_is_writeback(conf->log)) { 4813 if (s.to_write) 4814 handle_stripe_dirtying(conf, sh, &s, disks); 4815 } else { /* write back cache */ 4816 int ret = 0; 4817 4818 /* First, try handle writes in caching phase */ 4819 if (s.to_write) 4820 ret = r5c_try_caching_write(conf, sh, &s, 4821 disks); 4822 /* 4823 * If caching phase failed: ret == -EAGAIN 4824 * OR 4825 * stripe under reclaim: !caching && injournal 4826 * 4827 * fall back to handle_stripe_dirtying() 4828 */ 4829 if (ret == -EAGAIN || 4830 /* stripe under reclaim: !caching && injournal */ 4831 (!test_bit(STRIPE_R5C_CACHING, &sh->state) && 4832 s.injournal > 0)) { 4833 ret = handle_stripe_dirtying(conf, sh, &s, 4834 disks); 4835 if (ret == -EAGAIN) 4836 goto finish; 4837 } 4838 } 4839 } 4840 4841 /* maybe we need to check and possibly fix the parity for this stripe 4842 * Any reads will already have been scheduled, so we just see if enough 4843 * data is available. The parity check is held off while parity 4844 * dependent operations are in flight. 4845 */ 4846 if (sh->check_state || 4847 (s.syncing && s.locked == 0 && 4848 !test_bit(STRIPE_COMPUTE_RUN, &sh->state) && 4849 !test_bit(STRIPE_INSYNC, &sh->state))) { 4850 if (conf->level == 6) 4851 handle_parity_checks6(conf, sh, &s, disks); 4852 else 4853 handle_parity_checks5(conf, sh, &s, disks); 4854 } 4855 4856 if ((s.replacing || s.syncing) && s.locked == 0 4857 && !test_bit(STRIPE_COMPUTE_RUN, &sh->state) 4858 && !test_bit(STRIPE_REPLACED, &sh->state)) { 4859 /* Write out to replacement devices where possible */ 4860 for (i = 0; i < conf->raid_disks; i++) 4861 if (test_bit(R5_NeedReplace, &sh->dev[i].flags)) { 4862 WARN_ON(!test_bit(R5_UPTODATE, &sh->dev[i].flags)); 4863 set_bit(R5_WantReplace, &sh->dev[i].flags); 4864 set_bit(R5_LOCKED, &sh->dev[i].flags); 4865 s.locked++; 4866 } 4867 if (s.replacing) 4868 set_bit(STRIPE_INSYNC, &sh->state); 4869 set_bit(STRIPE_REPLACED, &sh->state); 4870 } 4871 if ((s.syncing || s.replacing) && s.locked == 0 && 4872 !test_bit(STRIPE_COMPUTE_RUN, &sh->state) && 4873 test_bit(STRIPE_INSYNC, &sh->state)) { 4874 md_done_sync(conf->mddev, STRIPE_SECTORS, 1); 4875 clear_bit(STRIPE_SYNCING, &sh->state); 4876 if (test_and_clear_bit(R5_Overlap, &sh->dev[sh->pd_idx].flags)) 4877 wake_up(&conf->wait_for_overlap); 4878 } 4879 4880 /* If the failed drives are just a ReadError, then we might need 4881 * to progress the repair/check process 4882 */ 4883 if (s.failed <= conf->max_degraded && !conf->mddev->ro) 4884 for (i = 0; i < s.failed; i++) { 4885 struct r5dev *dev = &sh->dev[s.failed_num[i]]; 4886 if (test_bit(R5_ReadError, &dev->flags) 4887 && !test_bit(R5_LOCKED, &dev->flags) 4888 && test_bit(R5_UPTODATE, &dev->flags) 4889 ) { 4890 if (!test_bit(R5_ReWrite, &dev->flags)) { 4891 set_bit(R5_Wantwrite, &dev->flags); 4892 set_bit(R5_ReWrite, &dev->flags); 4893 set_bit(R5_LOCKED, &dev->flags); 4894 s.locked++; 4895 } else { 4896 /* let's read it back */ 4897 set_bit(R5_Wantread, &dev->flags); 4898 set_bit(R5_LOCKED, &dev->flags); 4899 s.locked++; 4900 } 4901 } 4902 } 4903 4904 /* Finish reconstruct operations initiated by the expansion process */ 4905 if (sh->reconstruct_state == reconstruct_state_result) { 4906 struct stripe_head *sh_src 4907 = raid5_get_active_stripe(conf, sh->sector, 1, 1, 1); 4908 if (sh_src && test_bit(STRIPE_EXPAND_SOURCE, &sh_src->state)) { 4909 /* sh cannot be written until sh_src has been read. 4910 * so arrange for sh to be delayed a little 4911 */ 4912 set_bit(STRIPE_DELAYED, &sh->state); 4913 set_bit(STRIPE_HANDLE, &sh->state); 4914 if (!test_and_set_bit(STRIPE_PREREAD_ACTIVE, 4915 &sh_src->state)) 4916 atomic_inc(&conf->preread_active_stripes); 4917 raid5_release_stripe(sh_src); 4918 goto finish; 4919 } 4920 if (sh_src) 4921 raid5_release_stripe(sh_src); 4922 4923 sh->reconstruct_state = reconstruct_state_idle; 4924 clear_bit(STRIPE_EXPANDING, &sh->state); 4925 for (i = conf->raid_disks; i--; ) { 4926 set_bit(R5_Wantwrite, &sh->dev[i].flags); 4927 set_bit(R5_LOCKED, &sh->dev[i].flags); 4928 s.locked++; 4929 } 4930 } 4931 4932 if (s.expanded && test_bit(STRIPE_EXPANDING, &sh->state) && 4933 !sh->reconstruct_state) { 4934 /* Need to write out all blocks after computing parity */ 4935 sh->disks = conf->raid_disks; 4936 stripe_set_idx(sh->sector, conf, 0, sh); 4937 schedule_reconstruction(sh, &s, 1, 1); 4938 } else if (s.expanded && !sh->reconstruct_state && s.locked == 0) { 4939 clear_bit(STRIPE_EXPAND_READY, &sh->state); 4940 atomic_dec(&conf->reshape_stripes); 4941 wake_up(&conf->wait_for_overlap); 4942 md_done_sync(conf->mddev, STRIPE_SECTORS, 1); 4943 } 4944 4945 if (s.expanding && s.locked == 0 && 4946 !test_bit(STRIPE_COMPUTE_RUN, &sh->state)) 4947 handle_stripe_expansion(conf, sh); 4948 4949 finish: 4950 /* wait for this device to become unblocked */ 4951 if (unlikely(s.blocked_rdev)) { 4952 if (conf->mddev->external) 4953 md_wait_for_blocked_rdev(s.blocked_rdev, 4954 conf->mddev); 4955 else 4956 /* Internal metadata will immediately 4957 * be written by raid5d, so we don't 4958 * need to wait here. 4959 */ 4960 rdev_dec_pending(s.blocked_rdev, 4961 conf->mddev); 4962 } 4963 4964 if (s.handle_bad_blocks) 4965 for (i = disks; i--; ) { 4966 struct md_rdev *rdev; 4967 struct r5dev *dev = &sh->dev[i]; 4968 if (test_and_clear_bit(R5_WriteError, &dev->flags)) { 4969 /* We own a safe reference to the rdev */ 4970 rdev = conf->disks[i].rdev; 4971 if (!rdev_set_badblocks(rdev, sh->sector, 4972 STRIPE_SECTORS, 0)) 4973 md_error(conf->mddev, rdev); 4974 rdev_dec_pending(rdev, conf->mddev); 4975 } 4976 if (test_and_clear_bit(R5_MadeGood, &dev->flags)) { 4977 rdev = conf->disks[i].rdev; 4978 rdev_clear_badblocks(rdev, sh->sector, 4979 STRIPE_SECTORS, 0); 4980 rdev_dec_pending(rdev, conf->mddev); 4981 } 4982 if (test_and_clear_bit(R5_MadeGoodRepl, &dev->flags)) { 4983 rdev = conf->disks[i].replacement; 4984 if (!rdev) 4985 /* rdev have been moved down */ 4986 rdev = conf->disks[i].rdev; 4987 rdev_clear_badblocks(rdev, sh->sector, 4988 STRIPE_SECTORS, 0); 4989 rdev_dec_pending(rdev, conf->mddev); 4990 } 4991 } 4992 4993 if (s.ops_request) 4994 raid_run_ops(sh, s.ops_request); 4995 4996 ops_run_io(sh, &s); 4997 4998 if (s.dec_preread_active) { 4999 /* We delay this until after ops_run_io so that if make_request 5000 * is waiting on a flush, it won't continue until the writes 5001 * have actually been submitted. 5002 */ 5003 atomic_dec(&conf->preread_active_stripes); 5004 if (atomic_read(&conf->preread_active_stripes) < 5005 IO_THRESHOLD) 5006 md_wakeup_thread(conf->mddev->thread); 5007 } 5008 5009 clear_bit_unlock(STRIPE_ACTIVE, &sh->state); 5010 } 5011 5012 static void raid5_activate_delayed(struct r5conf *conf) 5013 { 5014 if (atomic_read(&conf->preread_active_stripes) < IO_THRESHOLD) { 5015 while (!list_empty(&conf->delayed_list)) { 5016 struct list_head *l = conf->delayed_list.next; 5017 struct stripe_head *sh; 5018 sh = list_entry(l, struct stripe_head, lru); 5019 list_del_init(l); 5020 clear_bit(STRIPE_DELAYED, &sh->state); 5021 if (!test_and_set_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) 5022 atomic_inc(&conf->preread_active_stripes); 5023 list_add_tail(&sh->lru, &conf->hold_list); 5024 raid5_wakeup_stripe_thread(sh); 5025 } 5026 } 5027 } 5028 5029 static void activate_bit_delay(struct r5conf *conf, 5030 struct list_head *temp_inactive_list) 5031 { 5032 /* device_lock is held */ 5033 struct list_head head; 5034 list_add(&head, &conf->bitmap_list); 5035 list_del_init(&conf->bitmap_list); 5036 while (!list_empty(&head)) { 5037 struct stripe_head *sh = list_entry(head.next, struct stripe_head, lru); 5038 int hash; 5039 list_del_init(&sh->lru); 5040 atomic_inc(&sh->count); 5041 hash = sh->hash_lock_index; 5042 __release_stripe(conf, sh, &temp_inactive_list[hash]); 5043 } 5044 } 5045 5046 static int raid5_congested(struct mddev *mddev, int bits) 5047 { 5048 struct r5conf *conf = mddev->private; 5049 5050 /* No difference between reads and writes. Just check 5051 * how busy the stripe_cache is 5052 */ 5053 5054 if (test_bit(R5_INACTIVE_BLOCKED, &conf->cache_state)) 5055 return 1; 5056 5057 /* Also checks whether there is pressure on r5cache log space */ 5058 if (test_bit(R5C_LOG_TIGHT, &conf->cache_state)) 5059 return 1; 5060 if (conf->quiesce) 5061 return 1; 5062 if (atomic_read(&conf->empty_inactive_list_nr)) 5063 return 1; 5064 5065 return 0; 5066 } 5067 5068 static int in_chunk_boundary(struct mddev *mddev, struct bio *bio) 5069 { 5070 struct r5conf *conf = mddev->private; 5071 sector_t sector = bio->bi_iter.bi_sector + get_start_sect(bio->bi_bdev); 5072 unsigned int chunk_sectors; 5073 unsigned int bio_sectors = bio_sectors(bio); 5074 5075 chunk_sectors = min(conf->chunk_sectors, conf->prev_chunk_sectors); 5076 return chunk_sectors >= 5077 ((sector & (chunk_sectors - 1)) + bio_sectors); 5078 } 5079 5080 /* 5081 * add bio to the retry LIFO ( in O(1) ... we are in interrupt ) 5082 * later sampled by raid5d. 5083 */ 5084 static void add_bio_to_retry(struct bio *bi,struct r5conf *conf) 5085 { 5086 unsigned long flags; 5087 5088 spin_lock_irqsave(&conf->device_lock, flags); 5089 5090 bi->bi_next = conf->retry_read_aligned_list; 5091 conf->retry_read_aligned_list = bi; 5092 5093 spin_unlock_irqrestore(&conf->device_lock, flags); 5094 md_wakeup_thread(conf->mddev->thread); 5095 } 5096 5097 static struct bio *remove_bio_from_retry(struct r5conf *conf, 5098 unsigned int *offset) 5099 { 5100 struct bio *bi; 5101 5102 bi = conf->retry_read_aligned; 5103 if (bi) { 5104 *offset = conf->retry_read_offset; 5105 conf->retry_read_aligned = NULL; 5106 return bi; 5107 } 5108 bi = conf->retry_read_aligned_list; 5109 if(bi) { 5110 conf->retry_read_aligned_list = bi->bi_next; 5111 bi->bi_next = NULL; 5112 *offset = 0; 5113 } 5114 5115 return bi; 5116 } 5117 5118 /* 5119 * The "raid5_align_endio" should check if the read succeeded and if it 5120 * did, call bio_endio on the original bio (having bio_put the new bio 5121 * first). 5122 * If the read failed.. 5123 */ 5124 static void raid5_align_endio(struct bio *bi) 5125 { 5126 struct bio* raid_bi = bi->bi_private; 5127 struct mddev *mddev; 5128 struct r5conf *conf; 5129 struct md_rdev *rdev; 5130 int error = bi->bi_error; 5131 5132 bio_put(bi); 5133 5134 rdev = (void*)raid_bi->bi_next; 5135 raid_bi->bi_next = NULL; 5136 mddev = rdev->mddev; 5137 conf = mddev->private; 5138 5139 rdev_dec_pending(rdev, conf->mddev); 5140 5141 if (!error) { 5142 bio_endio(raid_bi); 5143 if (atomic_dec_and_test(&conf->active_aligned_reads)) 5144 wake_up(&conf->wait_for_quiescent); 5145 return; 5146 } 5147 5148 pr_debug("raid5_align_endio : io error...handing IO for a retry\n"); 5149 5150 add_bio_to_retry(raid_bi, conf); 5151 } 5152 5153 static int raid5_read_one_chunk(struct mddev *mddev, struct bio *raid_bio) 5154 { 5155 struct r5conf *conf = mddev->private; 5156 int dd_idx; 5157 struct bio* align_bi; 5158 struct md_rdev *rdev; 5159 sector_t end_sector; 5160 5161 if (!in_chunk_boundary(mddev, raid_bio)) { 5162 pr_debug("%s: non aligned\n", __func__); 5163 return 0; 5164 } 5165 /* 5166 * use bio_clone_fast to make a copy of the bio 5167 */ 5168 align_bi = bio_clone_fast(raid_bio, GFP_NOIO, mddev->bio_set); 5169 if (!align_bi) 5170 return 0; 5171 /* 5172 * set bi_end_io to a new function, and set bi_private to the 5173 * original bio. 5174 */ 5175 align_bi->bi_end_io = raid5_align_endio; 5176 align_bi->bi_private = raid_bio; 5177 /* 5178 * compute position 5179 */ 5180 align_bi->bi_iter.bi_sector = 5181 raid5_compute_sector(conf, raid_bio->bi_iter.bi_sector, 5182 0, &dd_idx, NULL); 5183 5184 end_sector = bio_end_sector(align_bi); 5185 rcu_read_lock(); 5186 rdev = rcu_dereference(conf->disks[dd_idx].replacement); 5187 if (!rdev || test_bit(Faulty, &rdev->flags) || 5188 rdev->recovery_offset < end_sector) { 5189 rdev = rcu_dereference(conf->disks[dd_idx].rdev); 5190 if (rdev && 5191 (test_bit(Faulty, &rdev->flags) || 5192 !(test_bit(In_sync, &rdev->flags) || 5193 rdev->recovery_offset >= end_sector))) 5194 rdev = NULL; 5195 } 5196 5197 if (r5c_big_stripe_cached(conf, align_bi->bi_iter.bi_sector)) { 5198 rcu_read_unlock(); 5199 bio_put(align_bi); 5200 return 0; 5201 } 5202 5203 if (rdev) { 5204 sector_t first_bad; 5205 int bad_sectors; 5206 5207 atomic_inc(&rdev->nr_pending); 5208 rcu_read_unlock(); 5209 raid_bio->bi_next = (void*)rdev; 5210 align_bi->bi_bdev = rdev->bdev; 5211 bio_clear_flag(align_bi, BIO_SEG_VALID); 5212 5213 if (is_badblock(rdev, align_bi->bi_iter.bi_sector, 5214 bio_sectors(align_bi), 5215 &first_bad, &bad_sectors)) { 5216 bio_put(align_bi); 5217 rdev_dec_pending(rdev, mddev); 5218 return 0; 5219 } 5220 5221 /* No reshape active, so we can trust rdev->data_offset */ 5222 align_bi->bi_iter.bi_sector += rdev->data_offset; 5223 5224 spin_lock_irq(&conf->device_lock); 5225 wait_event_lock_irq(conf->wait_for_quiescent, 5226 conf->quiesce == 0, 5227 conf->device_lock); 5228 atomic_inc(&conf->active_aligned_reads); 5229 spin_unlock_irq(&conf->device_lock); 5230 5231 if (mddev->gendisk) 5232 trace_block_bio_remap(bdev_get_queue(align_bi->bi_bdev), 5233 align_bi, disk_devt(mddev->gendisk), 5234 raid_bio->bi_iter.bi_sector); 5235 generic_make_request(align_bi); 5236 return 1; 5237 } else { 5238 rcu_read_unlock(); 5239 bio_put(align_bi); 5240 return 0; 5241 } 5242 } 5243 5244 static struct bio *chunk_aligned_read(struct mddev *mddev, struct bio *raid_bio) 5245 { 5246 struct bio *split; 5247 sector_t sector = raid_bio->bi_iter.bi_sector; 5248 unsigned chunk_sects = mddev->chunk_sectors; 5249 unsigned sectors = chunk_sects - (sector & (chunk_sects-1)); 5250 5251 if (sectors < bio_sectors(raid_bio)) { 5252 struct r5conf *conf = mddev->private; 5253 split = bio_split(raid_bio, sectors, GFP_NOIO, conf->bio_split); 5254 bio_chain(split, raid_bio); 5255 generic_make_request(raid_bio); 5256 raid_bio = split; 5257 } 5258 5259 if (!raid5_read_one_chunk(mddev, raid_bio)) 5260 return raid_bio; 5261 5262 return NULL; 5263 } 5264 5265 /* __get_priority_stripe - get the next stripe to process 5266 * 5267 * Full stripe writes are allowed to pass preread active stripes up until 5268 * the bypass_threshold is exceeded. In general the bypass_count 5269 * increments when the handle_list is handled before the hold_list; however, it 5270 * will not be incremented when STRIPE_IO_STARTED is sampled set signifying a 5271 * stripe with in flight i/o. The bypass_count will be reset when the 5272 * head of the hold_list has changed, i.e. the head was promoted to the 5273 * handle_list. 5274 */ 5275 static struct stripe_head *__get_priority_stripe(struct r5conf *conf, int group) 5276 { 5277 struct stripe_head *sh, *tmp; 5278 struct list_head *handle_list = NULL; 5279 struct r5worker_group *wg; 5280 bool second_try = !r5c_is_writeback(conf->log); 5281 bool try_loprio = test_bit(R5C_LOG_TIGHT, &conf->cache_state); 5282 5283 again: 5284 wg = NULL; 5285 sh = NULL; 5286 if (conf->worker_cnt_per_group == 0) { 5287 handle_list = try_loprio ? &conf->loprio_list : 5288 &conf->handle_list; 5289 } else if (group != ANY_GROUP) { 5290 handle_list = try_loprio ? &conf->worker_groups[group].loprio_list : 5291 &conf->worker_groups[group].handle_list; 5292 wg = &conf->worker_groups[group]; 5293 } else { 5294 int i; 5295 for (i = 0; i < conf->group_cnt; i++) { 5296 handle_list = try_loprio ? &conf->worker_groups[i].loprio_list : 5297 &conf->worker_groups[i].handle_list; 5298 wg = &conf->worker_groups[i]; 5299 if (!list_empty(handle_list)) 5300 break; 5301 } 5302 } 5303 5304 pr_debug("%s: handle: %s hold: %s full_writes: %d bypass_count: %d\n", 5305 __func__, 5306 list_empty(handle_list) ? "empty" : "busy", 5307 list_empty(&conf->hold_list) ? "empty" : "busy", 5308 atomic_read(&conf->pending_full_writes), conf->bypass_count); 5309 5310 if (!list_empty(handle_list)) { 5311 sh = list_entry(handle_list->next, typeof(*sh), lru); 5312 5313 if (list_empty(&conf->hold_list)) 5314 conf->bypass_count = 0; 5315 else if (!test_bit(STRIPE_IO_STARTED, &sh->state)) { 5316 if (conf->hold_list.next == conf->last_hold) 5317 conf->bypass_count++; 5318 else { 5319 conf->last_hold = conf->hold_list.next; 5320 conf->bypass_count -= conf->bypass_threshold; 5321 if (conf->bypass_count < 0) 5322 conf->bypass_count = 0; 5323 } 5324 } 5325 } else if (!list_empty(&conf->hold_list) && 5326 ((conf->bypass_threshold && 5327 conf->bypass_count > conf->bypass_threshold) || 5328 atomic_read(&conf->pending_full_writes) == 0)) { 5329 5330 list_for_each_entry(tmp, &conf->hold_list, lru) { 5331 if (conf->worker_cnt_per_group == 0 || 5332 group == ANY_GROUP || 5333 !cpu_online(tmp->cpu) || 5334 cpu_to_group(tmp->cpu) == group) { 5335 sh = tmp; 5336 break; 5337 } 5338 } 5339 5340 if (sh) { 5341 conf->bypass_count -= conf->bypass_threshold; 5342 if (conf->bypass_count < 0) 5343 conf->bypass_count = 0; 5344 } 5345 wg = NULL; 5346 } 5347 5348 if (!sh) { 5349 if (second_try) 5350 return NULL; 5351 second_try = true; 5352 try_loprio = !try_loprio; 5353 goto again; 5354 } 5355 5356 if (wg) { 5357 wg->stripes_cnt--; 5358 sh->group = NULL; 5359 } 5360 list_del_init(&sh->lru); 5361 BUG_ON(atomic_inc_return(&sh->count) != 1); 5362 return sh; 5363 } 5364 5365 struct raid5_plug_cb { 5366 struct blk_plug_cb cb; 5367 struct list_head list; 5368 struct list_head temp_inactive_list[NR_STRIPE_HASH_LOCKS]; 5369 }; 5370 5371 static void raid5_unplug(struct blk_plug_cb *blk_cb, bool from_schedule) 5372 { 5373 struct raid5_plug_cb *cb = container_of( 5374 blk_cb, struct raid5_plug_cb, cb); 5375 struct stripe_head *sh; 5376 struct mddev *mddev = cb->cb.data; 5377 struct r5conf *conf = mddev->private; 5378 int cnt = 0; 5379 int hash; 5380 5381 if (cb->list.next && !list_empty(&cb->list)) { 5382 spin_lock_irq(&conf->device_lock); 5383 while (!list_empty(&cb->list)) { 5384 sh = list_first_entry(&cb->list, struct stripe_head, lru); 5385 list_del_init(&sh->lru); 5386 /* 5387 * avoid race release_stripe_plug() sees 5388 * STRIPE_ON_UNPLUG_LIST clear but the stripe 5389 * is still in our list 5390 */ 5391 smp_mb__before_atomic(); 5392 clear_bit(STRIPE_ON_UNPLUG_LIST, &sh->state); 5393 /* 5394 * STRIPE_ON_RELEASE_LIST could be set here. In that 5395 * case, the count is always > 1 here 5396 */ 5397 hash = sh->hash_lock_index; 5398 __release_stripe(conf, sh, &cb->temp_inactive_list[hash]); 5399 cnt++; 5400 } 5401 spin_unlock_irq(&conf->device_lock); 5402 } 5403 release_inactive_stripe_list(conf, cb->temp_inactive_list, 5404 NR_STRIPE_HASH_LOCKS); 5405 if (mddev->queue) 5406 trace_block_unplug(mddev->queue, cnt, !from_schedule); 5407 kfree(cb); 5408 } 5409 5410 static void release_stripe_plug(struct mddev *mddev, 5411 struct stripe_head *sh) 5412 { 5413 struct blk_plug_cb *blk_cb = blk_check_plugged( 5414 raid5_unplug, mddev, 5415 sizeof(struct raid5_plug_cb)); 5416 struct raid5_plug_cb *cb; 5417 5418 if (!blk_cb) { 5419 raid5_release_stripe(sh); 5420 return; 5421 } 5422 5423 cb = container_of(blk_cb, struct raid5_plug_cb, cb); 5424 5425 if (cb->list.next == NULL) { 5426 int i; 5427 INIT_LIST_HEAD(&cb->list); 5428 for (i = 0; i < NR_STRIPE_HASH_LOCKS; i++) 5429 INIT_LIST_HEAD(cb->temp_inactive_list + i); 5430 } 5431 5432 if (!test_and_set_bit(STRIPE_ON_UNPLUG_LIST, &sh->state)) 5433 list_add_tail(&sh->lru, &cb->list); 5434 else 5435 raid5_release_stripe(sh); 5436 } 5437 5438 static void make_discard_request(struct mddev *mddev, struct bio *bi) 5439 { 5440 struct r5conf *conf = mddev->private; 5441 sector_t logical_sector, last_sector; 5442 struct stripe_head *sh; 5443 int stripe_sectors; 5444 5445 if (mddev->reshape_position != MaxSector) 5446 /* Skip discard while reshape is happening */ 5447 return; 5448 5449 logical_sector = bi->bi_iter.bi_sector & ~((sector_t)STRIPE_SECTORS-1); 5450 last_sector = bi->bi_iter.bi_sector + (bi->bi_iter.bi_size>>9); 5451 5452 bi->bi_next = NULL; 5453 md_write_start(mddev, bi); 5454 5455 stripe_sectors = conf->chunk_sectors * 5456 (conf->raid_disks - conf->max_degraded); 5457 logical_sector = DIV_ROUND_UP_SECTOR_T(logical_sector, 5458 stripe_sectors); 5459 sector_div(last_sector, stripe_sectors); 5460 5461 logical_sector *= conf->chunk_sectors; 5462 last_sector *= conf->chunk_sectors; 5463 5464 for (; logical_sector < last_sector; 5465 logical_sector += STRIPE_SECTORS) { 5466 DEFINE_WAIT(w); 5467 int d; 5468 again: 5469 sh = raid5_get_active_stripe(conf, logical_sector, 0, 0, 0); 5470 prepare_to_wait(&conf->wait_for_overlap, &w, 5471 TASK_UNINTERRUPTIBLE); 5472 set_bit(R5_Overlap, &sh->dev[sh->pd_idx].flags); 5473 if (test_bit(STRIPE_SYNCING, &sh->state)) { 5474 raid5_release_stripe(sh); 5475 schedule(); 5476 goto again; 5477 } 5478 clear_bit(R5_Overlap, &sh->dev[sh->pd_idx].flags); 5479 spin_lock_irq(&sh->stripe_lock); 5480 for (d = 0; d < conf->raid_disks; d++) { 5481 if (d == sh->pd_idx || d == sh->qd_idx) 5482 continue; 5483 if (sh->dev[d].towrite || sh->dev[d].toread) { 5484 set_bit(R5_Overlap, &sh->dev[d].flags); 5485 spin_unlock_irq(&sh->stripe_lock); 5486 raid5_release_stripe(sh); 5487 schedule(); 5488 goto again; 5489 } 5490 } 5491 set_bit(STRIPE_DISCARD, &sh->state); 5492 finish_wait(&conf->wait_for_overlap, &w); 5493 sh->overwrite_disks = 0; 5494 for (d = 0; d < conf->raid_disks; d++) { 5495 if (d == sh->pd_idx || d == sh->qd_idx) 5496 continue; 5497 sh->dev[d].towrite = bi; 5498 set_bit(R5_OVERWRITE, &sh->dev[d].flags); 5499 bio_inc_remaining(bi); 5500 md_write_inc(mddev, bi); 5501 sh->overwrite_disks++; 5502 } 5503 spin_unlock_irq(&sh->stripe_lock); 5504 if (conf->mddev->bitmap) { 5505 for (d = 0; 5506 d < conf->raid_disks - conf->max_degraded; 5507 d++) 5508 bitmap_startwrite(mddev->bitmap, 5509 sh->sector, 5510 STRIPE_SECTORS, 5511 0); 5512 sh->bm_seq = conf->seq_flush + 1; 5513 set_bit(STRIPE_BIT_DELAY, &sh->state); 5514 } 5515 5516 set_bit(STRIPE_HANDLE, &sh->state); 5517 clear_bit(STRIPE_DELAYED, &sh->state); 5518 if (!test_and_set_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) 5519 atomic_inc(&conf->preread_active_stripes); 5520 release_stripe_plug(mddev, sh); 5521 } 5522 5523 md_write_end(mddev); 5524 bio_endio(bi); 5525 } 5526 5527 static void raid5_make_request(struct mddev *mddev, struct bio * bi) 5528 { 5529 struct r5conf *conf = mddev->private; 5530 int dd_idx; 5531 sector_t new_sector; 5532 sector_t logical_sector, last_sector; 5533 struct stripe_head *sh; 5534 const int rw = bio_data_dir(bi); 5535 DEFINE_WAIT(w); 5536 bool do_prepare; 5537 bool do_flush = false; 5538 5539 if (unlikely(bi->bi_opf & REQ_PREFLUSH)) { 5540 int ret = r5l_handle_flush_request(conf->log, bi); 5541 5542 if (ret == 0) 5543 return; 5544 if (ret == -ENODEV) { 5545 md_flush_request(mddev, bi); 5546 return; 5547 } 5548 /* ret == -EAGAIN, fallback */ 5549 /* 5550 * if r5l_handle_flush_request() didn't clear REQ_PREFLUSH, 5551 * we need to flush journal device 5552 */ 5553 do_flush = bi->bi_opf & REQ_PREFLUSH; 5554 } 5555 5556 /* 5557 * If array is degraded, better not do chunk aligned read because 5558 * later we might have to read it again in order to reconstruct 5559 * data on failed drives. 5560 */ 5561 if (rw == READ && mddev->degraded == 0 && 5562 mddev->reshape_position == MaxSector) { 5563 bi = chunk_aligned_read(mddev, bi); 5564 if (!bi) 5565 return; 5566 } 5567 5568 if (unlikely(bio_op(bi) == REQ_OP_DISCARD)) { 5569 make_discard_request(mddev, bi); 5570 return; 5571 } 5572 5573 logical_sector = bi->bi_iter.bi_sector & ~((sector_t)STRIPE_SECTORS-1); 5574 last_sector = bio_end_sector(bi); 5575 bi->bi_next = NULL; 5576 md_write_start(mddev, bi); 5577 5578 prepare_to_wait(&conf->wait_for_overlap, &w, TASK_UNINTERRUPTIBLE); 5579 for (;logical_sector < last_sector; logical_sector += STRIPE_SECTORS) { 5580 int previous; 5581 int seq; 5582 5583 do_prepare = false; 5584 retry: 5585 seq = read_seqcount_begin(&conf->gen_lock); 5586 previous = 0; 5587 if (do_prepare) 5588 prepare_to_wait(&conf->wait_for_overlap, &w, 5589 TASK_UNINTERRUPTIBLE); 5590 if (unlikely(conf->reshape_progress != MaxSector)) { 5591 /* spinlock is needed as reshape_progress may be 5592 * 64bit on a 32bit platform, and so it might be 5593 * possible to see a half-updated value 5594 * Of course reshape_progress could change after 5595 * the lock is dropped, so once we get a reference 5596 * to the stripe that we think it is, we will have 5597 * to check again. 5598 */ 5599 spin_lock_irq(&conf->device_lock); 5600 if (mddev->reshape_backwards 5601 ? logical_sector < conf->reshape_progress 5602 : logical_sector >= conf->reshape_progress) { 5603 previous = 1; 5604 } else { 5605 if (mddev->reshape_backwards 5606 ? logical_sector < conf->reshape_safe 5607 : logical_sector >= conf->reshape_safe) { 5608 spin_unlock_irq(&conf->device_lock); 5609 schedule(); 5610 do_prepare = true; 5611 goto retry; 5612 } 5613 } 5614 spin_unlock_irq(&conf->device_lock); 5615 } 5616 5617 new_sector = raid5_compute_sector(conf, logical_sector, 5618 previous, 5619 &dd_idx, NULL); 5620 pr_debug("raid456: raid5_make_request, sector %llu logical %llu\n", 5621 (unsigned long long)new_sector, 5622 (unsigned long long)logical_sector); 5623 5624 sh = raid5_get_active_stripe(conf, new_sector, previous, 5625 (bi->bi_opf & REQ_RAHEAD), 0); 5626 if (sh) { 5627 if (unlikely(previous)) { 5628 /* expansion might have moved on while waiting for a 5629 * stripe, so we must do the range check again. 5630 * Expansion could still move past after this 5631 * test, but as we are holding a reference to 5632 * 'sh', we know that if that happens, 5633 * STRIPE_EXPANDING will get set and the expansion 5634 * won't proceed until we finish with the stripe. 5635 */ 5636 int must_retry = 0; 5637 spin_lock_irq(&conf->device_lock); 5638 if (mddev->reshape_backwards 5639 ? logical_sector >= conf->reshape_progress 5640 : logical_sector < conf->reshape_progress) 5641 /* mismatch, need to try again */ 5642 must_retry = 1; 5643 spin_unlock_irq(&conf->device_lock); 5644 if (must_retry) { 5645 raid5_release_stripe(sh); 5646 schedule(); 5647 do_prepare = true; 5648 goto retry; 5649 } 5650 } 5651 if (read_seqcount_retry(&conf->gen_lock, seq)) { 5652 /* Might have got the wrong stripe_head 5653 * by accident 5654 */ 5655 raid5_release_stripe(sh); 5656 goto retry; 5657 } 5658 5659 if (rw == WRITE && 5660 logical_sector >= mddev->suspend_lo && 5661 logical_sector < mddev->suspend_hi) { 5662 raid5_release_stripe(sh); 5663 /* As the suspend_* range is controlled by 5664 * userspace, we want an interruptible 5665 * wait. 5666 */ 5667 flush_signals(current); 5668 prepare_to_wait(&conf->wait_for_overlap, 5669 &w, TASK_INTERRUPTIBLE); 5670 if (logical_sector >= mddev->suspend_lo && 5671 logical_sector < mddev->suspend_hi) { 5672 schedule(); 5673 do_prepare = true; 5674 } 5675 goto retry; 5676 } 5677 5678 if (test_bit(STRIPE_EXPANDING, &sh->state) || 5679 !add_stripe_bio(sh, bi, dd_idx, rw, previous)) { 5680 /* Stripe is busy expanding or 5681 * add failed due to overlap. Flush everything 5682 * and wait a while 5683 */ 5684 md_wakeup_thread(mddev->thread); 5685 raid5_release_stripe(sh); 5686 schedule(); 5687 do_prepare = true; 5688 goto retry; 5689 } 5690 if (do_flush) { 5691 set_bit(STRIPE_R5C_PREFLUSH, &sh->state); 5692 /* we only need flush for one stripe */ 5693 do_flush = false; 5694 } 5695 5696 set_bit(STRIPE_HANDLE, &sh->state); 5697 clear_bit(STRIPE_DELAYED, &sh->state); 5698 if ((!sh->batch_head || sh == sh->batch_head) && 5699 (bi->bi_opf & REQ_SYNC) && 5700 !test_and_set_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) 5701 atomic_inc(&conf->preread_active_stripes); 5702 release_stripe_plug(mddev, sh); 5703 } else { 5704 /* cannot get stripe for read-ahead, just give-up */ 5705 bi->bi_error = -EIO; 5706 break; 5707 } 5708 } 5709 finish_wait(&conf->wait_for_overlap, &w); 5710 5711 if (rw == WRITE) 5712 md_write_end(mddev); 5713 bio_endio(bi); 5714 } 5715 5716 static sector_t raid5_size(struct mddev *mddev, sector_t sectors, int raid_disks); 5717 5718 static sector_t reshape_request(struct mddev *mddev, sector_t sector_nr, int *skipped) 5719 { 5720 /* reshaping is quite different to recovery/resync so it is 5721 * handled quite separately ... here. 5722 * 5723 * On each call to sync_request, we gather one chunk worth of 5724 * destination stripes and flag them as expanding. 5725 * Then we find all the source stripes and request reads. 5726 * As the reads complete, handle_stripe will copy the data 5727 * into the destination stripe and release that stripe. 5728 */ 5729 struct r5conf *conf = mddev->private; 5730 struct stripe_head *sh; 5731 sector_t first_sector, last_sector; 5732 int raid_disks = conf->previous_raid_disks; 5733 int data_disks = raid_disks - conf->max_degraded; 5734 int new_data_disks = conf->raid_disks - conf->max_degraded; 5735 int i; 5736 int dd_idx; 5737 sector_t writepos, readpos, safepos; 5738 sector_t stripe_addr; 5739 int reshape_sectors; 5740 struct list_head stripes; 5741 sector_t retn; 5742 5743 if (sector_nr == 0) { 5744 /* If restarting in the middle, skip the initial sectors */ 5745 if (mddev->reshape_backwards && 5746 conf->reshape_progress < raid5_size(mddev, 0, 0)) { 5747 sector_nr = raid5_size(mddev, 0, 0) 5748 - conf->reshape_progress; 5749 } else if (mddev->reshape_backwards && 5750 conf->reshape_progress == MaxSector) { 5751 /* shouldn't happen, but just in case, finish up.*/ 5752 sector_nr = MaxSector; 5753 } else if (!mddev->reshape_backwards && 5754 conf->reshape_progress > 0) 5755 sector_nr = conf->reshape_progress; 5756 sector_div(sector_nr, new_data_disks); 5757 if (sector_nr) { 5758 mddev->curr_resync_completed = sector_nr; 5759 sysfs_notify(&mddev->kobj, NULL, "sync_completed"); 5760 *skipped = 1; 5761 retn = sector_nr; 5762 goto finish; 5763 } 5764 } 5765 5766 /* We need to process a full chunk at a time. 5767 * If old and new chunk sizes differ, we need to process the 5768 * largest of these 5769 */ 5770 5771 reshape_sectors = max(conf->chunk_sectors, conf->prev_chunk_sectors); 5772 5773 /* We update the metadata at least every 10 seconds, or when 5774 * the data about to be copied would over-write the source of 5775 * the data at the front of the range. i.e. one new_stripe 5776 * along from reshape_progress new_maps to after where 5777 * reshape_safe old_maps to 5778 */ 5779 writepos = conf->reshape_progress; 5780 sector_div(writepos, new_data_disks); 5781 readpos = conf->reshape_progress; 5782 sector_div(readpos, data_disks); 5783 safepos = conf->reshape_safe; 5784 sector_div(safepos, data_disks); 5785 if (mddev->reshape_backwards) { 5786 BUG_ON(writepos < reshape_sectors); 5787 writepos -= reshape_sectors; 5788 readpos += reshape_sectors; 5789 safepos += reshape_sectors; 5790 } else { 5791 writepos += reshape_sectors; 5792 /* readpos and safepos are worst-case calculations. 5793 * A negative number is overly pessimistic, and causes 5794 * obvious problems for unsigned storage. So clip to 0. 5795 */ 5796 readpos -= min_t(sector_t, reshape_sectors, readpos); 5797 safepos -= min_t(sector_t, reshape_sectors, safepos); 5798 } 5799 5800 /* Having calculated the 'writepos' possibly use it 5801 * to set 'stripe_addr' which is where we will write to. 5802 */ 5803 if (mddev->reshape_backwards) { 5804 BUG_ON(conf->reshape_progress == 0); 5805 stripe_addr = writepos; 5806 BUG_ON((mddev->dev_sectors & 5807 ~((sector_t)reshape_sectors - 1)) 5808 - reshape_sectors - stripe_addr 5809 != sector_nr); 5810 } else { 5811 BUG_ON(writepos != sector_nr + reshape_sectors); 5812 stripe_addr = sector_nr; 5813 } 5814 5815 /* 'writepos' is the most advanced device address we might write. 5816 * 'readpos' is the least advanced device address we might read. 5817 * 'safepos' is the least address recorded in the metadata as having 5818 * been reshaped. 5819 * If there is a min_offset_diff, these are adjusted either by 5820 * increasing the safepos/readpos if diff is negative, or 5821 * increasing writepos if diff is positive. 5822 * If 'readpos' is then behind 'writepos', there is no way that we can 5823 * ensure safety in the face of a crash - that must be done by userspace 5824 * making a backup of the data. So in that case there is no particular 5825 * rush to update metadata. 5826 * Otherwise if 'safepos' is behind 'writepos', then we really need to 5827 * update the metadata to advance 'safepos' to match 'readpos' so that 5828 * we can be safe in the event of a crash. 5829 * So we insist on updating metadata if safepos is behind writepos and 5830 * readpos is beyond writepos. 5831 * In any case, update the metadata every 10 seconds. 5832 * Maybe that number should be configurable, but I'm not sure it is 5833 * worth it.... maybe it could be a multiple of safemode_delay??? 5834 */ 5835 if (conf->min_offset_diff < 0) { 5836 safepos += -conf->min_offset_diff; 5837 readpos += -conf->min_offset_diff; 5838 } else 5839 writepos += conf->min_offset_diff; 5840 5841 if ((mddev->reshape_backwards 5842 ? (safepos > writepos && readpos < writepos) 5843 : (safepos < writepos && readpos > writepos)) || 5844 time_after(jiffies, conf->reshape_checkpoint + 10*HZ)) { 5845 /* Cannot proceed until we've updated the superblock... */ 5846 wait_event(conf->wait_for_overlap, 5847 atomic_read(&conf->reshape_stripes)==0 5848 || test_bit(MD_RECOVERY_INTR, &mddev->recovery)); 5849 if (atomic_read(&conf->reshape_stripes) != 0) 5850 return 0; 5851 mddev->reshape_position = conf->reshape_progress; 5852 mddev->curr_resync_completed = sector_nr; 5853 conf->reshape_checkpoint = jiffies; 5854 set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags); 5855 md_wakeup_thread(mddev->thread); 5856 wait_event(mddev->sb_wait, mddev->sb_flags == 0 || 5857 test_bit(MD_RECOVERY_INTR, &mddev->recovery)); 5858 if (test_bit(MD_RECOVERY_INTR, &mddev->recovery)) 5859 return 0; 5860 spin_lock_irq(&conf->device_lock); 5861 conf->reshape_safe = mddev->reshape_position; 5862 spin_unlock_irq(&conf->device_lock); 5863 wake_up(&conf->wait_for_overlap); 5864 sysfs_notify(&mddev->kobj, NULL, "sync_completed"); 5865 } 5866 5867 INIT_LIST_HEAD(&stripes); 5868 for (i = 0; i < reshape_sectors; i += STRIPE_SECTORS) { 5869 int j; 5870 int skipped_disk = 0; 5871 sh = raid5_get_active_stripe(conf, stripe_addr+i, 0, 0, 1); 5872 set_bit(STRIPE_EXPANDING, &sh->state); 5873 atomic_inc(&conf->reshape_stripes); 5874 /* If any of this stripe is beyond the end of the old 5875 * array, then we need to zero those blocks 5876 */ 5877 for (j=sh->disks; j--;) { 5878 sector_t s; 5879 if (j == sh->pd_idx) 5880 continue; 5881 if (conf->level == 6 && 5882 j == sh->qd_idx) 5883 continue; 5884 s = raid5_compute_blocknr(sh, j, 0); 5885 if (s < raid5_size(mddev, 0, 0)) { 5886 skipped_disk = 1; 5887 continue; 5888 } 5889 memset(page_address(sh->dev[j].page), 0, STRIPE_SIZE); 5890 set_bit(R5_Expanded, &sh->dev[j].flags); 5891 set_bit(R5_UPTODATE, &sh->dev[j].flags); 5892 } 5893 if (!skipped_disk) { 5894 set_bit(STRIPE_EXPAND_READY, &sh->state); 5895 set_bit(STRIPE_HANDLE, &sh->state); 5896 } 5897 list_add(&sh->lru, &stripes); 5898 } 5899 spin_lock_irq(&conf->device_lock); 5900 if (mddev->reshape_backwards) 5901 conf->reshape_progress -= reshape_sectors * new_data_disks; 5902 else 5903 conf->reshape_progress += reshape_sectors * new_data_disks; 5904 spin_unlock_irq(&conf->device_lock); 5905 /* Ok, those stripe are ready. We can start scheduling 5906 * reads on the source stripes. 5907 * The source stripes are determined by mapping the first and last 5908 * block on the destination stripes. 5909 */ 5910 first_sector = 5911 raid5_compute_sector(conf, stripe_addr*(new_data_disks), 5912 1, &dd_idx, NULL); 5913 last_sector = 5914 raid5_compute_sector(conf, ((stripe_addr+reshape_sectors) 5915 * new_data_disks - 1), 5916 1, &dd_idx, NULL); 5917 if (last_sector >= mddev->dev_sectors) 5918 last_sector = mddev->dev_sectors - 1; 5919 while (first_sector <= last_sector) { 5920 sh = raid5_get_active_stripe(conf, first_sector, 1, 0, 1); 5921 set_bit(STRIPE_EXPAND_SOURCE, &sh->state); 5922 set_bit(STRIPE_HANDLE, &sh->state); 5923 raid5_release_stripe(sh); 5924 first_sector += STRIPE_SECTORS; 5925 } 5926 /* Now that the sources are clearly marked, we can release 5927 * the destination stripes 5928 */ 5929 while (!list_empty(&stripes)) { 5930 sh = list_entry(stripes.next, struct stripe_head, lru); 5931 list_del_init(&sh->lru); 5932 raid5_release_stripe(sh); 5933 } 5934 /* If this takes us to the resync_max point where we have to pause, 5935 * then we need to write out the superblock. 5936 */ 5937 sector_nr += reshape_sectors; 5938 retn = reshape_sectors; 5939 finish: 5940 if (mddev->curr_resync_completed > mddev->resync_max || 5941 (sector_nr - mddev->curr_resync_completed) * 2 5942 >= mddev->resync_max - mddev->curr_resync_completed) { 5943 /* Cannot proceed until we've updated the superblock... */ 5944 wait_event(conf->wait_for_overlap, 5945 atomic_read(&conf->reshape_stripes) == 0 5946 || test_bit(MD_RECOVERY_INTR, &mddev->recovery)); 5947 if (atomic_read(&conf->reshape_stripes) != 0) 5948 goto ret; 5949 mddev->reshape_position = conf->reshape_progress; 5950 mddev->curr_resync_completed = sector_nr; 5951 conf->reshape_checkpoint = jiffies; 5952 set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags); 5953 md_wakeup_thread(mddev->thread); 5954 wait_event(mddev->sb_wait, 5955 !test_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags) 5956 || test_bit(MD_RECOVERY_INTR, &mddev->recovery)); 5957 if (test_bit(MD_RECOVERY_INTR, &mddev->recovery)) 5958 goto ret; 5959 spin_lock_irq(&conf->device_lock); 5960 conf->reshape_safe = mddev->reshape_position; 5961 spin_unlock_irq(&conf->device_lock); 5962 wake_up(&conf->wait_for_overlap); 5963 sysfs_notify(&mddev->kobj, NULL, "sync_completed"); 5964 } 5965 ret: 5966 return retn; 5967 } 5968 5969 static inline sector_t raid5_sync_request(struct mddev *mddev, sector_t sector_nr, 5970 int *skipped) 5971 { 5972 struct r5conf *conf = mddev->private; 5973 struct stripe_head *sh; 5974 sector_t max_sector = mddev->dev_sectors; 5975 sector_t sync_blocks; 5976 int still_degraded = 0; 5977 int i; 5978 5979 if (sector_nr >= max_sector) { 5980 /* just being told to finish up .. nothing much to do */ 5981 5982 if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)) { 5983 end_reshape(conf); 5984 return 0; 5985 } 5986 5987 if (mddev->curr_resync < max_sector) /* aborted */ 5988 bitmap_end_sync(mddev->bitmap, mddev->curr_resync, 5989 &sync_blocks, 1); 5990 else /* completed sync */ 5991 conf->fullsync = 0; 5992 bitmap_close_sync(mddev->bitmap); 5993 5994 return 0; 5995 } 5996 5997 /* Allow raid5_quiesce to complete */ 5998 wait_event(conf->wait_for_overlap, conf->quiesce != 2); 5999 6000 if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)) 6001 return reshape_request(mddev, sector_nr, skipped); 6002 6003 /* No need to check resync_max as we never do more than one 6004 * stripe, and as resync_max will always be on a chunk boundary, 6005 * if the check in md_do_sync didn't fire, there is no chance 6006 * of overstepping resync_max here 6007 */ 6008 6009 /* if there is too many failed drives and we are trying 6010 * to resync, then assert that we are finished, because there is 6011 * nothing we can do. 6012 */ 6013 if (mddev->degraded >= conf->max_degraded && 6014 test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) { 6015 sector_t rv = mddev->dev_sectors - sector_nr; 6016 *skipped = 1; 6017 return rv; 6018 } 6019 if (!test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery) && 6020 !conf->fullsync && 6021 !bitmap_start_sync(mddev->bitmap, sector_nr, &sync_blocks, 1) && 6022 sync_blocks >= STRIPE_SECTORS) { 6023 /* we can skip this block, and probably more */ 6024 sync_blocks /= STRIPE_SECTORS; 6025 *skipped = 1; 6026 return sync_blocks * STRIPE_SECTORS; /* keep things rounded to whole stripes */ 6027 } 6028 6029 bitmap_cond_end_sync(mddev->bitmap, sector_nr, false); 6030 6031 sh = raid5_get_active_stripe(conf, sector_nr, 0, 1, 0); 6032 if (sh == NULL) { 6033 sh = raid5_get_active_stripe(conf, sector_nr, 0, 0, 0); 6034 /* make sure we don't swamp the stripe cache if someone else 6035 * is trying to get access 6036 */ 6037 schedule_timeout_uninterruptible(1); 6038 } 6039 /* Need to check if array will still be degraded after recovery/resync 6040 * Note in case of > 1 drive failures it's possible we're rebuilding 6041 * one drive while leaving another faulty drive in array. 6042 */ 6043 rcu_read_lock(); 6044 for (i = 0; i < conf->raid_disks; i++) { 6045 struct md_rdev *rdev = ACCESS_ONCE(conf->disks[i].rdev); 6046 6047 if (rdev == NULL || test_bit(Faulty, &rdev->flags)) 6048 still_degraded = 1; 6049 } 6050 rcu_read_unlock(); 6051 6052 bitmap_start_sync(mddev->bitmap, sector_nr, &sync_blocks, still_degraded); 6053 6054 set_bit(STRIPE_SYNC_REQUESTED, &sh->state); 6055 set_bit(STRIPE_HANDLE, &sh->state); 6056 6057 raid5_release_stripe(sh); 6058 6059 return STRIPE_SECTORS; 6060 } 6061 6062 static int retry_aligned_read(struct r5conf *conf, struct bio *raid_bio, 6063 unsigned int offset) 6064 { 6065 /* We may not be able to submit a whole bio at once as there 6066 * may not be enough stripe_heads available. 6067 * We cannot pre-allocate enough stripe_heads as we may need 6068 * more than exist in the cache (if we allow ever large chunks). 6069 * So we do one stripe head at a time and record in 6070 * ->bi_hw_segments how many have been done. 6071 * 6072 * We *know* that this entire raid_bio is in one chunk, so 6073 * it will be only one 'dd_idx' and only need one call to raid5_compute_sector. 6074 */ 6075 struct stripe_head *sh; 6076 int dd_idx; 6077 sector_t sector, logical_sector, last_sector; 6078 int scnt = 0; 6079 int handled = 0; 6080 6081 logical_sector = raid_bio->bi_iter.bi_sector & 6082 ~((sector_t)STRIPE_SECTORS-1); 6083 sector = raid5_compute_sector(conf, logical_sector, 6084 0, &dd_idx, NULL); 6085 last_sector = bio_end_sector(raid_bio); 6086 6087 for (; logical_sector < last_sector; 6088 logical_sector += STRIPE_SECTORS, 6089 sector += STRIPE_SECTORS, 6090 scnt++) { 6091 6092 if (scnt < offset) 6093 /* already done this stripe */ 6094 continue; 6095 6096 sh = raid5_get_active_stripe(conf, sector, 0, 1, 1); 6097 6098 if (!sh) { 6099 /* failed to get a stripe - must wait */ 6100 conf->retry_read_aligned = raid_bio; 6101 conf->retry_read_offset = scnt; 6102 return handled; 6103 } 6104 6105 if (!add_stripe_bio(sh, raid_bio, dd_idx, 0, 0)) { 6106 raid5_release_stripe(sh); 6107 conf->retry_read_aligned = raid_bio; 6108 conf->retry_read_offset = scnt; 6109 return handled; 6110 } 6111 6112 set_bit(R5_ReadNoMerge, &sh->dev[dd_idx].flags); 6113 handle_stripe(sh); 6114 raid5_release_stripe(sh); 6115 handled++; 6116 } 6117 6118 bio_endio(raid_bio); 6119 6120 if (atomic_dec_and_test(&conf->active_aligned_reads)) 6121 wake_up(&conf->wait_for_quiescent); 6122 return handled; 6123 } 6124 6125 static int handle_active_stripes(struct r5conf *conf, int group, 6126 struct r5worker *worker, 6127 struct list_head *temp_inactive_list) 6128 { 6129 struct stripe_head *batch[MAX_STRIPE_BATCH], *sh; 6130 int i, batch_size = 0, hash; 6131 bool release_inactive = false; 6132 6133 while (batch_size < MAX_STRIPE_BATCH && 6134 (sh = __get_priority_stripe(conf, group)) != NULL) 6135 batch[batch_size++] = sh; 6136 6137 if (batch_size == 0) { 6138 for (i = 0; i < NR_STRIPE_HASH_LOCKS; i++) 6139 if (!list_empty(temp_inactive_list + i)) 6140 break; 6141 if (i == NR_STRIPE_HASH_LOCKS) { 6142 spin_unlock_irq(&conf->device_lock); 6143 r5l_flush_stripe_to_raid(conf->log); 6144 spin_lock_irq(&conf->device_lock); 6145 return batch_size; 6146 } 6147 release_inactive = true; 6148 } 6149 spin_unlock_irq(&conf->device_lock); 6150 6151 release_inactive_stripe_list(conf, temp_inactive_list, 6152 NR_STRIPE_HASH_LOCKS); 6153 6154 r5l_flush_stripe_to_raid(conf->log); 6155 if (release_inactive) { 6156 spin_lock_irq(&conf->device_lock); 6157 return 0; 6158 } 6159 6160 for (i = 0; i < batch_size; i++) 6161 handle_stripe(batch[i]); 6162 log_write_stripe_run(conf); 6163 6164 cond_resched(); 6165 6166 spin_lock_irq(&conf->device_lock); 6167 for (i = 0; i < batch_size; i++) { 6168 hash = batch[i]->hash_lock_index; 6169 __release_stripe(conf, batch[i], &temp_inactive_list[hash]); 6170 } 6171 return batch_size; 6172 } 6173 6174 static void raid5_do_work(struct work_struct *work) 6175 { 6176 struct r5worker *worker = container_of(work, struct r5worker, work); 6177 struct r5worker_group *group = worker->group; 6178 struct r5conf *conf = group->conf; 6179 struct mddev *mddev = conf->mddev; 6180 int group_id = group - conf->worker_groups; 6181 int handled; 6182 struct blk_plug plug; 6183 6184 pr_debug("+++ raid5worker active\n"); 6185 6186 blk_start_plug(&plug); 6187 handled = 0; 6188 spin_lock_irq(&conf->device_lock); 6189 while (1) { 6190 int batch_size, released; 6191 6192 released = release_stripe_list(conf, worker->temp_inactive_list); 6193 6194 batch_size = handle_active_stripes(conf, group_id, worker, 6195 worker->temp_inactive_list); 6196 worker->working = false; 6197 if (!batch_size && !released) 6198 break; 6199 handled += batch_size; 6200 wait_event_lock_irq(mddev->sb_wait, 6201 !test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags), 6202 conf->device_lock); 6203 } 6204 pr_debug("%d stripes handled\n", handled); 6205 6206 spin_unlock_irq(&conf->device_lock); 6207 blk_finish_plug(&plug); 6208 6209 pr_debug("--- raid5worker inactive\n"); 6210 } 6211 6212 /* 6213 * This is our raid5 kernel thread. 6214 * 6215 * We scan the hash table for stripes which can be handled now. 6216 * During the scan, completed stripes are saved for us by the interrupt 6217 * handler, so that they will not have to wait for our next wakeup. 6218 */ 6219 static void raid5d(struct md_thread *thread) 6220 { 6221 struct mddev *mddev = thread->mddev; 6222 struct r5conf *conf = mddev->private; 6223 int handled; 6224 struct blk_plug plug; 6225 6226 pr_debug("+++ raid5d active\n"); 6227 6228 md_check_recovery(mddev); 6229 6230 blk_start_plug(&plug); 6231 handled = 0; 6232 spin_lock_irq(&conf->device_lock); 6233 while (1) { 6234 struct bio *bio; 6235 int batch_size, released; 6236 unsigned int offset; 6237 6238 released = release_stripe_list(conf, conf->temp_inactive_list); 6239 if (released) 6240 clear_bit(R5_DID_ALLOC, &conf->cache_state); 6241 6242 if ( 6243 !list_empty(&conf->bitmap_list)) { 6244 /* Now is a good time to flush some bitmap updates */ 6245 conf->seq_flush++; 6246 spin_unlock_irq(&conf->device_lock); 6247 bitmap_unplug(mddev->bitmap); 6248 spin_lock_irq(&conf->device_lock); 6249 conf->seq_write = conf->seq_flush; 6250 activate_bit_delay(conf, conf->temp_inactive_list); 6251 } 6252 raid5_activate_delayed(conf); 6253 6254 while ((bio = remove_bio_from_retry(conf, &offset))) { 6255 int ok; 6256 spin_unlock_irq(&conf->device_lock); 6257 ok = retry_aligned_read(conf, bio, offset); 6258 spin_lock_irq(&conf->device_lock); 6259 if (!ok) 6260 break; 6261 handled++; 6262 } 6263 6264 batch_size = handle_active_stripes(conf, ANY_GROUP, NULL, 6265 conf->temp_inactive_list); 6266 if (!batch_size && !released) 6267 break; 6268 handled += batch_size; 6269 6270 if (mddev->sb_flags & ~(1 << MD_SB_CHANGE_PENDING)) { 6271 spin_unlock_irq(&conf->device_lock); 6272 md_check_recovery(mddev); 6273 spin_lock_irq(&conf->device_lock); 6274 } 6275 } 6276 pr_debug("%d stripes handled\n", handled); 6277 6278 spin_unlock_irq(&conf->device_lock); 6279 if (test_and_clear_bit(R5_ALLOC_MORE, &conf->cache_state) && 6280 mutex_trylock(&conf->cache_size_mutex)) { 6281 grow_one_stripe(conf, __GFP_NOWARN); 6282 /* Set flag even if allocation failed. This helps 6283 * slow down allocation requests when mem is short 6284 */ 6285 set_bit(R5_DID_ALLOC, &conf->cache_state); 6286 mutex_unlock(&conf->cache_size_mutex); 6287 } 6288 6289 flush_deferred_bios(conf); 6290 6291 r5l_flush_stripe_to_raid(conf->log); 6292 6293 async_tx_issue_pending_all(); 6294 blk_finish_plug(&plug); 6295 6296 pr_debug("--- raid5d inactive\n"); 6297 } 6298 6299 static ssize_t 6300 raid5_show_stripe_cache_size(struct mddev *mddev, char *page) 6301 { 6302 struct r5conf *conf; 6303 int ret = 0; 6304 spin_lock(&mddev->lock); 6305 conf = mddev->private; 6306 if (conf) 6307 ret = sprintf(page, "%d\n", conf->min_nr_stripes); 6308 spin_unlock(&mddev->lock); 6309 return ret; 6310 } 6311 6312 int 6313 raid5_set_cache_size(struct mddev *mddev, int size) 6314 { 6315 struct r5conf *conf = mddev->private; 6316 int err; 6317 6318 if (size <= 16 || size > 32768) 6319 return -EINVAL; 6320 6321 conf->min_nr_stripes = size; 6322 mutex_lock(&conf->cache_size_mutex); 6323 while (size < conf->max_nr_stripes && 6324 drop_one_stripe(conf)) 6325 ; 6326 mutex_unlock(&conf->cache_size_mutex); 6327 6328 6329 err = md_allow_write(mddev); 6330 if (err) 6331 return err; 6332 6333 mutex_lock(&conf->cache_size_mutex); 6334 while (size > conf->max_nr_stripes) 6335 if (!grow_one_stripe(conf, GFP_KERNEL)) 6336 break; 6337 mutex_unlock(&conf->cache_size_mutex); 6338 6339 return 0; 6340 } 6341 EXPORT_SYMBOL(raid5_set_cache_size); 6342 6343 static ssize_t 6344 raid5_store_stripe_cache_size(struct mddev *mddev, const char *page, size_t len) 6345 { 6346 struct r5conf *conf; 6347 unsigned long new; 6348 int err; 6349 6350 if (len >= PAGE_SIZE) 6351 return -EINVAL; 6352 if (kstrtoul(page, 10, &new)) 6353 return -EINVAL; 6354 err = mddev_lock(mddev); 6355 if (err) 6356 return err; 6357 conf = mddev->private; 6358 if (!conf) 6359 err = -ENODEV; 6360 else 6361 err = raid5_set_cache_size(mddev, new); 6362 mddev_unlock(mddev); 6363 6364 return err ?: len; 6365 } 6366 6367 static struct md_sysfs_entry 6368 raid5_stripecache_size = __ATTR(stripe_cache_size, S_IRUGO | S_IWUSR, 6369 raid5_show_stripe_cache_size, 6370 raid5_store_stripe_cache_size); 6371 6372 static ssize_t 6373 raid5_show_rmw_level(struct mddev *mddev, char *page) 6374 { 6375 struct r5conf *conf = mddev->private; 6376 if (conf) 6377 return sprintf(page, "%d\n", conf->rmw_level); 6378 else 6379 return 0; 6380 } 6381 6382 static ssize_t 6383 raid5_store_rmw_level(struct mddev *mddev, const char *page, size_t len) 6384 { 6385 struct r5conf *conf = mddev->private; 6386 unsigned long new; 6387 6388 if (!conf) 6389 return -ENODEV; 6390 6391 if (len >= PAGE_SIZE) 6392 return -EINVAL; 6393 6394 if (kstrtoul(page, 10, &new)) 6395 return -EINVAL; 6396 6397 if (new != PARITY_DISABLE_RMW && !raid6_call.xor_syndrome) 6398 return -EINVAL; 6399 6400 if (new != PARITY_DISABLE_RMW && 6401 new != PARITY_ENABLE_RMW && 6402 new != PARITY_PREFER_RMW) 6403 return -EINVAL; 6404 6405 conf->rmw_level = new; 6406 return len; 6407 } 6408 6409 static struct md_sysfs_entry 6410 raid5_rmw_level = __ATTR(rmw_level, S_IRUGO | S_IWUSR, 6411 raid5_show_rmw_level, 6412 raid5_store_rmw_level); 6413 6414 6415 static ssize_t 6416 raid5_show_preread_threshold(struct mddev *mddev, char *page) 6417 { 6418 struct r5conf *conf; 6419 int ret = 0; 6420 spin_lock(&mddev->lock); 6421 conf = mddev->private; 6422 if (conf) 6423 ret = sprintf(page, "%d\n", conf->bypass_threshold); 6424 spin_unlock(&mddev->lock); 6425 return ret; 6426 } 6427 6428 static ssize_t 6429 raid5_store_preread_threshold(struct mddev *mddev, const char *page, size_t len) 6430 { 6431 struct r5conf *conf; 6432 unsigned long new; 6433 int err; 6434 6435 if (len >= PAGE_SIZE) 6436 return -EINVAL; 6437 if (kstrtoul(page, 10, &new)) 6438 return -EINVAL; 6439 6440 err = mddev_lock(mddev); 6441 if (err) 6442 return err; 6443 conf = mddev->private; 6444 if (!conf) 6445 err = -ENODEV; 6446 else if (new > conf->min_nr_stripes) 6447 err = -EINVAL; 6448 else 6449 conf->bypass_threshold = new; 6450 mddev_unlock(mddev); 6451 return err ?: len; 6452 } 6453 6454 static struct md_sysfs_entry 6455 raid5_preread_bypass_threshold = __ATTR(preread_bypass_threshold, 6456 S_IRUGO | S_IWUSR, 6457 raid5_show_preread_threshold, 6458 raid5_store_preread_threshold); 6459 6460 static ssize_t 6461 raid5_show_skip_copy(struct mddev *mddev, char *page) 6462 { 6463 struct r5conf *conf; 6464 int ret = 0; 6465 spin_lock(&mddev->lock); 6466 conf = mddev->private; 6467 if (conf) 6468 ret = sprintf(page, "%d\n", conf->skip_copy); 6469 spin_unlock(&mddev->lock); 6470 return ret; 6471 } 6472 6473 static ssize_t 6474 raid5_store_skip_copy(struct mddev *mddev, const char *page, size_t len) 6475 { 6476 struct r5conf *conf; 6477 unsigned long new; 6478 int err; 6479 6480 if (len >= PAGE_SIZE) 6481 return -EINVAL; 6482 if (kstrtoul(page, 10, &new)) 6483 return -EINVAL; 6484 new = !!new; 6485 6486 err = mddev_lock(mddev); 6487 if (err) 6488 return err; 6489 conf = mddev->private; 6490 if (!conf) 6491 err = -ENODEV; 6492 else if (new != conf->skip_copy) { 6493 mddev_suspend(mddev); 6494 conf->skip_copy = new; 6495 if (new) 6496 mddev->queue->backing_dev_info->capabilities |= 6497 BDI_CAP_STABLE_WRITES; 6498 else 6499 mddev->queue->backing_dev_info->capabilities &= 6500 ~BDI_CAP_STABLE_WRITES; 6501 mddev_resume(mddev); 6502 } 6503 mddev_unlock(mddev); 6504 return err ?: len; 6505 } 6506 6507 static struct md_sysfs_entry 6508 raid5_skip_copy = __ATTR(skip_copy, S_IRUGO | S_IWUSR, 6509 raid5_show_skip_copy, 6510 raid5_store_skip_copy); 6511 6512 static ssize_t 6513 stripe_cache_active_show(struct mddev *mddev, char *page) 6514 { 6515 struct r5conf *conf = mddev->private; 6516 if (conf) 6517 return sprintf(page, "%d\n", atomic_read(&conf->active_stripes)); 6518 else 6519 return 0; 6520 } 6521 6522 static struct md_sysfs_entry 6523 raid5_stripecache_active = __ATTR_RO(stripe_cache_active); 6524 6525 static ssize_t 6526 raid5_show_group_thread_cnt(struct mddev *mddev, char *page) 6527 { 6528 struct r5conf *conf; 6529 int ret = 0; 6530 spin_lock(&mddev->lock); 6531 conf = mddev->private; 6532 if (conf) 6533 ret = sprintf(page, "%d\n", conf->worker_cnt_per_group); 6534 spin_unlock(&mddev->lock); 6535 return ret; 6536 } 6537 6538 static int alloc_thread_groups(struct r5conf *conf, int cnt, 6539 int *group_cnt, 6540 int *worker_cnt_per_group, 6541 struct r5worker_group **worker_groups); 6542 static ssize_t 6543 raid5_store_group_thread_cnt(struct mddev *mddev, const char *page, size_t len) 6544 { 6545 struct r5conf *conf; 6546 unsigned long new; 6547 int err; 6548 struct r5worker_group *new_groups, *old_groups; 6549 int group_cnt, worker_cnt_per_group; 6550 6551 if (len >= PAGE_SIZE) 6552 return -EINVAL; 6553 if (kstrtoul(page, 10, &new)) 6554 return -EINVAL; 6555 6556 err = mddev_lock(mddev); 6557 if (err) 6558 return err; 6559 conf = mddev->private; 6560 if (!conf) 6561 err = -ENODEV; 6562 else if (new != conf->worker_cnt_per_group) { 6563 mddev_suspend(mddev); 6564 6565 old_groups = conf->worker_groups; 6566 if (old_groups) 6567 flush_workqueue(raid5_wq); 6568 6569 err = alloc_thread_groups(conf, new, 6570 &group_cnt, &worker_cnt_per_group, 6571 &new_groups); 6572 if (!err) { 6573 spin_lock_irq(&conf->device_lock); 6574 conf->group_cnt = group_cnt; 6575 conf->worker_cnt_per_group = worker_cnt_per_group; 6576 conf->worker_groups = new_groups; 6577 spin_unlock_irq(&conf->device_lock); 6578 6579 if (old_groups) 6580 kfree(old_groups[0].workers); 6581 kfree(old_groups); 6582 } 6583 mddev_resume(mddev); 6584 } 6585 mddev_unlock(mddev); 6586 6587 return err ?: len; 6588 } 6589 6590 static struct md_sysfs_entry 6591 raid5_group_thread_cnt = __ATTR(group_thread_cnt, S_IRUGO | S_IWUSR, 6592 raid5_show_group_thread_cnt, 6593 raid5_store_group_thread_cnt); 6594 6595 static struct attribute *raid5_attrs[] = { 6596 &raid5_stripecache_size.attr, 6597 &raid5_stripecache_active.attr, 6598 &raid5_preread_bypass_threshold.attr, 6599 &raid5_group_thread_cnt.attr, 6600 &raid5_skip_copy.attr, 6601 &raid5_rmw_level.attr, 6602 &r5c_journal_mode.attr, 6603 NULL, 6604 }; 6605 static struct attribute_group raid5_attrs_group = { 6606 .name = NULL, 6607 .attrs = raid5_attrs, 6608 }; 6609 6610 static int alloc_thread_groups(struct r5conf *conf, int cnt, 6611 int *group_cnt, 6612 int *worker_cnt_per_group, 6613 struct r5worker_group **worker_groups) 6614 { 6615 int i, j, k; 6616 ssize_t size; 6617 struct r5worker *workers; 6618 6619 *worker_cnt_per_group = cnt; 6620 if (cnt == 0) { 6621 *group_cnt = 0; 6622 *worker_groups = NULL; 6623 return 0; 6624 } 6625 *group_cnt = num_possible_nodes(); 6626 size = sizeof(struct r5worker) * cnt; 6627 workers = kzalloc(size * *group_cnt, GFP_NOIO); 6628 *worker_groups = kzalloc(sizeof(struct r5worker_group) * 6629 *group_cnt, GFP_NOIO); 6630 if (!*worker_groups || !workers) { 6631 kfree(workers); 6632 kfree(*worker_groups); 6633 return -ENOMEM; 6634 } 6635 6636 for (i = 0; i < *group_cnt; i++) { 6637 struct r5worker_group *group; 6638 6639 group = &(*worker_groups)[i]; 6640 INIT_LIST_HEAD(&group->handle_list); 6641 INIT_LIST_HEAD(&group->loprio_list); 6642 group->conf = conf; 6643 group->workers = workers + i * cnt; 6644 6645 for (j = 0; j < cnt; j++) { 6646 struct r5worker *worker = group->workers + j; 6647 worker->group = group; 6648 INIT_WORK(&worker->work, raid5_do_work); 6649 6650 for (k = 0; k < NR_STRIPE_HASH_LOCKS; k++) 6651 INIT_LIST_HEAD(worker->temp_inactive_list + k); 6652 } 6653 } 6654 6655 return 0; 6656 } 6657 6658 static void free_thread_groups(struct r5conf *conf) 6659 { 6660 if (conf->worker_groups) 6661 kfree(conf->worker_groups[0].workers); 6662 kfree(conf->worker_groups); 6663 conf->worker_groups = NULL; 6664 } 6665 6666 static sector_t 6667 raid5_size(struct mddev *mddev, sector_t sectors, int raid_disks) 6668 { 6669 struct r5conf *conf = mddev->private; 6670 6671 if (!sectors) 6672 sectors = mddev->dev_sectors; 6673 if (!raid_disks) 6674 /* size is defined by the smallest of previous and new size */ 6675 raid_disks = min(conf->raid_disks, conf->previous_raid_disks); 6676 6677 sectors &= ~((sector_t)conf->chunk_sectors - 1); 6678 sectors &= ~((sector_t)conf->prev_chunk_sectors - 1); 6679 return sectors * (raid_disks - conf->max_degraded); 6680 } 6681 6682 static void free_scratch_buffer(struct r5conf *conf, struct raid5_percpu *percpu) 6683 { 6684 safe_put_page(percpu->spare_page); 6685 if (percpu->scribble) 6686 flex_array_free(percpu->scribble); 6687 percpu->spare_page = NULL; 6688 percpu->scribble = NULL; 6689 } 6690 6691 static int alloc_scratch_buffer(struct r5conf *conf, struct raid5_percpu *percpu) 6692 { 6693 if (conf->level == 6 && !percpu->spare_page) 6694 percpu->spare_page = alloc_page(GFP_KERNEL); 6695 if (!percpu->scribble) 6696 percpu->scribble = scribble_alloc(max(conf->raid_disks, 6697 conf->previous_raid_disks), 6698 max(conf->chunk_sectors, 6699 conf->prev_chunk_sectors) 6700 / STRIPE_SECTORS, 6701 GFP_KERNEL); 6702 6703 if (!percpu->scribble || (conf->level == 6 && !percpu->spare_page)) { 6704 free_scratch_buffer(conf, percpu); 6705 return -ENOMEM; 6706 } 6707 6708 return 0; 6709 } 6710 6711 static int raid456_cpu_dead(unsigned int cpu, struct hlist_node *node) 6712 { 6713 struct r5conf *conf = hlist_entry_safe(node, struct r5conf, node); 6714 6715 free_scratch_buffer(conf, per_cpu_ptr(conf->percpu, cpu)); 6716 return 0; 6717 } 6718 6719 static void raid5_free_percpu(struct r5conf *conf) 6720 { 6721 if (!conf->percpu) 6722 return; 6723 6724 cpuhp_state_remove_instance(CPUHP_MD_RAID5_PREPARE, &conf->node); 6725 free_percpu(conf->percpu); 6726 } 6727 6728 static void free_conf(struct r5conf *conf) 6729 { 6730 int i; 6731 6732 log_exit(conf); 6733 6734 if (conf->shrinker.nr_deferred) 6735 unregister_shrinker(&conf->shrinker); 6736 6737 free_thread_groups(conf); 6738 shrink_stripes(conf); 6739 raid5_free_percpu(conf); 6740 for (i = 0; i < conf->pool_size; i++) 6741 if (conf->disks[i].extra_page) 6742 put_page(conf->disks[i].extra_page); 6743 kfree(conf->disks); 6744 if (conf->bio_split) 6745 bioset_free(conf->bio_split); 6746 kfree(conf->stripe_hashtbl); 6747 kfree(conf->pending_data); 6748 kfree(conf); 6749 } 6750 6751 static int raid456_cpu_up_prepare(unsigned int cpu, struct hlist_node *node) 6752 { 6753 struct r5conf *conf = hlist_entry_safe(node, struct r5conf, node); 6754 struct raid5_percpu *percpu = per_cpu_ptr(conf->percpu, cpu); 6755 6756 if (alloc_scratch_buffer(conf, percpu)) { 6757 pr_warn("%s: failed memory allocation for cpu%u\n", 6758 __func__, cpu); 6759 return -ENOMEM; 6760 } 6761 return 0; 6762 } 6763 6764 static int raid5_alloc_percpu(struct r5conf *conf) 6765 { 6766 int err = 0; 6767 6768 conf->percpu = alloc_percpu(struct raid5_percpu); 6769 if (!conf->percpu) 6770 return -ENOMEM; 6771 6772 err = cpuhp_state_add_instance(CPUHP_MD_RAID5_PREPARE, &conf->node); 6773 if (!err) { 6774 conf->scribble_disks = max(conf->raid_disks, 6775 conf->previous_raid_disks); 6776 conf->scribble_sectors = max(conf->chunk_sectors, 6777 conf->prev_chunk_sectors); 6778 } 6779 return err; 6780 } 6781 6782 static unsigned long raid5_cache_scan(struct shrinker *shrink, 6783 struct shrink_control *sc) 6784 { 6785 struct r5conf *conf = container_of(shrink, struct r5conf, shrinker); 6786 unsigned long ret = SHRINK_STOP; 6787 6788 if (mutex_trylock(&conf->cache_size_mutex)) { 6789 ret= 0; 6790 while (ret < sc->nr_to_scan && 6791 conf->max_nr_stripes > conf->min_nr_stripes) { 6792 if (drop_one_stripe(conf) == 0) { 6793 ret = SHRINK_STOP; 6794 break; 6795 } 6796 ret++; 6797 } 6798 mutex_unlock(&conf->cache_size_mutex); 6799 } 6800 return ret; 6801 } 6802 6803 static unsigned long raid5_cache_count(struct shrinker *shrink, 6804 struct shrink_control *sc) 6805 { 6806 struct r5conf *conf = container_of(shrink, struct r5conf, shrinker); 6807 6808 if (conf->max_nr_stripes < conf->min_nr_stripes) 6809 /* unlikely, but not impossible */ 6810 return 0; 6811 return conf->max_nr_stripes - conf->min_nr_stripes; 6812 } 6813 6814 static struct r5conf *setup_conf(struct mddev *mddev) 6815 { 6816 struct r5conf *conf; 6817 int raid_disk, memory, max_disks; 6818 struct md_rdev *rdev; 6819 struct disk_info *disk; 6820 char pers_name[6]; 6821 int i; 6822 int group_cnt, worker_cnt_per_group; 6823 struct r5worker_group *new_group; 6824 6825 if (mddev->new_level != 5 6826 && mddev->new_level != 4 6827 && mddev->new_level != 6) { 6828 pr_warn("md/raid:%s: raid level not set to 4/5/6 (%d)\n", 6829 mdname(mddev), mddev->new_level); 6830 return ERR_PTR(-EIO); 6831 } 6832 if ((mddev->new_level == 5 6833 && !algorithm_valid_raid5(mddev->new_layout)) || 6834 (mddev->new_level == 6 6835 && !algorithm_valid_raid6(mddev->new_layout))) { 6836 pr_warn("md/raid:%s: layout %d not supported\n", 6837 mdname(mddev), mddev->new_layout); 6838 return ERR_PTR(-EIO); 6839 } 6840 if (mddev->new_level == 6 && mddev->raid_disks < 4) { 6841 pr_warn("md/raid:%s: not enough configured devices (%d, minimum 4)\n", 6842 mdname(mddev), mddev->raid_disks); 6843 return ERR_PTR(-EINVAL); 6844 } 6845 6846 if (!mddev->new_chunk_sectors || 6847 (mddev->new_chunk_sectors << 9) % PAGE_SIZE || 6848 !is_power_of_2(mddev->new_chunk_sectors)) { 6849 pr_warn("md/raid:%s: invalid chunk size %d\n", 6850 mdname(mddev), mddev->new_chunk_sectors << 9); 6851 return ERR_PTR(-EINVAL); 6852 } 6853 6854 conf = kzalloc(sizeof(struct r5conf), GFP_KERNEL); 6855 if (conf == NULL) 6856 goto abort; 6857 INIT_LIST_HEAD(&conf->free_list); 6858 INIT_LIST_HEAD(&conf->pending_list); 6859 conf->pending_data = kzalloc(sizeof(struct r5pending_data) * 6860 PENDING_IO_MAX, GFP_KERNEL); 6861 if (!conf->pending_data) 6862 goto abort; 6863 for (i = 0; i < PENDING_IO_MAX; i++) 6864 list_add(&conf->pending_data[i].sibling, &conf->free_list); 6865 /* Don't enable multi-threading by default*/ 6866 if (!alloc_thread_groups(conf, 0, &group_cnt, &worker_cnt_per_group, 6867 &new_group)) { 6868 conf->group_cnt = group_cnt; 6869 conf->worker_cnt_per_group = worker_cnt_per_group; 6870 conf->worker_groups = new_group; 6871 } else 6872 goto abort; 6873 spin_lock_init(&conf->device_lock); 6874 seqcount_init(&conf->gen_lock); 6875 mutex_init(&conf->cache_size_mutex); 6876 init_waitqueue_head(&conf->wait_for_quiescent); 6877 init_waitqueue_head(&conf->wait_for_stripe); 6878 init_waitqueue_head(&conf->wait_for_overlap); 6879 INIT_LIST_HEAD(&conf->handle_list); 6880 INIT_LIST_HEAD(&conf->loprio_list); 6881 INIT_LIST_HEAD(&conf->hold_list); 6882 INIT_LIST_HEAD(&conf->delayed_list); 6883 INIT_LIST_HEAD(&conf->bitmap_list); 6884 init_llist_head(&conf->released_stripes); 6885 atomic_set(&conf->active_stripes, 0); 6886 atomic_set(&conf->preread_active_stripes, 0); 6887 atomic_set(&conf->active_aligned_reads, 0); 6888 spin_lock_init(&conf->pending_bios_lock); 6889 conf->batch_bio_dispatch = true; 6890 rdev_for_each(rdev, mddev) { 6891 if (test_bit(Journal, &rdev->flags)) 6892 continue; 6893 if (blk_queue_nonrot(bdev_get_queue(rdev->bdev))) { 6894 conf->batch_bio_dispatch = false; 6895 break; 6896 } 6897 } 6898 6899 conf->bypass_threshold = BYPASS_THRESHOLD; 6900 conf->recovery_disabled = mddev->recovery_disabled - 1; 6901 6902 conf->raid_disks = mddev->raid_disks; 6903 if (mddev->reshape_position == MaxSector) 6904 conf->previous_raid_disks = mddev->raid_disks; 6905 else 6906 conf->previous_raid_disks = mddev->raid_disks - mddev->delta_disks; 6907 max_disks = max(conf->raid_disks, conf->previous_raid_disks); 6908 6909 conf->disks = kzalloc(max_disks * sizeof(struct disk_info), 6910 GFP_KERNEL); 6911 6912 if (!conf->disks) 6913 goto abort; 6914 6915 for (i = 0; i < max_disks; i++) { 6916 conf->disks[i].extra_page = alloc_page(GFP_KERNEL); 6917 if (!conf->disks[i].extra_page) 6918 goto abort; 6919 } 6920 6921 conf->bio_split = bioset_create(BIO_POOL_SIZE, 0); 6922 if (!conf->bio_split) 6923 goto abort; 6924 conf->mddev = mddev; 6925 6926 if ((conf->stripe_hashtbl = kzalloc(PAGE_SIZE, GFP_KERNEL)) == NULL) 6927 goto abort; 6928 6929 /* We init hash_locks[0] separately to that it can be used 6930 * as the reference lock in the spin_lock_nest_lock() call 6931 * in lock_all_device_hash_locks_irq in order to convince 6932 * lockdep that we know what we are doing. 6933 */ 6934 spin_lock_init(conf->hash_locks); 6935 for (i = 1; i < NR_STRIPE_HASH_LOCKS; i++) 6936 spin_lock_init(conf->hash_locks + i); 6937 6938 for (i = 0; i < NR_STRIPE_HASH_LOCKS; i++) 6939 INIT_LIST_HEAD(conf->inactive_list + i); 6940 6941 for (i = 0; i < NR_STRIPE_HASH_LOCKS; i++) 6942 INIT_LIST_HEAD(conf->temp_inactive_list + i); 6943 6944 atomic_set(&conf->r5c_cached_full_stripes, 0); 6945 INIT_LIST_HEAD(&conf->r5c_full_stripe_list); 6946 atomic_set(&conf->r5c_cached_partial_stripes, 0); 6947 INIT_LIST_HEAD(&conf->r5c_partial_stripe_list); 6948 atomic_set(&conf->r5c_flushing_full_stripes, 0); 6949 atomic_set(&conf->r5c_flushing_partial_stripes, 0); 6950 6951 conf->level = mddev->new_level; 6952 conf->chunk_sectors = mddev->new_chunk_sectors; 6953 if (raid5_alloc_percpu(conf) != 0) 6954 goto abort; 6955 6956 pr_debug("raid456: run(%s) called.\n", mdname(mddev)); 6957 6958 rdev_for_each(rdev, mddev) { 6959 raid_disk = rdev->raid_disk; 6960 if (raid_disk >= max_disks 6961 || raid_disk < 0 || test_bit(Journal, &rdev->flags)) 6962 continue; 6963 disk = conf->disks + raid_disk; 6964 6965 if (test_bit(Replacement, &rdev->flags)) { 6966 if (disk->replacement) 6967 goto abort; 6968 disk->replacement = rdev; 6969 } else { 6970 if (disk->rdev) 6971 goto abort; 6972 disk->rdev = rdev; 6973 } 6974 6975 if (test_bit(In_sync, &rdev->flags)) { 6976 char b[BDEVNAME_SIZE]; 6977 pr_info("md/raid:%s: device %s operational as raid disk %d\n", 6978 mdname(mddev), bdevname(rdev->bdev, b), raid_disk); 6979 } else if (rdev->saved_raid_disk != raid_disk) 6980 /* Cannot rely on bitmap to complete recovery */ 6981 conf->fullsync = 1; 6982 } 6983 6984 conf->level = mddev->new_level; 6985 if (conf->level == 6) { 6986 conf->max_degraded = 2; 6987 if (raid6_call.xor_syndrome) 6988 conf->rmw_level = PARITY_ENABLE_RMW; 6989 else 6990 conf->rmw_level = PARITY_DISABLE_RMW; 6991 } else { 6992 conf->max_degraded = 1; 6993 conf->rmw_level = PARITY_ENABLE_RMW; 6994 } 6995 conf->algorithm = mddev->new_layout; 6996 conf->reshape_progress = mddev->reshape_position; 6997 if (conf->reshape_progress != MaxSector) { 6998 conf->prev_chunk_sectors = mddev->chunk_sectors; 6999 conf->prev_algo = mddev->layout; 7000 } else { 7001 conf->prev_chunk_sectors = conf->chunk_sectors; 7002 conf->prev_algo = conf->algorithm; 7003 } 7004 7005 conf->min_nr_stripes = NR_STRIPES; 7006 if (mddev->reshape_position != MaxSector) { 7007 int stripes = max_t(int, 7008 ((mddev->chunk_sectors << 9) / STRIPE_SIZE) * 4, 7009 ((mddev->new_chunk_sectors << 9) / STRIPE_SIZE) * 4); 7010 conf->min_nr_stripes = max(NR_STRIPES, stripes); 7011 if (conf->min_nr_stripes != NR_STRIPES) 7012 pr_info("md/raid:%s: force stripe size %d for reshape\n", 7013 mdname(mddev), conf->min_nr_stripes); 7014 } 7015 memory = conf->min_nr_stripes * (sizeof(struct stripe_head) + 7016 max_disks * ((sizeof(struct bio) + PAGE_SIZE))) / 1024; 7017 atomic_set(&conf->empty_inactive_list_nr, NR_STRIPE_HASH_LOCKS); 7018 if (grow_stripes(conf, conf->min_nr_stripes)) { 7019 pr_warn("md/raid:%s: couldn't allocate %dkB for buffers\n", 7020 mdname(mddev), memory); 7021 goto abort; 7022 } else 7023 pr_debug("md/raid:%s: allocated %dkB\n", mdname(mddev), memory); 7024 /* 7025 * Losing a stripe head costs more than the time to refill it, 7026 * it reduces the queue depth and so can hurt throughput. 7027 * So set it rather large, scaled by number of devices. 7028 */ 7029 conf->shrinker.seeks = DEFAULT_SEEKS * conf->raid_disks * 4; 7030 conf->shrinker.scan_objects = raid5_cache_scan; 7031 conf->shrinker.count_objects = raid5_cache_count; 7032 conf->shrinker.batch = 128; 7033 conf->shrinker.flags = 0; 7034 if (register_shrinker(&conf->shrinker)) { 7035 pr_warn("md/raid:%s: couldn't register shrinker.\n", 7036 mdname(mddev)); 7037 goto abort; 7038 } 7039 7040 sprintf(pers_name, "raid%d", mddev->new_level); 7041 conf->thread = md_register_thread(raid5d, mddev, pers_name); 7042 if (!conf->thread) { 7043 pr_warn("md/raid:%s: couldn't allocate thread.\n", 7044 mdname(mddev)); 7045 goto abort; 7046 } 7047 7048 return conf; 7049 7050 abort: 7051 if (conf) { 7052 free_conf(conf); 7053 return ERR_PTR(-EIO); 7054 } else 7055 return ERR_PTR(-ENOMEM); 7056 } 7057 7058 static int only_parity(int raid_disk, int algo, int raid_disks, int max_degraded) 7059 { 7060 switch (algo) { 7061 case ALGORITHM_PARITY_0: 7062 if (raid_disk < max_degraded) 7063 return 1; 7064 break; 7065 case ALGORITHM_PARITY_N: 7066 if (raid_disk >= raid_disks - max_degraded) 7067 return 1; 7068 break; 7069 case ALGORITHM_PARITY_0_6: 7070 if (raid_disk == 0 || 7071 raid_disk == raid_disks - 1) 7072 return 1; 7073 break; 7074 case ALGORITHM_LEFT_ASYMMETRIC_6: 7075 case ALGORITHM_RIGHT_ASYMMETRIC_6: 7076 case ALGORITHM_LEFT_SYMMETRIC_6: 7077 case ALGORITHM_RIGHT_SYMMETRIC_6: 7078 if (raid_disk == raid_disks - 1) 7079 return 1; 7080 } 7081 return 0; 7082 } 7083 7084 static int raid5_run(struct mddev *mddev) 7085 { 7086 struct r5conf *conf; 7087 int working_disks = 0; 7088 int dirty_parity_disks = 0; 7089 struct md_rdev *rdev; 7090 struct md_rdev *journal_dev = NULL; 7091 sector_t reshape_offset = 0; 7092 int i; 7093 long long min_offset_diff = 0; 7094 int first = 1; 7095 7096 if (mddev->recovery_cp != MaxSector) 7097 pr_notice("md/raid:%s: not clean -- starting background reconstruction\n", 7098 mdname(mddev)); 7099 7100 rdev_for_each(rdev, mddev) { 7101 long long diff; 7102 7103 if (test_bit(Journal, &rdev->flags)) { 7104 journal_dev = rdev; 7105 continue; 7106 } 7107 if (rdev->raid_disk < 0) 7108 continue; 7109 diff = (rdev->new_data_offset - rdev->data_offset); 7110 if (first) { 7111 min_offset_diff = diff; 7112 first = 0; 7113 } else if (mddev->reshape_backwards && 7114 diff < min_offset_diff) 7115 min_offset_diff = diff; 7116 else if (!mddev->reshape_backwards && 7117 diff > min_offset_diff) 7118 min_offset_diff = diff; 7119 } 7120 7121 if (mddev->reshape_position != MaxSector) { 7122 /* Check that we can continue the reshape. 7123 * Difficulties arise if the stripe we would write to 7124 * next is at or after the stripe we would read from next. 7125 * For a reshape that changes the number of devices, this 7126 * is only possible for a very short time, and mdadm makes 7127 * sure that time appears to have past before assembling 7128 * the array. So we fail if that time hasn't passed. 7129 * For a reshape that keeps the number of devices the same 7130 * mdadm must be monitoring the reshape can keeping the 7131 * critical areas read-only and backed up. It will start 7132 * the array in read-only mode, so we check for that. 7133 */ 7134 sector_t here_new, here_old; 7135 int old_disks; 7136 int max_degraded = (mddev->level == 6 ? 2 : 1); 7137 int chunk_sectors; 7138 int new_data_disks; 7139 7140 if (journal_dev) { 7141 pr_warn("md/raid:%s: don't support reshape with journal - aborting.\n", 7142 mdname(mddev)); 7143 return -EINVAL; 7144 } 7145 7146 if (mddev->new_level != mddev->level) { 7147 pr_warn("md/raid:%s: unsupported reshape required - aborting.\n", 7148 mdname(mddev)); 7149 return -EINVAL; 7150 } 7151 old_disks = mddev->raid_disks - mddev->delta_disks; 7152 /* reshape_position must be on a new-stripe boundary, and one 7153 * further up in new geometry must map after here in old 7154 * geometry. 7155 * If the chunk sizes are different, then as we perform reshape 7156 * in units of the largest of the two, reshape_position needs 7157 * be a multiple of the largest chunk size times new data disks. 7158 */ 7159 here_new = mddev->reshape_position; 7160 chunk_sectors = max(mddev->chunk_sectors, mddev->new_chunk_sectors); 7161 new_data_disks = mddev->raid_disks - max_degraded; 7162 if (sector_div(here_new, chunk_sectors * new_data_disks)) { 7163 pr_warn("md/raid:%s: reshape_position not on a stripe boundary\n", 7164 mdname(mddev)); 7165 return -EINVAL; 7166 } 7167 reshape_offset = here_new * chunk_sectors; 7168 /* here_new is the stripe we will write to */ 7169 here_old = mddev->reshape_position; 7170 sector_div(here_old, chunk_sectors * (old_disks-max_degraded)); 7171 /* here_old is the first stripe that we might need to read 7172 * from */ 7173 if (mddev->delta_disks == 0) { 7174 /* We cannot be sure it is safe to start an in-place 7175 * reshape. It is only safe if user-space is monitoring 7176 * and taking constant backups. 7177 * mdadm always starts a situation like this in 7178 * readonly mode so it can take control before 7179 * allowing any writes. So just check for that. 7180 */ 7181 if (abs(min_offset_diff) >= mddev->chunk_sectors && 7182 abs(min_offset_diff) >= mddev->new_chunk_sectors) 7183 /* not really in-place - so OK */; 7184 else if (mddev->ro == 0) { 7185 pr_warn("md/raid:%s: in-place reshape must be started in read-only mode - aborting\n", 7186 mdname(mddev)); 7187 return -EINVAL; 7188 } 7189 } else if (mddev->reshape_backwards 7190 ? (here_new * chunk_sectors + min_offset_diff <= 7191 here_old * chunk_sectors) 7192 : (here_new * chunk_sectors >= 7193 here_old * chunk_sectors + (-min_offset_diff))) { 7194 /* Reading from the same stripe as writing to - bad */ 7195 pr_warn("md/raid:%s: reshape_position too early for auto-recovery - aborting.\n", 7196 mdname(mddev)); 7197 return -EINVAL; 7198 } 7199 pr_debug("md/raid:%s: reshape will continue\n", mdname(mddev)); 7200 /* OK, we should be able to continue; */ 7201 } else { 7202 BUG_ON(mddev->level != mddev->new_level); 7203 BUG_ON(mddev->layout != mddev->new_layout); 7204 BUG_ON(mddev->chunk_sectors != mddev->new_chunk_sectors); 7205 BUG_ON(mddev->delta_disks != 0); 7206 } 7207 7208 if (test_bit(MD_HAS_JOURNAL, &mddev->flags) && 7209 test_bit(MD_HAS_PPL, &mddev->flags)) { 7210 pr_warn("md/raid:%s: using journal device and PPL not allowed - disabling PPL\n", 7211 mdname(mddev)); 7212 clear_bit(MD_HAS_PPL, &mddev->flags); 7213 } 7214 7215 if (mddev->private == NULL) 7216 conf = setup_conf(mddev); 7217 else 7218 conf = mddev->private; 7219 7220 if (IS_ERR(conf)) 7221 return PTR_ERR(conf); 7222 7223 if (test_bit(MD_HAS_JOURNAL, &mddev->flags)) { 7224 if (!journal_dev) { 7225 pr_warn("md/raid:%s: journal disk is missing, force array readonly\n", 7226 mdname(mddev)); 7227 mddev->ro = 1; 7228 set_disk_ro(mddev->gendisk, 1); 7229 } else if (mddev->recovery_cp == MaxSector) 7230 set_bit(MD_JOURNAL_CLEAN, &mddev->flags); 7231 } 7232 7233 conf->min_offset_diff = min_offset_diff; 7234 mddev->thread = conf->thread; 7235 conf->thread = NULL; 7236 mddev->private = conf; 7237 7238 for (i = 0; i < conf->raid_disks && conf->previous_raid_disks; 7239 i++) { 7240 rdev = conf->disks[i].rdev; 7241 if (!rdev && conf->disks[i].replacement) { 7242 /* The replacement is all we have yet */ 7243 rdev = conf->disks[i].replacement; 7244 conf->disks[i].replacement = NULL; 7245 clear_bit(Replacement, &rdev->flags); 7246 conf->disks[i].rdev = rdev; 7247 } 7248 if (!rdev) 7249 continue; 7250 if (conf->disks[i].replacement && 7251 conf->reshape_progress != MaxSector) { 7252 /* replacements and reshape simply do not mix. */ 7253 pr_warn("md: cannot handle concurrent replacement and reshape.\n"); 7254 goto abort; 7255 } 7256 if (test_bit(In_sync, &rdev->flags)) { 7257 working_disks++; 7258 continue; 7259 } 7260 /* This disc is not fully in-sync. However if it 7261 * just stored parity (beyond the recovery_offset), 7262 * when we don't need to be concerned about the 7263 * array being dirty. 7264 * When reshape goes 'backwards', we never have 7265 * partially completed devices, so we only need 7266 * to worry about reshape going forwards. 7267 */ 7268 /* Hack because v0.91 doesn't store recovery_offset properly. */ 7269 if (mddev->major_version == 0 && 7270 mddev->minor_version > 90) 7271 rdev->recovery_offset = reshape_offset; 7272 7273 if (rdev->recovery_offset < reshape_offset) { 7274 /* We need to check old and new layout */ 7275 if (!only_parity(rdev->raid_disk, 7276 conf->algorithm, 7277 conf->raid_disks, 7278 conf->max_degraded)) 7279 continue; 7280 } 7281 if (!only_parity(rdev->raid_disk, 7282 conf->prev_algo, 7283 conf->previous_raid_disks, 7284 conf->max_degraded)) 7285 continue; 7286 dirty_parity_disks++; 7287 } 7288 7289 /* 7290 * 0 for a fully functional array, 1 or 2 for a degraded array. 7291 */ 7292 mddev->degraded = raid5_calc_degraded(conf); 7293 7294 if (has_failed(conf)) { 7295 pr_crit("md/raid:%s: not enough operational devices (%d/%d failed)\n", 7296 mdname(mddev), mddev->degraded, conf->raid_disks); 7297 goto abort; 7298 } 7299 7300 /* device size must be a multiple of chunk size */ 7301 mddev->dev_sectors &= ~(mddev->chunk_sectors - 1); 7302 mddev->resync_max_sectors = mddev->dev_sectors; 7303 7304 if (mddev->degraded > dirty_parity_disks && 7305 mddev->recovery_cp != MaxSector) { 7306 if (test_bit(MD_HAS_PPL, &mddev->flags)) 7307 pr_crit("md/raid:%s: starting dirty degraded array with PPL.\n", 7308 mdname(mddev)); 7309 else if (mddev->ok_start_degraded) 7310 pr_crit("md/raid:%s: starting dirty degraded array - data corruption possible.\n", 7311 mdname(mddev)); 7312 else { 7313 pr_crit("md/raid:%s: cannot start dirty degraded array.\n", 7314 mdname(mddev)); 7315 goto abort; 7316 } 7317 } 7318 7319 pr_info("md/raid:%s: raid level %d active with %d out of %d devices, algorithm %d\n", 7320 mdname(mddev), conf->level, 7321 mddev->raid_disks-mddev->degraded, mddev->raid_disks, 7322 mddev->new_layout); 7323 7324 print_raid5_conf(conf); 7325 7326 if (conf->reshape_progress != MaxSector) { 7327 conf->reshape_safe = conf->reshape_progress; 7328 atomic_set(&conf->reshape_stripes, 0); 7329 clear_bit(MD_RECOVERY_SYNC, &mddev->recovery); 7330 clear_bit(MD_RECOVERY_CHECK, &mddev->recovery); 7331 set_bit(MD_RECOVERY_RESHAPE, &mddev->recovery); 7332 set_bit(MD_RECOVERY_RUNNING, &mddev->recovery); 7333 mddev->sync_thread = md_register_thread(md_do_sync, mddev, 7334 "reshape"); 7335 } 7336 7337 /* Ok, everything is just fine now */ 7338 if (mddev->to_remove == &raid5_attrs_group) 7339 mddev->to_remove = NULL; 7340 else if (mddev->kobj.sd && 7341 sysfs_create_group(&mddev->kobj, &raid5_attrs_group)) 7342 pr_warn("raid5: failed to create sysfs attributes for %s\n", 7343 mdname(mddev)); 7344 md_set_array_sectors(mddev, raid5_size(mddev, 0, 0)); 7345 7346 if (mddev->queue) { 7347 int chunk_size; 7348 /* read-ahead size must cover two whole stripes, which 7349 * is 2 * (datadisks) * chunksize where 'n' is the 7350 * number of raid devices 7351 */ 7352 int data_disks = conf->previous_raid_disks - conf->max_degraded; 7353 int stripe = data_disks * 7354 ((mddev->chunk_sectors << 9) / PAGE_SIZE); 7355 if (mddev->queue->backing_dev_info->ra_pages < 2 * stripe) 7356 mddev->queue->backing_dev_info->ra_pages = 2 * stripe; 7357 7358 chunk_size = mddev->chunk_sectors << 9; 7359 blk_queue_io_min(mddev->queue, chunk_size); 7360 blk_queue_io_opt(mddev->queue, chunk_size * 7361 (conf->raid_disks - conf->max_degraded)); 7362 mddev->queue->limits.raid_partial_stripes_expensive = 1; 7363 /* 7364 * We can only discard a whole stripe. It doesn't make sense to 7365 * discard data disk but write parity disk 7366 */ 7367 stripe = stripe * PAGE_SIZE; 7368 /* Round up to power of 2, as discard handling 7369 * currently assumes that */ 7370 while ((stripe-1) & stripe) 7371 stripe = (stripe | (stripe-1)) + 1; 7372 mddev->queue->limits.discard_alignment = stripe; 7373 mddev->queue->limits.discard_granularity = stripe; 7374 7375 blk_queue_max_write_same_sectors(mddev->queue, 0); 7376 blk_queue_max_write_zeroes_sectors(mddev->queue, 0); 7377 7378 rdev_for_each(rdev, mddev) { 7379 disk_stack_limits(mddev->gendisk, rdev->bdev, 7380 rdev->data_offset << 9); 7381 disk_stack_limits(mddev->gendisk, rdev->bdev, 7382 rdev->new_data_offset << 9); 7383 } 7384 7385 /* 7386 * zeroing is required, otherwise data 7387 * could be lost. Consider a scenario: discard a stripe 7388 * (the stripe could be inconsistent if 7389 * discard_zeroes_data is 0); write one disk of the 7390 * stripe (the stripe could be inconsistent again 7391 * depending on which disks are used to calculate 7392 * parity); the disk is broken; The stripe data of this 7393 * disk is lost. 7394 * 7395 * We only allow DISCARD if the sysadmin has confirmed that 7396 * only safe devices are in use by setting a module parameter. 7397 * A better idea might be to turn DISCARD into WRITE_ZEROES 7398 * requests, as that is required to be safe. 7399 */ 7400 if (devices_handle_discard_safely && 7401 mddev->queue->limits.max_discard_sectors >= (stripe >> 9) && 7402 mddev->queue->limits.discard_granularity >= stripe) 7403 queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, 7404 mddev->queue); 7405 else 7406 queue_flag_clear_unlocked(QUEUE_FLAG_DISCARD, 7407 mddev->queue); 7408 7409 blk_queue_max_hw_sectors(mddev->queue, UINT_MAX); 7410 } 7411 7412 if (log_init(conf, journal_dev, raid5_has_ppl(conf))) 7413 goto abort; 7414 7415 return 0; 7416 abort: 7417 md_unregister_thread(&mddev->thread); 7418 print_raid5_conf(conf); 7419 free_conf(conf); 7420 mddev->private = NULL; 7421 pr_warn("md/raid:%s: failed to run raid set.\n", mdname(mddev)); 7422 return -EIO; 7423 } 7424 7425 static void raid5_free(struct mddev *mddev, void *priv) 7426 { 7427 struct r5conf *conf = priv; 7428 7429 free_conf(conf); 7430 mddev->to_remove = &raid5_attrs_group; 7431 } 7432 7433 static void raid5_status(struct seq_file *seq, struct mddev *mddev) 7434 { 7435 struct r5conf *conf = mddev->private; 7436 int i; 7437 7438 seq_printf(seq, " level %d, %dk chunk, algorithm %d", mddev->level, 7439 conf->chunk_sectors / 2, mddev->layout); 7440 seq_printf (seq, " [%d/%d] [", conf->raid_disks, conf->raid_disks - mddev->degraded); 7441 rcu_read_lock(); 7442 for (i = 0; i < conf->raid_disks; i++) { 7443 struct md_rdev *rdev = rcu_dereference(conf->disks[i].rdev); 7444 seq_printf (seq, "%s", rdev && test_bit(In_sync, &rdev->flags) ? "U" : "_"); 7445 } 7446 rcu_read_unlock(); 7447 seq_printf (seq, "]"); 7448 } 7449 7450 static void print_raid5_conf (struct r5conf *conf) 7451 { 7452 int i; 7453 struct disk_info *tmp; 7454 7455 pr_debug("RAID conf printout:\n"); 7456 if (!conf) { 7457 pr_debug("(conf==NULL)\n"); 7458 return; 7459 } 7460 pr_debug(" --- level:%d rd:%d wd:%d\n", conf->level, 7461 conf->raid_disks, 7462 conf->raid_disks - conf->mddev->degraded); 7463 7464 for (i = 0; i < conf->raid_disks; i++) { 7465 char b[BDEVNAME_SIZE]; 7466 tmp = conf->disks + i; 7467 if (tmp->rdev) 7468 pr_debug(" disk %d, o:%d, dev:%s\n", 7469 i, !test_bit(Faulty, &tmp->rdev->flags), 7470 bdevname(tmp->rdev->bdev, b)); 7471 } 7472 } 7473 7474 static int raid5_spare_active(struct mddev *mddev) 7475 { 7476 int i; 7477 struct r5conf *conf = mddev->private; 7478 struct disk_info *tmp; 7479 int count = 0; 7480 unsigned long flags; 7481 7482 for (i = 0; i < conf->raid_disks; i++) { 7483 tmp = conf->disks + i; 7484 if (tmp->replacement 7485 && tmp->replacement->recovery_offset == MaxSector 7486 && !test_bit(Faulty, &tmp->replacement->flags) 7487 && !test_and_set_bit(In_sync, &tmp->replacement->flags)) { 7488 /* Replacement has just become active. */ 7489 if (!tmp->rdev 7490 || !test_and_clear_bit(In_sync, &tmp->rdev->flags)) 7491 count++; 7492 if (tmp->rdev) { 7493 /* Replaced device not technically faulty, 7494 * but we need to be sure it gets removed 7495 * and never re-added. 7496 */ 7497 set_bit(Faulty, &tmp->rdev->flags); 7498 sysfs_notify_dirent_safe( 7499 tmp->rdev->sysfs_state); 7500 } 7501 sysfs_notify_dirent_safe(tmp->replacement->sysfs_state); 7502 } else if (tmp->rdev 7503 && tmp->rdev->recovery_offset == MaxSector 7504 && !test_bit(Faulty, &tmp->rdev->flags) 7505 && !test_and_set_bit(In_sync, &tmp->rdev->flags)) { 7506 count++; 7507 sysfs_notify_dirent_safe(tmp->rdev->sysfs_state); 7508 } 7509 } 7510 spin_lock_irqsave(&conf->device_lock, flags); 7511 mddev->degraded = raid5_calc_degraded(conf); 7512 spin_unlock_irqrestore(&conf->device_lock, flags); 7513 print_raid5_conf(conf); 7514 return count; 7515 } 7516 7517 static int raid5_remove_disk(struct mddev *mddev, struct md_rdev *rdev) 7518 { 7519 struct r5conf *conf = mddev->private; 7520 int err = 0; 7521 int number = rdev->raid_disk; 7522 struct md_rdev **rdevp; 7523 struct disk_info *p = conf->disks + number; 7524 7525 print_raid5_conf(conf); 7526 if (test_bit(Journal, &rdev->flags) && conf->log) { 7527 /* 7528 * we can't wait pending write here, as this is called in 7529 * raid5d, wait will deadlock. 7530 * neilb: there is no locking about new writes here, 7531 * so this cannot be safe. 7532 */ 7533 if (atomic_read(&conf->active_stripes)) { 7534 return -EBUSY; 7535 } 7536 log_exit(conf); 7537 return 0; 7538 } 7539 if (rdev == p->rdev) 7540 rdevp = &p->rdev; 7541 else if (rdev == p->replacement) 7542 rdevp = &p->replacement; 7543 else 7544 return 0; 7545 7546 if (number >= conf->raid_disks && 7547 conf->reshape_progress == MaxSector) 7548 clear_bit(In_sync, &rdev->flags); 7549 7550 if (test_bit(In_sync, &rdev->flags) || 7551 atomic_read(&rdev->nr_pending)) { 7552 err = -EBUSY; 7553 goto abort; 7554 } 7555 /* Only remove non-faulty devices if recovery 7556 * isn't possible. 7557 */ 7558 if (!test_bit(Faulty, &rdev->flags) && 7559 mddev->recovery_disabled != conf->recovery_disabled && 7560 !has_failed(conf) && 7561 (!p->replacement || p->replacement == rdev) && 7562 number < conf->raid_disks) { 7563 err = -EBUSY; 7564 goto abort; 7565 } 7566 *rdevp = NULL; 7567 if (!test_bit(RemoveSynchronized, &rdev->flags)) { 7568 synchronize_rcu(); 7569 if (atomic_read(&rdev->nr_pending)) { 7570 /* lost the race, try later */ 7571 err = -EBUSY; 7572 *rdevp = rdev; 7573 } 7574 } 7575 if (!err) { 7576 err = log_modify(conf, rdev, false); 7577 if (err) 7578 goto abort; 7579 } 7580 if (p->replacement) { 7581 /* We must have just cleared 'rdev' */ 7582 p->rdev = p->replacement; 7583 clear_bit(Replacement, &p->replacement->flags); 7584 smp_mb(); /* Make sure other CPUs may see both as identical 7585 * but will never see neither - if they are careful 7586 */ 7587 p->replacement = NULL; 7588 7589 if (!err) 7590 err = log_modify(conf, p->rdev, true); 7591 } 7592 7593 clear_bit(WantReplacement, &rdev->flags); 7594 abort: 7595 7596 print_raid5_conf(conf); 7597 return err; 7598 } 7599 7600 static int raid5_add_disk(struct mddev *mddev, struct md_rdev *rdev) 7601 { 7602 struct r5conf *conf = mddev->private; 7603 int err = -EEXIST; 7604 int disk; 7605 struct disk_info *p; 7606 int first = 0; 7607 int last = conf->raid_disks - 1; 7608 7609 if (test_bit(Journal, &rdev->flags)) { 7610 if (conf->log) 7611 return -EBUSY; 7612 7613 rdev->raid_disk = 0; 7614 /* 7615 * The array is in readonly mode if journal is missing, so no 7616 * write requests running. We should be safe 7617 */ 7618 log_init(conf, rdev, false); 7619 return 0; 7620 } 7621 if (mddev->recovery_disabled == conf->recovery_disabled) 7622 return -EBUSY; 7623 7624 if (rdev->saved_raid_disk < 0 && has_failed(conf)) 7625 /* no point adding a device */ 7626 return -EINVAL; 7627 7628 if (rdev->raid_disk >= 0) 7629 first = last = rdev->raid_disk; 7630 7631 /* 7632 * find the disk ... but prefer rdev->saved_raid_disk 7633 * if possible. 7634 */ 7635 if (rdev->saved_raid_disk >= 0 && 7636 rdev->saved_raid_disk >= first && 7637 conf->disks[rdev->saved_raid_disk].rdev == NULL) 7638 first = rdev->saved_raid_disk; 7639 7640 for (disk = first; disk <= last; disk++) { 7641 p = conf->disks + disk; 7642 if (p->rdev == NULL) { 7643 clear_bit(In_sync, &rdev->flags); 7644 rdev->raid_disk = disk; 7645 if (rdev->saved_raid_disk != disk) 7646 conf->fullsync = 1; 7647 rcu_assign_pointer(p->rdev, rdev); 7648 7649 err = log_modify(conf, rdev, true); 7650 7651 goto out; 7652 } 7653 } 7654 for (disk = first; disk <= last; disk++) { 7655 p = conf->disks + disk; 7656 if (test_bit(WantReplacement, &p->rdev->flags) && 7657 p->replacement == NULL) { 7658 clear_bit(In_sync, &rdev->flags); 7659 set_bit(Replacement, &rdev->flags); 7660 rdev->raid_disk = disk; 7661 err = 0; 7662 conf->fullsync = 1; 7663 rcu_assign_pointer(p->replacement, rdev); 7664 break; 7665 } 7666 } 7667 out: 7668 print_raid5_conf(conf); 7669 return err; 7670 } 7671 7672 static int raid5_resize(struct mddev *mddev, sector_t sectors) 7673 { 7674 /* no resync is happening, and there is enough space 7675 * on all devices, so we can resize. 7676 * We need to make sure resync covers any new space. 7677 * If the array is shrinking we should possibly wait until 7678 * any io in the removed space completes, but it hardly seems 7679 * worth it. 7680 */ 7681 sector_t newsize; 7682 struct r5conf *conf = mddev->private; 7683 7684 if (conf->log || raid5_has_ppl(conf)) 7685 return -EINVAL; 7686 sectors &= ~((sector_t)conf->chunk_sectors - 1); 7687 newsize = raid5_size(mddev, sectors, mddev->raid_disks); 7688 if (mddev->external_size && 7689 mddev->array_sectors > newsize) 7690 return -EINVAL; 7691 if (mddev->bitmap) { 7692 int ret = bitmap_resize(mddev->bitmap, sectors, 0, 0); 7693 if (ret) 7694 return ret; 7695 } 7696 md_set_array_sectors(mddev, newsize); 7697 if (sectors > mddev->dev_sectors && 7698 mddev->recovery_cp > mddev->dev_sectors) { 7699 mddev->recovery_cp = mddev->dev_sectors; 7700 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 7701 } 7702 mddev->dev_sectors = sectors; 7703 mddev->resync_max_sectors = sectors; 7704 return 0; 7705 } 7706 7707 static int check_stripe_cache(struct mddev *mddev) 7708 { 7709 /* Can only proceed if there are plenty of stripe_heads. 7710 * We need a minimum of one full stripe,, and for sensible progress 7711 * it is best to have about 4 times that. 7712 * If we require 4 times, then the default 256 4K stripe_heads will 7713 * allow for chunk sizes up to 256K, which is probably OK. 7714 * If the chunk size is greater, user-space should request more 7715 * stripe_heads first. 7716 */ 7717 struct r5conf *conf = mddev->private; 7718 if (((mddev->chunk_sectors << 9) / STRIPE_SIZE) * 4 7719 > conf->min_nr_stripes || 7720 ((mddev->new_chunk_sectors << 9) / STRIPE_SIZE) * 4 7721 > conf->min_nr_stripes) { 7722 pr_warn("md/raid:%s: reshape: not enough stripes. Needed %lu\n", 7723 mdname(mddev), 7724 ((max(mddev->chunk_sectors, mddev->new_chunk_sectors) << 9) 7725 / STRIPE_SIZE)*4); 7726 return 0; 7727 } 7728 return 1; 7729 } 7730 7731 static int check_reshape(struct mddev *mddev) 7732 { 7733 struct r5conf *conf = mddev->private; 7734 7735 if (conf->log || raid5_has_ppl(conf)) 7736 return -EINVAL; 7737 if (mddev->delta_disks == 0 && 7738 mddev->new_layout == mddev->layout && 7739 mddev->new_chunk_sectors == mddev->chunk_sectors) 7740 return 0; /* nothing to do */ 7741 if (has_failed(conf)) 7742 return -EINVAL; 7743 if (mddev->delta_disks < 0 && mddev->reshape_position == MaxSector) { 7744 /* We might be able to shrink, but the devices must 7745 * be made bigger first. 7746 * For raid6, 4 is the minimum size. 7747 * Otherwise 2 is the minimum 7748 */ 7749 int min = 2; 7750 if (mddev->level == 6) 7751 min = 4; 7752 if (mddev->raid_disks + mddev->delta_disks < min) 7753 return -EINVAL; 7754 } 7755 7756 if (!check_stripe_cache(mddev)) 7757 return -ENOSPC; 7758 7759 if (mddev->new_chunk_sectors > mddev->chunk_sectors || 7760 mddev->delta_disks > 0) 7761 if (resize_chunks(conf, 7762 conf->previous_raid_disks 7763 + max(0, mddev->delta_disks), 7764 max(mddev->new_chunk_sectors, 7765 mddev->chunk_sectors) 7766 ) < 0) 7767 return -ENOMEM; 7768 7769 if (conf->previous_raid_disks + mddev->delta_disks <= conf->pool_size) 7770 return 0; /* never bother to shrink */ 7771 return resize_stripes(conf, (conf->previous_raid_disks 7772 + mddev->delta_disks)); 7773 } 7774 7775 static int raid5_start_reshape(struct mddev *mddev) 7776 { 7777 struct r5conf *conf = mddev->private; 7778 struct md_rdev *rdev; 7779 int spares = 0; 7780 unsigned long flags; 7781 7782 if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) 7783 return -EBUSY; 7784 7785 if (!check_stripe_cache(mddev)) 7786 return -ENOSPC; 7787 7788 if (has_failed(conf)) 7789 return -EINVAL; 7790 7791 rdev_for_each(rdev, mddev) { 7792 if (!test_bit(In_sync, &rdev->flags) 7793 && !test_bit(Faulty, &rdev->flags)) 7794 spares++; 7795 } 7796 7797 if (spares - mddev->degraded < mddev->delta_disks - conf->max_degraded) 7798 /* Not enough devices even to make a degraded array 7799 * of that size 7800 */ 7801 return -EINVAL; 7802 7803 /* Refuse to reduce size of the array. Any reductions in 7804 * array size must be through explicit setting of array_size 7805 * attribute. 7806 */ 7807 if (raid5_size(mddev, 0, conf->raid_disks + mddev->delta_disks) 7808 < mddev->array_sectors) { 7809 pr_warn("md/raid:%s: array size must be reduced before number of disks\n", 7810 mdname(mddev)); 7811 return -EINVAL; 7812 } 7813 7814 atomic_set(&conf->reshape_stripes, 0); 7815 spin_lock_irq(&conf->device_lock); 7816 write_seqcount_begin(&conf->gen_lock); 7817 conf->previous_raid_disks = conf->raid_disks; 7818 conf->raid_disks += mddev->delta_disks; 7819 conf->prev_chunk_sectors = conf->chunk_sectors; 7820 conf->chunk_sectors = mddev->new_chunk_sectors; 7821 conf->prev_algo = conf->algorithm; 7822 conf->algorithm = mddev->new_layout; 7823 conf->generation++; 7824 /* Code that selects data_offset needs to see the generation update 7825 * if reshape_progress has been set - so a memory barrier needed. 7826 */ 7827 smp_mb(); 7828 if (mddev->reshape_backwards) 7829 conf->reshape_progress = raid5_size(mddev, 0, 0); 7830 else 7831 conf->reshape_progress = 0; 7832 conf->reshape_safe = conf->reshape_progress; 7833 write_seqcount_end(&conf->gen_lock); 7834 spin_unlock_irq(&conf->device_lock); 7835 7836 /* Now make sure any requests that proceeded on the assumption 7837 * the reshape wasn't running - like Discard or Read - have 7838 * completed. 7839 */ 7840 mddev_suspend(mddev); 7841 mddev_resume(mddev); 7842 7843 /* Add some new drives, as many as will fit. 7844 * We know there are enough to make the newly sized array work. 7845 * Don't add devices if we are reducing the number of 7846 * devices in the array. This is because it is not possible 7847 * to correctly record the "partially reconstructed" state of 7848 * such devices during the reshape and confusion could result. 7849 */ 7850 if (mddev->delta_disks >= 0) { 7851 rdev_for_each(rdev, mddev) 7852 if (rdev->raid_disk < 0 && 7853 !test_bit(Faulty, &rdev->flags)) { 7854 if (raid5_add_disk(mddev, rdev) == 0) { 7855 if (rdev->raid_disk 7856 >= conf->previous_raid_disks) 7857 set_bit(In_sync, &rdev->flags); 7858 else 7859 rdev->recovery_offset = 0; 7860 7861 if (sysfs_link_rdev(mddev, rdev)) 7862 /* Failure here is OK */; 7863 } 7864 } else if (rdev->raid_disk >= conf->previous_raid_disks 7865 && !test_bit(Faulty, &rdev->flags)) { 7866 /* This is a spare that was manually added */ 7867 set_bit(In_sync, &rdev->flags); 7868 } 7869 7870 /* When a reshape changes the number of devices, 7871 * ->degraded is measured against the larger of the 7872 * pre and post number of devices. 7873 */ 7874 spin_lock_irqsave(&conf->device_lock, flags); 7875 mddev->degraded = raid5_calc_degraded(conf); 7876 spin_unlock_irqrestore(&conf->device_lock, flags); 7877 } 7878 mddev->raid_disks = conf->raid_disks; 7879 mddev->reshape_position = conf->reshape_progress; 7880 set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags); 7881 7882 clear_bit(MD_RECOVERY_SYNC, &mddev->recovery); 7883 clear_bit(MD_RECOVERY_CHECK, &mddev->recovery); 7884 clear_bit(MD_RECOVERY_DONE, &mddev->recovery); 7885 set_bit(MD_RECOVERY_RESHAPE, &mddev->recovery); 7886 set_bit(MD_RECOVERY_RUNNING, &mddev->recovery); 7887 mddev->sync_thread = md_register_thread(md_do_sync, mddev, 7888 "reshape"); 7889 if (!mddev->sync_thread) { 7890 mddev->recovery = 0; 7891 spin_lock_irq(&conf->device_lock); 7892 write_seqcount_begin(&conf->gen_lock); 7893 mddev->raid_disks = conf->raid_disks = conf->previous_raid_disks; 7894 mddev->new_chunk_sectors = 7895 conf->chunk_sectors = conf->prev_chunk_sectors; 7896 mddev->new_layout = conf->algorithm = conf->prev_algo; 7897 rdev_for_each(rdev, mddev) 7898 rdev->new_data_offset = rdev->data_offset; 7899 smp_wmb(); 7900 conf->generation --; 7901 conf->reshape_progress = MaxSector; 7902 mddev->reshape_position = MaxSector; 7903 write_seqcount_end(&conf->gen_lock); 7904 spin_unlock_irq(&conf->device_lock); 7905 return -EAGAIN; 7906 } 7907 conf->reshape_checkpoint = jiffies; 7908 md_wakeup_thread(mddev->sync_thread); 7909 md_new_event(mddev); 7910 return 0; 7911 } 7912 7913 /* This is called from the reshape thread and should make any 7914 * changes needed in 'conf' 7915 */ 7916 static void end_reshape(struct r5conf *conf) 7917 { 7918 7919 if (!test_bit(MD_RECOVERY_INTR, &conf->mddev->recovery)) { 7920 struct md_rdev *rdev; 7921 7922 spin_lock_irq(&conf->device_lock); 7923 conf->previous_raid_disks = conf->raid_disks; 7924 rdev_for_each(rdev, conf->mddev) 7925 rdev->data_offset = rdev->new_data_offset; 7926 smp_wmb(); 7927 conf->reshape_progress = MaxSector; 7928 conf->mddev->reshape_position = MaxSector; 7929 spin_unlock_irq(&conf->device_lock); 7930 wake_up(&conf->wait_for_overlap); 7931 7932 /* read-ahead size must cover two whole stripes, which is 7933 * 2 * (datadisks) * chunksize where 'n' is the number of raid devices 7934 */ 7935 if (conf->mddev->queue) { 7936 int data_disks = conf->raid_disks - conf->max_degraded; 7937 int stripe = data_disks * ((conf->chunk_sectors << 9) 7938 / PAGE_SIZE); 7939 if (conf->mddev->queue->backing_dev_info->ra_pages < 2 * stripe) 7940 conf->mddev->queue->backing_dev_info->ra_pages = 2 * stripe; 7941 } 7942 } 7943 } 7944 7945 /* This is called from the raid5d thread with mddev_lock held. 7946 * It makes config changes to the device. 7947 */ 7948 static void raid5_finish_reshape(struct mddev *mddev) 7949 { 7950 struct r5conf *conf = mddev->private; 7951 7952 if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery)) { 7953 7954 if (mddev->delta_disks > 0) { 7955 md_set_array_sectors(mddev, raid5_size(mddev, 0, 0)); 7956 if (mddev->queue) { 7957 set_capacity(mddev->gendisk, mddev->array_sectors); 7958 revalidate_disk(mddev->gendisk); 7959 } 7960 } else { 7961 int d; 7962 spin_lock_irq(&conf->device_lock); 7963 mddev->degraded = raid5_calc_degraded(conf); 7964 spin_unlock_irq(&conf->device_lock); 7965 for (d = conf->raid_disks ; 7966 d < conf->raid_disks - mddev->delta_disks; 7967 d++) { 7968 struct md_rdev *rdev = conf->disks[d].rdev; 7969 if (rdev) 7970 clear_bit(In_sync, &rdev->flags); 7971 rdev = conf->disks[d].replacement; 7972 if (rdev) 7973 clear_bit(In_sync, &rdev->flags); 7974 } 7975 } 7976 mddev->layout = conf->algorithm; 7977 mddev->chunk_sectors = conf->chunk_sectors; 7978 mddev->reshape_position = MaxSector; 7979 mddev->delta_disks = 0; 7980 mddev->reshape_backwards = 0; 7981 } 7982 } 7983 7984 static void raid5_quiesce(struct mddev *mddev, int state) 7985 { 7986 struct r5conf *conf = mddev->private; 7987 7988 switch(state) { 7989 case 2: /* resume for a suspend */ 7990 wake_up(&conf->wait_for_overlap); 7991 break; 7992 7993 case 1: /* stop all writes */ 7994 lock_all_device_hash_locks_irq(conf); 7995 /* '2' tells resync/reshape to pause so that all 7996 * active stripes can drain 7997 */ 7998 r5c_flush_cache(conf, INT_MAX); 7999 conf->quiesce = 2; 8000 wait_event_cmd(conf->wait_for_quiescent, 8001 atomic_read(&conf->active_stripes) == 0 && 8002 atomic_read(&conf->active_aligned_reads) == 0, 8003 unlock_all_device_hash_locks_irq(conf), 8004 lock_all_device_hash_locks_irq(conf)); 8005 conf->quiesce = 1; 8006 unlock_all_device_hash_locks_irq(conf); 8007 /* allow reshape to continue */ 8008 wake_up(&conf->wait_for_overlap); 8009 break; 8010 8011 case 0: /* re-enable writes */ 8012 lock_all_device_hash_locks_irq(conf); 8013 conf->quiesce = 0; 8014 wake_up(&conf->wait_for_quiescent); 8015 wake_up(&conf->wait_for_overlap); 8016 unlock_all_device_hash_locks_irq(conf); 8017 break; 8018 } 8019 r5l_quiesce(conf->log, state); 8020 } 8021 8022 static void *raid45_takeover_raid0(struct mddev *mddev, int level) 8023 { 8024 struct r0conf *raid0_conf = mddev->private; 8025 sector_t sectors; 8026 8027 /* for raid0 takeover only one zone is supported */ 8028 if (raid0_conf->nr_strip_zones > 1) { 8029 pr_warn("md/raid:%s: cannot takeover raid0 with more than one zone.\n", 8030 mdname(mddev)); 8031 return ERR_PTR(-EINVAL); 8032 } 8033 8034 sectors = raid0_conf->strip_zone[0].zone_end; 8035 sector_div(sectors, raid0_conf->strip_zone[0].nb_dev); 8036 mddev->dev_sectors = sectors; 8037 mddev->new_level = level; 8038 mddev->new_layout = ALGORITHM_PARITY_N; 8039 mddev->new_chunk_sectors = mddev->chunk_sectors; 8040 mddev->raid_disks += 1; 8041 mddev->delta_disks = 1; 8042 /* make sure it will be not marked as dirty */ 8043 mddev->recovery_cp = MaxSector; 8044 8045 return setup_conf(mddev); 8046 } 8047 8048 static void *raid5_takeover_raid1(struct mddev *mddev) 8049 { 8050 int chunksect; 8051 void *ret; 8052 8053 if (mddev->raid_disks != 2 || 8054 mddev->degraded > 1) 8055 return ERR_PTR(-EINVAL); 8056 8057 /* Should check if there are write-behind devices? */ 8058 8059 chunksect = 64*2; /* 64K by default */ 8060 8061 /* The array must be an exact multiple of chunksize */ 8062 while (chunksect && (mddev->array_sectors & (chunksect-1))) 8063 chunksect >>= 1; 8064 8065 if ((chunksect<<9) < STRIPE_SIZE) 8066 /* array size does not allow a suitable chunk size */ 8067 return ERR_PTR(-EINVAL); 8068 8069 mddev->new_level = 5; 8070 mddev->new_layout = ALGORITHM_LEFT_SYMMETRIC; 8071 mddev->new_chunk_sectors = chunksect; 8072 8073 ret = setup_conf(mddev); 8074 if (!IS_ERR(ret)) 8075 mddev_clear_unsupported_flags(mddev, 8076 UNSUPPORTED_MDDEV_FLAGS); 8077 return ret; 8078 } 8079 8080 static void *raid5_takeover_raid6(struct mddev *mddev) 8081 { 8082 int new_layout; 8083 8084 switch (mddev->layout) { 8085 case ALGORITHM_LEFT_ASYMMETRIC_6: 8086 new_layout = ALGORITHM_LEFT_ASYMMETRIC; 8087 break; 8088 case ALGORITHM_RIGHT_ASYMMETRIC_6: 8089 new_layout = ALGORITHM_RIGHT_ASYMMETRIC; 8090 break; 8091 case ALGORITHM_LEFT_SYMMETRIC_6: 8092 new_layout = ALGORITHM_LEFT_SYMMETRIC; 8093 break; 8094 case ALGORITHM_RIGHT_SYMMETRIC_6: 8095 new_layout = ALGORITHM_RIGHT_SYMMETRIC; 8096 break; 8097 case ALGORITHM_PARITY_0_6: 8098 new_layout = ALGORITHM_PARITY_0; 8099 break; 8100 case ALGORITHM_PARITY_N: 8101 new_layout = ALGORITHM_PARITY_N; 8102 break; 8103 default: 8104 return ERR_PTR(-EINVAL); 8105 } 8106 mddev->new_level = 5; 8107 mddev->new_layout = new_layout; 8108 mddev->delta_disks = -1; 8109 mddev->raid_disks -= 1; 8110 return setup_conf(mddev); 8111 } 8112 8113 static int raid5_check_reshape(struct mddev *mddev) 8114 { 8115 /* For a 2-drive array, the layout and chunk size can be changed 8116 * immediately as not restriping is needed. 8117 * For larger arrays we record the new value - after validation 8118 * to be used by a reshape pass. 8119 */ 8120 struct r5conf *conf = mddev->private; 8121 int new_chunk = mddev->new_chunk_sectors; 8122 8123 if (mddev->new_layout >= 0 && !algorithm_valid_raid5(mddev->new_layout)) 8124 return -EINVAL; 8125 if (new_chunk > 0) { 8126 if (!is_power_of_2(new_chunk)) 8127 return -EINVAL; 8128 if (new_chunk < (PAGE_SIZE>>9)) 8129 return -EINVAL; 8130 if (mddev->array_sectors & (new_chunk-1)) 8131 /* not factor of array size */ 8132 return -EINVAL; 8133 } 8134 8135 /* They look valid */ 8136 8137 if (mddev->raid_disks == 2) { 8138 /* can make the change immediately */ 8139 if (mddev->new_layout >= 0) { 8140 conf->algorithm = mddev->new_layout; 8141 mddev->layout = mddev->new_layout; 8142 } 8143 if (new_chunk > 0) { 8144 conf->chunk_sectors = new_chunk ; 8145 mddev->chunk_sectors = new_chunk; 8146 } 8147 set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags); 8148 md_wakeup_thread(mddev->thread); 8149 } 8150 return check_reshape(mddev); 8151 } 8152 8153 static int raid6_check_reshape(struct mddev *mddev) 8154 { 8155 int new_chunk = mddev->new_chunk_sectors; 8156 8157 if (mddev->new_layout >= 0 && !algorithm_valid_raid6(mddev->new_layout)) 8158 return -EINVAL; 8159 if (new_chunk > 0) { 8160 if (!is_power_of_2(new_chunk)) 8161 return -EINVAL; 8162 if (new_chunk < (PAGE_SIZE >> 9)) 8163 return -EINVAL; 8164 if (mddev->array_sectors & (new_chunk-1)) 8165 /* not factor of array size */ 8166 return -EINVAL; 8167 } 8168 8169 /* They look valid */ 8170 return check_reshape(mddev); 8171 } 8172 8173 static void *raid5_takeover(struct mddev *mddev) 8174 { 8175 /* raid5 can take over: 8176 * raid0 - if there is only one strip zone - make it a raid4 layout 8177 * raid1 - if there are two drives. We need to know the chunk size 8178 * raid4 - trivial - just use a raid4 layout. 8179 * raid6 - Providing it is a *_6 layout 8180 */ 8181 if (mddev->level == 0) 8182 return raid45_takeover_raid0(mddev, 5); 8183 if (mddev->level == 1) 8184 return raid5_takeover_raid1(mddev); 8185 if (mddev->level == 4) { 8186 mddev->new_layout = ALGORITHM_PARITY_N; 8187 mddev->new_level = 5; 8188 return setup_conf(mddev); 8189 } 8190 if (mddev->level == 6) 8191 return raid5_takeover_raid6(mddev); 8192 8193 return ERR_PTR(-EINVAL); 8194 } 8195 8196 static void *raid4_takeover(struct mddev *mddev) 8197 { 8198 /* raid4 can take over: 8199 * raid0 - if there is only one strip zone 8200 * raid5 - if layout is right 8201 */ 8202 if (mddev->level == 0) 8203 return raid45_takeover_raid0(mddev, 4); 8204 if (mddev->level == 5 && 8205 mddev->layout == ALGORITHM_PARITY_N) { 8206 mddev->new_layout = 0; 8207 mddev->new_level = 4; 8208 return setup_conf(mddev); 8209 } 8210 return ERR_PTR(-EINVAL); 8211 } 8212 8213 static struct md_personality raid5_personality; 8214 8215 static void *raid6_takeover(struct mddev *mddev) 8216 { 8217 /* Currently can only take over a raid5. We map the 8218 * personality to an equivalent raid6 personality 8219 * with the Q block at the end. 8220 */ 8221 int new_layout; 8222 8223 if (mddev->pers != &raid5_personality) 8224 return ERR_PTR(-EINVAL); 8225 if (mddev->degraded > 1) 8226 return ERR_PTR(-EINVAL); 8227 if (mddev->raid_disks > 253) 8228 return ERR_PTR(-EINVAL); 8229 if (mddev->raid_disks < 3) 8230 return ERR_PTR(-EINVAL); 8231 8232 switch (mddev->layout) { 8233 case ALGORITHM_LEFT_ASYMMETRIC: 8234 new_layout = ALGORITHM_LEFT_ASYMMETRIC_6; 8235 break; 8236 case ALGORITHM_RIGHT_ASYMMETRIC: 8237 new_layout = ALGORITHM_RIGHT_ASYMMETRIC_6; 8238 break; 8239 case ALGORITHM_LEFT_SYMMETRIC: 8240 new_layout = ALGORITHM_LEFT_SYMMETRIC_6; 8241 break; 8242 case ALGORITHM_RIGHT_SYMMETRIC: 8243 new_layout = ALGORITHM_RIGHT_SYMMETRIC_6; 8244 break; 8245 case ALGORITHM_PARITY_0: 8246 new_layout = ALGORITHM_PARITY_0_6; 8247 break; 8248 case ALGORITHM_PARITY_N: 8249 new_layout = ALGORITHM_PARITY_N; 8250 break; 8251 default: 8252 return ERR_PTR(-EINVAL); 8253 } 8254 mddev->new_level = 6; 8255 mddev->new_layout = new_layout; 8256 mddev->delta_disks = 1; 8257 mddev->raid_disks += 1; 8258 return setup_conf(mddev); 8259 } 8260 8261 static int raid5_change_consistency_policy(struct mddev *mddev, const char *buf) 8262 { 8263 struct r5conf *conf; 8264 int err; 8265 8266 err = mddev_lock(mddev); 8267 if (err) 8268 return err; 8269 conf = mddev->private; 8270 if (!conf) { 8271 mddev_unlock(mddev); 8272 return -ENODEV; 8273 } 8274 8275 if (strncmp(buf, "ppl", 3) == 0) { 8276 /* ppl only works with RAID 5 */ 8277 if (!raid5_has_ppl(conf) && conf->level == 5) { 8278 err = log_init(conf, NULL, true); 8279 if (!err) { 8280 err = resize_stripes(conf, conf->pool_size); 8281 if (err) 8282 log_exit(conf); 8283 } 8284 } else 8285 err = -EINVAL; 8286 } else if (strncmp(buf, "resync", 6) == 0) { 8287 if (raid5_has_ppl(conf)) { 8288 mddev_suspend(mddev); 8289 log_exit(conf); 8290 mddev_resume(mddev); 8291 err = resize_stripes(conf, conf->pool_size); 8292 } else if (test_bit(MD_HAS_JOURNAL, &conf->mddev->flags) && 8293 r5l_log_disk_error(conf)) { 8294 bool journal_dev_exists = false; 8295 struct md_rdev *rdev; 8296 8297 rdev_for_each(rdev, mddev) 8298 if (test_bit(Journal, &rdev->flags)) { 8299 journal_dev_exists = true; 8300 break; 8301 } 8302 8303 if (!journal_dev_exists) { 8304 mddev_suspend(mddev); 8305 clear_bit(MD_HAS_JOURNAL, &mddev->flags); 8306 mddev_resume(mddev); 8307 } else /* need remove journal device first */ 8308 err = -EBUSY; 8309 } else 8310 err = -EINVAL; 8311 } else { 8312 err = -EINVAL; 8313 } 8314 8315 if (!err) 8316 md_update_sb(mddev, 1); 8317 8318 mddev_unlock(mddev); 8319 8320 return err; 8321 } 8322 8323 static struct md_personality raid6_personality = 8324 { 8325 .name = "raid6", 8326 .level = 6, 8327 .owner = THIS_MODULE, 8328 .make_request = raid5_make_request, 8329 .run = raid5_run, 8330 .free = raid5_free, 8331 .status = raid5_status, 8332 .error_handler = raid5_error, 8333 .hot_add_disk = raid5_add_disk, 8334 .hot_remove_disk= raid5_remove_disk, 8335 .spare_active = raid5_spare_active, 8336 .sync_request = raid5_sync_request, 8337 .resize = raid5_resize, 8338 .size = raid5_size, 8339 .check_reshape = raid6_check_reshape, 8340 .start_reshape = raid5_start_reshape, 8341 .finish_reshape = raid5_finish_reshape, 8342 .quiesce = raid5_quiesce, 8343 .takeover = raid6_takeover, 8344 .congested = raid5_congested, 8345 .change_consistency_policy = raid5_change_consistency_policy, 8346 }; 8347 static struct md_personality raid5_personality = 8348 { 8349 .name = "raid5", 8350 .level = 5, 8351 .owner = THIS_MODULE, 8352 .make_request = raid5_make_request, 8353 .run = raid5_run, 8354 .free = raid5_free, 8355 .status = raid5_status, 8356 .error_handler = raid5_error, 8357 .hot_add_disk = raid5_add_disk, 8358 .hot_remove_disk= raid5_remove_disk, 8359 .spare_active = raid5_spare_active, 8360 .sync_request = raid5_sync_request, 8361 .resize = raid5_resize, 8362 .size = raid5_size, 8363 .check_reshape = raid5_check_reshape, 8364 .start_reshape = raid5_start_reshape, 8365 .finish_reshape = raid5_finish_reshape, 8366 .quiesce = raid5_quiesce, 8367 .takeover = raid5_takeover, 8368 .congested = raid5_congested, 8369 .change_consistency_policy = raid5_change_consistency_policy, 8370 }; 8371 8372 static struct md_personality raid4_personality = 8373 { 8374 .name = "raid4", 8375 .level = 4, 8376 .owner = THIS_MODULE, 8377 .make_request = raid5_make_request, 8378 .run = raid5_run, 8379 .free = raid5_free, 8380 .status = raid5_status, 8381 .error_handler = raid5_error, 8382 .hot_add_disk = raid5_add_disk, 8383 .hot_remove_disk= raid5_remove_disk, 8384 .spare_active = raid5_spare_active, 8385 .sync_request = raid5_sync_request, 8386 .resize = raid5_resize, 8387 .size = raid5_size, 8388 .check_reshape = raid5_check_reshape, 8389 .start_reshape = raid5_start_reshape, 8390 .finish_reshape = raid5_finish_reshape, 8391 .quiesce = raid5_quiesce, 8392 .takeover = raid4_takeover, 8393 .congested = raid5_congested, 8394 .change_consistency_policy = raid5_change_consistency_policy, 8395 }; 8396 8397 static int __init raid5_init(void) 8398 { 8399 int ret; 8400 8401 raid5_wq = alloc_workqueue("raid5wq", 8402 WQ_UNBOUND|WQ_MEM_RECLAIM|WQ_CPU_INTENSIVE|WQ_SYSFS, 0); 8403 if (!raid5_wq) 8404 return -ENOMEM; 8405 8406 ret = cpuhp_setup_state_multi(CPUHP_MD_RAID5_PREPARE, 8407 "md/raid5:prepare", 8408 raid456_cpu_up_prepare, 8409 raid456_cpu_dead); 8410 if (ret) { 8411 destroy_workqueue(raid5_wq); 8412 return ret; 8413 } 8414 register_md_personality(&raid6_personality); 8415 register_md_personality(&raid5_personality); 8416 register_md_personality(&raid4_personality); 8417 return 0; 8418 } 8419 8420 static void raid5_exit(void) 8421 { 8422 unregister_md_personality(&raid6_personality); 8423 unregister_md_personality(&raid5_personality); 8424 unregister_md_personality(&raid4_personality); 8425 cpuhp_remove_multi_state(CPUHP_MD_RAID5_PREPARE); 8426 destroy_workqueue(raid5_wq); 8427 } 8428 8429 module_init(raid5_init); 8430 module_exit(raid5_exit); 8431 MODULE_LICENSE("GPL"); 8432 MODULE_DESCRIPTION("RAID4/5/6 (striping with parity) personality for MD"); 8433 MODULE_ALIAS("md-personality-4"); /* RAID5 */ 8434 MODULE_ALIAS("md-raid5"); 8435 MODULE_ALIAS("md-raid4"); 8436 MODULE_ALIAS("md-level-5"); 8437 MODULE_ALIAS("md-level-4"); 8438 MODULE_ALIAS("md-personality-8"); /* RAID6 */ 8439 MODULE_ALIAS("md-raid6"); 8440 MODULE_ALIAS("md-level-6"); 8441 8442 /* This used to be two separate modules, they were: */ 8443 MODULE_ALIAS("raid5"); 8444 MODULE_ALIAS("raid6"); 8445