1 /* 2 * raid5.c : Multiple Devices driver for Linux 3 * Copyright (C) 1996, 1997 Ingo Molnar, Miguel de Icaza, Gadi Oxman 4 * Copyright (C) 1999, 2000 Ingo Molnar 5 * Copyright (C) 2002, 2003 H. Peter Anvin 6 * 7 * RAID-4/5/6 management functions. 8 * Thanks to Penguin Computing for making the RAID-6 development possible 9 * by donating a test server! 10 * 11 * This program is free software; you can redistribute it and/or modify 12 * it under the terms of the GNU General Public License as published by 13 * the Free Software Foundation; either version 2, or (at your option) 14 * any later version. 15 * 16 * You should have received a copy of the GNU General Public License 17 * (for example /usr/src/linux/COPYING); if not, write to the Free 18 * Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. 19 */ 20 21 /* 22 * BITMAP UNPLUGGING: 23 * 24 * The sequencing for updating the bitmap reliably is a little 25 * subtle (and I got it wrong the first time) so it deserves some 26 * explanation. 27 * 28 * We group bitmap updates into batches. Each batch has a number. 29 * We may write out several batches at once, but that isn't very important. 30 * conf->seq_write is the number of the last batch successfully written. 31 * conf->seq_flush is the number of the last batch that was closed to 32 * new additions. 33 * When we discover that we will need to write to any block in a stripe 34 * (in add_stripe_bio) we update the in-memory bitmap and record in sh->bm_seq 35 * the number of the batch it will be in. This is seq_flush+1. 36 * When we are ready to do a write, if that batch hasn't been written yet, 37 * we plug the array and queue the stripe for later. 38 * When an unplug happens, we increment bm_flush, thus closing the current 39 * batch. 40 * When we notice that bm_flush > bm_write, we write out all pending updates 41 * to the bitmap, and advance bm_write to where bm_flush was. 42 * This may occasionally write a bit out twice, but is sure never to 43 * miss any bits. 44 */ 45 46 #include <linux/blkdev.h> 47 #include <linux/kthread.h> 48 #include <linux/raid/pq.h> 49 #include <linux/async_tx.h> 50 #include <linux/module.h> 51 #include <linux/async.h> 52 #include <linux/seq_file.h> 53 #include <linux/cpu.h> 54 #include <linux/slab.h> 55 #include <linux/ratelimit.h> 56 #include <linux/nodemask.h> 57 #include <linux/flex_array.h> 58 #include <linux/sched/signal.h> 59 60 #include <trace/events/block.h> 61 #include <linux/list_sort.h> 62 63 #include "md.h" 64 #include "raid5.h" 65 #include "raid0.h" 66 #include "bitmap.h" 67 #include "raid5-log.h" 68 69 #define UNSUPPORTED_MDDEV_FLAGS (1L << MD_FAILFAST_SUPPORTED) 70 71 #define cpu_to_group(cpu) cpu_to_node(cpu) 72 #define ANY_GROUP NUMA_NO_NODE 73 74 static bool devices_handle_discard_safely = false; 75 module_param(devices_handle_discard_safely, bool, 0644); 76 MODULE_PARM_DESC(devices_handle_discard_safely, 77 "Set to Y if all devices in each array reliably return zeroes on reads from discarded regions"); 78 static struct workqueue_struct *raid5_wq; 79 80 static inline struct hlist_head *stripe_hash(struct r5conf *conf, sector_t sect) 81 { 82 int hash = (sect >> STRIPE_SHIFT) & HASH_MASK; 83 return &conf->stripe_hashtbl[hash]; 84 } 85 86 static inline int stripe_hash_locks_hash(sector_t sect) 87 { 88 return (sect >> STRIPE_SHIFT) & STRIPE_HASH_LOCKS_MASK; 89 } 90 91 static inline void lock_device_hash_lock(struct r5conf *conf, int hash) 92 { 93 spin_lock_irq(conf->hash_locks + hash); 94 spin_lock(&conf->device_lock); 95 } 96 97 static inline void unlock_device_hash_lock(struct r5conf *conf, int hash) 98 { 99 spin_unlock(&conf->device_lock); 100 spin_unlock_irq(conf->hash_locks + hash); 101 } 102 103 static inline void lock_all_device_hash_locks_irq(struct r5conf *conf) 104 { 105 int i; 106 spin_lock_irq(conf->hash_locks); 107 for (i = 1; i < NR_STRIPE_HASH_LOCKS; i++) 108 spin_lock_nest_lock(conf->hash_locks + i, conf->hash_locks); 109 spin_lock(&conf->device_lock); 110 } 111 112 static inline void unlock_all_device_hash_locks_irq(struct r5conf *conf) 113 { 114 int i; 115 spin_unlock(&conf->device_lock); 116 for (i = NR_STRIPE_HASH_LOCKS - 1; i; i--) 117 spin_unlock(conf->hash_locks + i); 118 spin_unlock_irq(conf->hash_locks); 119 } 120 121 /* Find first data disk in a raid6 stripe */ 122 static inline int raid6_d0(struct stripe_head *sh) 123 { 124 if (sh->ddf_layout) 125 /* ddf always start from first device */ 126 return 0; 127 /* md starts just after Q block */ 128 if (sh->qd_idx == sh->disks - 1) 129 return 0; 130 else 131 return sh->qd_idx + 1; 132 } 133 static inline int raid6_next_disk(int disk, int raid_disks) 134 { 135 disk++; 136 return (disk < raid_disks) ? disk : 0; 137 } 138 139 /* When walking through the disks in a raid5, starting at raid6_d0, 140 * We need to map each disk to a 'slot', where the data disks are slot 141 * 0 .. raid_disks-3, the parity disk is raid_disks-2 and the Q disk 142 * is raid_disks-1. This help does that mapping. 143 */ 144 static int raid6_idx_to_slot(int idx, struct stripe_head *sh, 145 int *count, int syndrome_disks) 146 { 147 int slot = *count; 148 149 if (sh->ddf_layout) 150 (*count)++; 151 if (idx == sh->pd_idx) 152 return syndrome_disks; 153 if (idx == sh->qd_idx) 154 return syndrome_disks + 1; 155 if (!sh->ddf_layout) 156 (*count)++; 157 return slot; 158 } 159 160 static void print_raid5_conf (struct r5conf *conf); 161 162 static int stripe_operations_active(struct stripe_head *sh) 163 { 164 return sh->check_state || sh->reconstruct_state || 165 test_bit(STRIPE_BIOFILL_RUN, &sh->state) || 166 test_bit(STRIPE_COMPUTE_RUN, &sh->state); 167 } 168 169 static bool stripe_is_lowprio(struct stripe_head *sh) 170 { 171 return (test_bit(STRIPE_R5C_FULL_STRIPE, &sh->state) || 172 test_bit(STRIPE_R5C_PARTIAL_STRIPE, &sh->state)) && 173 !test_bit(STRIPE_R5C_CACHING, &sh->state); 174 } 175 176 static void raid5_wakeup_stripe_thread(struct stripe_head *sh) 177 { 178 struct r5conf *conf = sh->raid_conf; 179 struct r5worker_group *group; 180 int thread_cnt; 181 int i, cpu = sh->cpu; 182 183 if (!cpu_online(cpu)) { 184 cpu = cpumask_any(cpu_online_mask); 185 sh->cpu = cpu; 186 } 187 188 if (list_empty(&sh->lru)) { 189 struct r5worker_group *group; 190 group = conf->worker_groups + cpu_to_group(cpu); 191 if (stripe_is_lowprio(sh)) 192 list_add_tail(&sh->lru, &group->loprio_list); 193 else 194 list_add_tail(&sh->lru, &group->handle_list); 195 group->stripes_cnt++; 196 sh->group = group; 197 } 198 199 if (conf->worker_cnt_per_group == 0) { 200 md_wakeup_thread(conf->mddev->thread); 201 return; 202 } 203 204 group = conf->worker_groups + cpu_to_group(sh->cpu); 205 206 group->workers[0].working = true; 207 /* at least one worker should run to avoid race */ 208 queue_work_on(sh->cpu, raid5_wq, &group->workers[0].work); 209 210 thread_cnt = group->stripes_cnt / MAX_STRIPE_BATCH - 1; 211 /* wakeup more workers */ 212 for (i = 1; i < conf->worker_cnt_per_group && thread_cnt > 0; i++) { 213 if (group->workers[i].working == false) { 214 group->workers[i].working = true; 215 queue_work_on(sh->cpu, raid5_wq, 216 &group->workers[i].work); 217 thread_cnt--; 218 } 219 } 220 } 221 222 static void do_release_stripe(struct r5conf *conf, struct stripe_head *sh, 223 struct list_head *temp_inactive_list) 224 { 225 int i; 226 int injournal = 0; /* number of date pages with R5_InJournal */ 227 228 BUG_ON(!list_empty(&sh->lru)); 229 BUG_ON(atomic_read(&conf->active_stripes)==0); 230 231 if (r5c_is_writeback(conf->log)) 232 for (i = sh->disks; i--; ) 233 if (test_bit(R5_InJournal, &sh->dev[i].flags)) 234 injournal++; 235 /* 236 * In the following cases, the stripe cannot be released to cached 237 * lists. Therefore, we make the stripe write out and set 238 * STRIPE_HANDLE: 239 * 1. when quiesce in r5c write back; 240 * 2. when resync is requested fot the stripe. 241 */ 242 if (test_bit(STRIPE_SYNC_REQUESTED, &sh->state) || 243 (conf->quiesce && r5c_is_writeback(conf->log) && 244 !test_bit(STRIPE_HANDLE, &sh->state) && injournal != 0)) { 245 if (test_bit(STRIPE_R5C_CACHING, &sh->state)) 246 r5c_make_stripe_write_out(sh); 247 set_bit(STRIPE_HANDLE, &sh->state); 248 } 249 250 if (test_bit(STRIPE_HANDLE, &sh->state)) { 251 if (test_bit(STRIPE_DELAYED, &sh->state) && 252 !test_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) 253 list_add_tail(&sh->lru, &conf->delayed_list); 254 else if (test_bit(STRIPE_BIT_DELAY, &sh->state) && 255 sh->bm_seq - conf->seq_write > 0) 256 list_add_tail(&sh->lru, &conf->bitmap_list); 257 else { 258 clear_bit(STRIPE_DELAYED, &sh->state); 259 clear_bit(STRIPE_BIT_DELAY, &sh->state); 260 if (conf->worker_cnt_per_group == 0) { 261 if (stripe_is_lowprio(sh)) 262 list_add_tail(&sh->lru, 263 &conf->loprio_list); 264 else 265 list_add_tail(&sh->lru, 266 &conf->handle_list); 267 } else { 268 raid5_wakeup_stripe_thread(sh); 269 return; 270 } 271 } 272 md_wakeup_thread(conf->mddev->thread); 273 } else { 274 BUG_ON(stripe_operations_active(sh)); 275 if (test_and_clear_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) 276 if (atomic_dec_return(&conf->preread_active_stripes) 277 < IO_THRESHOLD) 278 md_wakeup_thread(conf->mddev->thread); 279 atomic_dec(&conf->active_stripes); 280 if (!test_bit(STRIPE_EXPANDING, &sh->state)) { 281 if (!r5c_is_writeback(conf->log)) 282 list_add_tail(&sh->lru, temp_inactive_list); 283 else { 284 WARN_ON(test_bit(R5_InJournal, &sh->dev[sh->pd_idx].flags)); 285 if (injournal == 0) 286 list_add_tail(&sh->lru, temp_inactive_list); 287 else if (injournal == conf->raid_disks - conf->max_degraded) { 288 /* full stripe */ 289 if (!test_and_set_bit(STRIPE_R5C_FULL_STRIPE, &sh->state)) 290 atomic_inc(&conf->r5c_cached_full_stripes); 291 if (test_and_clear_bit(STRIPE_R5C_PARTIAL_STRIPE, &sh->state)) 292 atomic_dec(&conf->r5c_cached_partial_stripes); 293 list_add_tail(&sh->lru, &conf->r5c_full_stripe_list); 294 r5c_check_cached_full_stripe(conf); 295 } else 296 /* 297 * STRIPE_R5C_PARTIAL_STRIPE is set in 298 * r5c_try_caching_write(). No need to 299 * set it again. 300 */ 301 list_add_tail(&sh->lru, &conf->r5c_partial_stripe_list); 302 } 303 } 304 } 305 } 306 307 static void __release_stripe(struct r5conf *conf, struct stripe_head *sh, 308 struct list_head *temp_inactive_list) 309 { 310 if (atomic_dec_and_test(&sh->count)) 311 do_release_stripe(conf, sh, temp_inactive_list); 312 } 313 314 /* 315 * @hash could be NR_STRIPE_HASH_LOCKS, then we have a list of inactive_list 316 * 317 * Be careful: Only one task can add/delete stripes from temp_inactive_list at 318 * given time. Adding stripes only takes device lock, while deleting stripes 319 * only takes hash lock. 320 */ 321 static void release_inactive_stripe_list(struct r5conf *conf, 322 struct list_head *temp_inactive_list, 323 int hash) 324 { 325 int size; 326 bool do_wakeup = false; 327 unsigned long flags; 328 329 if (hash == NR_STRIPE_HASH_LOCKS) { 330 size = NR_STRIPE_HASH_LOCKS; 331 hash = NR_STRIPE_HASH_LOCKS - 1; 332 } else 333 size = 1; 334 while (size) { 335 struct list_head *list = &temp_inactive_list[size - 1]; 336 337 /* 338 * We don't hold any lock here yet, raid5_get_active_stripe() might 339 * remove stripes from the list 340 */ 341 if (!list_empty_careful(list)) { 342 spin_lock_irqsave(conf->hash_locks + hash, flags); 343 if (list_empty(conf->inactive_list + hash) && 344 !list_empty(list)) 345 atomic_dec(&conf->empty_inactive_list_nr); 346 list_splice_tail_init(list, conf->inactive_list + hash); 347 do_wakeup = true; 348 spin_unlock_irqrestore(conf->hash_locks + hash, flags); 349 } 350 size--; 351 hash--; 352 } 353 354 if (do_wakeup) { 355 wake_up(&conf->wait_for_stripe); 356 if (atomic_read(&conf->active_stripes) == 0) 357 wake_up(&conf->wait_for_quiescent); 358 if (conf->retry_read_aligned) 359 md_wakeup_thread(conf->mddev->thread); 360 } 361 } 362 363 /* should hold conf->device_lock already */ 364 static int release_stripe_list(struct r5conf *conf, 365 struct list_head *temp_inactive_list) 366 { 367 struct stripe_head *sh, *t; 368 int count = 0; 369 struct llist_node *head; 370 371 head = llist_del_all(&conf->released_stripes); 372 head = llist_reverse_order(head); 373 llist_for_each_entry_safe(sh, t, head, release_list) { 374 int hash; 375 376 /* sh could be readded after STRIPE_ON_RELEASE_LIST is cleard */ 377 smp_mb(); 378 clear_bit(STRIPE_ON_RELEASE_LIST, &sh->state); 379 /* 380 * Don't worry the bit is set here, because if the bit is set 381 * again, the count is always > 1. This is true for 382 * STRIPE_ON_UNPLUG_LIST bit too. 383 */ 384 hash = sh->hash_lock_index; 385 __release_stripe(conf, sh, &temp_inactive_list[hash]); 386 count++; 387 } 388 389 return count; 390 } 391 392 void raid5_release_stripe(struct stripe_head *sh) 393 { 394 struct r5conf *conf = sh->raid_conf; 395 unsigned long flags; 396 struct list_head list; 397 int hash; 398 bool wakeup; 399 400 /* Avoid release_list until the last reference. 401 */ 402 if (atomic_add_unless(&sh->count, -1, 1)) 403 return; 404 405 if (unlikely(!conf->mddev->thread) || 406 test_and_set_bit(STRIPE_ON_RELEASE_LIST, &sh->state)) 407 goto slow_path; 408 wakeup = llist_add(&sh->release_list, &conf->released_stripes); 409 if (wakeup) 410 md_wakeup_thread(conf->mddev->thread); 411 return; 412 slow_path: 413 local_irq_save(flags); 414 /* we are ok here if STRIPE_ON_RELEASE_LIST is set or not */ 415 if (atomic_dec_and_lock(&sh->count, &conf->device_lock)) { 416 INIT_LIST_HEAD(&list); 417 hash = sh->hash_lock_index; 418 do_release_stripe(conf, sh, &list); 419 spin_unlock(&conf->device_lock); 420 release_inactive_stripe_list(conf, &list, hash); 421 } 422 local_irq_restore(flags); 423 } 424 425 static inline void remove_hash(struct stripe_head *sh) 426 { 427 pr_debug("remove_hash(), stripe %llu\n", 428 (unsigned long long)sh->sector); 429 430 hlist_del_init(&sh->hash); 431 } 432 433 static inline void insert_hash(struct r5conf *conf, struct stripe_head *sh) 434 { 435 struct hlist_head *hp = stripe_hash(conf, sh->sector); 436 437 pr_debug("insert_hash(), stripe %llu\n", 438 (unsigned long long)sh->sector); 439 440 hlist_add_head(&sh->hash, hp); 441 } 442 443 /* find an idle stripe, make sure it is unhashed, and return it. */ 444 static struct stripe_head *get_free_stripe(struct r5conf *conf, int hash) 445 { 446 struct stripe_head *sh = NULL; 447 struct list_head *first; 448 449 if (list_empty(conf->inactive_list + hash)) 450 goto out; 451 first = (conf->inactive_list + hash)->next; 452 sh = list_entry(first, struct stripe_head, lru); 453 list_del_init(first); 454 remove_hash(sh); 455 atomic_inc(&conf->active_stripes); 456 BUG_ON(hash != sh->hash_lock_index); 457 if (list_empty(conf->inactive_list + hash)) 458 atomic_inc(&conf->empty_inactive_list_nr); 459 out: 460 return sh; 461 } 462 463 static void shrink_buffers(struct stripe_head *sh) 464 { 465 struct page *p; 466 int i; 467 int num = sh->raid_conf->pool_size; 468 469 for (i = 0; i < num ; i++) { 470 WARN_ON(sh->dev[i].page != sh->dev[i].orig_page); 471 p = sh->dev[i].page; 472 if (!p) 473 continue; 474 sh->dev[i].page = NULL; 475 put_page(p); 476 } 477 } 478 479 static int grow_buffers(struct stripe_head *sh, gfp_t gfp) 480 { 481 int i; 482 int num = sh->raid_conf->pool_size; 483 484 for (i = 0; i < num; i++) { 485 struct page *page; 486 487 if (!(page = alloc_page(gfp))) { 488 return 1; 489 } 490 sh->dev[i].page = page; 491 sh->dev[i].orig_page = page; 492 } 493 494 return 0; 495 } 496 497 static void stripe_set_idx(sector_t stripe, struct r5conf *conf, int previous, 498 struct stripe_head *sh); 499 500 static void init_stripe(struct stripe_head *sh, sector_t sector, int previous) 501 { 502 struct r5conf *conf = sh->raid_conf; 503 int i, seq; 504 505 BUG_ON(atomic_read(&sh->count) != 0); 506 BUG_ON(test_bit(STRIPE_HANDLE, &sh->state)); 507 BUG_ON(stripe_operations_active(sh)); 508 BUG_ON(sh->batch_head); 509 510 pr_debug("init_stripe called, stripe %llu\n", 511 (unsigned long long)sector); 512 retry: 513 seq = read_seqcount_begin(&conf->gen_lock); 514 sh->generation = conf->generation - previous; 515 sh->disks = previous ? conf->previous_raid_disks : conf->raid_disks; 516 sh->sector = sector; 517 stripe_set_idx(sector, conf, previous, sh); 518 sh->state = 0; 519 520 for (i = sh->disks; i--; ) { 521 struct r5dev *dev = &sh->dev[i]; 522 523 if (dev->toread || dev->read || dev->towrite || dev->written || 524 test_bit(R5_LOCKED, &dev->flags)) { 525 pr_err("sector=%llx i=%d %p %p %p %p %d\n", 526 (unsigned long long)sh->sector, i, dev->toread, 527 dev->read, dev->towrite, dev->written, 528 test_bit(R5_LOCKED, &dev->flags)); 529 WARN_ON(1); 530 } 531 dev->flags = 0; 532 dev->sector = raid5_compute_blocknr(sh, i, previous); 533 } 534 if (read_seqcount_retry(&conf->gen_lock, seq)) 535 goto retry; 536 sh->overwrite_disks = 0; 537 insert_hash(conf, sh); 538 sh->cpu = smp_processor_id(); 539 set_bit(STRIPE_BATCH_READY, &sh->state); 540 } 541 542 static struct stripe_head *__find_stripe(struct r5conf *conf, sector_t sector, 543 short generation) 544 { 545 struct stripe_head *sh; 546 547 pr_debug("__find_stripe, sector %llu\n", (unsigned long long)sector); 548 hlist_for_each_entry(sh, stripe_hash(conf, sector), hash) 549 if (sh->sector == sector && sh->generation == generation) 550 return sh; 551 pr_debug("__stripe %llu not in cache\n", (unsigned long long)sector); 552 return NULL; 553 } 554 555 /* 556 * Need to check if array has failed when deciding whether to: 557 * - start an array 558 * - remove non-faulty devices 559 * - add a spare 560 * - allow a reshape 561 * This determination is simple when no reshape is happening. 562 * However if there is a reshape, we need to carefully check 563 * both the before and after sections. 564 * This is because some failed devices may only affect one 565 * of the two sections, and some non-in_sync devices may 566 * be insync in the section most affected by failed devices. 567 */ 568 int raid5_calc_degraded(struct r5conf *conf) 569 { 570 int degraded, degraded2; 571 int i; 572 573 rcu_read_lock(); 574 degraded = 0; 575 for (i = 0; i < conf->previous_raid_disks; i++) { 576 struct md_rdev *rdev = rcu_dereference(conf->disks[i].rdev); 577 if (rdev && test_bit(Faulty, &rdev->flags)) 578 rdev = rcu_dereference(conf->disks[i].replacement); 579 if (!rdev || test_bit(Faulty, &rdev->flags)) 580 degraded++; 581 else if (test_bit(In_sync, &rdev->flags)) 582 ; 583 else 584 /* not in-sync or faulty. 585 * If the reshape increases the number of devices, 586 * this is being recovered by the reshape, so 587 * this 'previous' section is not in_sync. 588 * If the number of devices is being reduced however, 589 * the device can only be part of the array if 590 * we are reverting a reshape, so this section will 591 * be in-sync. 592 */ 593 if (conf->raid_disks >= conf->previous_raid_disks) 594 degraded++; 595 } 596 rcu_read_unlock(); 597 if (conf->raid_disks == conf->previous_raid_disks) 598 return degraded; 599 rcu_read_lock(); 600 degraded2 = 0; 601 for (i = 0; i < conf->raid_disks; i++) { 602 struct md_rdev *rdev = rcu_dereference(conf->disks[i].rdev); 603 if (rdev && test_bit(Faulty, &rdev->flags)) 604 rdev = rcu_dereference(conf->disks[i].replacement); 605 if (!rdev || test_bit(Faulty, &rdev->flags)) 606 degraded2++; 607 else if (test_bit(In_sync, &rdev->flags)) 608 ; 609 else 610 /* not in-sync or faulty. 611 * If reshape increases the number of devices, this 612 * section has already been recovered, else it 613 * almost certainly hasn't. 614 */ 615 if (conf->raid_disks <= conf->previous_raid_disks) 616 degraded2++; 617 } 618 rcu_read_unlock(); 619 if (degraded2 > degraded) 620 return degraded2; 621 return degraded; 622 } 623 624 static int has_failed(struct r5conf *conf) 625 { 626 int degraded; 627 628 if (conf->mddev->reshape_position == MaxSector) 629 return conf->mddev->degraded > conf->max_degraded; 630 631 degraded = raid5_calc_degraded(conf); 632 if (degraded > conf->max_degraded) 633 return 1; 634 return 0; 635 } 636 637 struct stripe_head * 638 raid5_get_active_stripe(struct r5conf *conf, sector_t sector, 639 int previous, int noblock, int noquiesce) 640 { 641 struct stripe_head *sh; 642 int hash = stripe_hash_locks_hash(sector); 643 int inc_empty_inactive_list_flag; 644 645 pr_debug("get_stripe, sector %llu\n", (unsigned long long)sector); 646 647 spin_lock_irq(conf->hash_locks + hash); 648 649 do { 650 wait_event_lock_irq(conf->wait_for_quiescent, 651 conf->quiesce == 0 || noquiesce, 652 *(conf->hash_locks + hash)); 653 sh = __find_stripe(conf, sector, conf->generation - previous); 654 if (!sh) { 655 if (!test_bit(R5_INACTIVE_BLOCKED, &conf->cache_state)) { 656 sh = get_free_stripe(conf, hash); 657 if (!sh && !test_bit(R5_DID_ALLOC, 658 &conf->cache_state)) 659 set_bit(R5_ALLOC_MORE, 660 &conf->cache_state); 661 } 662 if (noblock && sh == NULL) 663 break; 664 665 r5c_check_stripe_cache_usage(conf); 666 if (!sh) { 667 set_bit(R5_INACTIVE_BLOCKED, 668 &conf->cache_state); 669 r5l_wake_reclaim(conf->log, 0); 670 wait_event_lock_irq( 671 conf->wait_for_stripe, 672 !list_empty(conf->inactive_list + hash) && 673 (atomic_read(&conf->active_stripes) 674 < (conf->max_nr_stripes * 3 / 4) 675 || !test_bit(R5_INACTIVE_BLOCKED, 676 &conf->cache_state)), 677 *(conf->hash_locks + hash)); 678 clear_bit(R5_INACTIVE_BLOCKED, 679 &conf->cache_state); 680 } else { 681 init_stripe(sh, sector, previous); 682 atomic_inc(&sh->count); 683 } 684 } else if (!atomic_inc_not_zero(&sh->count)) { 685 spin_lock(&conf->device_lock); 686 if (!atomic_read(&sh->count)) { 687 if (!test_bit(STRIPE_HANDLE, &sh->state)) 688 atomic_inc(&conf->active_stripes); 689 BUG_ON(list_empty(&sh->lru) && 690 !test_bit(STRIPE_EXPANDING, &sh->state)); 691 inc_empty_inactive_list_flag = 0; 692 if (!list_empty(conf->inactive_list + hash)) 693 inc_empty_inactive_list_flag = 1; 694 list_del_init(&sh->lru); 695 if (list_empty(conf->inactive_list + hash) && inc_empty_inactive_list_flag) 696 atomic_inc(&conf->empty_inactive_list_nr); 697 if (sh->group) { 698 sh->group->stripes_cnt--; 699 sh->group = NULL; 700 } 701 } 702 atomic_inc(&sh->count); 703 spin_unlock(&conf->device_lock); 704 } 705 } while (sh == NULL); 706 707 spin_unlock_irq(conf->hash_locks + hash); 708 return sh; 709 } 710 711 static bool is_full_stripe_write(struct stripe_head *sh) 712 { 713 BUG_ON(sh->overwrite_disks > (sh->disks - sh->raid_conf->max_degraded)); 714 return sh->overwrite_disks == (sh->disks - sh->raid_conf->max_degraded); 715 } 716 717 static void lock_two_stripes(struct stripe_head *sh1, struct stripe_head *sh2) 718 { 719 if (sh1 > sh2) { 720 spin_lock_irq(&sh2->stripe_lock); 721 spin_lock_nested(&sh1->stripe_lock, 1); 722 } else { 723 spin_lock_irq(&sh1->stripe_lock); 724 spin_lock_nested(&sh2->stripe_lock, 1); 725 } 726 } 727 728 static void unlock_two_stripes(struct stripe_head *sh1, struct stripe_head *sh2) 729 { 730 spin_unlock(&sh1->stripe_lock); 731 spin_unlock_irq(&sh2->stripe_lock); 732 } 733 734 /* Only freshly new full stripe normal write stripe can be added to a batch list */ 735 static bool stripe_can_batch(struct stripe_head *sh) 736 { 737 struct r5conf *conf = sh->raid_conf; 738 739 if (conf->log || raid5_has_ppl(conf)) 740 return false; 741 return test_bit(STRIPE_BATCH_READY, &sh->state) && 742 !test_bit(STRIPE_BITMAP_PENDING, &sh->state) && 743 is_full_stripe_write(sh); 744 } 745 746 /* we only do back search */ 747 static void stripe_add_to_batch_list(struct r5conf *conf, struct stripe_head *sh) 748 { 749 struct stripe_head *head; 750 sector_t head_sector, tmp_sec; 751 int hash; 752 int dd_idx; 753 int inc_empty_inactive_list_flag; 754 755 /* Don't cross chunks, so stripe pd_idx/qd_idx is the same */ 756 tmp_sec = sh->sector; 757 if (!sector_div(tmp_sec, conf->chunk_sectors)) 758 return; 759 head_sector = sh->sector - STRIPE_SECTORS; 760 761 hash = stripe_hash_locks_hash(head_sector); 762 spin_lock_irq(conf->hash_locks + hash); 763 head = __find_stripe(conf, head_sector, conf->generation); 764 if (head && !atomic_inc_not_zero(&head->count)) { 765 spin_lock(&conf->device_lock); 766 if (!atomic_read(&head->count)) { 767 if (!test_bit(STRIPE_HANDLE, &head->state)) 768 atomic_inc(&conf->active_stripes); 769 BUG_ON(list_empty(&head->lru) && 770 !test_bit(STRIPE_EXPANDING, &head->state)); 771 inc_empty_inactive_list_flag = 0; 772 if (!list_empty(conf->inactive_list + hash)) 773 inc_empty_inactive_list_flag = 1; 774 list_del_init(&head->lru); 775 if (list_empty(conf->inactive_list + hash) && inc_empty_inactive_list_flag) 776 atomic_inc(&conf->empty_inactive_list_nr); 777 if (head->group) { 778 head->group->stripes_cnt--; 779 head->group = NULL; 780 } 781 } 782 atomic_inc(&head->count); 783 spin_unlock(&conf->device_lock); 784 } 785 spin_unlock_irq(conf->hash_locks + hash); 786 787 if (!head) 788 return; 789 if (!stripe_can_batch(head)) 790 goto out; 791 792 lock_two_stripes(head, sh); 793 /* clear_batch_ready clear the flag */ 794 if (!stripe_can_batch(head) || !stripe_can_batch(sh)) 795 goto unlock_out; 796 797 if (sh->batch_head) 798 goto unlock_out; 799 800 dd_idx = 0; 801 while (dd_idx == sh->pd_idx || dd_idx == sh->qd_idx) 802 dd_idx++; 803 if (head->dev[dd_idx].towrite->bi_opf != sh->dev[dd_idx].towrite->bi_opf || 804 bio_op(head->dev[dd_idx].towrite) != bio_op(sh->dev[dd_idx].towrite)) 805 goto unlock_out; 806 807 if (head->batch_head) { 808 spin_lock(&head->batch_head->batch_lock); 809 /* This batch list is already running */ 810 if (!stripe_can_batch(head)) { 811 spin_unlock(&head->batch_head->batch_lock); 812 goto unlock_out; 813 } 814 /* 815 * We must assign batch_head of this stripe within the 816 * batch_lock, otherwise clear_batch_ready of batch head 817 * stripe could clear BATCH_READY bit of this stripe and 818 * this stripe->batch_head doesn't get assigned, which 819 * could confuse clear_batch_ready for this stripe 820 */ 821 sh->batch_head = head->batch_head; 822 823 /* 824 * at this point, head's BATCH_READY could be cleared, but we 825 * can still add the stripe to batch list 826 */ 827 list_add(&sh->batch_list, &head->batch_list); 828 spin_unlock(&head->batch_head->batch_lock); 829 } else { 830 head->batch_head = head; 831 sh->batch_head = head->batch_head; 832 spin_lock(&head->batch_lock); 833 list_add_tail(&sh->batch_list, &head->batch_list); 834 spin_unlock(&head->batch_lock); 835 } 836 837 if (test_and_clear_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) 838 if (atomic_dec_return(&conf->preread_active_stripes) 839 < IO_THRESHOLD) 840 md_wakeup_thread(conf->mddev->thread); 841 842 if (test_and_clear_bit(STRIPE_BIT_DELAY, &sh->state)) { 843 int seq = sh->bm_seq; 844 if (test_bit(STRIPE_BIT_DELAY, &sh->batch_head->state) && 845 sh->batch_head->bm_seq > seq) 846 seq = sh->batch_head->bm_seq; 847 set_bit(STRIPE_BIT_DELAY, &sh->batch_head->state); 848 sh->batch_head->bm_seq = seq; 849 } 850 851 atomic_inc(&sh->count); 852 unlock_out: 853 unlock_two_stripes(head, sh); 854 out: 855 raid5_release_stripe(head); 856 } 857 858 /* Determine if 'data_offset' or 'new_data_offset' should be used 859 * in this stripe_head. 860 */ 861 static int use_new_offset(struct r5conf *conf, struct stripe_head *sh) 862 { 863 sector_t progress = conf->reshape_progress; 864 /* Need a memory barrier to make sure we see the value 865 * of conf->generation, or ->data_offset that was set before 866 * reshape_progress was updated. 867 */ 868 smp_rmb(); 869 if (progress == MaxSector) 870 return 0; 871 if (sh->generation == conf->generation - 1) 872 return 0; 873 /* We are in a reshape, and this is a new-generation stripe, 874 * so use new_data_offset. 875 */ 876 return 1; 877 } 878 879 static void dispatch_bio_list(struct bio_list *tmp) 880 { 881 struct bio *bio; 882 883 while ((bio = bio_list_pop(tmp))) 884 generic_make_request(bio); 885 } 886 887 static int cmp_stripe(void *priv, struct list_head *a, struct list_head *b) 888 { 889 const struct r5pending_data *da = list_entry(a, 890 struct r5pending_data, sibling); 891 const struct r5pending_data *db = list_entry(b, 892 struct r5pending_data, sibling); 893 if (da->sector > db->sector) 894 return 1; 895 if (da->sector < db->sector) 896 return -1; 897 return 0; 898 } 899 900 static void dispatch_defer_bios(struct r5conf *conf, int target, 901 struct bio_list *list) 902 { 903 struct r5pending_data *data; 904 struct list_head *first, *next = NULL; 905 int cnt = 0; 906 907 if (conf->pending_data_cnt == 0) 908 return; 909 910 list_sort(NULL, &conf->pending_list, cmp_stripe); 911 912 first = conf->pending_list.next; 913 914 /* temporarily move the head */ 915 if (conf->next_pending_data) 916 list_move_tail(&conf->pending_list, 917 &conf->next_pending_data->sibling); 918 919 while (!list_empty(&conf->pending_list)) { 920 data = list_first_entry(&conf->pending_list, 921 struct r5pending_data, sibling); 922 if (&data->sibling == first) 923 first = data->sibling.next; 924 next = data->sibling.next; 925 926 bio_list_merge(list, &data->bios); 927 list_move(&data->sibling, &conf->free_list); 928 cnt++; 929 if (cnt >= target) 930 break; 931 } 932 conf->pending_data_cnt -= cnt; 933 BUG_ON(conf->pending_data_cnt < 0 || cnt < target); 934 935 if (next != &conf->pending_list) 936 conf->next_pending_data = list_entry(next, 937 struct r5pending_data, sibling); 938 else 939 conf->next_pending_data = NULL; 940 /* list isn't empty */ 941 if (first != &conf->pending_list) 942 list_move_tail(&conf->pending_list, first); 943 } 944 945 static void flush_deferred_bios(struct r5conf *conf) 946 { 947 struct bio_list tmp = BIO_EMPTY_LIST; 948 949 if (conf->pending_data_cnt == 0) 950 return; 951 952 spin_lock(&conf->pending_bios_lock); 953 dispatch_defer_bios(conf, conf->pending_data_cnt, &tmp); 954 BUG_ON(conf->pending_data_cnt != 0); 955 spin_unlock(&conf->pending_bios_lock); 956 957 dispatch_bio_list(&tmp); 958 } 959 960 static void defer_issue_bios(struct r5conf *conf, sector_t sector, 961 struct bio_list *bios) 962 { 963 struct bio_list tmp = BIO_EMPTY_LIST; 964 struct r5pending_data *ent; 965 966 spin_lock(&conf->pending_bios_lock); 967 ent = list_first_entry(&conf->free_list, struct r5pending_data, 968 sibling); 969 list_move_tail(&ent->sibling, &conf->pending_list); 970 ent->sector = sector; 971 bio_list_init(&ent->bios); 972 bio_list_merge(&ent->bios, bios); 973 conf->pending_data_cnt++; 974 if (conf->pending_data_cnt >= PENDING_IO_MAX) 975 dispatch_defer_bios(conf, PENDING_IO_ONE_FLUSH, &tmp); 976 977 spin_unlock(&conf->pending_bios_lock); 978 979 dispatch_bio_list(&tmp); 980 } 981 982 static void 983 raid5_end_read_request(struct bio *bi); 984 static void 985 raid5_end_write_request(struct bio *bi); 986 987 static void ops_run_io(struct stripe_head *sh, struct stripe_head_state *s) 988 { 989 struct r5conf *conf = sh->raid_conf; 990 int i, disks = sh->disks; 991 struct stripe_head *head_sh = sh; 992 struct bio_list pending_bios = BIO_EMPTY_LIST; 993 bool should_defer; 994 995 might_sleep(); 996 997 if (log_stripe(sh, s) == 0) 998 return; 999 1000 should_defer = conf->batch_bio_dispatch && conf->group_cnt; 1001 1002 for (i = disks; i--; ) { 1003 int op, op_flags = 0; 1004 int replace_only = 0; 1005 struct bio *bi, *rbi; 1006 struct md_rdev *rdev, *rrdev = NULL; 1007 1008 sh = head_sh; 1009 if (test_and_clear_bit(R5_Wantwrite, &sh->dev[i].flags)) { 1010 op = REQ_OP_WRITE; 1011 if (test_and_clear_bit(R5_WantFUA, &sh->dev[i].flags)) 1012 op_flags = REQ_FUA; 1013 if (test_bit(R5_Discard, &sh->dev[i].flags)) 1014 op = REQ_OP_DISCARD; 1015 } else if (test_and_clear_bit(R5_Wantread, &sh->dev[i].flags)) 1016 op = REQ_OP_READ; 1017 else if (test_and_clear_bit(R5_WantReplace, 1018 &sh->dev[i].flags)) { 1019 op = REQ_OP_WRITE; 1020 replace_only = 1; 1021 } else 1022 continue; 1023 if (test_and_clear_bit(R5_SyncIO, &sh->dev[i].flags)) 1024 op_flags |= REQ_SYNC; 1025 1026 again: 1027 bi = &sh->dev[i].req; 1028 rbi = &sh->dev[i].rreq; /* For writing to replacement */ 1029 1030 rcu_read_lock(); 1031 rrdev = rcu_dereference(conf->disks[i].replacement); 1032 smp_mb(); /* Ensure that if rrdev is NULL, rdev won't be */ 1033 rdev = rcu_dereference(conf->disks[i].rdev); 1034 if (!rdev) { 1035 rdev = rrdev; 1036 rrdev = NULL; 1037 } 1038 if (op_is_write(op)) { 1039 if (replace_only) 1040 rdev = NULL; 1041 if (rdev == rrdev) 1042 /* We raced and saw duplicates */ 1043 rrdev = NULL; 1044 } else { 1045 if (test_bit(R5_ReadRepl, &head_sh->dev[i].flags) && rrdev) 1046 rdev = rrdev; 1047 rrdev = NULL; 1048 } 1049 1050 if (rdev && test_bit(Faulty, &rdev->flags)) 1051 rdev = NULL; 1052 if (rdev) 1053 atomic_inc(&rdev->nr_pending); 1054 if (rrdev && test_bit(Faulty, &rrdev->flags)) 1055 rrdev = NULL; 1056 if (rrdev) 1057 atomic_inc(&rrdev->nr_pending); 1058 rcu_read_unlock(); 1059 1060 /* We have already checked bad blocks for reads. Now 1061 * need to check for writes. We never accept write errors 1062 * on the replacement, so we don't to check rrdev. 1063 */ 1064 while (op_is_write(op) && rdev && 1065 test_bit(WriteErrorSeen, &rdev->flags)) { 1066 sector_t first_bad; 1067 int bad_sectors; 1068 int bad = is_badblock(rdev, sh->sector, STRIPE_SECTORS, 1069 &first_bad, &bad_sectors); 1070 if (!bad) 1071 break; 1072 1073 if (bad < 0) { 1074 set_bit(BlockedBadBlocks, &rdev->flags); 1075 if (!conf->mddev->external && 1076 conf->mddev->sb_flags) { 1077 /* It is very unlikely, but we might 1078 * still need to write out the 1079 * bad block log - better give it 1080 * a chance*/ 1081 md_check_recovery(conf->mddev); 1082 } 1083 /* 1084 * Because md_wait_for_blocked_rdev 1085 * will dec nr_pending, we must 1086 * increment it first. 1087 */ 1088 atomic_inc(&rdev->nr_pending); 1089 md_wait_for_blocked_rdev(rdev, conf->mddev); 1090 } else { 1091 /* Acknowledged bad block - skip the write */ 1092 rdev_dec_pending(rdev, conf->mddev); 1093 rdev = NULL; 1094 } 1095 } 1096 1097 if (rdev) { 1098 if (s->syncing || s->expanding || s->expanded 1099 || s->replacing) 1100 md_sync_acct(rdev->bdev, STRIPE_SECTORS); 1101 1102 set_bit(STRIPE_IO_STARTED, &sh->state); 1103 1104 bio_set_dev(bi, rdev->bdev); 1105 bio_set_op_attrs(bi, op, op_flags); 1106 bi->bi_end_io = op_is_write(op) 1107 ? raid5_end_write_request 1108 : raid5_end_read_request; 1109 bi->bi_private = sh; 1110 1111 pr_debug("%s: for %llu schedule op %d on disc %d\n", 1112 __func__, (unsigned long long)sh->sector, 1113 bi->bi_opf, i); 1114 atomic_inc(&sh->count); 1115 if (sh != head_sh) 1116 atomic_inc(&head_sh->count); 1117 if (use_new_offset(conf, sh)) 1118 bi->bi_iter.bi_sector = (sh->sector 1119 + rdev->new_data_offset); 1120 else 1121 bi->bi_iter.bi_sector = (sh->sector 1122 + rdev->data_offset); 1123 if (test_bit(R5_ReadNoMerge, &head_sh->dev[i].flags)) 1124 bi->bi_opf |= REQ_NOMERGE; 1125 1126 if (test_bit(R5_SkipCopy, &sh->dev[i].flags)) 1127 WARN_ON(test_bit(R5_UPTODATE, &sh->dev[i].flags)); 1128 1129 if (!op_is_write(op) && 1130 test_bit(R5_InJournal, &sh->dev[i].flags)) 1131 /* 1132 * issuing read for a page in journal, this 1133 * must be preparing for prexor in rmw; read 1134 * the data into orig_page 1135 */ 1136 sh->dev[i].vec.bv_page = sh->dev[i].orig_page; 1137 else 1138 sh->dev[i].vec.bv_page = sh->dev[i].page; 1139 bi->bi_vcnt = 1; 1140 bi->bi_io_vec[0].bv_len = STRIPE_SIZE; 1141 bi->bi_io_vec[0].bv_offset = 0; 1142 bi->bi_iter.bi_size = STRIPE_SIZE; 1143 /* 1144 * If this is discard request, set bi_vcnt 0. We don't 1145 * want to confuse SCSI because SCSI will replace payload 1146 */ 1147 if (op == REQ_OP_DISCARD) 1148 bi->bi_vcnt = 0; 1149 if (rrdev) 1150 set_bit(R5_DOUBLE_LOCKED, &sh->dev[i].flags); 1151 1152 if (conf->mddev->gendisk) 1153 trace_block_bio_remap(bi->bi_disk->queue, 1154 bi, disk_devt(conf->mddev->gendisk), 1155 sh->dev[i].sector); 1156 if (should_defer && op_is_write(op)) 1157 bio_list_add(&pending_bios, bi); 1158 else 1159 generic_make_request(bi); 1160 } 1161 if (rrdev) { 1162 if (s->syncing || s->expanding || s->expanded 1163 || s->replacing) 1164 md_sync_acct(rrdev->bdev, STRIPE_SECTORS); 1165 1166 set_bit(STRIPE_IO_STARTED, &sh->state); 1167 1168 bio_set_dev(rbi, rrdev->bdev); 1169 bio_set_op_attrs(rbi, op, op_flags); 1170 BUG_ON(!op_is_write(op)); 1171 rbi->bi_end_io = raid5_end_write_request; 1172 rbi->bi_private = sh; 1173 1174 pr_debug("%s: for %llu schedule op %d on " 1175 "replacement disc %d\n", 1176 __func__, (unsigned long long)sh->sector, 1177 rbi->bi_opf, i); 1178 atomic_inc(&sh->count); 1179 if (sh != head_sh) 1180 atomic_inc(&head_sh->count); 1181 if (use_new_offset(conf, sh)) 1182 rbi->bi_iter.bi_sector = (sh->sector 1183 + rrdev->new_data_offset); 1184 else 1185 rbi->bi_iter.bi_sector = (sh->sector 1186 + rrdev->data_offset); 1187 if (test_bit(R5_SkipCopy, &sh->dev[i].flags)) 1188 WARN_ON(test_bit(R5_UPTODATE, &sh->dev[i].flags)); 1189 sh->dev[i].rvec.bv_page = sh->dev[i].page; 1190 rbi->bi_vcnt = 1; 1191 rbi->bi_io_vec[0].bv_len = STRIPE_SIZE; 1192 rbi->bi_io_vec[0].bv_offset = 0; 1193 rbi->bi_iter.bi_size = STRIPE_SIZE; 1194 /* 1195 * If this is discard request, set bi_vcnt 0. We don't 1196 * want to confuse SCSI because SCSI will replace payload 1197 */ 1198 if (op == REQ_OP_DISCARD) 1199 rbi->bi_vcnt = 0; 1200 if (conf->mddev->gendisk) 1201 trace_block_bio_remap(rbi->bi_disk->queue, 1202 rbi, disk_devt(conf->mddev->gendisk), 1203 sh->dev[i].sector); 1204 if (should_defer && op_is_write(op)) 1205 bio_list_add(&pending_bios, rbi); 1206 else 1207 generic_make_request(rbi); 1208 } 1209 if (!rdev && !rrdev) { 1210 if (op_is_write(op)) 1211 set_bit(STRIPE_DEGRADED, &sh->state); 1212 pr_debug("skip op %d on disc %d for sector %llu\n", 1213 bi->bi_opf, i, (unsigned long long)sh->sector); 1214 clear_bit(R5_LOCKED, &sh->dev[i].flags); 1215 set_bit(STRIPE_HANDLE, &sh->state); 1216 } 1217 1218 if (!head_sh->batch_head) 1219 continue; 1220 sh = list_first_entry(&sh->batch_list, struct stripe_head, 1221 batch_list); 1222 if (sh != head_sh) 1223 goto again; 1224 } 1225 1226 if (should_defer && !bio_list_empty(&pending_bios)) 1227 defer_issue_bios(conf, head_sh->sector, &pending_bios); 1228 } 1229 1230 static struct dma_async_tx_descriptor * 1231 async_copy_data(int frombio, struct bio *bio, struct page **page, 1232 sector_t sector, struct dma_async_tx_descriptor *tx, 1233 struct stripe_head *sh, int no_skipcopy) 1234 { 1235 struct bio_vec bvl; 1236 struct bvec_iter iter; 1237 struct page *bio_page; 1238 int page_offset; 1239 struct async_submit_ctl submit; 1240 enum async_tx_flags flags = 0; 1241 1242 if (bio->bi_iter.bi_sector >= sector) 1243 page_offset = (signed)(bio->bi_iter.bi_sector - sector) * 512; 1244 else 1245 page_offset = (signed)(sector - bio->bi_iter.bi_sector) * -512; 1246 1247 if (frombio) 1248 flags |= ASYNC_TX_FENCE; 1249 init_async_submit(&submit, flags, tx, NULL, NULL, NULL); 1250 1251 bio_for_each_segment(bvl, bio, iter) { 1252 int len = bvl.bv_len; 1253 int clen; 1254 int b_offset = 0; 1255 1256 if (page_offset < 0) { 1257 b_offset = -page_offset; 1258 page_offset += b_offset; 1259 len -= b_offset; 1260 } 1261 1262 if (len > 0 && page_offset + len > STRIPE_SIZE) 1263 clen = STRIPE_SIZE - page_offset; 1264 else 1265 clen = len; 1266 1267 if (clen > 0) { 1268 b_offset += bvl.bv_offset; 1269 bio_page = bvl.bv_page; 1270 if (frombio) { 1271 if (sh->raid_conf->skip_copy && 1272 b_offset == 0 && page_offset == 0 && 1273 clen == STRIPE_SIZE && 1274 !no_skipcopy) 1275 *page = bio_page; 1276 else 1277 tx = async_memcpy(*page, bio_page, page_offset, 1278 b_offset, clen, &submit); 1279 } else 1280 tx = async_memcpy(bio_page, *page, b_offset, 1281 page_offset, clen, &submit); 1282 } 1283 /* chain the operations */ 1284 submit.depend_tx = tx; 1285 1286 if (clen < len) /* hit end of page */ 1287 break; 1288 page_offset += len; 1289 } 1290 1291 return tx; 1292 } 1293 1294 static void ops_complete_biofill(void *stripe_head_ref) 1295 { 1296 struct stripe_head *sh = stripe_head_ref; 1297 int i; 1298 1299 pr_debug("%s: stripe %llu\n", __func__, 1300 (unsigned long long)sh->sector); 1301 1302 /* clear completed biofills */ 1303 for (i = sh->disks; i--; ) { 1304 struct r5dev *dev = &sh->dev[i]; 1305 1306 /* acknowledge completion of a biofill operation */ 1307 /* and check if we need to reply to a read request, 1308 * new R5_Wantfill requests are held off until 1309 * !STRIPE_BIOFILL_RUN 1310 */ 1311 if (test_and_clear_bit(R5_Wantfill, &dev->flags)) { 1312 struct bio *rbi, *rbi2; 1313 1314 BUG_ON(!dev->read); 1315 rbi = dev->read; 1316 dev->read = NULL; 1317 while (rbi && rbi->bi_iter.bi_sector < 1318 dev->sector + STRIPE_SECTORS) { 1319 rbi2 = r5_next_bio(rbi, dev->sector); 1320 bio_endio(rbi); 1321 rbi = rbi2; 1322 } 1323 } 1324 } 1325 clear_bit(STRIPE_BIOFILL_RUN, &sh->state); 1326 1327 set_bit(STRIPE_HANDLE, &sh->state); 1328 raid5_release_stripe(sh); 1329 } 1330 1331 static void ops_run_biofill(struct stripe_head *sh) 1332 { 1333 struct dma_async_tx_descriptor *tx = NULL; 1334 struct async_submit_ctl submit; 1335 int i; 1336 1337 BUG_ON(sh->batch_head); 1338 pr_debug("%s: stripe %llu\n", __func__, 1339 (unsigned long long)sh->sector); 1340 1341 for (i = sh->disks; i--; ) { 1342 struct r5dev *dev = &sh->dev[i]; 1343 if (test_bit(R5_Wantfill, &dev->flags)) { 1344 struct bio *rbi; 1345 spin_lock_irq(&sh->stripe_lock); 1346 dev->read = rbi = dev->toread; 1347 dev->toread = NULL; 1348 spin_unlock_irq(&sh->stripe_lock); 1349 while (rbi && rbi->bi_iter.bi_sector < 1350 dev->sector + STRIPE_SECTORS) { 1351 tx = async_copy_data(0, rbi, &dev->page, 1352 dev->sector, tx, sh, 0); 1353 rbi = r5_next_bio(rbi, dev->sector); 1354 } 1355 } 1356 } 1357 1358 atomic_inc(&sh->count); 1359 init_async_submit(&submit, ASYNC_TX_ACK, tx, ops_complete_biofill, sh, NULL); 1360 async_trigger_callback(&submit); 1361 } 1362 1363 static void mark_target_uptodate(struct stripe_head *sh, int target) 1364 { 1365 struct r5dev *tgt; 1366 1367 if (target < 0) 1368 return; 1369 1370 tgt = &sh->dev[target]; 1371 set_bit(R5_UPTODATE, &tgt->flags); 1372 BUG_ON(!test_bit(R5_Wantcompute, &tgt->flags)); 1373 clear_bit(R5_Wantcompute, &tgt->flags); 1374 } 1375 1376 static void ops_complete_compute(void *stripe_head_ref) 1377 { 1378 struct stripe_head *sh = stripe_head_ref; 1379 1380 pr_debug("%s: stripe %llu\n", __func__, 1381 (unsigned long long)sh->sector); 1382 1383 /* mark the computed target(s) as uptodate */ 1384 mark_target_uptodate(sh, sh->ops.target); 1385 mark_target_uptodate(sh, sh->ops.target2); 1386 1387 clear_bit(STRIPE_COMPUTE_RUN, &sh->state); 1388 if (sh->check_state == check_state_compute_run) 1389 sh->check_state = check_state_compute_result; 1390 set_bit(STRIPE_HANDLE, &sh->state); 1391 raid5_release_stripe(sh); 1392 } 1393 1394 /* return a pointer to the address conversion region of the scribble buffer */ 1395 static addr_conv_t *to_addr_conv(struct stripe_head *sh, 1396 struct raid5_percpu *percpu, int i) 1397 { 1398 void *addr; 1399 1400 addr = flex_array_get(percpu->scribble, i); 1401 return addr + sizeof(struct page *) * (sh->disks + 2); 1402 } 1403 1404 /* return a pointer to the address conversion region of the scribble buffer */ 1405 static struct page **to_addr_page(struct raid5_percpu *percpu, int i) 1406 { 1407 void *addr; 1408 1409 addr = flex_array_get(percpu->scribble, i); 1410 return addr; 1411 } 1412 1413 static struct dma_async_tx_descriptor * 1414 ops_run_compute5(struct stripe_head *sh, struct raid5_percpu *percpu) 1415 { 1416 int disks = sh->disks; 1417 struct page **xor_srcs = to_addr_page(percpu, 0); 1418 int target = sh->ops.target; 1419 struct r5dev *tgt = &sh->dev[target]; 1420 struct page *xor_dest = tgt->page; 1421 int count = 0; 1422 struct dma_async_tx_descriptor *tx; 1423 struct async_submit_ctl submit; 1424 int i; 1425 1426 BUG_ON(sh->batch_head); 1427 1428 pr_debug("%s: stripe %llu block: %d\n", 1429 __func__, (unsigned long long)sh->sector, target); 1430 BUG_ON(!test_bit(R5_Wantcompute, &tgt->flags)); 1431 1432 for (i = disks; i--; ) 1433 if (i != target) 1434 xor_srcs[count++] = sh->dev[i].page; 1435 1436 atomic_inc(&sh->count); 1437 1438 init_async_submit(&submit, ASYNC_TX_FENCE|ASYNC_TX_XOR_ZERO_DST, NULL, 1439 ops_complete_compute, sh, to_addr_conv(sh, percpu, 0)); 1440 if (unlikely(count == 1)) 1441 tx = async_memcpy(xor_dest, xor_srcs[0], 0, 0, STRIPE_SIZE, &submit); 1442 else 1443 tx = async_xor(xor_dest, xor_srcs, 0, count, STRIPE_SIZE, &submit); 1444 1445 return tx; 1446 } 1447 1448 /* set_syndrome_sources - populate source buffers for gen_syndrome 1449 * @srcs - (struct page *) array of size sh->disks 1450 * @sh - stripe_head to parse 1451 * 1452 * Populates srcs in proper layout order for the stripe and returns the 1453 * 'count' of sources to be used in a call to async_gen_syndrome. The P 1454 * destination buffer is recorded in srcs[count] and the Q destination 1455 * is recorded in srcs[count+1]]. 1456 */ 1457 static int set_syndrome_sources(struct page **srcs, 1458 struct stripe_head *sh, 1459 int srctype) 1460 { 1461 int disks = sh->disks; 1462 int syndrome_disks = sh->ddf_layout ? disks : (disks - 2); 1463 int d0_idx = raid6_d0(sh); 1464 int count; 1465 int i; 1466 1467 for (i = 0; i < disks; i++) 1468 srcs[i] = NULL; 1469 1470 count = 0; 1471 i = d0_idx; 1472 do { 1473 int slot = raid6_idx_to_slot(i, sh, &count, syndrome_disks); 1474 struct r5dev *dev = &sh->dev[i]; 1475 1476 if (i == sh->qd_idx || i == sh->pd_idx || 1477 (srctype == SYNDROME_SRC_ALL) || 1478 (srctype == SYNDROME_SRC_WANT_DRAIN && 1479 (test_bit(R5_Wantdrain, &dev->flags) || 1480 test_bit(R5_InJournal, &dev->flags))) || 1481 (srctype == SYNDROME_SRC_WRITTEN && 1482 (dev->written || 1483 test_bit(R5_InJournal, &dev->flags)))) { 1484 if (test_bit(R5_InJournal, &dev->flags)) 1485 srcs[slot] = sh->dev[i].orig_page; 1486 else 1487 srcs[slot] = sh->dev[i].page; 1488 } 1489 i = raid6_next_disk(i, disks); 1490 } while (i != d0_idx); 1491 1492 return syndrome_disks; 1493 } 1494 1495 static struct dma_async_tx_descriptor * 1496 ops_run_compute6_1(struct stripe_head *sh, struct raid5_percpu *percpu) 1497 { 1498 int disks = sh->disks; 1499 struct page **blocks = to_addr_page(percpu, 0); 1500 int target; 1501 int qd_idx = sh->qd_idx; 1502 struct dma_async_tx_descriptor *tx; 1503 struct async_submit_ctl submit; 1504 struct r5dev *tgt; 1505 struct page *dest; 1506 int i; 1507 int count; 1508 1509 BUG_ON(sh->batch_head); 1510 if (sh->ops.target < 0) 1511 target = sh->ops.target2; 1512 else if (sh->ops.target2 < 0) 1513 target = sh->ops.target; 1514 else 1515 /* we should only have one valid target */ 1516 BUG(); 1517 BUG_ON(target < 0); 1518 pr_debug("%s: stripe %llu block: %d\n", 1519 __func__, (unsigned long long)sh->sector, target); 1520 1521 tgt = &sh->dev[target]; 1522 BUG_ON(!test_bit(R5_Wantcompute, &tgt->flags)); 1523 dest = tgt->page; 1524 1525 atomic_inc(&sh->count); 1526 1527 if (target == qd_idx) { 1528 count = set_syndrome_sources(blocks, sh, SYNDROME_SRC_ALL); 1529 blocks[count] = NULL; /* regenerating p is not necessary */ 1530 BUG_ON(blocks[count+1] != dest); /* q should already be set */ 1531 init_async_submit(&submit, ASYNC_TX_FENCE, NULL, 1532 ops_complete_compute, sh, 1533 to_addr_conv(sh, percpu, 0)); 1534 tx = async_gen_syndrome(blocks, 0, count+2, STRIPE_SIZE, &submit); 1535 } else { 1536 /* Compute any data- or p-drive using XOR */ 1537 count = 0; 1538 for (i = disks; i-- ; ) { 1539 if (i == target || i == qd_idx) 1540 continue; 1541 blocks[count++] = sh->dev[i].page; 1542 } 1543 1544 init_async_submit(&submit, ASYNC_TX_FENCE|ASYNC_TX_XOR_ZERO_DST, 1545 NULL, ops_complete_compute, sh, 1546 to_addr_conv(sh, percpu, 0)); 1547 tx = async_xor(dest, blocks, 0, count, STRIPE_SIZE, &submit); 1548 } 1549 1550 return tx; 1551 } 1552 1553 static struct dma_async_tx_descriptor * 1554 ops_run_compute6_2(struct stripe_head *sh, struct raid5_percpu *percpu) 1555 { 1556 int i, count, disks = sh->disks; 1557 int syndrome_disks = sh->ddf_layout ? disks : disks-2; 1558 int d0_idx = raid6_d0(sh); 1559 int faila = -1, failb = -1; 1560 int target = sh->ops.target; 1561 int target2 = sh->ops.target2; 1562 struct r5dev *tgt = &sh->dev[target]; 1563 struct r5dev *tgt2 = &sh->dev[target2]; 1564 struct dma_async_tx_descriptor *tx; 1565 struct page **blocks = to_addr_page(percpu, 0); 1566 struct async_submit_ctl submit; 1567 1568 BUG_ON(sh->batch_head); 1569 pr_debug("%s: stripe %llu block1: %d block2: %d\n", 1570 __func__, (unsigned long long)sh->sector, target, target2); 1571 BUG_ON(target < 0 || target2 < 0); 1572 BUG_ON(!test_bit(R5_Wantcompute, &tgt->flags)); 1573 BUG_ON(!test_bit(R5_Wantcompute, &tgt2->flags)); 1574 1575 /* we need to open-code set_syndrome_sources to handle the 1576 * slot number conversion for 'faila' and 'failb' 1577 */ 1578 for (i = 0; i < disks ; i++) 1579 blocks[i] = NULL; 1580 count = 0; 1581 i = d0_idx; 1582 do { 1583 int slot = raid6_idx_to_slot(i, sh, &count, syndrome_disks); 1584 1585 blocks[slot] = sh->dev[i].page; 1586 1587 if (i == target) 1588 faila = slot; 1589 if (i == target2) 1590 failb = slot; 1591 i = raid6_next_disk(i, disks); 1592 } while (i != d0_idx); 1593 1594 BUG_ON(faila == failb); 1595 if (failb < faila) 1596 swap(faila, failb); 1597 pr_debug("%s: stripe: %llu faila: %d failb: %d\n", 1598 __func__, (unsigned long long)sh->sector, faila, failb); 1599 1600 atomic_inc(&sh->count); 1601 1602 if (failb == syndrome_disks+1) { 1603 /* Q disk is one of the missing disks */ 1604 if (faila == syndrome_disks) { 1605 /* Missing P+Q, just recompute */ 1606 init_async_submit(&submit, ASYNC_TX_FENCE, NULL, 1607 ops_complete_compute, sh, 1608 to_addr_conv(sh, percpu, 0)); 1609 return async_gen_syndrome(blocks, 0, syndrome_disks+2, 1610 STRIPE_SIZE, &submit); 1611 } else { 1612 struct page *dest; 1613 int data_target; 1614 int qd_idx = sh->qd_idx; 1615 1616 /* Missing D+Q: recompute D from P, then recompute Q */ 1617 if (target == qd_idx) 1618 data_target = target2; 1619 else 1620 data_target = target; 1621 1622 count = 0; 1623 for (i = disks; i-- ; ) { 1624 if (i == data_target || i == qd_idx) 1625 continue; 1626 blocks[count++] = sh->dev[i].page; 1627 } 1628 dest = sh->dev[data_target].page; 1629 init_async_submit(&submit, 1630 ASYNC_TX_FENCE|ASYNC_TX_XOR_ZERO_DST, 1631 NULL, NULL, NULL, 1632 to_addr_conv(sh, percpu, 0)); 1633 tx = async_xor(dest, blocks, 0, count, STRIPE_SIZE, 1634 &submit); 1635 1636 count = set_syndrome_sources(blocks, sh, SYNDROME_SRC_ALL); 1637 init_async_submit(&submit, ASYNC_TX_FENCE, tx, 1638 ops_complete_compute, sh, 1639 to_addr_conv(sh, percpu, 0)); 1640 return async_gen_syndrome(blocks, 0, count+2, 1641 STRIPE_SIZE, &submit); 1642 } 1643 } else { 1644 init_async_submit(&submit, ASYNC_TX_FENCE, NULL, 1645 ops_complete_compute, sh, 1646 to_addr_conv(sh, percpu, 0)); 1647 if (failb == syndrome_disks) { 1648 /* We're missing D+P. */ 1649 return async_raid6_datap_recov(syndrome_disks+2, 1650 STRIPE_SIZE, faila, 1651 blocks, &submit); 1652 } else { 1653 /* We're missing D+D. */ 1654 return async_raid6_2data_recov(syndrome_disks+2, 1655 STRIPE_SIZE, faila, failb, 1656 blocks, &submit); 1657 } 1658 } 1659 } 1660 1661 static void ops_complete_prexor(void *stripe_head_ref) 1662 { 1663 struct stripe_head *sh = stripe_head_ref; 1664 1665 pr_debug("%s: stripe %llu\n", __func__, 1666 (unsigned long long)sh->sector); 1667 1668 if (r5c_is_writeback(sh->raid_conf->log)) 1669 /* 1670 * raid5-cache write back uses orig_page during prexor. 1671 * After prexor, it is time to free orig_page 1672 */ 1673 r5c_release_extra_page(sh); 1674 } 1675 1676 static struct dma_async_tx_descriptor * 1677 ops_run_prexor5(struct stripe_head *sh, struct raid5_percpu *percpu, 1678 struct dma_async_tx_descriptor *tx) 1679 { 1680 int disks = sh->disks; 1681 struct page **xor_srcs = to_addr_page(percpu, 0); 1682 int count = 0, pd_idx = sh->pd_idx, i; 1683 struct async_submit_ctl submit; 1684 1685 /* existing parity data subtracted */ 1686 struct page *xor_dest = xor_srcs[count++] = sh->dev[pd_idx].page; 1687 1688 BUG_ON(sh->batch_head); 1689 pr_debug("%s: stripe %llu\n", __func__, 1690 (unsigned long long)sh->sector); 1691 1692 for (i = disks; i--; ) { 1693 struct r5dev *dev = &sh->dev[i]; 1694 /* Only process blocks that are known to be uptodate */ 1695 if (test_bit(R5_InJournal, &dev->flags)) 1696 xor_srcs[count++] = dev->orig_page; 1697 else if (test_bit(R5_Wantdrain, &dev->flags)) 1698 xor_srcs[count++] = dev->page; 1699 } 1700 1701 init_async_submit(&submit, ASYNC_TX_FENCE|ASYNC_TX_XOR_DROP_DST, tx, 1702 ops_complete_prexor, sh, to_addr_conv(sh, percpu, 0)); 1703 tx = async_xor(xor_dest, xor_srcs, 0, count, STRIPE_SIZE, &submit); 1704 1705 return tx; 1706 } 1707 1708 static struct dma_async_tx_descriptor * 1709 ops_run_prexor6(struct stripe_head *sh, struct raid5_percpu *percpu, 1710 struct dma_async_tx_descriptor *tx) 1711 { 1712 struct page **blocks = to_addr_page(percpu, 0); 1713 int count; 1714 struct async_submit_ctl submit; 1715 1716 pr_debug("%s: stripe %llu\n", __func__, 1717 (unsigned long long)sh->sector); 1718 1719 count = set_syndrome_sources(blocks, sh, SYNDROME_SRC_WANT_DRAIN); 1720 1721 init_async_submit(&submit, ASYNC_TX_FENCE|ASYNC_TX_PQ_XOR_DST, tx, 1722 ops_complete_prexor, sh, to_addr_conv(sh, percpu, 0)); 1723 tx = async_gen_syndrome(blocks, 0, count+2, STRIPE_SIZE, &submit); 1724 1725 return tx; 1726 } 1727 1728 static struct dma_async_tx_descriptor * 1729 ops_run_biodrain(struct stripe_head *sh, struct dma_async_tx_descriptor *tx) 1730 { 1731 struct r5conf *conf = sh->raid_conf; 1732 int disks = sh->disks; 1733 int i; 1734 struct stripe_head *head_sh = sh; 1735 1736 pr_debug("%s: stripe %llu\n", __func__, 1737 (unsigned long long)sh->sector); 1738 1739 for (i = disks; i--; ) { 1740 struct r5dev *dev; 1741 struct bio *chosen; 1742 1743 sh = head_sh; 1744 if (test_and_clear_bit(R5_Wantdrain, &head_sh->dev[i].flags)) { 1745 struct bio *wbi; 1746 1747 again: 1748 dev = &sh->dev[i]; 1749 /* 1750 * clear R5_InJournal, so when rewriting a page in 1751 * journal, it is not skipped by r5l_log_stripe() 1752 */ 1753 clear_bit(R5_InJournal, &dev->flags); 1754 spin_lock_irq(&sh->stripe_lock); 1755 chosen = dev->towrite; 1756 dev->towrite = NULL; 1757 sh->overwrite_disks = 0; 1758 BUG_ON(dev->written); 1759 wbi = dev->written = chosen; 1760 spin_unlock_irq(&sh->stripe_lock); 1761 WARN_ON(dev->page != dev->orig_page); 1762 1763 while (wbi && wbi->bi_iter.bi_sector < 1764 dev->sector + STRIPE_SECTORS) { 1765 if (wbi->bi_opf & REQ_FUA) 1766 set_bit(R5_WantFUA, &dev->flags); 1767 if (wbi->bi_opf & REQ_SYNC) 1768 set_bit(R5_SyncIO, &dev->flags); 1769 if (bio_op(wbi) == REQ_OP_DISCARD) 1770 set_bit(R5_Discard, &dev->flags); 1771 else { 1772 tx = async_copy_data(1, wbi, &dev->page, 1773 dev->sector, tx, sh, 1774 r5c_is_writeback(conf->log)); 1775 if (dev->page != dev->orig_page && 1776 !r5c_is_writeback(conf->log)) { 1777 set_bit(R5_SkipCopy, &dev->flags); 1778 clear_bit(R5_UPTODATE, &dev->flags); 1779 clear_bit(R5_OVERWRITE, &dev->flags); 1780 } 1781 } 1782 wbi = r5_next_bio(wbi, dev->sector); 1783 } 1784 1785 if (head_sh->batch_head) { 1786 sh = list_first_entry(&sh->batch_list, 1787 struct stripe_head, 1788 batch_list); 1789 if (sh == head_sh) 1790 continue; 1791 goto again; 1792 } 1793 } 1794 } 1795 1796 return tx; 1797 } 1798 1799 static void ops_complete_reconstruct(void *stripe_head_ref) 1800 { 1801 struct stripe_head *sh = stripe_head_ref; 1802 int disks = sh->disks; 1803 int pd_idx = sh->pd_idx; 1804 int qd_idx = sh->qd_idx; 1805 int i; 1806 bool fua = false, sync = false, discard = false; 1807 1808 pr_debug("%s: stripe %llu\n", __func__, 1809 (unsigned long long)sh->sector); 1810 1811 for (i = disks; i--; ) { 1812 fua |= test_bit(R5_WantFUA, &sh->dev[i].flags); 1813 sync |= test_bit(R5_SyncIO, &sh->dev[i].flags); 1814 discard |= test_bit(R5_Discard, &sh->dev[i].flags); 1815 } 1816 1817 for (i = disks; i--; ) { 1818 struct r5dev *dev = &sh->dev[i]; 1819 1820 if (dev->written || i == pd_idx || i == qd_idx) { 1821 if (!discard && !test_bit(R5_SkipCopy, &dev->flags)) 1822 set_bit(R5_UPTODATE, &dev->flags); 1823 if (fua) 1824 set_bit(R5_WantFUA, &dev->flags); 1825 if (sync) 1826 set_bit(R5_SyncIO, &dev->flags); 1827 } 1828 } 1829 1830 if (sh->reconstruct_state == reconstruct_state_drain_run) 1831 sh->reconstruct_state = reconstruct_state_drain_result; 1832 else if (sh->reconstruct_state == reconstruct_state_prexor_drain_run) 1833 sh->reconstruct_state = reconstruct_state_prexor_drain_result; 1834 else { 1835 BUG_ON(sh->reconstruct_state != reconstruct_state_run); 1836 sh->reconstruct_state = reconstruct_state_result; 1837 } 1838 1839 set_bit(STRIPE_HANDLE, &sh->state); 1840 raid5_release_stripe(sh); 1841 } 1842 1843 static void 1844 ops_run_reconstruct5(struct stripe_head *sh, struct raid5_percpu *percpu, 1845 struct dma_async_tx_descriptor *tx) 1846 { 1847 int disks = sh->disks; 1848 struct page **xor_srcs; 1849 struct async_submit_ctl submit; 1850 int count, pd_idx = sh->pd_idx, i; 1851 struct page *xor_dest; 1852 int prexor = 0; 1853 unsigned long flags; 1854 int j = 0; 1855 struct stripe_head *head_sh = sh; 1856 int last_stripe; 1857 1858 pr_debug("%s: stripe %llu\n", __func__, 1859 (unsigned long long)sh->sector); 1860 1861 for (i = 0; i < sh->disks; i++) { 1862 if (pd_idx == i) 1863 continue; 1864 if (!test_bit(R5_Discard, &sh->dev[i].flags)) 1865 break; 1866 } 1867 if (i >= sh->disks) { 1868 atomic_inc(&sh->count); 1869 set_bit(R5_Discard, &sh->dev[pd_idx].flags); 1870 ops_complete_reconstruct(sh); 1871 return; 1872 } 1873 again: 1874 count = 0; 1875 xor_srcs = to_addr_page(percpu, j); 1876 /* check if prexor is active which means only process blocks 1877 * that are part of a read-modify-write (written) 1878 */ 1879 if (head_sh->reconstruct_state == reconstruct_state_prexor_drain_run) { 1880 prexor = 1; 1881 xor_dest = xor_srcs[count++] = sh->dev[pd_idx].page; 1882 for (i = disks; i--; ) { 1883 struct r5dev *dev = &sh->dev[i]; 1884 if (head_sh->dev[i].written || 1885 test_bit(R5_InJournal, &head_sh->dev[i].flags)) 1886 xor_srcs[count++] = dev->page; 1887 } 1888 } else { 1889 xor_dest = sh->dev[pd_idx].page; 1890 for (i = disks; i--; ) { 1891 struct r5dev *dev = &sh->dev[i]; 1892 if (i != pd_idx) 1893 xor_srcs[count++] = dev->page; 1894 } 1895 } 1896 1897 /* 1/ if we prexor'd then the dest is reused as a source 1898 * 2/ if we did not prexor then we are redoing the parity 1899 * set ASYNC_TX_XOR_DROP_DST and ASYNC_TX_XOR_ZERO_DST 1900 * for the synchronous xor case 1901 */ 1902 last_stripe = !head_sh->batch_head || 1903 list_first_entry(&sh->batch_list, 1904 struct stripe_head, batch_list) == head_sh; 1905 if (last_stripe) { 1906 flags = ASYNC_TX_ACK | 1907 (prexor ? ASYNC_TX_XOR_DROP_DST : ASYNC_TX_XOR_ZERO_DST); 1908 1909 atomic_inc(&head_sh->count); 1910 init_async_submit(&submit, flags, tx, ops_complete_reconstruct, head_sh, 1911 to_addr_conv(sh, percpu, j)); 1912 } else { 1913 flags = prexor ? ASYNC_TX_XOR_DROP_DST : ASYNC_TX_XOR_ZERO_DST; 1914 init_async_submit(&submit, flags, tx, NULL, NULL, 1915 to_addr_conv(sh, percpu, j)); 1916 } 1917 1918 if (unlikely(count == 1)) 1919 tx = async_memcpy(xor_dest, xor_srcs[0], 0, 0, STRIPE_SIZE, &submit); 1920 else 1921 tx = async_xor(xor_dest, xor_srcs, 0, count, STRIPE_SIZE, &submit); 1922 if (!last_stripe) { 1923 j++; 1924 sh = list_first_entry(&sh->batch_list, struct stripe_head, 1925 batch_list); 1926 goto again; 1927 } 1928 } 1929 1930 static void 1931 ops_run_reconstruct6(struct stripe_head *sh, struct raid5_percpu *percpu, 1932 struct dma_async_tx_descriptor *tx) 1933 { 1934 struct async_submit_ctl submit; 1935 struct page **blocks; 1936 int count, i, j = 0; 1937 struct stripe_head *head_sh = sh; 1938 int last_stripe; 1939 int synflags; 1940 unsigned long txflags; 1941 1942 pr_debug("%s: stripe %llu\n", __func__, (unsigned long long)sh->sector); 1943 1944 for (i = 0; i < sh->disks; i++) { 1945 if (sh->pd_idx == i || sh->qd_idx == i) 1946 continue; 1947 if (!test_bit(R5_Discard, &sh->dev[i].flags)) 1948 break; 1949 } 1950 if (i >= sh->disks) { 1951 atomic_inc(&sh->count); 1952 set_bit(R5_Discard, &sh->dev[sh->pd_idx].flags); 1953 set_bit(R5_Discard, &sh->dev[sh->qd_idx].flags); 1954 ops_complete_reconstruct(sh); 1955 return; 1956 } 1957 1958 again: 1959 blocks = to_addr_page(percpu, j); 1960 1961 if (sh->reconstruct_state == reconstruct_state_prexor_drain_run) { 1962 synflags = SYNDROME_SRC_WRITTEN; 1963 txflags = ASYNC_TX_ACK | ASYNC_TX_PQ_XOR_DST; 1964 } else { 1965 synflags = SYNDROME_SRC_ALL; 1966 txflags = ASYNC_TX_ACK; 1967 } 1968 1969 count = set_syndrome_sources(blocks, sh, synflags); 1970 last_stripe = !head_sh->batch_head || 1971 list_first_entry(&sh->batch_list, 1972 struct stripe_head, batch_list) == head_sh; 1973 1974 if (last_stripe) { 1975 atomic_inc(&head_sh->count); 1976 init_async_submit(&submit, txflags, tx, ops_complete_reconstruct, 1977 head_sh, to_addr_conv(sh, percpu, j)); 1978 } else 1979 init_async_submit(&submit, 0, tx, NULL, NULL, 1980 to_addr_conv(sh, percpu, j)); 1981 tx = async_gen_syndrome(blocks, 0, count+2, STRIPE_SIZE, &submit); 1982 if (!last_stripe) { 1983 j++; 1984 sh = list_first_entry(&sh->batch_list, struct stripe_head, 1985 batch_list); 1986 goto again; 1987 } 1988 } 1989 1990 static void ops_complete_check(void *stripe_head_ref) 1991 { 1992 struct stripe_head *sh = stripe_head_ref; 1993 1994 pr_debug("%s: stripe %llu\n", __func__, 1995 (unsigned long long)sh->sector); 1996 1997 sh->check_state = check_state_check_result; 1998 set_bit(STRIPE_HANDLE, &sh->state); 1999 raid5_release_stripe(sh); 2000 } 2001 2002 static void ops_run_check_p(struct stripe_head *sh, struct raid5_percpu *percpu) 2003 { 2004 int disks = sh->disks; 2005 int pd_idx = sh->pd_idx; 2006 int qd_idx = sh->qd_idx; 2007 struct page *xor_dest; 2008 struct page **xor_srcs = to_addr_page(percpu, 0); 2009 struct dma_async_tx_descriptor *tx; 2010 struct async_submit_ctl submit; 2011 int count; 2012 int i; 2013 2014 pr_debug("%s: stripe %llu\n", __func__, 2015 (unsigned long long)sh->sector); 2016 2017 BUG_ON(sh->batch_head); 2018 count = 0; 2019 xor_dest = sh->dev[pd_idx].page; 2020 xor_srcs[count++] = xor_dest; 2021 for (i = disks; i--; ) { 2022 if (i == pd_idx || i == qd_idx) 2023 continue; 2024 xor_srcs[count++] = sh->dev[i].page; 2025 } 2026 2027 init_async_submit(&submit, 0, NULL, NULL, NULL, 2028 to_addr_conv(sh, percpu, 0)); 2029 tx = async_xor_val(xor_dest, xor_srcs, 0, count, STRIPE_SIZE, 2030 &sh->ops.zero_sum_result, &submit); 2031 2032 atomic_inc(&sh->count); 2033 init_async_submit(&submit, ASYNC_TX_ACK, tx, ops_complete_check, sh, NULL); 2034 tx = async_trigger_callback(&submit); 2035 } 2036 2037 static void ops_run_check_pq(struct stripe_head *sh, struct raid5_percpu *percpu, int checkp) 2038 { 2039 struct page **srcs = to_addr_page(percpu, 0); 2040 struct async_submit_ctl submit; 2041 int count; 2042 2043 pr_debug("%s: stripe %llu checkp: %d\n", __func__, 2044 (unsigned long long)sh->sector, checkp); 2045 2046 BUG_ON(sh->batch_head); 2047 count = set_syndrome_sources(srcs, sh, SYNDROME_SRC_ALL); 2048 if (!checkp) 2049 srcs[count] = NULL; 2050 2051 atomic_inc(&sh->count); 2052 init_async_submit(&submit, ASYNC_TX_ACK, NULL, ops_complete_check, 2053 sh, to_addr_conv(sh, percpu, 0)); 2054 async_syndrome_val(srcs, 0, count+2, STRIPE_SIZE, 2055 &sh->ops.zero_sum_result, percpu->spare_page, &submit); 2056 } 2057 2058 static void raid_run_ops(struct stripe_head *sh, unsigned long ops_request) 2059 { 2060 int overlap_clear = 0, i, disks = sh->disks; 2061 struct dma_async_tx_descriptor *tx = NULL; 2062 struct r5conf *conf = sh->raid_conf; 2063 int level = conf->level; 2064 struct raid5_percpu *percpu; 2065 unsigned long cpu; 2066 2067 cpu = get_cpu(); 2068 percpu = per_cpu_ptr(conf->percpu, cpu); 2069 if (test_bit(STRIPE_OP_BIOFILL, &ops_request)) { 2070 ops_run_biofill(sh); 2071 overlap_clear++; 2072 } 2073 2074 if (test_bit(STRIPE_OP_COMPUTE_BLK, &ops_request)) { 2075 if (level < 6) 2076 tx = ops_run_compute5(sh, percpu); 2077 else { 2078 if (sh->ops.target2 < 0 || sh->ops.target < 0) 2079 tx = ops_run_compute6_1(sh, percpu); 2080 else 2081 tx = ops_run_compute6_2(sh, percpu); 2082 } 2083 /* terminate the chain if reconstruct is not set to be run */ 2084 if (tx && !test_bit(STRIPE_OP_RECONSTRUCT, &ops_request)) 2085 async_tx_ack(tx); 2086 } 2087 2088 if (test_bit(STRIPE_OP_PREXOR, &ops_request)) { 2089 if (level < 6) 2090 tx = ops_run_prexor5(sh, percpu, tx); 2091 else 2092 tx = ops_run_prexor6(sh, percpu, tx); 2093 } 2094 2095 if (test_bit(STRIPE_OP_PARTIAL_PARITY, &ops_request)) 2096 tx = ops_run_partial_parity(sh, percpu, tx); 2097 2098 if (test_bit(STRIPE_OP_BIODRAIN, &ops_request)) { 2099 tx = ops_run_biodrain(sh, tx); 2100 overlap_clear++; 2101 } 2102 2103 if (test_bit(STRIPE_OP_RECONSTRUCT, &ops_request)) { 2104 if (level < 6) 2105 ops_run_reconstruct5(sh, percpu, tx); 2106 else 2107 ops_run_reconstruct6(sh, percpu, tx); 2108 } 2109 2110 if (test_bit(STRIPE_OP_CHECK, &ops_request)) { 2111 if (sh->check_state == check_state_run) 2112 ops_run_check_p(sh, percpu); 2113 else if (sh->check_state == check_state_run_q) 2114 ops_run_check_pq(sh, percpu, 0); 2115 else if (sh->check_state == check_state_run_pq) 2116 ops_run_check_pq(sh, percpu, 1); 2117 else 2118 BUG(); 2119 } 2120 2121 if (overlap_clear && !sh->batch_head) 2122 for (i = disks; i--; ) { 2123 struct r5dev *dev = &sh->dev[i]; 2124 if (test_and_clear_bit(R5_Overlap, &dev->flags)) 2125 wake_up(&sh->raid_conf->wait_for_overlap); 2126 } 2127 put_cpu(); 2128 } 2129 2130 static void free_stripe(struct kmem_cache *sc, struct stripe_head *sh) 2131 { 2132 if (sh->ppl_page) 2133 __free_page(sh->ppl_page); 2134 kmem_cache_free(sc, sh); 2135 } 2136 2137 static struct stripe_head *alloc_stripe(struct kmem_cache *sc, gfp_t gfp, 2138 int disks, struct r5conf *conf) 2139 { 2140 struct stripe_head *sh; 2141 int i; 2142 2143 sh = kmem_cache_zalloc(sc, gfp); 2144 if (sh) { 2145 spin_lock_init(&sh->stripe_lock); 2146 spin_lock_init(&sh->batch_lock); 2147 INIT_LIST_HEAD(&sh->batch_list); 2148 INIT_LIST_HEAD(&sh->lru); 2149 INIT_LIST_HEAD(&sh->r5c); 2150 INIT_LIST_HEAD(&sh->log_list); 2151 atomic_set(&sh->count, 1); 2152 sh->raid_conf = conf; 2153 sh->log_start = MaxSector; 2154 for (i = 0; i < disks; i++) { 2155 struct r5dev *dev = &sh->dev[i]; 2156 2157 bio_init(&dev->req, &dev->vec, 1); 2158 bio_init(&dev->rreq, &dev->rvec, 1); 2159 } 2160 2161 if (raid5_has_ppl(conf)) { 2162 sh->ppl_page = alloc_page(gfp); 2163 if (!sh->ppl_page) { 2164 free_stripe(sc, sh); 2165 sh = NULL; 2166 } 2167 } 2168 } 2169 return sh; 2170 } 2171 static int grow_one_stripe(struct r5conf *conf, gfp_t gfp) 2172 { 2173 struct stripe_head *sh; 2174 2175 sh = alloc_stripe(conf->slab_cache, gfp, conf->pool_size, conf); 2176 if (!sh) 2177 return 0; 2178 2179 if (grow_buffers(sh, gfp)) { 2180 shrink_buffers(sh); 2181 free_stripe(conf->slab_cache, sh); 2182 return 0; 2183 } 2184 sh->hash_lock_index = 2185 conf->max_nr_stripes % NR_STRIPE_HASH_LOCKS; 2186 /* we just created an active stripe so... */ 2187 atomic_inc(&conf->active_stripes); 2188 2189 raid5_release_stripe(sh); 2190 conf->max_nr_stripes++; 2191 return 1; 2192 } 2193 2194 static int grow_stripes(struct r5conf *conf, int num) 2195 { 2196 struct kmem_cache *sc; 2197 int devs = max(conf->raid_disks, conf->previous_raid_disks); 2198 2199 if (conf->mddev->gendisk) 2200 sprintf(conf->cache_name[0], 2201 "raid%d-%s", conf->level, mdname(conf->mddev)); 2202 else 2203 sprintf(conf->cache_name[0], 2204 "raid%d-%p", conf->level, conf->mddev); 2205 sprintf(conf->cache_name[1], "%s-alt", conf->cache_name[0]); 2206 2207 conf->active_name = 0; 2208 sc = kmem_cache_create(conf->cache_name[conf->active_name], 2209 sizeof(struct stripe_head)+(devs-1)*sizeof(struct r5dev), 2210 0, 0, NULL); 2211 if (!sc) 2212 return 1; 2213 conf->slab_cache = sc; 2214 conf->pool_size = devs; 2215 while (num--) 2216 if (!grow_one_stripe(conf, GFP_KERNEL)) 2217 return 1; 2218 2219 return 0; 2220 } 2221 2222 /** 2223 * scribble_len - return the required size of the scribble region 2224 * @num - total number of disks in the array 2225 * 2226 * The size must be enough to contain: 2227 * 1/ a struct page pointer for each device in the array +2 2228 * 2/ room to convert each entry in (1) to its corresponding dma 2229 * (dma_map_page()) or page (page_address()) address. 2230 * 2231 * Note: the +2 is for the destination buffers of the ddf/raid6 case where we 2232 * calculate over all devices (not just the data blocks), using zeros in place 2233 * of the P and Q blocks. 2234 */ 2235 static struct flex_array *scribble_alloc(int num, int cnt, gfp_t flags) 2236 { 2237 struct flex_array *ret; 2238 size_t len; 2239 2240 len = sizeof(struct page *) * (num+2) + sizeof(addr_conv_t) * (num+2); 2241 ret = flex_array_alloc(len, cnt, flags); 2242 if (!ret) 2243 return NULL; 2244 /* always prealloc all elements, so no locking is required */ 2245 if (flex_array_prealloc(ret, 0, cnt, flags)) { 2246 flex_array_free(ret); 2247 return NULL; 2248 } 2249 return ret; 2250 } 2251 2252 static int resize_chunks(struct r5conf *conf, int new_disks, int new_sectors) 2253 { 2254 unsigned long cpu; 2255 int err = 0; 2256 2257 /* 2258 * Never shrink. And mddev_suspend() could deadlock if this is called 2259 * from raid5d. In that case, scribble_disks and scribble_sectors 2260 * should equal to new_disks and new_sectors 2261 */ 2262 if (conf->scribble_disks >= new_disks && 2263 conf->scribble_sectors >= new_sectors) 2264 return 0; 2265 mddev_suspend(conf->mddev); 2266 get_online_cpus(); 2267 for_each_present_cpu(cpu) { 2268 struct raid5_percpu *percpu; 2269 struct flex_array *scribble; 2270 2271 percpu = per_cpu_ptr(conf->percpu, cpu); 2272 scribble = scribble_alloc(new_disks, 2273 new_sectors / STRIPE_SECTORS, 2274 GFP_NOIO); 2275 2276 if (scribble) { 2277 flex_array_free(percpu->scribble); 2278 percpu->scribble = scribble; 2279 } else { 2280 err = -ENOMEM; 2281 break; 2282 } 2283 } 2284 put_online_cpus(); 2285 mddev_resume(conf->mddev); 2286 if (!err) { 2287 conf->scribble_disks = new_disks; 2288 conf->scribble_sectors = new_sectors; 2289 } 2290 return err; 2291 } 2292 2293 static int resize_stripes(struct r5conf *conf, int newsize) 2294 { 2295 /* Make all the stripes able to hold 'newsize' devices. 2296 * New slots in each stripe get 'page' set to a new page. 2297 * 2298 * This happens in stages: 2299 * 1/ create a new kmem_cache and allocate the required number of 2300 * stripe_heads. 2301 * 2/ gather all the old stripe_heads and transfer the pages across 2302 * to the new stripe_heads. This will have the side effect of 2303 * freezing the array as once all stripe_heads have been collected, 2304 * no IO will be possible. Old stripe heads are freed once their 2305 * pages have been transferred over, and the old kmem_cache is 2306 * freed when all stripes are done. 2307 * 3/ reallocate conf->disks to be suitable bigger. If this fails, 2308 * we simple return a failure status - no need to clean anything up. 2309 * 4/ allocate new pages for the new slots in the new stripe_heads. 2310 * If this fails, we don't bother trying the shrink the 2311 * stripe_heads down again, we just leave them as they are. 2312 * As each stripe_head is processed the new one is released into 2313 * active service. 2314 * 2315 * Once step2 is started, we cannot afford to wait for a write, 2316 * so we use GFP_NOIO allocations. 2317 */ 2318 struct stripe_head *osh, *nsh; 2319 LIST_HEAD(newstripes); 2320 struct disk_info *ndisks; 2321 int err = 0; 2322 struct kmem_cache *sc; 2323 int i; 2324 int hash, cnt; 2325 2326 md_allow_write(conf->mddev); 2327 2328 /* Step 1 */ 2329 sc = kmem_cache_create(conf->cache_name[1-conf->active_name], 2330 sizeof(struct stripe_head)+(newsize-1)*sizeof(struct r5dev), 2331 0, 0, NULL); 2332 if (!sc) 2333 return -ENOMEM; 2334 2335 /* Need to ensure auto-resizing doesn't interfere */ 2336 mutex_lock(&conf->cache_size_mutex); 2337 2338 for (i = conf->max_nr_stripes; i; i--) { 2339 nsh = alloc_stripe(sc, GFP_KERNEL, newsize, conf); 2340 if (!nsh) 2341 break; 2342 2343 list_add(&nsh->lru, &newstripes); 2344 } 2345 if (i) { 2346 /* didn't get enough, give up */ 2347 while (!list_empty(&newstripes)) { 2348 nsh = list_entry(newstripes.next, struct stripe_head, lru); 2349 list_del(&nsh->lru); 2350 free_stripe(sc, nsh); 2351 } 2352 kmem_cache_destroy(sc); 2353 mutex_unlock(&conf->cache_size_mutex); 2354 return -ENOMEM; 2355 } 2356 /* Step 2 - Must use GFP_NOIO now. 2357 * OK, we have enough stripes, start collecting inactive 2358 * stripes and copying them over 2359 */ 2360 hash = 0; 2361 cnt = 0; 2362 list_for_each_entry(nsh, &newstripes, lru) { 2363 lock_device_hash_lock(conf, hash); 2364 wait_event_cmd(conf->wait_for_stripe, 2365 !list_empty(conf->inactive_list + hash), 2366 unlock_device_hash_lock(conf, hash), 2367 lock_device_hash_lock(conf, hash)); 2368 osh = get_free_stripe(conf, hash); 2369 unlock_device_hash_lock(conf, hash); 2370 2371 for(i=0; i<conf->pool_size; i++) { 2372 nsh->dev[i].page = osh->dev[i].page; 2373 nsh->dev[i].orig_page = osh->dev[i].page; 2374 } 2375 nsh->hash_lock_index = hash; 2376 free_stripe(conf->slab_cache, osh); 2377 cnt++; 2378 if (cnt >= conf->max_nr_stripes / NR_STRIPE_HASH_LOCKS + 2379 !!((conf->max_nr_stripes % NR_STRIPE_HASH_LOCKS) > hash)) { 2380 hash++; 2381 cnt = 0; 2382 } 2383 } 2384 kmem_cache_destroy(conf->slab_cache); 2385 2386 /* Step 3. 2387 * At this point, we are holding all the stripes so the array 2388 * is completely stalled, so now is a good time to resize 2389 * conf->disks and the scribble region 2390 */ 2391 ndisks = kzalloc(newsize * sizeof(struct disk_info), GFP_NOIO); 2392 if (ndisks) { 2393 for (i = 0; i < conf->pool_size; i++) 2394 ndisks[i] = conf->disks[i]; 2395 2396 for (i = conf->pool_size; i < newsize; i++) { 2397 ndisks[i].extra_page = alloc_page(GFP_NOIO); 2398 if (!ndisks[i].extra_page) 2399 err = -ENOMEM; 2400 } 2401 2402 if (err) { 2403 for (i = conf->pool_size; i < newsize; i++) 2404 if (ndisks[i].extra_page) 2405 put_page(ndisks[i].extra_page); 2406 kfree(ndisks); 2407 } else { 2408 kfree(conf->disks); 2409 conf->disks = ndisks; 2410 } 2411 } else 2412 err = -ENOMEM; 2413 2414 mutex_unlock(&conf->cache_size_mutex); 2415 2416 conf->slab_cache = sc; 2417 conf->active_name = 1-conf->active_name; 2418 2419 /* Step 4, return new stripes to service */ 2420 while(!list_empty(&newstripes)) { 2421 nsh = list_entry(newstripes.next, struct stripe_head, lru); 2422 list_del_init(&nsh->lru); 2423 2424 for (i=conf->raid_disks; i < newsize; i++) 2425 if (nsh->dev[i].page == NULL) { 2426 struct page *p = alloc_page(GFP_NOIO); 2427 nsh->dev[i].page = p; 2428 nsh->dev[i].orig_page = p; 2429 if (!p) 2430 err = -ENOMEM; 2431 } 2432 raid5_release_stripe(nsh); 2433 } 2434 /* critical section pass, GFP_NOIO no longer needed */ 2435 2436 if (!err) 2437 conf->pool_size = newsize; 2438 return err; 2439 } 2440 2441 static int drop_one_stripe(struct r5conf *conf) 2442 { 2443 struct stripe_head *sh; 2444 int hash = (conf->max_nr_stripes - 1) & STRIPE_HASH_LOCKS_MASK; 2445 2446 spin_lock_irq(conf->hash_locks + hash); 2447 sh = get_free_stripe(conf, hash); 2448 spin_unlock_irq(conf->hash_locks + hash); 2449 if (!sh) 2450 return 0; 2451 BUG_ON(atomic_read(&sh->count)); 2452 shrink_buffers(sh); 2453 free_stripe(conf->slab_cache, sh); 2454 atomic_dec(&conf->active_stripes); 2455 conf->max_nr_stripes--; 2456 return 1; 2457 } 2458 2459 static void shrink_stripes(struct r5conf *conf) 2460 { 2461 while (conf->max_nr_stripes && 2462 drop_one_stripe(conf)) 2463 ; 2464 2465 kmem_cache_destroy(conf->slab_cache); 2466 conf->slab_cache = NULL; 2467 } 2468 2469 static void raid5_end_read_request(struct bio * bi) 2470 { 2471 struct stripe_head *sh = bi->bi_private; 2472 struct r5conf *conf = sh->raid_conf; 2473 int disks = sh->disks, i; 2474 char b[BDEVNAME_SIZE]; 2475 struct md_rdev *rdev = NULL; 2476 sector_t s; 2477 2478 for (i=0 ; i<disks; i++) 2479 if (bi == &sh->dev[i].req) 2480 break; 2481 2482 pr_debug("end_read_request %llu/%d, count: %d, error %d.\n", 2483 (unsigned long long)sh->sector, i, atomic_read(&sh->count), 2484 bi->bi_status); 2485 if (i == disks) { 2486 bio_reset(bi); 2487 BUG(); 2488 return; 2489 } 2490 if (test_bit(R5_ReadRepl, &sh->dev[i].flags)) 2491 /* If replacement finished while this request was outstanding, 2492 * 'replacement' might be NULL already. 2493 * In that case it moved down to 'rdev'. 2494 * rdev is not removed until all requests are finished. 2495 */ 2496 rdev = conf->disks[i].replacement; 2497 if (!rdev) 2498 rdev = conf->disks[i].rdev; 2499 2500 if (use_new_offset(conf, sh)) 2501 s = sh->sector + rdev->new_data_offset; 2502 else 2503 s = sh->sector + rdev->data_offset; 2504 if (!bi->bi_status) { 2505 set_bit(R5_UPTODATE, &sh->dev[i].flags); 2506 if (test_bit(R5_ReadError, &sh->dev[i].flags)) { 2507 /* Note that this cannot happen on a 2508 * replacement device. We just fail those on 2509 * any error 2510 */ 2511 pr_info_ratelimited( 2512 "md/raid:%s: read error corrected (%lu sectors at %llu on %s)\n", 2513 mdname(conf->mddev), STRIPE_SECTORS, 2514 (unsigned long long)s, 2515 bdevname(rdev->bdev, b)); 2516 atomic_add(STRIPE_SECTORS, &rdev->corrected_errors); 2517 clear_bit(R5_ReadError, &sh->dev[i].flags); 2518 clear_bit(R5_ReWrite, &sh->dev[i].flags); 2519 } else if (test_bit(R5_ReadNoMerge, &sh->dev[i].flags)) 2520 clear_bit(R5_ReadNoMerge, &sh->dev[i].flags); 2521 2522 if (test_bit(R5_InJournal, &sh->dev[i].flags)) 2523 /* 2524 * end read for a page in journal, this 2525 * must be preparing for prexor in rmw 2526 */ 2527 set_bit(R5_OrigPageUPTDODATE, &sh->dev[i].flags); 2528 2529 if (atomic_read(&rdev->read_errors)) 2530 atomic_set(&rdev->read_errors, 0); 2531 } else { 2532 const char *bdn = bdevname(rdev->bdev, b); 2533 int retry = 0; 2534 int set_bad = 0; 2535 2536 clear_bit(R5_UPTODATE, &sh->dev[i].flags); 2537 atomic_inc(&rdev->read_errors); 2538 if (test_bit(R5_ReadRepl, &sh->dev[i].flags)) 2539 pr_warn_ratelimited( 2540 "md/raid:%s: read error on replacement device (sector %llu on %s).\n", 2541 mdname(conf->mddev), 2542 (unsigned long long)s, 2543 bdn); 2544 else if (conf->mddev->degraded >= conf->max_degraded) { 2545 set_bad = 1; 2546 pr_warn_ratelimited( 2547 "md/raid:%s: read error not correctable (sector %llu on %s).\n", 2548 mdname(conf->mddev), 2549 (unsigned long long)s, 2550 bdn); 2551 } else if (test_bit(R5_ReWrite, &sh->dev[i].flags)) { 2552 /* Oh, no!!! */ 2553 set_bad = 1; 2554 pr_warn_ratelimited( 2555 "md/raid:%s: read error NOT corrected!! (sector %llu on %s).\n", 2556 mdname(conf->mddev), 2557 (unsigned long long)s, 2558 bdn); 2559 } else if (atomic_read(&rdev->read_errors) 2560 > conf->max_nr_stripes) 2561 pr_warn("md/raid:%s: Too many read errors, failing device %s.\n", 2562 mdname(conf->mddev), bdn); 2563 else 2564 retry = 1; 2565 if (set_bad && test_bit(In_sync, &rdev->flags) 2566 && !test_bit(R5_ReadNoMerge, &sh->dev[i].flags)) 2567 retry = 1; 2568 if (retry) 2569 if (test_bit(R5_ReadNoMerge, &sh->dev[i].flags)) { 2570 set_bit(R5_ReadError, &sh->dev[i].flags); 2571 clear_bit(R5_ReadNoMerge, &sh->dev[i].flags); 2572 } else 2573 set_bit(R5_ReadNoMerge, &sh->dev[i].flags); 2574 else { 2575 clear_bit(R5_ReadError, &sh->dev[i].flags); 2576 clear_bit(R5_ReWrite, &sh->dev[i].flags); 2577 if (!(set_bad 2578 && test_bit(In_sync, &rdev->flags) 2579 && rdev_set_badblocks( 2580 rdev, sh->sector, STRIPE_SECTORS, 0))) 2581 md_error(conf->mddev, rdev); 2582 } 2583 } 2584 rdev_dec_pending(rdev, conf->mddev); 2585 bio_reset(bi); 2586 clear_bit(R5_LOCKED, &sh->dev[i].flags); 2587 set_bit(STRIPE_HANDLE, &sh->state); 2588 raid5_release_stripe(sh); 2589 } 2590 2591 static void raid5_end_write_request(struct bio *bi) 2592 { 2593 struct stripe_head *sh = bi->bi_private; 2594 struct r5conf *conf = sh->raid_conf; 2595 int disks = sh->disks, i; 2596 struct md_rdev *uninitialized_var(rdev); 2597 sector_t first_bad; 2598 int bad_sectors; 2599 int replacement = 0; 2600 2601 for (i = 0 ; i < disks; i++) { 2602 if (bi == &sh->dev[i].req) { 2603 rdev = conf->disks[i].rdev; 2604 break; 2605 } 2606 if (bi == &sh->dev[i].rreq) { 2607 rdev = conf->disks[i].replacement; 2608 if (rdev) 2609 replacement = 1; 2610 else 2611 /* rdev was removed and 'replacement' 2612 * replaced it. rdev is not removed 2613 * until all requests are finished. 2614 */ 2615 rdev = conf->disks[i].rdev; 2616 break; 2617 } 2618 } 2619 pr_debug("end_write_request %llu/%d, count %d, error: %d.\n", 2620 (unsigned long long)sh->sector, i, atomic_read(&sh->count), 2621 bi->bi_status); 2622 if (i == disks) { 2623 bio_reset(bi); 2624 BUG(); 2625 return; 2626 } 2627 2628 if (replacement) { 2629 if (bi->bi_status) 2630 md_error(conf->mddev, rdev); 2631 else if (is_badblock(rdev, sh->sector, 2632 STRIPE_SECTORS, 2633 &first_bad, &bad_sectors)) 2634 set_bit(R5_MadeGoodRepl, &sh->dev[i].flags); 2635 } else { 2636 if (bi->bi_status) { 2637 set_bit(STRIPE_DEGRADED, &sh->state); 2638 set_bit(WriteErrorSeen, &rdev->flags); 2639 set_bit(R5_WriteError, &sh->dev[i].flags); 2640 if (!test_and_set_bit(WantReplacement, &rdev->flags)) 2641 set_bit(MD_RECOVERY_NEEDED, 2642 &rdev->mddev->recovery); 2643 } else if (is_badblock(rdev, sh->sector, 2644 STRIPE_SECTORS, 2645 &first_bad, &bad_sectors)) { 2646 set_bit(R5_MadeGood, &sh->dev[i].flags); 2647 if (test_bit(R5_ReadError, &sh->dev[i].flags)) 2648 /* That was a successful write so make 2649 * sure it looks like we already did 2650 * a re-write. 2651 */ 2652 set_bit(R5_ReWrite, &sh->dev[i].flags); 2653 } 2654 } 2655 rdev_dec_pending(rdev, conf->mddev); 2656 2657 if (sh->batch_head && bi->bi_status && !replacement) 2658 set_bit(STRIPE_BATCH_ERR, &sh->batch_head->state); 2659 2660 bio_reset(bi); 2661 if (!test_and_clear_bit(R5_DOUBLE_LOCKED, &sh->dev[i].flags)) 2662 clear_bit(R5_LOCKED, &sh->dev[i].flags); 2663 set_bit(STRIPE_HANDLE, &sh->state); 2664 raid5_release_stripe(sh); 2665 2666 if (sh->batch_head && sh != sh->batch_head) 2667 raid5_release_stripe(sh->batch_head); 2668 } 2669 2670 static void raid5_error(struct mddev *mddev, struct md_rdev *rdev) 2671 { 2672 char b[BDEVNAME_SIZE]; 2673 struct r5conf *conf = mddev->private; 2674 unsigned long flags; 2675 pr_debug("raid456: error called\n"); 2676 2677 spin_lock_irqsave(&conf->device_lock, flags); 2678 clear_bit(In_sync, &rdev->flags); 2679 mddev->degraded = raid5_calc_degraded(conf); 2680 spin_unlock_irqrestore(&conf->device_lock, flags); 2681 set_bit(MD_RECOVERY_INTR, &mddev->recovery); 2682 2683 set_bit(Blocked, &rdev->flags); 2684 set_bit(Faulty, &rdev->flags); 2685 set_mask_bits(&mddev->sb_flags, 0, 2686 BIT(MD_SB_CHANGE_DEVS) | BIT(MD_SB_CHANGE_PENDING)); 2687 pr_crit("md/raid:%s: Disk failure on %s, disabling device.\n" 2688 "md/raid:%s: Operation continuing on %d devices.\n", 2689 mdname(mddev), 2690 bdevname(rdev->bdev, b), 2691 mdname(mddev), 2692 conf->raid_disks - mddev->degraded); 2693 r5c_update_on_rdev_error(mddev, rdev); 2694 } 2695 2696 /* 2697 * Input: a 'big' sector number, 2698 * Output: index of the data and parity disk, and the sector # in them. 2699 */ 2700 sector_t raid5_compute_sector(struct r5conf *conf, sector_t r_sector, 2701 int previous, int *dd_idx, 2702 struct stripe_head *sh) 2703 { 2704 sector_t stripe, stripe2; 2705 sector_t chunk_number; 2706 unsigned int chunk_offset; 2707 int pd_idx, qd_idx; 2708 int ddf_layout = 0; 2709 sector_t new_sector; 2710 int algorithm = previous ? conf->prev_algo 2711 : conf->algorithm; 2712 int sectors_per_chunk = previous ? conf->prev_chunk_sectors 2713 : conf->chunk_sectors; 2714 int raid_disks = previous ? conf->previous_raid_disks 2715 : conf->raid_disks; 2716 int data_disks = raid_disks - conf->max_degraded; 2717 2718 /* First compute the information on this sector */ 2719 2720 /* 2721 * Compute the chunk number and the sector offset inside the chunk 2722 */ 2723 chunk_offset = sector_div(r_sector, sectors_per_chunk); 2724 chunk_number = r_sector; 2725 2726 /* 2727 * Compute the stripe number 2728 */ 2729 stripe = chunk_number; 2730 *dd_idx = sector_div(stripe, data_disks); 2731 stripe2 = stripe; 2732 /* 2733 * Select the parity disk based on the user selected algorithm. 2734 */ 2735 pd_idx = qd_idx = -1; 2736 switch(conf->level) { 2737 case 4: 2738 pd_idx = data_disks; 2739 break; 2740 case 5: 2741 switch (algorithm) { 2742 case ALGORITHM_LEFT_ASYMMETRIC: 2743 pd_idx = data_disks - sector_div(stripe2, raid_disks); 2744 if (*dd_idx >= pd_idx) 2745 (*dd_idx)++; 2746 break; 2747 case ALGORITHM_RIGHT_ASYMMETRIC: 2748 pd_idx = sector_div(stripe2, raid_disks); 2749 if (*dd_idx >= pd_idx) 2750 (*dd_idx)++; 2751 break; 2752 case ALGORITHM_LEFT_SYMMETRIC: 2753 pd_idx = data_disks - sector_div(stripe2, raid_disks); 2754 *dd_idx = (pd_idx + 1 + *dd_idx) % raid_disks; 2755 break; 2756 case ALGORITHM_RIGHT_SYMMETRIC: 2757 pd_idx = sector_div(stripe2, raid_disks); 2758 *dd_idx = (pd_idx + 1 + *dd_idx) % raid_disks; 2759 break; 2760 case ALGORITHM_PARITY_0: 2761 pd_idx = 0; 2762 (*dd_idx)++; 2763 break; 2764 case ALGORITHM_PARITY_N: 2765 pd_idx = data_disks; 2766 break; 2767 default: 2768 BUG(); 2769 } 2770 break; 2771 case 6: 2772 2773 switch (algorithm) { 2774 case ALGORITHM_LEFT_ASYMMETRIC: 2775 pd_idx = raid_disks - 1 - sector_div(stripe2, raid_disks); 2776 qd_idx = pd_idx + 1; 2777 if (pd_idx == raid_disks-1) { 2778 (*dd_idx)++; /* Q D D D P */ 2779 qd_idx = 0; 2780 } else if (*dd_idx >= pd_idx) 2781 (*dd_idx) += 2; /* D D P Q D */ 2782 break; 2783 case ALGORITHM_RIGHT_ASYMMETRIC: 2784 pd_idx = sector_div(stripe2, raid_disks); 2785 qd_idx = pd_idx + 1; 2786 if (pd_idx == raid_disks-1) { 2787 (*dd_idx)++; /* Q D D D P */ 2788 qd_idx = 0; 2789 } else if (*dd_idx >= pd_idx) 2790 (*dd_idx) += 2; /* D D P Q D */ 2791 break; 2792 case ALGORITHM_LEFT_SYMMETRIC: 2793 pd_idx = raid_disks - 1 - sector_div(stripe2, raid_disks); 2794 qd_idx = (pd_idx + 1) % raid_disks; 2795 *dd_idx = (pd_idx + 2 + *dd_idx) % raid_disks; 2796 break; 2797 case ALGORITHM_RIGHT_SYMMETRIC: 2798 pd_idx = sector_div(stripe2, raid_disks); 2799 qd_idx = (pd_idx + 1) % raid_disks; 2800 *dd_idx = (pd_idx + 2 + *dd_idx) % raid_disks; 2801 break; 2802 2803 case ALGORITHM_PARITY_0: 2804 pd_idx = 0; 2805 qd_idx = 1; 2806 (*dd_idx) += 2; 2807 break; 2808 case ALGORITHM_PARITY_N: 2809 pd_idx = data_disks; 2810 qd_idx = data_disks + 1; 2811 break; 2812 2813 case ALGORITHM_ROTATING_ZERO_RESTART: 2814 /* Exactly the same as RIGHT_ASYMMETRIC, but or 2815 * of blocks for computing Q is different. 2816 */ 2817 pd_idx = sector_div(stripe2, raid_disks); 2818 qd_idx = pd_idx + 1; 2819 if (pd_idx == raid_disks-1) { 2820 (*dd_idx)++; /* Q D D D P */ 2821 qd_idx = 0; 2822 } else if (*dd_idx >= pd_idx) 2823 (*dd_idx) += 2; /* D D P Q D */ 2824 ddf_layout = 1; 2825 break; 2826 2827 case ALGORITHM_ROTATING_N_RESTART: 2828 /* Same a left_asymmetric, by first stripe is 2829 * D D D P Q rather than 2830 * Q D D D P 2831 */ 2832 stripe2 += 1; 2833 pd_idx = raid_disks - 1 - sector_div(stripe2, raid_disks); 2834 qd_idx = pd_idx + 1; 2835 if (pd_idx == raid_disks-1) { 2836 (*dd_idx)++; /* Q D D D P */ 2837 qd_idx = 0; 2838 } else if (*dd_idx >= pd_idx) 2839 (*dd_idx) += 2; /* D D P Q D */ 2840 ddf_layout = 1; 2841 break; 2842 2843 case ALGORITHM_ROTATING_N_CONTINUE: 2844 /* Same as left_symmetric but Q is before P */ 2845 pd_idx = raid_disks - 1 - sector_div(stripe2, raid_disks); 2846 qd_idx = (pd_idx + raid_disks - 1) % raid_disks; 2847 *dd_idx = (pd_idx + 1 + *dd_idx) % raid_disks; 2848 ddf_layout = 1; 2849 break; 2850 2851 case ALGORITHM_LEFT_ASYMMETRIC_6: 2852 /* RAID5 left_asymmetric, with Q on last device */ 2853 pd_idx = data_disks - sector_div(stripe2, raid_disks-1); 2854 if (*dd_idx >= pd_idx) 2855 (*dd_idx)++; 2856 qd_idx = raid_disks - 1; 2857 break; 2858 2859 case ALGORITHM_RIGHT_ASYMMETRIC_6: 2860 pd_idx = sector_div(stripe2, raid_disks-1); 2861 if (*dd_idx >= pd_idx) 2862 (*dd_idx)++; 2863 qd_idx = raid_disks - 1; 2864 break; 2865 2866 case ALGORITHM_LEFT_SYMMETRIC_6: 2867 pd_idx = data_disks - sector_div(stripe2, raid_disks-1); 2868 *dd_idx = (pd_idx + 1 + *dd_idx) % (raid_disks-1); 2869 qd_idx = raid_disks - 1; 2870 break; 2871 2872 case ALGORITHM_RIGHT_SYMMETRIC_6: 2873 pd_idx = sector_div(stripe2, raid_disks-1); 2874 *dd_idx = (pd_idx + 1 + *dd_idx) % (raid_disks-1); 2875 qd_idx = raid_disks - 1; 2876 break; 2877 2878 case ALGORITHM_PARITY_0_6: 2879 pd_idx = 0; 2880 (*dd_idx)++; 2881 qd_idx = raid_disks - 1; 2882 break; 2883 2884 default: 2885 BUG(); 2886 } 2887 break; 2888 } 2889 2890 if (sh) { 2891 sh->pd_idx = pd_idx; 2892 sh->qd_idx = qd_idx; 2893 sh->ddf_layout = ddf_layout; 2894 } 2895 /* 2896 * Finally, compute the new sector number 2897 */ 2898 new_sector = (sector_t)stripe * sectors_per_chunk + chunk_offset; 2899 return new_sector; 2900 } 2901 2902 sector_t raid5_compute_blocknr(struct stripe_head *sh, int i, int previous) 2903 { 2904 struct r5conf *conf = sh->raid_conf; 2905 int raid_disks = sh->disks; 2906 int data_disks = raid_disks - conf->max_degraded; 2907 sector_t new_sector = sh->sector, check; 2908 int sectors_per_chunk = previous ? conf->prev_chunk_sectors 2909 : conf->chunk_sectors; 2910 int algorithm = previous ? conf->prev_algo 2911 : conf->algorithm; 2912 sector_t stripe; 2913 int chunk_offset; 2914 sector_t chunk_number; 2915 int dummy1, dd_idx = i; 2916 sector_t r_sector; 2917 struct stripe_head sh2; 2918 2919 chunk_offset = sector_div(new_sector, sectors_per_chunk); 2920 stripe = new_sector; 2921 2922 if (i == sh->pd_idx) 2923 return 0; 2924 switch(conf->level) { 2925 case 4: break; 2926 case 5: 2927 switch (algorithm) { 2928 case ALGORITHM_LEFT_ASYMMETRIC: 2929 case ALGORITHM_RIGHT_ASYMMETRIC: 2930 if (i > sh->pd_idx) 2931 i--; 2932 break; 2933 case ALGORITHM_LEFT_SYMMETRIC: 2934 case ALGORITHM_RIGHT_SYMMETRIC: 2935 if (i < sh->pd_idx) 2936 i += raid_disks; 2937 i -= (sh->pd_idx + 1); 2938 break; 2939 case ALGORITHM_PARITY_0: 2940 i -= 1; 2941 break; 2942 case ALGORITHM_PARITY_N: 2943 break; 2944 default: 2945 BUG(); 2946 } 2947 break; 2948 case 6: 2949 if (i == sh->qd_idx) 2950 return 0; /* It is the Q disk */ 2951 switch (algorithm) { 2952 case ALGORITHM_LEFT_ASYMMETRIC: 2953 case ALGORITHM_RIGHT_ASYMMETRIC: 2954 case ALGORITHM_ROTATING_ZERO_RESTART: 2955 case ALGORITHM_ROTATING_N_RESTART: 2956 if (sh->pd_idx == raid_disks-1) 2957 i--; /* Q D D D P */ 2958 else if (i > sh->pd_idx) 2959 i -= 2; /* D D P Q D */ 2960 break; 2961 case ALGORITHM_LEFT_SYMMETRIC: 2962 case ALGORITHM_RIGHT_SYMMETRIC: 2963 if (sh->pd_idx == raid_disks-1) 2964 i--; /* Q D D D P */ 2965 else { 2966 /* D D P Q D */ 2967 if (i < sh->pd_idx) 2968 i += raid_disks; 2969 i -= (sh->pd_idx + 2); 2970 } 2971 break; 2972 case ALGORITHM_PARITY_0: 2973 i -= 2; 2974 break; 2975 case ALGORITHM_PARITY_N: 2976 break; 2977 case ALGORITHM_ROTATING_N_CONTINUE: 2978 /* Like left_symmetric, but P is before Q */ 2979 if (sh->pd_idx == 0) 2980 i--; /* P D D D Q */ 2981 else { 2982 /* D D Q P D */ 2983 if (i < sh->pd_idx) 2984 i += raid_disks; 2985 i -= (sh->pd_idx + 1); 2986 } 2987 break; 2988 case ALGORITHM_LEFT_ASYMMETRIC_6: 2989 case ALGORITHM_RIGHT_ASYMMETRIC_6: 2990 if (i > sh->pd_idx) 2991 i--; 2992 break; 2993 case ALGORITHM_LEFT_SYMMETRIC_6: 2994 case ALGORITHM_RIGHT_SYMMETRIC_6: 2995 if (i < sh->pd_idx) 2996 i += data_disks + 1; 2997 i -= (sh->pd_idx + 1); 2998 break; 2999 case ALGORITHM_PARITY_0_6: 3000 i -= 1; 3001 break; 3002 default: 3003 BUG(); 3004 } 3005 break; 3006 } 3007 3008 chunk_number = stripe * data_disks + i; 3009 r_sector = chunk_number * sectors_per_chunk + chunk_offset; 3010 3011 check = raid5_compute_sector(conf, r_sector, 3012 previous, &dummy1, &sh2); 3013 if (check != sh->sector || dummy1 != dd_idx || sh2.pd_idx != sh->pd_idx 3014 || sh2.qd_idx != sh->qd_idx) { 3015 pr_warn("md/raid:%s: compute_blocknr: map not correct\n", 3016 mdname(conf->mddev)); 3017 return 0; 3018 } 3019 return r_sector; 3020 } 3021 3022 /* 3023 * There are cases where we want handle_stripe_dirtying() and 3024 * schedule_reconstruction() to delay towrite to some dev of a stripe. 3025 * 3026 * This function checks whether we want to delay the towrite. Specifically, 3027 * we delay the towrite when: 3028 * 3029 * 1. degraded stripe has a non-overwrite to the missing dev, AND this 3030 * stripe has data in journal (for other devices). 3031 * 3032 * In this case, when reading data for the non-overwrite dev, it is 3033 * necessary to handle complex rmw of write back cache (prexor with 3034 * orig_page, and xor with page). To keep read path simple, we would 3035 * like to flush data in journal to RAID disks first, so complex rmw 3036 * is handled in the write patch (handle_stripe_dirtying). 3037 * 3038 * 2. when journal space is critical (R5C_LOG_CRITICAL=1) 3039 * 3040 * It is important to be able to flush all stripes in raid5-cache. 3041 * Therefore, we need reserve some space on the journal device for 3042 * these flushes. If flush operation includes pending writes to the 3043 * stripe, we need to reserve (conf->raid_disk + 1) pages per stripe 3044 * for the flush out. If we exclude these pending writes from flush 3045 * operation, we only need (conf->max_degraded + 1) pages per stripe. 3046 * Therefore, excluding pending writes in these cases enables more 3047 * efficient use of the journal device. 3048 * 3049 * Note: To make sure the stripe makes progress, we only delay 3050 * towrite for stripes with data already in journal (injournal > 0). 3051 * When LOG_CRITICAL, stripes with injournal == 0 will be sent to 3052 * no_space_stripes list. 3053 * 3054 * 3. during journal failure 3055 * In journal failure, we try to flush all cached data to raid disks 3056 * based on data in stripe cache. The array is read-only to upper 3057 * layers, so we would skip all pending writes. 3058 * 3059 */ 3060 static inline bool delay_towrite(struct r5conf *conf, 3061 struct r5dev *dev, 3062 struct stripe_head_state *s) 3063 { 3064 /* case 1 above */ 3065 if (!test_bit(R5_OVERWRITE, &dev->flags) && 3066 !test_bit(R5_Insync, &dev->flags) && s->injournal) 3067 return true; 3068 /* case 2 above */ 3069 if (test_bit(R5C_LOG_CRITICAL, &conf->cache_state) && 3070 s->injournal > 0) 3071 return true; 3072 /* case 3 above */ 3073 if (s->log_failed && s->injournal) 3074 return true; 3075 return false; 3076 } 3077 3078 static void 3079 schedule_reconstruction(struct stripe_head *sh, struct stripe_head_state *s, 3080 int rcw, int expand) 3081 { 3082 int i, pd_idx = sh->pd_idx, qd_idx = sh->qd_idx, disks = sh->disks; 3083 struct r5conf *conf = sh->raid_conf; 3084 int level = conf->level; 3085 3086 if (rcw) { 3087 /* 3088 * In some cases, handle_stripe_dirtying initially decided to 3089 * run rmw and allocates extra page for prexor. However, rcw is 3090 * cheaper later on. We need to free the extra page now, 3091 * because we won't be able to do that in ops_complete_prexor(). 3092 */ 3093 r5c_release_extra_page(sh); 3094 3095 for (i = disks; i--; ) { 3096 struct r5dev *dev = &sh->dev[i]; 3097 3098 if (dev->towrite && !delay_towrite(conf, dev, s)) { 3099 set_bit(R5_LOCKED, &dev->flags); 3100 set_bit(R5_Wantdrain, &dev->flags); 3101 if (!expand) 3102 clear_bit(R5_UPTODATE, &dev->flags); 3103 s->locked++; 3104 } else if (test_bit(R5_InJournal, &dev->flags)) { 3105 set_bit(R5_LOCKED, &dev->flags); 3106 s->locked++; 3107 } 3108 } 3109 /* if we are not expanding this is a proper write request, and 3110 * there will be bios with new data to be drained into the 3111 * stripe cache 3112 */ 3113 if (!expand) { 3114 if (!s->locked) 3115 /* False alarm, nothing to do */ 3116 return; 3117 sh->reconstruct_state = reconstruct_state_drain_run; 3118 set_bit(STRIPE_OP_BIODRAIN, &s->ops_request); 3119 } else 3120 sh->reconstruct_state = reconstruct_state_run; 3121 3122 set_bit(STRIPE_OP_RECONSTRUCT, &s->ops_request); 3123 3124 if (s->locked + conf->max_degraded == disks) 3125 if (!test_and_set_bit(STRIPE_FULL_WRITE, &sh->state)) 3126 atomic_inc(&conf->pending_full_writes); 3127 } else { 3128 BUG_ON(!(test_bit(R5_UPTODATE, &sh->dev[pd_idx].flags) || 3129 test_bit(R5_Wantcompute, &sh->dev[pd_idx].flags))); 3130 BUG_ON(level == 6 && 3131 (!(test_bit(R5_UPTODATE, &sh->dev[qd_idx].flags) || 3132 test_bit(R5_Wantcompute, &sh->dev[qd_idx].flags)))); 3133 3134 for (i = disks; i--; ) { 3135 struct r5dev *dev = &sh->dev[i]; 3136 if (i == pd_idx || i == qd_idx) 3137 continue; 3138 3139 if (dev->towrite && 3140 (test_bit(R5_UPTODATE, &dev->flags) || 3141 test_bit(R5_Wantcompute, &dev->flags))) { 3142 set_bit(R5_Wantdrain, &dev->flags); 3143 set_bit(R5_LOCKED, &dev->flags); 3144 clear_bit(R5_UPTODATE, &dev->flags); 3145 s->locked++; 3146 } else if (test_bit(R5_InJournal, &dev->flags)) { 3147 set_bit(R5_LOCKED, &dev->flags); 3148 s->locked++; 3149 } 3150 } 3151 if (!s->locked) 3152 /* False alarm - nothing to do */ 3153 return; 3154 sh->reconstruct_state = reconstruct_state_prexor_drain_run; 3155 set_bit(STRIPE_OP_PREXOR, &s->ops_request); 3156 set_bit(STRIPE_OP_BIODRAIN, &s->ops_request); 3157 set_bit(STRIPE_OP_RECONSTRUCT, &s->ops_request); 3158 } 3159 3160 /* keep the parity disk(s) locked while asynchronous operations 3161 * are in flight 3162 */ 3163 set_bit(R5_LOCKED, &sh->dev[pd_idx].flags); 3164 clear_bit(R5_UPTODATE, &sh->dev[pd_idx].flags); 3165 s->locked++; 3166 3167 if (level == 6) { 3168 int qd_idx = sh->qd_idx; 3169 struct r5dev *dev = &sh->dev[qd_idx]; 3170 3171 set_bit(R5_LOCKED, &dev->flags); 3172 clear_bit(R5_UPTODATE, &dev->flags); 3173 s->locked++; 3174 } 3175 3176 if (raid5_has_ppl(sh->raid_conf) && sh->ppl_page && 3177 test_bit(STRIPE_OP_BIODRAIN, &s->ops_request) && 3178 !test_bit(STRIPE_FULL_WRITE, &sh->state) && 3179 test_bit(R5_Insync, &sh->dev[pd_idx].flags)) 3180 set_bit(STRIPE_OP_PARTIAL_PARITY, &s->ops_request); 3181 3182 pr_debug("%s: stripe %llu locked: %d ops_request: %lx\n", 3183 __func__, (unsigned long long)sh->sector, 3184 s->locked, s->ops_request); 3185 } 3186 3187 /* 3188 * Each stripe/dev can have one or more bion attached. 3189 * toread/towrite point to the first in a chain. 3190 * The bi_next chain must be in order. 3191 */ 3192 static int add_stripe_bio(struct stripe_head *sh, struct bio *bi, int dd_idx, 3193 int forwrite, int previous) 3194 { 3195 struct bio **bip; 3196 struct r5conf *conf = sh->raid_conf; 3197 int firstwrite=0; 3198 3199 pr_debug("adding bi b#%llu to stripe s#%llu\n", 3200 (unsigned long long)bi->bi_iter.bi_sector, 3201 (unsigned long long)sh->sector); 3202 3203 spin_lock_irq(&sh->stripe_lock); 3204 /* Don't allow new IO added to stripes in batch list */ 3205 if (sh->batch_head) 3206 goto overlap; 3207 if (forwrite) { 3208 bip = &sh->dev[dd_idx].towrite; 3209 if (*bip == NULL) 3210 firstwrite = 1; 3211 } else 3212 bip = &sh->dev[dd_idx].toread; 3213 while (*bip && (*bip)->bi_iter.bi_sector < bi->bi_iter.bi_sector) { 3214 if (bio_end_sector(*bip) > bi->bi_iter.bi_sector) 3215 goto overlap; 3216 bip = & (*bip)->bi_next; 3217 } 3218 if (*bip && (*bip)->bi_iter.bi_sector < bio_end_sector(bi)) 3219 goto overlap; 3220 3221 if (forwrite && raid5_has_ppl(conf)) { 3222 /* 3223 * With PPL only writes to consecutive data chunks within a 3224 * stripe are allowed because for a single stripe_head we can 3225 * only have one PPL entry at a time, which describes one data 3226 * range. Not really an overlap, but wait_for_overlap can be 3227 * used to handle this. 3228 */ 3229 sector_t sector; 3230 sector_t first = 0; 3231 sector_t last = 0; 3232 int count = 0; 3233 int i; 3234 3235 for (i = 0; i < sh->disks; i++) { 3236 if (i != sh->pd_idx && 3237 (i == dd_idx || sh->dev[i].towrite)) { 3238 sector = sh->dev[i].sector; 3239 if (count == 0 || sector < first) 3240 first = sector; 3241 if (sector > last) 3242 last = sector; 3243 count++; 3244 } 3245 } 3246 3247 if (first + conf->chunk_sectors * (count - 1) != last) 3248 goto overlap; 3249 } 3250 3251 if (!forwrite || previous) 3252 clear_bit(STRIPE_BATCH_READY, &sh->state); 3253 3254 BUG_ON(*bip && bi->bi_next && (*bip) != bi->bi_next); 3255 if (*bip) 3256 bi->bi_next = *bip; 3257 *bip = bi; 3258 bio_inc_remaining(bi); 3259 md_write_inc(conf->mddev, bi); 3260 3261 if (forwrite) { 3262 /* check if page is covered */ 3263 sector_t sector = sh->dev[dd_idx].sector; 3264 for (bi=sh->dev[dd_idx].towrite; 3265 sector < sh->dev[dd_idx].sector + STRIPE_SECTORS && 3266 bi && bi->bi_iter.bi_sector <= sector; 3267 bi = r5_next_bio(bi, sh->dev[dd_idx].sector)) { 3268 if (bio_end_sector(bi) >= sector) 3269 sector = bio_end_sector(bi); 3270 } 3271 if (sector >= sh->dev[dd_idx].sector + STRIPE_SECTORS) 3272 if (!test_and_set_bit(R5_OVERWRITE, &sh->dev[dd_idx].flags)) 3273 sh->overwrite_disks++; 3274 } 3275 3276 pr_debug("added bi b#%llu to stripe s#%llu, disk %d.\n", 3277 (unsigned long long)(*bip)->bi_iter.bi_sector, 3278 (unsigned long long)sh->sector, dd_idx); 3279 3280 if (conf->mddev->bitmap && firstwrite) { 3281 /* Cannot hold spinlock over bitmap_startwrite, 3282 * but must ensure this isn't added to a batch until 3283 * we have added to the bitmap and set bm_seq. 3284 * So set STRIPE_BITMAP_PENDING to prevent 3285 * batching. 3286 * If multiple add_stripe_bio() calls race here they 3287 * much all set STRIPE_BITMAP_PENDING. So only the first one 3288 * to complete "bitmap_startwrite" gets to set 3289 * STRIPE_BIT_DELAY. This is important as once a stripe 3290 * is added to a batch, STRIPE_BIT_DELAY cannot be changed 3291 * any more. 3292 */ 3293 set_bit(STRIPE_BITMAP_PENDING, &sh->state); 3294 spin_unlock_irq(&sh->stripe_lock); 3295 bitmap_startwrite(conf->mddev->bitmap, sh->sector, 3296 STRIPE_SECTORS, 0); 3297 spin_lock_irq(&sh->stripe_lock); 3298 clear_bit(STRIPE_BITMAP_PENDING, &sh->state); 3299 if (!sh->batch_head) { 3300 sh->bm_seq = conf->seq_flush+1; 3301 set_bit(STRIPE_BIT_DELAY, &sh->state); 3302 } 3303 } 3304 spin_unlock_irq(&sh->stripe_lock); 3305 3306 if (stripe_can_batch(sh)) 3307 stripe_add_to_batch_list(conf, sh); 3308 return 1; 3309 3310 overlap: 3311 set_bit(R5_Overlap, &sh->dev[dd_idx].flags); 3312 spin_unlock_irq(&sh->stripe_lock); 3313 return 0; 3314 } 3315 3316 static void end_reshape(struct r5conf *conf); 3317 3318 static void stripe_set_idx(sector_t stripe, struct r5conf *conf, int previous, 3319 struct stripe_head *sh) 3320 { 3321 int sectors_per_chunk = 3322 previous ? conf->prev_chunk_sectors : conf->chunk_sectors; 3323 int dd_idx; 3324 int chunk_offset = sector_div(stripe, sectors_per_chunk); 3325 int disks = previous ? conf->previous_raid_disks : conf->raid_disks; 3326 3327 raid5_compute_sector(conf, 3328 stripe * (disks - conf->max_degraded) 3329 *sectors_per_chunk + chunk_offset, 3330 previous, 3331 &dd_idx, sh); 3332 } 3333 3334 static void 3335 handle_failed_stripe(struct r5conf *conf, struct stripe_head *sh, 3336 struct stripe_head_state *s, int disks) 3337 { 3338 int i; 3339 BUG_ON(sh->batch_head); 3340 for (i = disks; i--; ) { 3341 struct bio *bi; 3342 int bitmap_end = 0; 3343 3344 if (test_bit(R5_ReadError, &sh->dev[i].flags)) { 3345 struct md_rdev *rdev; 3346 rcu_read_lock(); 3347 rdev = rcu_dereference(conf->disks[i].rdev); 3348 if (rdev && test_bit(In_sync, &rdev->flags) && 3349 !test_bit(Faulty, &rdev->flags)) 3350 atomic_inc(&rdev->nr_pending); 3351 else 3352 rdev = NULL; 3353 rcu_read_unlock(); 3354 if (rdev) { 3355 if (!rdev_set_badblocks( 3356 rdev, 3357 sh->sector, 3358 STRIPE_SECTORS, 0)) 3359 md_error(conf->mddev, rdev); 3360 rdev_dec_pending(rdev, conf->mddev); 3361 } 3362 } 3363 spin_lock_irq(&sh->stripe_lock); 3364 /* fail all writes first */ 3365 bi = sh->dev[i].towrite; 3366 sh->dev[i].towrite = NULL; 3367 sh->overwrite_disks = 0; 3368 spin_unlock_irq(&sh->stripe_lock); 3369 if (bi) 3370 bitmap_end = 1; 3371 3372 log_stripe_write_finished(sh); 3373 3374 if (test_and_clear_bit(R5_Overlap, &sh->dev[i].flags)) 3375 wake_up(&conf->wait_for_overlap); 3376 3377 while (bi && bi->bi_iter.bi_sector < 3378 sh->dev[i].sector + STRIPE_SECTORS) { 3379 struct bio *nextbi = r5_next_bio(bi, sh->dev[i].sector); 3380 3381 md_write_end(conf->mddev); 3382 bio_io_error(bi); 3383 bi = nextbi; 3384 } 3385 if (bitmap_end) 3386 bitmap_endwrite(conf->mddev->bitmap, sh->sector, 3387 STRIPE_SECTORS, 0, 0); 3388 bitmap_end = 0; 3389 /* and fail all 'written' */ 3390 bi = sh->dev[i].written; 3391 sh->dev[i].written = NULL; 3392 if (test_and_clear_bit(R5_SkipCopy, &sh->dev[i].flags)) { 3393 WARN_ON(test_bit(R5_UPTODATE, &sh->dev[i].flags)); 3394 sh->dev[i].page = sh->dev[i].orig_page; 3395 } 3396 3397 if (bi) bitmap_end = 1; 3398 while (bi && bi->bi_iter.bi_sector < 3399 sh->dev[i].sector + STRIPE_SECTORS) { 3400 struct bio *bi2 = r5_next_bio(bi, sh->dev[i].sector); 3401 3402 md_write_end(conf->mddev); 3403 bio_io_error(bi); 3404 bi = bi2; 3405 } 3406 3407 /* fail any reads if this device is non-operational and 3408 * the data has not reached the cache yet. 3409 */ 3410 if (!test_bit(R5_Wantfill, &sh->dev[i].flags) && 3411 s->failed > conf->max_degraded && 3412 (!test_bit(R5_Insync, &sh->dev[i].flags) || 3413 test_bit(R5_ReadError, &sh->dev[i].flags))) { 3414 spin_lock_irq(&sh->stripe_lock); 3415 bi = sh->dev[i].toread; 3416 sh->dev[i].toread = NULL; 3417 spin_unlock_irq(&sh->stripe_lock); 3418 if (test_and_clear_bit(R5_Overlap, &sh->dev[i].flags)) 3419 wake_up(&conf->wait_for_overlap); 3420 if (bi) 3421 s->to_read--; 3422 while (bi && bi->bi_iter.bi_sector < 3423 sh->dev[i].sector + STRIPE_SECTORS) { 3424 struct bio *nextbi = 3425 r5_next_bio(bi, sh->dev[i].sector); 3426 3427 bio_io_error(bi); 3428 bi = nextbi; 3429 } 3430 } 3431 if (bitmap_end) 3432 bitmap_endwrite(conf->mddev->bitmap, sh->sector, 3433 STRIPE_SECTORS, 0, 0); 3434 /* If we were in the middle of a write the parity block might 3435 * still be locked - so just clear all R5_LOCKED flags 3436 */ 3437 clear_bit(R5_LOCKED, &sh->dev[i].flags); 3438 } 3439 s->to_write = 0; 3440 s->written = 0; 3441 3442 if (test_and_clear_bit(STRIPE_FULL_WRITE, &sh->state)) 3443 if (atomic_dec_and_test(&conf->pending_full_writes)) 3444 md_wakeup_thread(conf->mddev->thread); 3445 } 3446 3447 static void 3448 handle_failed_sync(struct r5conf *conf, struct stripe_head *sh, 3449 struct stripe_head_state *s) 3450 { 3451 int abort = 0; 3452 int i; 3453 3454 BUG_ON(sh->batch_head); 3455 clear_bit(STRIPE_SYNCING, &sh->state); 3456 if (test_and_clear_bit(R5_Overlap, &sh->dev[sh->pd_idx].flags)) 3457 wake_up(&conf->wait_for_overlap); 3458 s->syncing = 0; 3459 s->replacing = 0; 3460 /* There is nothing more to do for sync/check/repair. 3461 * Don't even need to abort as that is handled elsewhere 3462 * if needed, and not always wanted e.g. if there is a known 3463 * bad block here. 3464 * For recover/replace we need to record a bad block on all 3465 * non-sync devices, or abort the recovery 3466 */ 3467 if (test_bit(MD_RECOVERY_RECOVER, &conf->mddev->recovery)) { 3468 /* During recovery devices cannot be removed, so 3469 * locking and refcounting of rdevs is not needed 3470 */ 3471 rcu_read_lock(); 3472 for (i = 0; i < conf->raid_disks; i++) { 3473 struct md_rdev *rdev = rcu_dereference(conf->disks[i].rdev); 3474 if (rdev 3475 && !test_bit(Faulty, &rdev->flags) 3476 && !test_bit(In_sync, &rdev->flags) 3477 && !rdev_set_badblocks(rdev, sh->sector, 3478 STRIPE_SECTORS, 0)) 3479 abort = 1; 3480 rdev = rcu_dereference(conf->disks[i].replacement); 3481 if (rdev 3482 && !test_bit(Faulty, &rdev->flags) 3483 && !test_bit(In_sync, &rdev->flags) 3484 && !rdev_set_badblocks(rdev, sh->sector, 3485 STRIPE_SECTORS, 0)) 3486 abort = 1; 3487 } 3488 rcu_read_unlock(); 3489 if (abort) 3490 conf->recovery_disabled = 3491 conf->mddev->recovery_disabled; 3492 } 3493 md_done_sync(conf->mddev, STRIPE_SECTORS, !abort); 3494 } 3495 3496 static int want_replace(struct stripe_head *sh, int disk_idx) 3497 { 3498 struct md_rdev *rdev; 3499 int rv = 0; 3500 3501 rcu_read_lock(); 3502 rdev = rcu_dereference(sh->raid_conf->disks[disk_idx].replacement); 3503 if (rdev 3504 && !test_bit(Faulty, &rdev->flags) 3505 && !test_bit(In_sync, &rdev->flags) 3506 && (rdev->recovery_offset <= sh->sector 3507 || rdev->mddev->recovery_cp <= sh->sector)) 3508 rv = 1; 3509 rcu_read_unlock(); 3510 return rv; 3511 } 3512 3513 static int need_this_block(struct stripe_head *sh, struct stripe_head_state *s, 3514 int disk_idx, int disks) 3515 { 3516 struct r5dev *dev = &sh->dev[disk_idx]; 3517 struct r5dev *fdev[2] = { &sh->dev[s->failed_num[0]], 3518 &sh->dev[s->failed_num[1]] }; 3519 int i; 3520 3521 3522 if (test_bit(R5_LOCKED, &dev->flags) || 3523 test_bit(R5_UPTODATE, &dev->flags)) 3524 /* No point reading this as we already have it or have 3525 * decided to get it. 3526 */ 3527 return 0; 3528 3529 if (dev->toread || 3530 (dev->towrite && !test_bit(R5_OVERWRITE, &dev->flags))) 3531 /* We need this block to directly satisfy a request */ 3532 return 1; 3533 3534 if (s->syncing || s->expanding || 3535 (s->replacing && want_replace(sh, disk_idx))) 3536 /* When syncing, or expanding we read everything. 3537 * When replacing, we need the replaced block. 3538 */ 3539 return 1; 3540 3541 if ((s->failed >= 1 && fdev[0]->toread) || 3542 (s->failed >= 2 && fdev[1]->toread)) 3543 /* If we want to read from a failed device, then 3544 * we need to actually read every other device. 3545 */ 3546 return 1; 3547 3548 /* Sometimes neither read-modify-write nor reconstruct-write 3549 * cycles can work. In those cases we read every block we 3550 * can. Then the parity-update is certain to have enough to 3551 * work with. 3552 * This can only be a problem when we need to write something, 3553 * and some device has failed. If either of those tests 3554 * fail we need look no further. 3555 */ 3556 if (!s->failed || !s->to_write) 3557 return 0; 3558 3559 if (test_bit(R5_Insync, &dev->flags) && 3560 !test_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) 3561 /* Pre-reads at not permitted until after short delay 3562 * to gather multiple requests. However if this 3563 * device is no Insync, the block could only be computed 3564 * and there is no need to delay that. 3565 */ 3566 return 0; 3567 3568 for (i = 0; i < s->failed && i < 2; i++) { 3569 if (fdev[i]->towrite && 3570 !test_bit(R5_UPTODATE, &fdev[i]->flags) && 3571 !test_bit(R5_OVERWRITE, &fdev[i]->flags)) 3572 /* If we have a partial write to a failed 3573 * device, then we will need to reconstruct 3574 * the content of that device, so all other 3575 * devices must be read. 3576 */ 3577 return 1; 3578 } 3579 3580 /* If we are forced to do a reconstruct-write, either because 3581 * the current RAID6 implementation only supports that, or 3582 * because parity cannot be trusted and we are currently 3583 * recovering it, there is extra need to be careful. 3584 * If one of the devices that we would need to read, because 3585 * it is not being overwritten (and maybe not written at all) 3586 * is missing/faulty, then we need to read everything we can. 3587 */ 3588 if (sh->raid_conf->level != 6 && 3589 sh->sector < sh->raid_conf->mddev->recovery_cp) 3590 /* reconstruct-write isn't being forced */ 3591 return 0; 3592 for (i = 0; i < s->failed && i < 2; i++) { 3593 if (s->failed_num[i] != sh->pd_idx && 3594 s->failed_num[i] != sh->qd_idx && 3595 !test_bit(R5_UPTODATE, &fdev[i]->flags) && 3596 !test_bit(R5_OVERWRITE, &fdev[i]->flags)) 3597 return 1; 3598 } 3599 3600 return 0; 3601 } 3602 3603 /* fetch_block - checks the given member device to see if its data needs 3604 * to be read or computed to satisfy a request. 3605 * 3606 * Returns 1 when no more member devices need to be checked, otherwise returns 3607 * 0 to tell the loop in handle_stripe_fill to continue 3608 */ 3609 static int fetch_block(struct stripe_head *sh, struct stripe_head_state *s, 3610 int disk_idx, int disks) 3611 { 3612 struct r5dev *dev = &sh->dev[disk_idx]; 3613 3614 /* is the data in this block needed, and can we get it? */ 3615 if (need_this_block(sh, s, disk_idx, disks)) { 3616 /* we would like to get this block, possibly by computing it, 3617 * otherwise read it if the backing disk is insync 3618 */ 3619 BUG_ON(test_bit(R5_Wantcompute, &dev->flags)); 3620 BUG_ON(test_bit(R5_Wantread, &dev->flags)); 3621 BUG_ON(sh->batch_head); 3622 3623 /* 3624 * In the raid6 case if the only non-uptodate disk is P 3625 * then we already trusted P to compute the other failed 3626 * drives. It is safe to compute rather than re-read P. 3627 * In other cases we only compute blocks from failed 3628 * devices, otherwise check/repair might fail to detect 3629 * a real inconsistency. 3630 */ 3631 3632 if ((s->uptodate == disks - 1) && 3633 ((sh->qd_idx >= 0 && sh->pd_idx == disk_idx) || 3634 (s->failed && (disk_idx == s->failed_num[0] || 3635 disk_idx == s->failed_num[1])))) { 3636 /* have disk failed, and we're requested to fetch it; 3637 * do compute it 3638 */ 3639 pr_debug("Computing stripe %llu block %d\n", 3640 (unsigned long long)sh->sector, disk_idx); 3641 set_bit(STRIPE_COMPUTE_RUN, &sh->state); 3642 set_bit(STRIPE_OP_COMPUTE_BLK, &s->ops_request); 3643 set_bit(R5_Wantcompute, &dev->flags); 3644 sh->ops.target = disk_idx; 3645 sh->ops.target2 = -1; /* no 2nd target */ 3646 s->req_compute = 1; 3647 /* Careful: from this point on 'uptodate' is in the eye 3648 * of raid_run_ops which services 'compute' operations 3649 * before writes. R5_Wantcompute flags a block that will 3650 * be R5_UPTODATE by the time it is needed for a 3651 * subsequent operation. 3652 */ 3653 s->uptodate++; 3654 return 1; 3655 } else if (s->uptodate == disks-2 && s->failed >= 2) { 3656 /* Computing 2-failure is *very* expensive; only 3657 * do it if failed >= 2 3658 */ 3659 int other; 3660 for (other = disks; other--; ) { 3661 if (other == disk_idx) 3662 continue; 3663 if (!test_bit(R5_UPTODATE, 3664 &sh->dev[other].flags)) 3665 break; 3666 } 3667 BUG_ON(other < 0); 3668 pr_debug("Computing stripe %llu blocks %d,%d\n", 3669 (unsigned long long)sh->sector, 3670 disk_idx, other); 3671 set_bit(STRIPE_COMPUTE_RUN, &sh->state); 3672 set_bit(STRIPE_OP_COMPUTE_BLK, &s->ops_request); 3673 set_bit(R5_Wantcompute, &sh->dev[disk_idx].flags); 3674 set_bit(R5_Wantcompute, &sh->dev[other].flags); 3675 sh->ops.target = disk_idx; 3676 sh->ops.target2 = other; 3677 s->uptodate += 2; 3678 s->req_compute = 1; 3679 return 1; 3680 } else if (test_bit(R5_Insync, &dev->flags)) { 3681 set_bit(R5_LOCKED, &dev->flags); 3682 set_bit(R5_Wantread, &dev->flags); 3683 s->locked++; 3684 pr_debug("Reading block %d (sync=%d)\n", 3685 disk_idx, s->syncing); 3686 } 3687 } 3688 3689 return 0; 3690 } 3691 3692 /** 3693 * handle_stripe_fill - read or compute data to satisfy pending requests. 3694 */ 3695 static void handle_stripe_fill(struct stripe_head *sh, 3696 struct stripe_head_state *s, 3697 int disks) 3698 { 3699 int i; 3700 3701 /* look for blocks to read/compute, skip this if a compute 3702 * is already in flight, or if the stripe contents are in the 3703 * midst of changing due to a write 3704 */ 3705 if (!test_bit(STRIPE_COMPUTE_RUN, &sh->state) && !sh->check_state && 3706 !sh->reconstruct_state) { 3707 3708 /* 3709 * For degraded stripe with data in journal, do not handle 3710 * read requests yet, instead, flush the stripe to raid 3711 * disks first, this avoids handling complex rmw of write 3712 * back cache (prexor with orig_page, and then xor with 3713 * page) in the read path 3714 */ 3715 if (s->injournal && s->failed) { 3716 if (test_bit(STRIPE_R5C_CACHING, &sh->state)) 3717 r5c_make_stripe_write_out(sh); 3718 goto out; 3719 } 3720 3721 for (i = disks; i--; ) 3722 if (fetch_block(sh, s, i, disks)) 3723 break; 3724 } 3725 out: 3726 set_bit(STRIPE_HANDLE, &sh->state); 3727 } 3728 3729 static void break_stripe_batch_list(struct stripe_head *head_sh, 3730 unsigned long handle_flags); 3731 /* handle_stripe_clean_event 3732 * any written block on an uptodate or failed drive can be returned. 3733 * Note that if we 'wrote' to a failed drive, it will be UPTODATE, but 3734 * never LOCKED, so we don't need to test 'failed' directly. 3735 */ 3736 static void handle_stripe_clean_event(struct r5conf *conf, 3737 struct stripe_head *sh, int disks) 3738 { 3739 int i; 3740 struct r5dev *dev; 3741 int discard_pending = 0; 3742 struct stripe_head *head_sh = sh; 3743 bool do_endio = false; 3744 3745 for (i = disks; i--; ) 3746 if (sh->dev[i].written) { 3747 dev = &sh->dev[i]; 3748 if (!test_bit(R5_LOCKED, &dev->flags) && 3749 (test_bit(R5_UPTODATE, &dev->flags) || 3750 test_bit(R5_Discard, &dev->flags) || 3751 test_bit(R5_SkipCopy, &dev->flags))) { 3752 /* We can return any write requests */ 3753 struct bio *wbi, *wbi2; 3754 pr_debug("Return write for disc %d\n", i); 3755 if (test_and_clear_bit(R5_Discard, &dev->flags)) 3756 clear_bit(R5_UPTODATE, &dev->flags); 3757 if (test_and_clear_bit(R5_SkipCopy, &dev->flags)) { 3758 WARN_ON(test_bit(R5_UPTODATE, &dev->flags)); 3759 } 3760 do_endio = true; 3761 3762 returnbi: 3763 dev->page = dev->orig_page; 3764 wbi = dev->written; 3765 dev->written = NULL; 3766 while (wbi && wbi->bi_iter.bi_sector < 3767 dev->sector + STRIPE_SECTORS) { 3768 wbi2 = r5_next_bio(wbi, dev->sector); 3769 md_write_end(conf->mddev); 3770 bio_endio(wbi); 3771 wbi = wbi2; 3772 } 3773 bitmap_endwrite(conf->mddev->bitmap, sh->sector, 3774 STRIPE_SECTORS, 3775 !test_bit(STRIPE_DEGRADED, &sh->state), 3776 0); 3777 if (head_sh->batch_head) { 3778 sh = list_first_entry(&sh->batch_list, 3779 struct stripe_head, 3780 batch_list); 3781 if (sh != head_sh) { 3782 dev = &sh->dev[i]; 3783 goto returnbi; 3784 } 3785 } 3786 sh = head_sh; 3787 dev = &sh->dev[i]; 3788 } else if (test_bit(R5_Discard, &dev->flags)) 3789 discard_pending = 1; 3790 } 3791 3792 log_stripe_write_finished(sh); 3793 3794 if (!discard_pending && 3795 test_bit(R5_Discard, &sh->dev[sh->pd_idx].flags)) { 3796 int hash; 3797 clear_bit(R5_Discard, &sh->dev[sh->pd_idx].flags); 3798 clear_bit(R5_UPTODATE, &sh->dev[sh->pd_idx].flags); 3799 if (sh->qd_idx >= 0) { 3800 clear_bit(R5_Discard, &sh->dev[sh->qd_idx].flags); 3801 clear_bit(R5_UPTODATE, &sh->dev[sh->qd_idx].flags); 3802 } 3803 /* now that discard is done we can proceed with any sync */ 3804 clear_bit(STRIPE_DISCARD, &sh->state); 3805 /* 3806 * SCSI discard will change some bio fields and the stripe has 3807 * no updated data, so remove it from hash list and the stripe 3808 * will be reinitialized 3809 */ 3810 unhash: 3811 hash = sh->hash_lock_index; 3812 spin_lock_irq(conf->hash_locks + hash); 3813 remove_hash(sh); 3814 spin_unlock_irq(conf->hash_locks + hash); 3815 if (head_sh->batch_head) { 3816 sh = list_first_entry(&sh->batch_list, 3817 struct stripe_head, batch_list); 3818 if (sh != head_sh) 3819 goto unhash; 3820 } 3821 sh = head_sh; 3822 3823 if (test_bit(STRIPE_SYNC_REQUESTED, &sh->state)) 3824 set_bit(STRIPE_HANDLE, &sh->state); 3825 3826 } 3827 3828 if (test_and_clear_bit(STRIPE_FULL_WRITE, &sh->state)) 3829 if (atomic_dec_and_test(&conf->pending_full_writes)) 3830 md_wakeup_thread(conf->mddev->thread); 3831 3832 if (head_sh->batch_head && do_endio) 3833 break_stripe_batch_list(head_sh, STRIPE_EXPAND_SYNC_FLAGS); 3834 } 3835 3836 /* 3837 * For RMW in write back cache, we need extra page in prexor to store the 3838 * old data. This page is stored in dev->orig_page. 3839 * 3840 * This function checks whether we have data for prexor. The exact logic 3841 * is: 3842 * R5_UPTODATE && (!R5_InJournal || R5_OrigPageUPTDODATE) 3843 */ 3844 static inline bool uptodate_for_rmw(struct r5dev *dev) 3845 { 3846 return (test_bit(R5_UPTODATE, &dev->flags)) && 3847 (!test_bit(R5_InJournal, &dev->flags) || 3848 test_bit(R5_OrigPageUPTDODATE, &dev->flags)); 3849 } 3850 3851 static int handle_stripe_dirtying(struct r5conf *conf, 3852 struct stripe_head *sh, 3853 struct stripe_head_state *s, 3854 int disks) 3855 { 3856 int rmw = 0, rcw = 0, i; 3857 sector_t recovery_cp = conf->mddev->recovery_cp; 3858 3859 /* Check whether resync is now happening or should start. 3860 * If yes, then the array is dirty (after unclean shutdown or 3861 * initial creation), so parity in some stripes might be inconsistent. 3862 * In this case, we need to always do reconstruct-write, to ensure 3863 * that in case of drive failure or read-error correction, we 3864 * generate correct data from the parity. 3865 */ 3866 if (conf->rmw_level == PARITY_DISABLE_RMW || 3867 (recovery_cp < MaxSector && sh->sector >= recovery_cp && 3868 s->failed == 0)) { 3869 /* Calculate the real rcw later - for now make it 3870 * look like rcw is cheaper 3871 */ 3872 rcw = 1; rmw = 2; 3873 pr_debug("force RCW rmw_level=%u, recovery_cp=%llu sh->sector=%llu\n", 3874 conf->rmw_level, (unsigned long long)recovery_cp, 3875 (unsigned long long)sh->sector); 3876 } else for (i = disks; i--; ) { 3877 /* would I have to read this buffer for read_modify_write */ 3878 struct r5dev *dev = &sh->dev[i]; 3879 if (((dev->towrite && !delay_towrite(conf, dev, s)) || 3880 i == sh->pd_idx || i == sh->qd_idx || 3881 test_bit(R5_InJournal, &dev->flags)) && 3882 !test_bit(R5_LOCKED, &dev->flags) && 3883 !(uptodate_for_rmw(dev) || 3884 test_bit(R5_Wantcompute, &dev->flags))) { 3885 if (test_bit(R5_Insync, &dev->flags)) 3886 rmw++; 3887 else 3888 rmw += 2*disks; /* cannot read it */ 3889 } 3890 /* Would I have to read this buffer for reconstruct_write */ 3891 if (!test_bit(R5_OVERWRITE, &dev->flags) && 3892 i != sh->pd_idx && i != sh->qd_idx && 3893 !test_bit(R5_LOCKED, &dev->flags) && 3894 !(test_bit(R5_UPTODATE, &dev->flags) || 3895 test_bit(R5_Wantcompute, &dev->flags))) { 3896 if (test_bit(R5_Insync, &dev->flags)) 3897 rcw++; 3898 else 3899 rcw += 2*disks; 3900 } 3901 } 3902 3903 pr_debug("for sector %llu state 0x%lx, rmw=%d rcw=%d\n", 3904 (unsigned long long)sh->sector, sh->state, rmw, rcw); 3905 set_bit(STRIPE_HANDLE, &sh->state); 3906 if ((rmw < rcw || (rmw == rcw && conf->rmw_level == PARITY_PREFER_RMW)) && rmw > 0) { 3907 /* prefer read-modify-write, but need to get some data */ 3908 if (conf->mddev->queue) 3909 blk_add_trace_msg(conf->mddev->queue, 3910 "raid5 rmw %llu %d", 3911 (unsigned long long)sh->sector, rmw); 3912 for (i = disks; i--; ) { 3913 struct r5dev *dev = &sh->dev[i]; 3914 if (test_bit(R5_InJournal, &dev->flags) && 3915 dev->page == dev->orig_page && 3916 !test_bit(R5_LOCKED, &sh->dev[sh->pd_idx].flags)) { 3917 /* alloc page for prexor */ 3918 struct page *p = alloc_page(GFP_NOIO); 3919 3920 if (p) { 3921 dev->orig_page = p; 3922 continue; 3923 } 3924 3925 /* 3926 * alloc_page() failed, try use 3927 * disk_info->extra_page 3928 */ 3929 if (!test_and_set_bit(R5C_EXTRA_PAGE_IN_USE, 3930 &conf->cache_state)) { 3931 r5c_use_extra_page(sh); 3932 break; 3933 } 3934 3935 /* extra_page in use, add to delayed_list */ 3936 set_bit(STRIPE_DELAYED, &sh->state); 3937 s->waiting_extra_page = 1; 3938 return -EAGAIN; 3939 } 3940 } 3941 3942 for (i = disks; i--; ) { 3943 struct r5dev *dev = &sh->dev[i]; 3944 if (((dev->towrite && !delay_towrite(conf, dev, s)) || 3945 i == sh->pd_idx || i == sh->qd_idx || 3946 test_bit(R5_InJournal, &dev->flags)) && 3947 !test_bit(R5_LOCKED, &dev->flags) && 3948 !(uptodate_for_rmw(dev) || 3949 test_bit(R5_Wantcompute, &dev->flags)) && 3950 test_bit(R5_Insync, &dev->flags)) { 3951 if (test_bit(STRIPE_PREREAD_ACTIVE, 3952 &sh->state)) { 3953 pr_debug("Read_old block %d for r-m-w\n", 3954 i); 3955 set_bit(R5_LOCKED, &dev->flags); 3956 set_bit(R5_Wantread, &dev->flags); 3957 s->locked++; 3958 } else { 3959 set_bit(STRIPE_DELAYED, &sh->state); 3960 set_bit(STRIPE_HANDLE, &sh->state); 3961 } 3962 } 3963 } 3964 } 3965 if ((rcw < rmw || (rcw == rmw && conf->rmw_level != PARITY_PREFER_RMW)) && rcw > 0) { 3966 /* want reconstruct write, but need to get some data */ 3967 int qread =0; 3968 rcw = 0; 3969 for (i = disks; i--; ) { 3970 struct r5dev *dev = &sh->dev[i]; 3971 if (!test_bit(R5_OVERWRITE, &dev->flags) && 3972 i != sh->pd_idx && i != sh->qd_idx && 3973 !test_bit(R5_LOCKED, &dev->flags) && 3974 !(test_bit(R5_UPTODATE, &dev->flags) || 3975 test_bit(R5_Wantcompute, &dev->flags))) { 3976 rcw++; 3977 if (test_bit(R5_Insync, &dev->flags) && 3978 test_bit(STRIPE_PREREAD_ACTIVE, 3979 &sh->state)) { 3980 pr_debug("Read_old block " 3981 "%d for Reconstruct\n", i); 3982 set_bit(R5_LOCKED, &dev->flags); 3983 set_bit(R5_Wantread, &dev->flags); 3984 s->locked++; 3985 qread++; 3986 } else { 3987 set_bit(STRIPE_DELAYED, &sh->state); 3988 set_bit(STRIPE_HANDLE, &sh->state); 3989 } 3990 } 3991 } 3992 if (rcw && conf->mddev->queue) 3993 blk_add_trace_msg(conf->mddev->queue, "raid5 rcw %llu %d %d %d", 3994 (unsigned long long)sh->sector, 3995 rcw, qread, test_bit(STRIPE_DELAYED, &sh->state)); 3996 } 3997 3998 if (rcw > disks && rmw > disks && 3999 !test_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) 4000 set_bit(STRIPE_DELAYED, &sh->state); 4001 4002 /* now if nothing is locked, and if we have enough data, 4003 * we can start a write request 4004 */ 4005 /* since handle_stripe can be called at any time we need to handle the 4006 * case where a compute block operation has been submitted and then a 4007 * subsequent call wants to start a write request. raid_run_ops only 4008 * handles the case where compute block and reconstruct are requested 4009 * simultaneously. If this is not the case then new writes need to be 4010 * held off until the compute completes. 4011 */ 4012 if ((s->req_compute || !test_bit(STRIPE_COMPUTE_RUN, &sh->state)) && 4013 (s->locked == 0 && (rcw == 0 || rmw == 0) && 4014 !test_bit(STRIPE_BIT_DELAY, &sh->state))) 4015 schedule_reconstruction(sh, s, rcw == 0, 0); 4016 return 0; 4017 } 4018 4019 static void handle_parity_checks5(struct r5conf *conf, struct stripe_head *sh, 4020 struct stripe_head_state *s, int disks) 4021 { 4022 struct r5dev *dev = NULL; 4023 4024 BUG_ON(sh->batch_head); 4025 set_bit(STRIPE_HANDLE, &sh->state); 4026 4027 switch (sh->check_state) { 4028 case check_state_idle: 4029 /* start a new check operation if there are no failures */ 4030 if (s->failed == 0) { 4031 BUG_ON(s->uptodate != disks); 4032 sh->check_state = check_state_run; 4033 set_bit(STRIPE_OP_CHECK, &s->ops_request); 4034 clear_bit(R5_UPTODATE, &sh->dev[sh->pd_idx].flags); 4035 s->uptodate--; 4036 break; 4037 } 4038 dev = &sh->dev[s->failed_num[0]]; 4039 /* fall through */ 4040 case check_state_compute_result: 4041 sh->check_state = check_state_idle; 4042 if (!dev) 4043 dev = &sh->dev[sh->pd_idx]; 4044 4045 /* check that a write has not made the stripe insync */ 4046 if (test_bit(STRIPE_INSYNC, &sh->state)) 4047 break; 4048 4049 /* either failed parity check, or recovery is happening */ 4050 BUG_ON(!test_bit(R5_UPTODATE, &dev->flags)); 4051 BUG_ON(s->uptodate != disks); 4052 4053 set_bit(R5_LOCKED, &dev->flags); 4054 s->locked++; 4055 set_bit(R5_Wantwrite, &dev->flags); 4056 4057 clear_bit(STRIPE_DEGRADED, &sh->state); 4058 set_bit(STRIPE_INSYNC, &sh->state); 4059 break; 4060 case check_state_run: 4061 break; /* we will be called again upon completion */ 4062 case check_state_check_result: 4063 sh->check_state = check_state_idle; 4064 4065 /* if a failure occurred during the check operation, leave 4066 * STRIPE_INSYNC not set and let the stripe be handled again 4067 */ 4068 if (s->failed) 4069 break; 4070 4071 /* handle a successful check operation, if parity is correct 4072 * we are done. Otherwise update the mismatch count and repair 4073 * parity if !MD_RECOVERY_CHECK 4074 */ 4075 if ((sh->ops.zero_sum_result & SUM_CHECK_P_RESULT) == 0) 4076 /* parity is correct (on disc, 4077 * not in buffer any more) 4078 */ 4079 set_bit(STRIPE_INSYNC, &sh->state); 4080 else { 4081 atomic64_add(STRIPE_SECTORS, &conf->mddev->resync_mismatches); 4082 if (test_bit(MD_RECOVERY_CHECK, &conf->mddev->recovery)) { 4083 /* don't try to repair!! */ 4084 set_bit(STRIPE_INSYNC, &sh->state); 4085 pr_warn_ratelimited("%s: mismatch sector in range " 4086 "%llu-%llu\n", mdname(conf->mddev), 4087 (unsigned long long) sh->sector, 4088 (unsigned long long) sh->sector + 4089 STRIPE_SECTORS); 4090 } else { 4091 sh->check_state = check_state_compute_run; 4092 set_bit(STRIPE_COMPUTE_RUN, &sh->state); 4093 set_bit(STRIPE_OP_COMPUTE_BLK, &s->ops_request); 4094 set_bit(R5_Wantcompute, 4095 &sh->dev[sh->pd_idx].flags); 4096 sh->ops.target = sh->pd_idx; 4097 sh->ops.target2 = -1; 4098 s->uptodate++; 4099 } 4100 } 4101 break; 4102 case check_state_compute_run: 4103 break; 4104 default: 4105 pr_err("%s: unknown check_state: %d sector: %llu\n", 4106 __func__, sh->check_state, 4107 (unsigned long long) sh->sector); 4108 BUG(); 4109 } 4110 } 4111 4112 static void handle_parity_checks6(struct r5conf *conf, struct stripe_head *sh, 4113 struct stripe_head_state *s, 4114 int disks) 4115 { 4116 int pd_idx = sh->pd_idx; 4117 int qd_idx = sh->qd_idx; 4118 struct r5dev *dev; 4119 4120 BUG_ON(sh->batch_head); 4121 set_bit(STRIPE_HANDLE, &sh->state); 4122 4123 BUG_ON(s->failed > 2); 4124 4125 /* Want to check and possibly repair P and Q. 4126 * However there could be one 'failed' device, in which 4127 * case we can only check one of them, possibly using the 4128 * other to generate missing data 4129 */ 4130 4131 switch (sh->check_state) { 4132 case check_state_idle: 4133 /* start a new check operation if there are < 2 failures */ 4134 if (s->failed == s->q_failed) { 4135 /* The only possible failed device holds Q, so it 4136 * makes sense to check P (If anything else were failed, 4137 * we would have used P to recreate it). 4138 */ 4139 sh->check_state = check_state_run; 4140 } 4141 if (!s->q_failed && s->failed < 2) { 4142 /* Q is not failed, and we didn't use it to generate 4143 * anything, so it makes sense to check it 4144 */ 4145 if (sh->check_state == check_state_run) 4146 sh->check_state = check_state_run_pq; 4147 else 4148 sh->check_state = check_state_run_q; 4149 } 4150 4151 /* discard potentially stale zero_sum_result */ 4152 sh->ops.zero_sum_result = 0; 4153 4154 if (sh->check_state == check_state_run) { 4155 /* async_xor_zero_sum destroys the contents of P */ 4156 clear_bit(R5_UPTODATE, &sh->dev[pd_idx].flags); 4157 s->uptodate--; 4158 } 4159 if (sh->check_state >= check_state_run && 4160 sh->check_state <= check_state_run_pq) { 4161 /* async_syndrome_zero_sum preserves P and Q, so 4162 * no need to mark them !uptodate here 4163 */ 4164 set_bit(STRIPE_OP_CHECK, &s->ops_request); 4165 break; 4166 } 4167 4168 /* we have 2-disk failure */ 4169 BUG_ON(s->failed != 2); 4170 /* fall through */ 4171 case check_state_compute_result: 4172 sh->check_state = check_state_idle; 4173 4174 /* check that a write has not made the stripe insync */ 4175 if (test_bit(STRIPE_INSYNC, &sh->state)) 4176 break; 4177 4178 /* now write out any block on a failed drive, 4179 * or P or Q if they were recomputed 4180 */ 4181 BUG_ON(s->uptodate < disks - 1); /* We don't need Q to recover */ 4182 if (s->failed == 2) { 4183 dev = &sh->dev[s->failed_num[1]]; 4184 s->locked++; 4185 set_bit(R5_LOCKED, &dev->flags); 4186 set_bit(R5_Wantwrite, &dev->flags); 4187 } 4188 if (s->failed >= 1) { 4189 dev = &sh->dev[s->failed_num[0]]; 4190 s->locked++; 4191 set_bit(R5_LOCKED, &dev->flags); 4192 set_bit(R5_Wantwrite, &dev->flags); 4193 } 4194 if (sh->ops.zero_sum_result & SUM_CHECK_P_RESULT) { 4195 dev = &sh->dev[pd_idx]; 4196 s->locked++; 4197 set_bit(R5_LOCKED, &dev->flags); 4198 set_bit(R5_Wantwrite, &dev->flags); 4199 } 4200 if (sh->ops.zero_sum_result & SUM_CHECK_Q_RESULT) { 4201 dev = &sh->dev[qd_idx]; 4202 s->locked++; 4203 set_bit(R5_LOCKED, &dev->flags); 4204 set_bit(R5_Wantwrite, &dev->flags); 4205 } 4206 clear_bit(STRIPE_DEGRADED, &sh->state); 4207 4208 set_bit(STRIPE_INSYNC, &sh->state); 4209 break; 4210 case check_state_run: 4211 case check_state_run_q: 4212 case check_state_run_pq: 4213 break; /* we will be called again upon completion */ 4214 case check_state_check_result: 4215 sh->check_state = check_state_idle; 4216 4217 /* handle a successful check operation, if parity is correct 4218 * we are done. Otherwise update the mismatch count and repair 4219 * parity if !MD_RECOVERY_CHECK 4220 */ 4221 if (sh->ops.zero_sum_result == 0) { 4222 /* both parities are correct */ 4223 if (!s->failed) 4224 set_bit(STRIPE_INSYNC, &sh->state); 4225 else { 4226 /* in contrast to the raid5 case we can validate 4227 * parity, but still have a failure to write 4228 * back 4229 */ 4230 sh->check_state = check_state_compute_result; 4231 /* Returning at this point means that we may go 4232 * off and bring p and/or q uptodate again so 4233 * we make sure to check zero_sum_result again 4234 * to verify if p or q need writeback 4235 */ 4236 } 4237 } else { 4238 atomic64_add(STRIPE_SECTORS, &conf->mddev->resync_mismatches); 4239 if (test_bit(MD_RECOVERY_CHECK, &conf->mddev->recovery)) { 4240 /* don't try to repair!! */ 4241 set_bit(STRIPE_INSYNC, &sh->state); 4242 pr_warn_ratelimited("%s: mismatch sector in range " 4243 "%llu-%llu\n", mdname(conf->mddev), 4244 (unsigned long long) sh->sector, 4245 (unsigned long long) sh->sector + 4246 STRIPE_SECTORS); 4247 } else { 4248 int *target = &sh->ops.target; 4249 4250 sh->ops.target = -1; 4251 sh->ops.target2 = -1; 4252 sh->check_state = check_state_compute_run; 4253 set_bit(STRIPE_COMPUTE_RUN, &sh->state); 4254 set_bit(STRIPE_OP_COMPUTE_BLK, &s->ops_request); 4255 if (sh->ops.zero_sum_result & SUM_CHECK_P_RESULT) { 4256 set_bit(R5_Wantcompute, 4257 &sh->dev[pd_idx].flags); 4258 *target = pd_idx; 4259 target = &sh->ops.target2; 4260 s->uptodate++; 4261 } 4262 if (sh->ops.zero_sum_result & SUM_CHECK_Q_RESULT) { 4263 set_bit(R5_Wantcompute, 4264 &sh->dev[qd_idx].flags); 4265 *target = qd_idx; 4266 s->uptodate++; 4267 } 4268 } 4269 } 4270 break; 4271 case check_state_compute_run: 4272 break; 4273 default: 4274 pr_warn("%s: unknown check_state: %d sector: %llu\n", 4275 __func__, sh->check_state, 4276 (unsigned long long) sh->sector); 4277 BUG(); 4278 } 4279 } 4280 4281 static void handle_stripe_expansion(struct r5conf *conf, struct stripe_head *sh) 4282 { 4283 int i; 4284 4285 /* We have read all the blocks in this stripe and now we need to 4286 * copy some of them into a target stripe for expand. 4287 */ 4288 struct dma_async_tx_descriptor *tx = NULL; 4289 BUG_ON(sh->batch_head); 4290 clear_bit(STRIPE_EXPAND_SOURCE, &sh->state); 4291 for (i = 0; i < sh->disks; i++) 4292 if (i != sh->pd_idx && i != sh->qd_idx) { 4293 int dd_idx, j; 4294 struct stripe_head *sh2; 4295 struct async_submit_ctl submit; 4296 4297 sector_t bn = raid5_compute_blocknr(sh, i, 1); 4298 sector_t s = raid5_compute_sector(conf, bn, 0, 4299 &dd_idx, NULL); 4300 sh2 = raid5_get_active_stripe(conf, s, 0, 1, 1); 4301 if (sh2 == NULL) 4302 /* so far only the early blocks of this stripe 4303 * have been requested. When later blocks 4304 * get requested, we will try again 4305 */ 4306 continue; 4307 if (!test_bit(STRIPE_EXPANDING, &sh2->state) || 4308 test_bit(R5_Expanded, &sh2->dev[dd_idx].flags)) { 4309 /* must have already done this block */ 4310 raid5_release_stripe(sh2); 4311 continue; 4312 } 4313 4314 /* place all the copies on one channel */ 4315 init_async_submit(&submit, 0, tx, NULL, NULL, NULL); 4316 tx = async_memcpy(sh2->dev[dd_idx].page, 4317 sh->dev[i].page, 0, 0, STRIPE_SIZE, 4318 &submit); 4319 4320 set_bit(R5_Expanded, &sh2->dev[dd_idx].flags); 4321 set_bit(R5_UPTODATE, &sh2->dev[dd_idx].flags); 4322 for (j = 0; j < conf->raid_disks; j++) 4323 if (j != sh2->pd_idx && 4324 j != sh2->qd_idx && 4325 !test_bit(R5_Expanded, &sh2->dev[j].flags)) 4326 break; 4327 if (j == conf->raid_disks) { 4328 set_bit(STRIPE_EXPAND_READY, &sh2->state); 4329 set_bit(STRIPE_HANDLE, &sh2->state); 4330 } 4331 raid5_release_stripe(sh2); 4332 4333 } 4334 /* done submitting copies, wait for them to complete */ 4335 async_tx_quiesce(&tx); 4336 } 4337 4338 /* 4339 * handle_stripe - do things to a stripe. 4340 * 4341 * We lock the stripe by setting STRIPE_ACTIVE and then examine the 4342 * state of various bits to see what needs to be done. 4343 * Possible results: 4344 * return some read requests which now have data 4345 * return some write requests which are safely on storage 4346 * schedule a read on some buffers 4347 * schedule a write of some buffers 4348 * return confirmation of parity correctness 4349 * 4350 */ 4351 4352 static void analyse_stripe(struct stripe_head *sh, struct stripe_head_state *s) 4353 { 4354 struct r5conf *conf = sh->raid_conf; 4355 int disks = sh->disks; 4356 struct r5dev *dev; 4357 int i; 4358 int do_recovery = 0; 4359 4360 memset(s, 0, sizeof(*s)); 4361 4362 s->expanding = test_bit(STRIPE_EXPAND_SOURCE, &sh->state) && !sh->batch_head; 4363 s->expanded = test_bit(STRIPE_EXPAND_READY, &sh->state) && !sh->batch_head; 4364 s->failed_num[0] = -1; 4365 s->failed_num[1] = -1; 4366 s->log_failed = r5l_log_disk_error(conf); 4367 4368 /* Now to look around and see what can be done */ 4369 rcu_read_lock(); 4370 for (i=disks; i--; ) { 4371 struct md_rdev *rdev; 4372 sector_t first_bad; 4373 int bad_sectors; 4374 int is_bad = 0; 4375 4376 dev = &sh->dev[i]; 4377 4378 pr_debug("check %d: state 0x%lx read %p write %p written %p\n", 4379 i, dev->flags, 4380 dev->toread, dev->towrite, dev->written); 4381 /* maybe we can reply to a read 4382 * 4383 * new wantfill requests are only permitted while 4384 * ops_complete_biofill is guaranteed to be inactive 4385 */ 4386 if (test_bit(R5_UPTODATE, &dev->flags) && dev->toread && 4387 !test_bit(STRIPE_BIOFILL_RUN, &sh->state)) 4388 set_bit(R5_Wantfill, &dev->flags); 4389 4390 /* now count some things */ 4391 if (test_bit(R5_LOCKED, &dev->flags)) 4392 s->locked++; 4393 if (test_bit(R5_UPTODATE, &dev->flags)) 4394 s->uptodate++; 4395 if (test_bit(R5_Wantcompute, &dev->flags)) { 4396 s->compute++; 4397 BUG_ON(s->compute > 2); 4398 } 4399 4400 if (test_bit(R5_Wantfill, &dev->flags)) 4401 s->to_fill++; 4402 else if (dev->toread) 4403 s->to_read++; 4404 if (dev->towrite) { 4405 s->to_write++; 4406 if (!test_bit(R5_OVERWRITE, &dev->flags)) 4407 s->non_overwrite++; 4408 } 4409 if (dev->written) 4410 s->written++; 4411 /* Prefer to use the replacement for reads, but only 4412 * if it is recovered enough and has no bad blocks. 4413 */ 4414 rdev = rcu_dereference(conf->disks[i].replacement); 4415 if (rdev && !test_bit(Faulty, &rdev->flags) && 4416 rdev->recovery_offset >= sh->sector + STRIPE_SECTORS && 4417 !is_badblock(rdev, sh->sector, STRIPE_SECTORS, 4418 &first_bad, &bad_sectors)) 4419 set_bit(R5_ReadRepl, &dev->flags); 4420 else { 4421 if (rdev && !test_bit(Faulty, &rdev->flags)) 4422 set_bit(R5_NeedReplace, &dev->flags); 4423 else 4424 clear_bit(R5_NeedReplace, &dev->flags); 4425 rdev = rcu_dereference(conf->disks[i].rdev); 4426 clear_bit(R5_ReadRepl, &dev->flags); 4427 } 4428 if (rdev && test_bit(Faulty, &rdev->flags)) 4429 rdev = NULL; 4430 if (rdev) { 4431 is_bad = is_badblock(rdev, sh->sector, STRIPE_SECTORS, 4432 &first_bad, &bad_sectors); 4433 if (s->blocked_rdev == NULL 4434 && (test_bit(Blocked, &rdev->flags) 4435 || is_bad < 0)) { 4436 if (is_bad < 0) 4437 set_bit(BlockedBadBlocks, 4438 &rdev->flags); 4439 s->blocked_rdev = rdev; 4440 atomic_inc(&rdev->nr_pending); 4441 } 4442 } 4443 clear_bit(R5_Insync, &dev->flags); 4444 if (!rdev) 4445 /* Not in-sync */; 4446 else if (is_bad) { 4447 /* also not in-sync */ 4448 if (!test_bit(WriteErrorSeen, &rdev->flags) && 4449 test_bit(R5_UPTODATE, &dev->flags)) { 4450 /* treat as in-sync, but with a read error 4451 * which we can now try to correct 4452 */ 4453 set_bit(R5_Insync, &dev->flags); 4454 set_bit(R5_ReadError, &dev->flags); 4455 } 4456 } else if (test_bit(In_sync, &rdev->flags)) 4457 set_bit(R5_Insync, &dev->flags); 4458 else if (sh->sector + STRIPE_SECTORS <= rdev->recovery_offset) 4459 /* in sync if before recovery_offset */ 4460 set_bit(R5_Insync, &dev->flags); 4461 else if (test_bit(R5_UPTODATE, &dev->flags) && 4462 test_bit(R5_Expanded, &dev->flags)) 4463 /* If we've reshaped into here, we assume it is Insync. 4464 * We will shortly update recovery_offset to make 4465 * it official. 4466 */ 4467 set_bit(R5_Insync, &dev->flags); 4468 4469 if (test_bit(R5_WriteError, &dev->flags)) { 4470 /* This flag does not apply to '.replacement' 4471 * only to .rdev, so make sure to check that*/ 4472 struct md_rdev *rdev2 = rcu_dereference( 4473 conf->disks[i].rdev); 4474 if (rdev2 == rdev) 4475 clear_bit(R5_Insync, &dev->flags); 4476 if (rdev2 && !test_bit(Faulty, &rdev2->flags)) { 4477 s->handle_bad_blocks = 1; 4478 atomic_inc(&rdev2->nr_pending); 4479 } else 4480 clear_bit(R5_WriteError, &dev->flags); 4481 } 4482 if (test_bit(R5_MadeGood, &dev->flags)) { 4483 /* This flag does not apply to '.replacement' 4484 * only to .rdev, so make sure to check that*/ 4485 struct md_rdev *rdev2 = rcu_dereference( 4486 conf->disks[i].rdev); 4487 if (rdev2 && !test_bit(Faulty, &rdev2->flags)) { 4488 s->handle_bad_blocks = 1; 4489 atomic_inc(&rdev2->nr_pending); 4490 } else 4491 clear_bit(R5_MadeGood, &dev->flags); 4492 } 4493 if (test_bit(R5_MadeGoodRepl, &dev->flags)) { 4494 struct md_rdev *rdev2 = rcu_dereference( 4495 conf->disks[i].replacement); 4496 if (rdev2 && !test_bit(Faulty, &rdev2->flags)) { 4497 s->handle_bad_blocks = 1; 4498 atomic_inc(&rdev2->nr_pending); 4499 } else 4500 clear_bit(R5_MadeGoodRepl, &dev->flags); 4501 } 4502 if (!test_bit(R5_Insync, &dev->flags)) { 4503 /* The ReadError flag will just be confusing now */ 4504 clear_bit(R5_ReadError, &dev->flags); 4505 clear_bit(R5_ReWrite, &dev->flags); 4506 } 4507 if (test_bit(R5_ReadError, &dev->flags)) 4508 clear_bit(R5_Insync, &dev->flags); 4509 if (!test_bit(R5_Insync, &dev->flags)) { 4510 if (s->failed < 2) 4511 s->failed_num[s->failed] = i; 4512 s->failed++; 4513 if (rdev && !test_bit(Faulty, &rdev->flags)) 4514 do_recovery = 1; 4515 } 4516 4517 if (test_bit(R5_InJournal, &dev->flags)) 4518 s->injournal++; 4519 if (test_bit(R5_InJournal, &dev->flags) && dev->written) 4520 s->just_cached++; 4521 } 4522 if (test_bit(STRIPE_SYNCING, &sh->state)) { 4523 /* If there is a failed device being replaced, 4524 * we must be recovering. 4525 * else if we are after recovery_cp, we must be syncing 4526 * else if MD_RECOVERY_REQUESTED is set, we also are syncing. 4527 * else we can only be replacing 4528 * sync and recovery both need to read all devices, and so 4529 * use the same flag. 4530 */ 4531 if (do_recovery || 4532 sh->sector >= conf->mddev->recovery_cp || 4533 test_bit(MD_RECOVERY_REQUESTED, &(conf->mddev->recovery))) 4534 s->syncing = 1; 4535 else 4536 s->replacing = 1; 4537 } 4538 rcu_read_unlock(); 4539 } 4540 4541 static int clear_batch_ready(struct stripe_head *sh) 4542 { 4543 /* Return '1' if this is a member of batch, or 4544 * '0' if it is a lone stripe or a head which can now be 4545 * handled. 4546 */ 4547 struct stripe_head *tmp; 4548 if (!test_and_clear_bit(STRIPE_BATCH_READY, &sh->state)) 4549 return (sh->batch_head && sh->batch_head != sh); 4550 spin_lock(&sh->stripe_lock); 4551 if (!sh->batch_head) { 4552 spin_unlock(&sh->stripe_lock); 4553 return 0; 4554 } 4555 4556 /* 4557 * this stripe could be added to a batch list before we check 4558 * BATCH_READY, skips it 4559 */ 4560 if (sh->batch_head != sh) { 4561 spin_unlock(&sh->stripe_lock); 4562 return 1; 4563 } 4564 spin_lock(&sh->batch_lock); 4565 list_for_each_entry(tmp, &sh->batch_list, batch_list) 4566 clear_bit(STRIPE_BATCH_READY, &tmp->state); 4567 spin_unlock(&sh->batch_lock); 4568 spin_unlock(&sh->stripe_lock); 4569 4570 /* 4571 * BATCH_READY is cleared, no new stripes can be added. 4572 * batch_list can be accessed without lock 4573 */ 4574 return 0; 4575 } 4576 4577 static void break_stripe_batch_list(struct stripe_head *head_sh, 4578 unsigned long handle_flags) 4579 { 4580 struct stripe_head *sh, *next; 4581 int i; 4582 int do_wakeup = 0; 4583 4584 list_for_each_entry_safe(sh, next, &head_sh->batch_list, batch_list) { 4585 4586 list_del_init(&sh->batch_list); 4587 4588 WARN_ONCE(sh->state & ((1 << STRIPE_ACTIVE) | 4589 (1 << STRIPE_SYNCING) | 4590 (1 << STRIPE_REPLACED) | 4591 (1 << STRIPE_DELAYED) | 4592 (1 << STRIPE_BIT_DELAY) | 4593 (1 << STRIPE_FULL_WRITE) | 4594 (1 << STRIPE_BIOFILL_RUN) | 4595 (1 << STRIPE_COMPUTE_RUN) | 4596 (1 << STRIPE_OPS_REQ_PENDING) | 4597 (1 << STRIPE_DISCARD) | 4598 (1 << STRIPE_BATCH_READY) | 4599 (1 << STRIPE_BATCH_ERR) | 4600 (1 << STRIPE_BITMAP_PENDING)), 4601 "stripe state: %lx\n", sh->state); 4602 WARN_ONCE(head_sh->state & ((1 << STRIPE_DISCARD) | 4603 (1 << STRIPE_REPLACED)), 4604 "head stripe state: %lx\n", head_sh->state); 4605 4606 set_mask_bits(&sh->state, ~(STRIPE_EXPAND_SYNC_FLAGS | 4607 (1 << STRIPE_PREREAD_ACTIVE) | 4608 (1 << STRIPE_DEGRADED) | 4609 (1 << STRIPE_ON_UNPLUG_LIST)), 4610 head_sh->state & (1 << STRIPE_INSYNC)); 4611 4612 sh->check_state = head_sh->check_state; 4613 sh->reconstruct_state = head_sh->reconstruct_state; 4614 for (i = 0; i < sh->disks; i++) { 4615 if (test_and_clear_bit(R5_Overlap, &sh->dev[i].flags)) 4616 do_wakeup = 1; 4617 sh->dev[i].flags = head_sh->dev[i].flags & 4618 (~((1 << R5_WriteError) | (1 << R5_Overlap))); 4619 } 4620 spin_lock_irq(&sh->stripe_lock); 4621 sh->batch_head = NULL; 4622 spin_unlock_irq(&sh->stripe_lock); 4623 if (handle_flags == 0 || 4624 sh->state & handle_flags) 4625 set_bit(STRIPE_HANDLE, &sh->state); 4626 raid5_release_stripe(sh); 4627 } 4628 spin_lock_irq(&head_sh->stripe_lock); 4629 head_sh->batch_head = NULL; 4630 spin_unlock_irq(&head_sh->stripe_lock); 4631 for (i = 0; i < head_sh->disks; i++) 4632 if (test_and_clear_bit(R5_Overlap, &head_sh->dev[i].flags)) 4633 do_wakeup = 1; 4634 if (head_sh->state & handle_flags) 4635 set_bit(STRIPE_HANDLE, &head_sh->state); 4636 4637 if (do_wakeup) 4638 wake_up(&head_sh->raid_conf->wait_for_overlap); 4639 } 4640 4641 static void handle_stripe(struct stripe_head *sh) 4642 { 4643 struct stripe_head_state s; 4644 struct r5conf *conf = sh->raid_conf; 4645 int i; 4646 int prexor; 4647 int disks = sh->disks; 4648 struct r5dev *pdev, *qdev; 4649 4650 clear_bit(STRIPE_HANDLE, &sh->state); 4651 if (test_and_set_bit_lock(STRIPE_ACTIVE, &sh->state)) { 4652 /* already being handled, ensure it gets handled 4653 * again when current action finishes */ 4654 set_bit(STRIPE_HANDLE, &sh->state); 4655 return; 4656 } 4657 4658 if (clear_batch_ready(sh) ) { 4659 clear_bit_unlock(STRIPE_ACTIVE, &sh->state); 4660 return; 4661 } 4662 4663 if (test_and_clear_bit(STRIPE_BATCH_ERR, &sh->state)) 4664 break_stripe_batch_list(sh, 0); 4665 4666 if (test_bit(STRIPE_SYNC_REQUESTED, &sh->state) && !sh->batch_head) { 4667 spin_lock(&sh->stripe_lock); 4668 /* 4669 * Cannot process 'sync' concurrently with 'discard'. 4670 * Flush data in r5cache before 'sync'. 4671 */ 4672 if (!test_bit(STRIPE_R5C_PARTIAL_STRIPE, &sh->state) && 4673 !test_bit(STRIPE_R5C_FULL_STRIPE, &sh->state) && 4674 !test_bit(STRIPE_DISCARD, &sh->state) && 4675 test_and_clear_bit(STRIPE_SYNC_REQUESTED, &sh->state)) { 4676 set_bit(STRIPE_SYNCING, &sh->state); 4677 clear_bit(STRIPE_INSYNC, &sh->state); 4678 clear_bit(STRIPE_REPLACED, &sh->state); 4679 } 4680 spin_unlock(&sh->stripe_lock); 4681 } 4682 clear_bit(STRIPE_DELAYED, &sh->state); 4683 4684 pr_debug("handling stripe %llu, state=%#lx cnt=%d, " 4685 "pd_idx=%d, qd_idx=%d\n, check:%d, reconstruct:%d\n", 4686 (unsigned long long)sh->sector, sh->state, 4687 atomic_read(&sh->count), sh->pd_idx, sh->qd_idx, 4688 sh->check_state, sh->reconstruct_state); 4689 4690 analyse_stripe(sh, &s); 4691 4692 if (test_bit(STRIPE_LOG_TRAPPED, &sh->state)) 4693 goto finish; 4694 4695 if (s.handle_bad_blocks || 4696 test_bit(MD_SB_CHANGE_PENDING, &conf->mddev->sb_flags)) { 4697 set_bit(STRIPE_HANDLE, &sh->state); 4698 goto finish; 4699 } 4700 4701 if (unlikely(s.blocked_rdev)) { 4702 if (s.syncing || s.expanding || s.expanded || 4703 s.replacing || s.to_write || s.written) { 4704 set_bit(STRIPE_HANDLE, &sh->state); 4705 goto finish; 4706 } 4707 /* There is nothing for the blocked_rdev to block */ 4708 rdev_dec_pending(s.blocked_rdev, conf->mddev); 4709 s.blocked_rdev = NULL; 4710 } 4711 4712 if (s.to_fill && !test_bit(STRIPE_BIOFILL_RUN, &sh->state)) { 4713 set_bit(STRIPE_OP_BIOFILL, &s.ops_request); 4714 set_bit(STRIPE_BIOFILL_RUN, &sh->state); 4715 } 4716 4717 pr_debug("locked=%d uptodate=%d to_read=%d" 4718 " to_write=%d failed=%d failed_num=%d,%d\n", 4719 s.locked, s.uptodate, s.to_read, s.to_write, s.failed, 4720 s.failed_num[0], s.failed_num[1]); 4721 /* 4722 * check if the array has lost more than max_degraded devices and, 4723 * if so, some requests might need to be failed. 4724 * 4725 * When journal device failed (log_failed), we will only process 4726 * the stripe if there is data need write to raid disks 4727 */ 4728 if (s.failed > conf->max_degraded || 4729 (s.log_failed && s.injournal == 0)) { 4730 sh->check_state = 0; 4731 sh->reconstruct_state = 0; 4732 break_stripe_batch_list(sh, 0); 4733 if (s.to_read+s.to_write+s.written) 4734 handle_failed_stripe(conf, sh, &s, disks); 4735 if (s.syncing + s.replacing) 4736 handle_failed_sync(conf, sh, &s); 4737 } 4738 4739 /* Now we check to see if any write operations have recently 4740 * completed 4741 */ 4742 prexor = 0; 4743 if (sh->reconstruct_state == reconstruct_state_prexor_drain_result) 4744 prexor = 1; 4745 if (sh->reconstruct_state == reconstruct_state_drain_result || 4746 sh->reconstruct_state == reconstruct_state_prexor_drain_result) { 4747 sh->reconstruct_state = reconstruct_state_idle; 4748 4749 /* All the 'written' buffers and the parity block are ready to 4750 * be written back to disk 4751 */ 4752 BUG_ON(!test_bit(R5_UPTODATE, &sh->dev[sh->pd_idx].flags) && 4753 !test_bit(R5_Discard, &sh->dev[sh->pd_idx].flags)); 4754 BUG_ON(sh->qd_idx >= 0 && 4755 !test_bit(R5_UPTODATE, &sh->dev[sh->qd_idx].flags) && 4756 !test_bit(R5_Discard, &sh->dev[sh->qd_idx].flags)); 4757 for (i = disks; i--; ) { 4758 struct r5dev *dev = &sh->dev[i]; 4759 if (test_bit(R5_LOCKED, &dev->flags) && 4760 (i == sh->pd_idx || i == sh->qd_idx || 4761 dev->written || test_bit(R5_InJournal, 4762 &dev->flags))) { 4763 pr_debug("Writing block %d\n", i); 4764 set_bit(R5_Wantwrite, &dev->flags); 4765 if (prexor) 4766 continue; 4767 if (s.failed > 1) 4768 continue; 4769 if (!test_bit(R5_Insync, &dev->flags) || 4770 ((i == sh->pd_idx || i == sh->qd_idx) && 4771 s.failed == 0)) 4772 set_bit(STRIPE_INSYNC, &sh->state); 4773 } 4774 } 4775 if (test_and_clear_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) 4776 s.dec_preread_active = 1; 4777 } 4778 4779 /* 4780 * might be able to return some write requests if the parity blocks 4781 * are safe, or on a failed drive 4782 */ 4783 pdev = &sh->dev[sh->pd_idx]; 4784 s.p_failed = (s.failed >= 1 && s.failed_num[0] == sh->pd_idx) 4785 || (s.failed >= 2 && s.failed_num[1] == sh->pd_idx); 4786 qdev = &sh->dev[sh->qd_idx]; 4787 s.q_failed = (s.failed >= 1 && s.failed_num[0] == sh->qd_idx) 4788 || (s.failed >= 2 && s.failed_num[1] == sh->qd_idx) 4789 || conf->level < 6; 4790 4791 if (s.written && 4792 (s.p_failed || ((test_bit(R5_Insync, &pdev->flags) 4793 && !test_bit(R5_LOCKED, &pdev->flags) 4794 && (test_bit(R5_UPTODATE, &pdev->flags) || 4795 test_bit(R5_Discard, &pdev->flags))))) && 4796 (s.q_failed || ((test_bit(R5_Insync, &qdev->flags) 4797 && !test_bit(R5_LOCKED, &qdev->flags) 4798 && (test_bit(R5_UPTODATE, &qdev->flags) || 4799 test_bit(R5_Discard, &qdev->flags)))))) 4800 handle_stripe_clean_event(conf, sh, disks); 4801 4802 if (s.just_cached) 4803 r5c_handle_cached_data_endio(conf, sh, disks); 4804 log_stripe_write_finished(sh); 4805 4806 /* Now we might consider reading some blocks, either to check/generate 4807 * parity, or to satisfy requests 4808 * or to load a block that is being partially written. 4809 */ 4810 if (s.to_read || s.non_overwrite 4811 || (conf->level == 6 && s.to_write && s.failed) 4812 || (s.syncing && (s.uptodate + s.compute < disks)) 4813 || s.replacing 4814 || s.expanding) 4815 handle_stripe_fill(sh, &s, disks); 4816 4817 /* 4818 * When the stripe finishes full journal write cycle (write to journal 4819 * and raid disk), this is the clean up procedure so it is ready for 4820 * next operation. 4821 */ 4822 r5c_finish_stripe_write_out(conf, sh, &s); 4823 4824 /* 4825 * Now to consider new write requests, cache write back and what else, 4826 * if anything should be read. We do not handle new writes when: 4827 * 1/ A 'write' operation (copy+xor) is already in flight. 4828 * 2/ A 'check' operation is in flight, as it may clobber the parity 4829 * block. 4830 * 3/ A r5c cache log write is in flight. 4831 */ 4832 4833 if (!sh->reconstruct_state && !sh->check_state && !sh->log_io) { 4834 if (!r5c_is_writeback(conf->log)) { 4835 if (s.to_write) 4836 handle_stripe_dirtying(conf, sh, &s, disks); 4837 } else { /* write back cache */ 4838 int ret = 0; 4839 4840 /* First, try handle writes in caching phase */ 4841 if (s.to_write) 4842 ret = r5c_try_caching_write(conf, sh, &s, 4843 disks); 4844 /* 4845 * If caching phase failed: ret == -EAGAIN 4846 * OR 4847 * stripe under reclaim: !caching && injournal 4848 * 4849 * fall back to handle_stripe_dirtying() 4850 */ 4851 if (ret == -EAGAIN || 4852 /* stripe under reclaim: !caching && injournal */ 4853 (!test_bit(STRIPE_R5C_CACHING, &sh->state) && 4854 s.injournal > 0)) { 4855 ret = handle_stripe_dirtying(conf, sh, &s, 4856 disks); 4857 if (ret == -EAGAIN) 4858 goto finish; 4859 } 4860 } 4861 } 4862 4863 /* maybe we need to check and possibly fix the parity for this stripe 4864 * Any reads will already have been scheduled, so we just see if enough 4865 * data is available. The parity check is held off while parity 4866 * dependent operations are in flight. 4867 */ 4868 if (sh->check_state || 4869 (s.syncing && s.locked == 0 && 4870 !test_bit(STRIPE_COMPUTE_RUN, &sh->state) && 4871 !test_bit(STRIPE_INSYNC, &sh->state))) { 4872 if (conf->level == 6) 4873 handle_parity_checks6(conf, sh, &s, disks); 4874 else 4875 handle_parity_checks5(conf, sh, &s, disks); 4876 } 4877 4878 if ((s.replacing || s.syncing) && s.locked == 0 4879 && !test_bit(STRIPE_COMPUTE_RUN, &sh->state) 4880 && !test_bit(STRIPE_REPLACED, &sh->state)) { 4881 /* Write out to replacement devices where possible */ 4882 for (i = 0; i < conf->raid_disks; i++) 4883 if (test_bit(R5_NeedReplace, &sh->dev[i].flags)) { 4884 WARN_ON(!test_bit(R5_UPTODATE, &sh->dev[i].flags)); 4885 set_bit(R5_WantReplace, &sh->dev[i].flags); 4886 set_bit(R5_LOCKED, &sh->dev[i].flags); 4887 s.locked++; 4888 } 4889 if (s.replacing) 4890 set_bit(STRIPE_INSYNC, &sh->state); 4891 set_bit(STRIPE_REPLACED, &sh->state); 4892 } 4893 if ((s.syncing || s.replacing) && s.locked == 0 && 4894 !test_bit(STRIPE_COMPUTE_RUN, &sh->state) && 4895 test_bit(STRIPE_INSYNC, &sh->state)) { 4896 md_done_sync(conf->mddev, STRIPE_SECTORS, 1); 4897 clear_bit(STRIPE_SYNCING, &sh->state); 4898 if (test_and_clear_bit(R5_Overlap, &sh->dev[sh->pd_idx].flags)) 4899 wake_up(&conf->wait_for_overlap); 4900 } 4901 4902 /* If the failed drives are just a ReadError, then we might need 4903 * to progress the repair/check process 4904 */ 4905 if (s.failed <= conf->max_degraded && !conf->mddev->ro) 4906 for (i = 0; i < s.failed; i++) { 4907 struct r5dev *dev = &sh->dev[s.failed_num[i]]; 4908 if (test_bit(R5_ReadError, &dev->flags) 4909 && !test_bit(R5_LOCKED, &dev->flags) 4910 && test_bit(R5_UPTODATE, &dev->flags) 4911 ) { 4912 if (!test_bit(R5_ReWrite, &dev->flags)) { 4913 set_bit(R5_Wantwrite, &dev->flags); 4914 set_bit(R5_ReWrite, &dev->flags); 4915 set_bit(R5_LOCKED, &dev->flags); 4916 s.locked++; 4917 } else { 4918 /* let's read it back */ 4919 set_bit(R5_Wantread, &dev->flags); 4920 set_bit(R5_LOCKED, &dev->flags); 4921 s.locked++; 4922 } 4923 } 4924 } 4925 4926 /* Finish reconstruct operations initiated by the expansion process */ 4927 if (sh->reconstruct_state == reconstruct_state_result) { 4928 struct stripe_head *sh_src 4929 = raid5_get_active_stripe(conf, sh->sector, 1, 1, 1); 4930 if (sh_src && test_bit(STRIPE_EXPAND_SOURCE, &sh_src->state)) { 4931 /* sh cannot be written until sh_src has been read. 4932 * so arrange for sh to be delayed a little 4933 */ 4934 set_bit(STRIPE_DELAYED, &sh->state); 4935 set_bit(STRIPE_HANDLE, &sh->state); 4936 if (!test_and_set_bit(STRIPE_PREREAD_ACTIVE, 4937 &sh_src->state)) 4938 atomic_inc(&conf->preread_active_stripes); 4939 raid5_release_stripe(sh_src); 4940 goto finish; 4941 } 4942 if (sh_src) 4943 raid5_release_stripe(sh_src); 4944 4945 sh->reconstruct_state = reconstruct_state_idle; 4946 clear_bit(STRIPE_EXPANDING, &sh->state); 4947 for (i = conf->raid_disks; i--; ) { 4948 set_bit(R5_Wantwrite, &sh->dev[i].flags); 4949 set_bit(R5_LOCKED, &sh->dev[i].flags); 4950 s.locked++; 4951 } 4952 } 4953 4954 if (s.expanded && test_bit(STRIPE_EXPANDING, &sh->state) && 4955 !sh->reconstruct_state) { 4956 /* Need to write out all blocks after computing parity */ 4957 sh->disks = conf->raid_disks; 4958 stripe_set_idx(sh->sector, conf, 0, sh); 4959 schedule_reconstruction(sh, &s, 1, 1); 4960 } else if (s.expanded && !sh->reconstruct_state && s.locked == 0) { 4961 clear_bit(STRIPE_EXPAND_READY, &sh->state); 4962 atomic_dec(&conf->reshape_stripes); 4963 wake_up(&conf->wait_for_overlap); 4964 md_done_sync(conf->mddev, STRIPE_SECTORS, 1); 4965 } 4966 4967 if (s.expanding && s.locked == 0 && 4968 !test_bit(STRIPE_COMPUTE_RUN, &sh->state)) 4969 handle_stripe_expansion(conf, sh); 4970 4971 finish: 4972 /* wait for this device to become unblocked */ 4973 if (unlikely(s.blocked_rdev)) { 4974 if (conf->mddev->external) 4975 md_wait_for_blocked_rdev(s.blocked_rdev, 4976 conf->mddev); 4977 else 4978 /* Internal metadata will immediately 4979 * be written by raid5d, so we don't 4980 * need to wait here. 4981 */ 4982 rdev_dec_pending(s.blocked_rdev, 4983 conf->mddev); 4984 } 4985 4986 if (s.handle_bad_blocks) 4987 for (i = disks; i--; ) { 4988 struct md_rdev *rdev; 4989 struct r5dev *dev = &sh->dev[i]; 4990 if (test_and_clear_bit(R5_WriteError, &dev->flags)) { 4991 /* We own a safe reference to the rdev */ 4992 rdev = conf->disks[i].rdev; 4993 if (!rdev_set_badblocks(rdev, sh->sector, 4994 STRIPE_SECTORS, 0)) 4995 md_error(conf->mddev, rdev); 4996 rdev_dec_pending(rdev, conf->mddev); 4997 } 4998 if (test_and_clear_bit(R5_MadeGood, &dev->flags)) { 4999 rdev = conf->disks[i].rdev; 5000 rdev_clear_badblocks(rdev, sh->sector, 5001 STRIPE_SECTORS, 0); 5002 rdev_dec_pending(rdev, conf->mddev); 5003 } 5004 if (test_and_clear_bit(R5_MadeGoodRepl, &dev->flags)) { 5005 rdev = conf->disks[i].replacement; 5006 if (!rdev) 5007 /* rdev have been moved down */ 5008 rdev = conf->disks[i].rdev; 5009 rdev_clear_badblocks(rdev, sh->sector, 5010 STRIPE_SECTORS, 0); 5011 rdev_dec_pending(rdev, conf->mddev); 5012 } 5013 } 5014 5015 if (s.ops_request) 5016 raid_run_ops(sh, s.ops_request); 5017 5018 ops_run_io(sh, &s); 5019 5020 if (s.dec_preread_active) { 5021 /* We delay this until after ops_run_io so that if make_request 5022 * is waiting on a flush, it won't continue until the writes 5023 * have actually been submitted. 5024 */ 5025 atomic_dec(&conf->preread_active_stripes); 5026 if (atomic_read(&conf->preread_active_stripes) < 5027 IO_THRESHOLD) 5028 md_wakeup_thread(conf->mddev->thread); 5029 } 5030 5031 clear_bit_unlock(STRIPE_ACTIVE, &sh->state); 5032 } 5033 5034 static void raid5_activate_delayed(struct r5conf *conf) 5035 { 5036 if (atomic_read(&conf->preread_active_stripes) < IO_THRESHOLD) { 5037 while (!list_empty(&conf->delayed_list)) { 5038 struct list_head *l = conf->delayed_list.next; 5039 struct stripe_head *sh; 5040 sh = list_entry(l, struct stripe_head, lru); 5041 list_del_init(l); 5042 clear_bit(STRIPE_DELAYED, &sh->state); 5043 if (!test_and_set_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) 5044 atomic_inc(&conf->preread_active_stripes); 5045 list_add_tail(&sh->lru, &conf->hold_list); 5046 raid5_wakeup_stripe_thread(sh); 5047 } 5048 } 5049 } 5050 5051 static void activate_bit_delay(struct r5conf *conf, 5052 struct list_head *temp_inactive_list) 5053 { 5054 /* device_lock is held */ 5055 struct list_head head; 5056 list_add(&head, &conf->bitmap_list); 5057 list_del_init(&conf->bitmap_list); 5058 while (!list_empty(&head)) { 5059 struct stripe_head *sh = list_entry(head.next, struct stripe_head, lru); 5060 int hash; 5061 list_del_init(&sh->lru); 5062 atomic_inc(&sh->count); 5063 hash = sh->hash_lock_index; 5064 __release_stripe(conf, sh, &temp_inactive_list[hash]); 5065 } 5066 } 5067 5068 static int raid5_congested(struct mddev *mddev, int bits) 5069 { 5070 struct r5conf *conf = mddev->private; 5071 5072 /* No difference between reads and writes. Just check 5073 * how busy the stripe_cache is 5074 */ 5075 5076 if (test_bit(R5_INACTIVE_BLOCKED, &conf->cache_state)) 5077 return 1; 5078 5079 /* Also checks whether there is pressure on r5cache log space */ 5080 if (test_bit(R5C_LOG_TIGHT, &conf->cache_state)) 5081 return 1; 5082 if (conf->quiesce) 5083 return 1; 5084 if (atomic_read(&conf->empty_inactive_list_nr)) 5085 return 1; 5086 5087 return 0; 5088 } 5089 5090 static int in_chunk_boundary(struct mddev *mddev, struct bio *bio) 5091 { 5092 struct r5conf *conf = mddev->private; 5093 sector_t sector = bio->bi_iter.bi_sector; 5094 unsigned int chunk_sectors; 5095 unsigned int bio_sectors = bio_sectors(bio); 5096 5097 WARN_ON_ONCE(bio->bi_partno); 5098 5099 chunk_sectors = min(conf->chunk_sectors, conf->prev_chunk_sectors); 5100 return chunk_sectors >= 5101 ((sector & (chunk_sectors - 1)) + bio_sectors); 5102 } 5103 5104 /* 5105 * add bio to the retry LIFO ( in O(1) ... we are in interrupt ) 5106 * later sampled by raid5d. 5107 */ 5108 static void add_bio_to_retry(struct bio *bi,struct r5conf *conf) 5109 { 5110 unsigned long flags; 5111 5112 spin_lock_irqsave(&conf->device_lock, flags); 5113 5114 bi->bi_next = conf->retry_read_aligned_list; 5115 conf->retry_read_aligned_list = bi; 5116 5117 spin_unlock_irqrestore(&conf->device_lock, flags); 5118 md_wakeup_thread(conf->mddev->thread); 5119 } 5120 5121 static struct bio *remove_bio_from_retry(struct r5conf *conf, 5122 unsigned int *offset) 5123 { 5124 struct bio *bi; 5125 5126 bi = conf->retry_read_aligned; 5127 if (bi) { 5128 *offset = conf->retry_read_offset; 5129 conf->retry_read_aligned = NULL; 5130 return bi; 5131 } 5132 bi = conf->retry_read_aligned_list; 5133 if(bi) { 5134 conf->retry_read_aligned_list = bi->bi_next; 5135 bi->bi_next = NULL; 5136 *offset = 0; 5137 } 5138 5139 return bi; 5140 } 5141 5142 /* 5143 * The "raid5_align_endio" should check if the read succeeded and if it 5144 * did, call bio_endio on the original bio (having bio_put the new bio 5145 * first). 5146 * If the read failed.. 5147 */ 5148 static void raid5_align_endio(struct bio *bi) 5149 { 5150 struct bio* raid_bi = bi->bi_private; 5151 struct mddev *mddev; 5152 struct r5conf *conf; 5153 struct md_rdev *rdev; 5154 blk_status_t error = bi->bi_status; 5155 5156 bio_put(bi); 5157 5158 rdev = (void*)raid_bi->bi_next; 5159 raid_bi->bi_next = NULL; 5160 mddev = rdev->mddev; 5161 conf = mddev->private; 5162 5163 rdev_dec_pending(rdev, conf->mddev); 5164 5165 if (!error) { 5166 bio_endio(raid_bi); 5167 if (atomic_dec_and_test(&conf->active_aligned_reads)) 5168 wake_up(&conf->wait_for_quiescent); 5169 return; 5170 } 5171 5172 pr_debug("raid5_align_endio : io error...handing IO for a retry\n"); 5173 5174 add_bio_to_retry(raid_bi, conf); 5175 } 5176 5177 static int raid5_read_one_chunk(struct mddev *mddev, struct bio *raid_bio) 5178 { 5179 struct r5conf *conf = mddev->private; 5180 int dd_idx; 5181 struct bio* align_bi; 5182 struct md_rdev *rdev; 5183 sector_t end_sector; 5184 5185 if (!in_chunk_boundary(mddev, raid_bio)) { 5186 pr_debug("%s: non aligned\n", __func__); 5187 return 0; 5188 } 5189 /* 5190 * use bio_clone_fast to make a copy of the bio 5191 */ 5192 align_bi = bio_clone_fast(raid_bio, GFP_NOIO, mddev->bio_set); 5193 if (!align_bi) 5194 return 0; 5195 /* 5196 * set bi_end_io to a new function, and set bi_private to the 5197 * original bio. 5198 */ 5199 align_bi->bi_end_io = raid5_align_endio; 5200 align_bi->bi_private = raid_bio; 5201 /* 5202 * compute position 5203 */ 5204 align_bi->bi_iter.bi_sector = 5205 raid5_compute_sector(conf, raid_bio->bi_iter.bi_sector, 5206 0, &dd_idx, NULL); 5207 5208 end_sector = bio_end_sector(align_bi); 5209 rcu_read_lock(); 5210 rdev = rcu_dereference(conf->disks[dd_idx].replacement); 5211 if (!rdev || test_bit(Faulty, &rdev->flags) || 5212 rdev->recovery_offset < end_sector) { 5213 rdev = rcu_dereference(conf->disks[dd_idx].rdev); 5214 if (rdev && 5215 (test_bit(Faulty, &rdev->flags) || 5216 !(test_bit(In_sync, &rdev->flags) || 5217 rdev->recovery_offset >= end_sector))) 5218 rdev = NULL; 5219 } 5220 5221 if (r5c_big_stripe_cached(conf, align_bi->bi_iter.bi_sector)) { 5222 rcu_read_unlock(); 5223 bio_put(align_bi); 5224 return 0; 5225 } 5226 5227 if (rdev) { 5228 sector_t first_bad; 5229 int bad_sectors; 5230 5231 atomic_inc(&rdev->nr_pending); 5232 rcu_read_unlock(); 5233 raid_bio->bi_next = (void*)rdev; 5234 bio_set_dev(align_bi, rdev->bdev); 5235 bio_clear_flag(align_bi, BIO_SEG_VALID); 5236 5237 if (is_badblock(rdev, align_bi->bi_iter.bi_sector, 5238 bio_sectors(align_bi), 5239 &first_bad, &bad_sectors)) { 5240 bio_put(align_bi); 5241 rdev_dec_pending(rdev, mddev); 5242 return 0; 5243 } 5244 5245 /* No reshape active, so we can trust rdev->data_offset */ 5246 align_bi->bi_iter.bi_sector += rdev->data_offset; 5247 5248 spin_lock_irq(&conf->device_lock); 5249 wait_event_lock_irq(conf->wait_for_quiescent, 5250 conf->quiesce == 0, 5251 conf->device_lock); 5252 atomic_inc(&conf->active_aligned_reads); 5253 spin_unlock_irq(&conf->device_lock); 5254 5255 if (mddev->gendisk) 5256 trace_block_bio_remap(align_bi->bi_disk->queue, 5257 align_bi, disk_devt(mddev->gendisk), 5258 raid_bio->bi_iter.bi_sector); 5259 generic_make_request(align_bi); 5260 return 1; 5261 } else { 5262 rcu_read_unlock(); 5263 bio_put(align_bi); 5264 return 0; 5265 } 5266 } 5267 5268 static struct bio *chunk_aligned_read(struct mddev *mddev, struct bio *raid_bio) 5269 { 5270 struct bio *split; 5271 sector_t sector = raid_bio->bi_iter.bi_sector; 5272 unsigned chunk_sects = mddev->chunk_sectors; 5273 unsigned sectors = chunk_sects - (sector & (chunk_sects-1)); 5274 5275 if (sectors < bio_sectors(raid_bio)) { 5276 struct r5conf *conf = mddev->private; 5277 split = bio_split(raid_bio, sectors, GFP_NOIO, conf->bio_split); 5278 bio_chain(split, raid_bio); 5279 generic_make_request(raid_bio); 5280 raid_bio = split; 5281 } 5282 5283 if (!raid5_read_one_chunk(mddev, raid_bio)) 5284 return raid_bio; 5285 5286 return NULL; 5287 } 5288 5289 /* __get_priority_stripe - get the next stripe to process 5290 * 5291 * Full stripe writes are allowed to pass preread active stripes up until 5292 * the bypass_threshold is exceeded. In general the bypass_count 5293 * increments when the handle_list is handled before the hold_list; however, it 5294 * will not be incremented when STRIPE_IO_STARTED is sampled set signifying a 5295 * stripe with in flight i/o. The bypass_count will be reset when the 5296 * head of the hold_list has changed, i.e. the head was promoted to the 5297 * handle_list. 5298 */ 5299 static struct stripe_head *__get_priority_stripe(struct r5conf *conf, int group) 5300 { 5301 struct stripe_head *sh, *tmp; 5302 struct list_head *handle_list = NULL; 5303 struct r5worker_group *wg; 5304 bool second_try = !r5c_is_writeback(conf->log) && 5305 !r5l_log_disk_error(conf); 5306 bool try_loprio = test_bit(R5C_LOG_TIGHT, &conf->cache_state) || 5307 r5l_log_disk_error(conf); 5308 5309 again: 5310 wg = NULL; 5311 sh = NULL; 5312 if (conf->worker_cnt_per_group == 0) { 5313 handle_list = try_loprio ? &conf->loprio_list : 5314 &conf->handle_list; 5315 } else if (group != ANY_GROUP) { 5316 handle_list = try_loprio ? &conf->worker_groups[group].loprio_list : 5317 &conf->worker_groups[group].handle_list; 5318 wg = &conf->worker_groups[group]; 5319 } else { 5320 int i; 5321 for (i = 0; i < conf->group_cnt; i++) { 5322 handle_list = try_loprio ? &conf->worker_groups[i].loprio_list : 5323 &conf->worker_groups[i].handle_list; 5324 wg = &conf->worker_groups[i]; 5325 if (!list_empty(handle_list)) 5326 break; 5327 } 5328 } 5329 5330 pr_debug("%s: handle: %s hold: %s full_writes: %d bypass_count: %d\n", 5331 __func__, 5332 list_empty(handle_list) ? "empty" : "busy", 5333 list_empty(&conf->hold_list) ? "empty" : "busy", 5334 atomic_read(&conf->pending_full_writes), conf->bypass_count); 5335 5336 if (!list_empty(handle_list)) { 5337 sh = list_entry(handle_list->next, typeof(*sh), lru); 5338 5339 if (list_empty(&conf->hold_list)) 5340 conf->bypass_count = 0; 5341 else if (!test_bit(STRIPE_IO_STARTED, &sh->state)) { 5342 if (conf->hold_list.next == conf->last_hold) 5343 conf->bypass_count++; 5344 else { 5345 conf->last_hold = conf->hold_list.next; 5346 conf->bypass_count -= conf->bypass_threshold; 5347 if (conf->bypass_count < 0) 5348 conf->bypass_count = 0; 5349 } 5350 } 5351 } else if (!list_empty(&conf->hold_list) && 5352 ((conf->bypass_threshold && 5353 conf->bypass_count > conf->bypass_threshold) || 5354 atomic_read(&conf->pending_full_writes) == 0)) { 5355 5356 list_for_each_entry(tmp, &conf->hold_list, lru) { 5357 if (conf->worker_cnt_per_group == 0 || 5358 group == ANY_GROUP || 5359 !cpu_online(tmp->cpu) || 5360 cpu_to_group(tmp->cpu) == group) { 5361 sh = tmp; 5362 break; 5363 } 5364 } 5365 5366 if (sh) { 5367 conf->bypass_count -= conf->bypass_threshold; 5368 if (conf->bypass_count < 0) 5369 conf->bypass_count = 0; 5370 } 5371 wg = NULL; 5372 } 5373 5374 if (!sh) { 5375 if (second_try) 5376 return NULL; 5377 second_try = true; 5378 try_loprio = !try_loprio; 5379 goto again; 5380 } 5381 5382 if (wg) { 5383 wg->stripes_cnt--; 5384 sh->group = NULL; 5385 } 5386 list_del_init(&sh->lru); 5387 BUG_ON(atomic_inc_return(&sh->count) != 1); 5388 return sh; 5389 } 5390 5391 struct raid5_plug_cb { 5392 struct blk_plug_cb cb; 5393 struct list_head list; 5394 struct list_head temp_inactive_list[NR_STRIPE_HASH_LOCKS]; 5395 }; 5396 5397 static void raid5_unplug(struct blk_plug_cb *blk_cb, bool from_schedule) 5398 { 5399 struct raid5_plug_cb *cb = container_of( 5400 blk_cb, struct raid5_plug_cb, cb); 5401 struct stripe_head *sh; 5402 struct mddev *mddev = cb->cb.data; 5403 struct r5conf *conf = mddev->private; 5404 int cnt = 0; 5405 int hash; 5406 5407 if (cb->list.next && !list_empty(&cb->list)) { 5408 spin_lock_irq(&conf->device_lock); 5409 while (!list_empty(&cb->list)) { 5410 sh = list_first_entry(&cb->list, struct stripe_head, lru); 5411 list_del_init(&sh->lru); 5412 /* 5413 * avoid race release_stripe_plug() sees 5414 * STRIPE_ON_UNPLUG_LIST clear but the stripe 5415 * is still in our list 5416 */ 5417 smp_mb__before_atomic(); 5418 clear_bit(STRIPE_ON_UNPLUG_LIST, &sh->state); 5419 /* 5420 * STRIPE_ON_RELEASE_LIST could be set here. In that 5421 * case, the count is always > 1 here 5422 */ 5423 hash = sh->hash_lock_index; 5424 __release_stripe(conf, sh, &cb->temp_inactive_list[hash]); 5425 cnt++; 5426 } 5427 spin_unlock_irq(&conf->device_lock); 5428 } 5429 release_inactive_stripe_list(conf, cb->temp_inactive_list, 5430 NR_STRIPE_HASH_LOCKS); 5431 if (mddev->queue) 5432 trace_block_unplug(mddev->queue, cnt, !from_schedule); 5433 kfree(cb); 5434 } 5435 5436 static void release_stripe_plug(struct mddev *mddev, 5437 struct stripe_head *sh) 5438 { 5439 struct blk_plug_cb *blk_cb = blk_check_plugged( 5440 raid5_unplug, mddev, 5441 sizeof(struct raid5_plug_cb)); 5442 struct raid5_plug_cb *cb; 5443 5444 if (!blk_cb) { 5445 raid5_release_stripe(sh); 5446 return; 5447 } 5448 5449 cb = container_of(blk_cb, struct raid5_plug_cb, cb); 5450 5451 if (cb->list.next == NULL) { 5452 int i; 5453 INIT_LIST_HEAD(&cb->list); 5454 for (i = 0; i < NR_STRIPE_HASH_LOCKS; i++) 5455 INIT_LIST_HEAD(cb->temp_inactive_list + i); 5456 } 5457 5458 if (!test_and_set_bit(STRIPE_ON_UNPLUG_LIST, &sh->state)) 5459 list_add_tail(&sh->lru, &cb->list); 5460 else 5461 raid5_release_stripe(sh); 5462 } 5463 5464 static void make_discard_request(struct mddev *mddev, struct bio *bi) 5465 { 5466 struct r5conf *conf = mddev->private; 5467 sector_t logical_sector, last_sector; 5468 struct stripe_head *sh; 5469 int stripe_sectors; 5470 5471 if (mddev->reshape_position != MaxSector) 5472 /* Skip discard while reshape is happening */ 5473 return; 5474 5475 logical_sector = bi->bi_iter.bi_sector & ~((sector_t)STRIPE_SECTORS-1); 5476 last_sector = bi->bi_iter.bi_sector + (bi->bi_iter.bi_size>>9); 5477 5478 bi->bi_next = NULL; 5479 5480 stripe_sectors = conf->chunk_sectors * 5481 (conf->raid_disks - conf->max_degraded); 5482 logical_sector = DIV_ROUND_UP_SECTOR_T(logical_sector, 5483 stripe_sectors); 5484 sector_div(last_sector, stripe_sectors); 5485 5486 logical_sector *= conf->chunk_sectors; 5487 last_sector *= conf->chunk_sectors; 5488 5489 for (; logical_sector < last_sector; 5490 logical_sector += STRIPE_SECTORS) { 5491 DEFINE_WAIT(w); 5492 int d; 5493 again: 5494 sh = raid5_get_active_stripe(conf, logical_sector, 0, 0, 0); 5495 prepare_to_wait(&conf->wait_for_overlap, &w, 5496 TASK_UNINTERRUPTIBLE); 5497 set_bit(R5_Overlap, &sh->dev[sh->pd_idx].flags); 5498 if (test_bit(STRIPE_SYNCING, &sh->state)) { 5499 raid5_release_stripe(sh); 5500 schedule(); 5501 goto again; 5502 } 5503 clear_bit(R5_Overlap, &sh->dev[sh->pd_idx].flags); 5504 spin_lock_irq(&sh->stripe_lock); 5505 for (d = 0; d < conf->raid_disks; d++) { 5506 if (d == sh->pd_idx || d == sh->qd_idx) 5507 continue; 5508 if (sh->dev[d].towrite || sh->dev[d].toread) { 5509 set_bit(R5_Overlap, &sh->dev[d].flags); 5510 spin_unlock_irq(&sh->stripe_lock); 5511 raid5_release_stripe(sh); 5512 schedule(); 5513 goto again; 5514 } 5515 } 5516 set_bit(STRIPE_DISCARD, &sh->state); 5517 finish_wait(&conf->wait_for_overlap, &w); 5518 sh->overwrite_disks = 0; 5519 for (d = 0; d < conf->raid_disks; d++) { 5520 if (d == sh->pd_idx || d == sh->qd_idx) 5521 continue; 5522 sh->dev[d].towrite = bi; 5523 set_bit(R5_OVERWRITE, &sh->dev[d].flags); 5524 bio_inc_remaining(bi); 5525 md_write_inc(mddev, bi); 5526 sh->overwrite_disks++; 5527 } 5528 spin_unlock_irq(&sh->stripe_lock); 5529 if (conf->mddev->bitmap) { 5530 for (d = 0; 5531 d < conf->raid_disks - conf->max_degraded; 5532 d++) 5533 bitmap_startwrite(mddev->bitmap, 5534 sh->sector, 5535 STRIPE_SECTORS, 5536 0); 5537 sh->bm_seq = conf->seq_flush + 1; 5538 set_bit(STRIPE_BIT_DELAY, &sh->state); 5539 } 5540 5541 set_bit(STRIPE_HANDLE, &sh->state); 5542 clear_bit(STRIPE_DELAYED, &sh->state); 5543 if (!test_and_set_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) 5544 atomic_inc(&conf->preread_active_stripes); 5545 release_stripe_plug(mddev, sh); 5546 } 5547 5548 bio_endio(bi); 5549 } 5550 5551 static bool raid5_make_request(struct mddev *mddev, struct bio * bi) 5552 { 5553 struct r5conf *conf = mddev->private; 5554 int dd_idx; 5555 sector_t new_sector; 5556 sector_t logical_sector, last_sector; 5557 struct stripe_head *sh; 5558 const int rw = bio_data_dir(bi); 5559 DEFINE_WAIT(w); 5560 bool do_prepare; 5561 bool do_flush = false; 5562 5563 if (unlikely(bi->bi_opf & REQ_PREFLUSH)) { 5564 int ret = r5l_handle_flush_request(conf->log, bi); 5565 5566 if (ret == 0) 5567 return true; 5568 if (ret == -ENODEV) { 5569 md_flush_request(mddev, bi); 5570 return true; 5571 } 5572 /* ret == -EAGAIN, fallback */ 5573 /* 5574 * if r5l_handle_flush_request() didn't clear REQ_PREFLUSH, 5575 * we need to flush journal device 5576 */ 5577 do_flush = bi->bi_opf & REQ_PREFLUSH; 5578 } 5579 5580 if (!md_write_start(mddev, bi)) 5581 return false; 5582 /* 5583 * If array is degraded, better not do chunk aligned read because 5584 * later we might have to read it again in order to reconstruct 5585 * data on failed drives. 5586 */ 5587 if (rw == READ && mddev->degraded == 0 && 5588 mddev->reshape_position == MaxSector) { 5589 bi = chunk_aligned_read(mddev, bi); 5590 if (!bi) 5591 return true; 5592 } 5593 5594 if (unlikely(bio_op(bi) == REQ_OP_DISCARD)) { 5595 make_discard_request(mddev, bi); 5596 md_write_end(mddev); 5597 return true; 5598 } 5599 5600 logical_sector = bi->bi_iter.bi_sector & ~((sector_t)STRIPE_SECTORS-1); 5601 last_sector = bio_end_sector(bi); 5602 bi->bi_next = NULL; 5603 5604 prepare_to_wait(&conf->wait_for_overlap, &w, TASK_UNINTERRUPTIBLE); 5605 for (;logical_sector < last_sector; logical_sector += STRIPE_SECTORS) { 5606 int previous; 5607 int seq; 5608 5609 do_prepare = false; 5610 retry: 5611 seq = read_seqcount_begin(&conf->gen_lock); 5612 previous = 0; 5613 if (do_prepare) 5614 prepare_to_wait(&conf->wait_for_overlap, &w, 5615 TASK_UNINTERRUPTIBLE); 5616 if (unlikely(conf->reshape_progress != MaxSector)) { 5617 /* spinlock is needed as reshape_progress may be 5618 * 64bit on a 32bit platform, and so it might be 5619 * possible to see a half-updated value 5620 * Of course reshape_progress could change after 5621 * the lock is dropped, so once we get a reference 5622 * to the stripe that we think it is, we will have 5623 * to check again. 5624 */ 5625 spin_lock_irq(&conf->device_lock); 5626 if (mddev->reshape_backwards 5627 ? logical_sector < conf->reshape_progress 5628 : logical_sector >= conf->reshape_progress) { 5629 previous = 1; 5630 } else { 5631 if (mddev->reshape_backwards 5632 ? logical_sector < conf->reshape_safe 5633 : logical_sector >= conf->reshape_safe) { 5634 spin_unlock_irq(&conf->device_lock); 5635 schedule(); 5636 do_prepare = true; 5637 goto retry; 5638 } 5639 } 5640 spin_unlock_irq(&conf->device_lock); 5641 } 5642 5643 new_sector = raid5_compute_sector(conf, logical_sector, 5644 previous, 5645 &dd_idx, NULL); 5646 pr_debug("raid456: raid5_make_request, sector %llu logical %llu\n", 5647 (unsigned long long)new_sector, 5648 (unsigned long long)logical_sector); 5649 5650 sh = raid5_get_active_stripe(conf, new_sector, previous, 5651 (bi->bi_opf & REQ_RAHEAD), 0); 5652 if (sh) { 5653 if (unlikely(previous)) { 5654 /* expansion might have moved on while waiting for a 5655 * stripe, so we must do the range check again. 5656 * Expansion could still move past after this 5657 * test, but as we are holding a reference to 5658 * 'sh', we know that if that happens, 5659 * STRIPE_EXPANDING will get set and the expansion 5660 * won't proceed until we finish with the stripe. 5661 */ 5662 int must_retry = 0; 5663 spin_lock_irq(&conf->device_lock); 5664 if (mddev->reshape_backwards 5665 ? logical_sector >= conf->reshape_progress 5666 : logical_sector < conf->reshape_progress) 5667 /* mismatch, need to try again */ 5668 must_retry = 1; 5669 spin_unlock_irq(&conf->device_lock); 5670 if (must_retry) { 5671 raid5_release_stripe(sh); 5672 schedule(); 5673 do_prepare = true; 5674 goto retry; 5675 } 5676 } 5677 if (read_seqcount_retry(&conf->gen_lock, seq)) { 5678 /* Might have got the wrong stripe_head 5679 * by accident 5680 */ 5681 raid5_release_stripe(sh); 5682 goto retry; 5683 } 5684 5685 if (rw == WRITE && 5686 logical_sector >= mddev->suspend_lo && 5687 logical_sector < mddev->suspend_hi) { 5688 raid5_release_stripe(sh); 5689 /* As the suspend_* range is controlled by 5690 * userspace, we want an interruptible 5691 * wait. 5692 */ 5693 prepare_to_wait(&conf->wait_for_overlap, 5694 &w, TASK_INTERRUPTIBLE); 5695 if (logical_sector >= mddev->suspend_lo && 5696 logical_sector < mddev->suspend_hi) { 5697 sigset_t full, old; 5698 sigfillset(&full); 5699 sigprocmask(SIG_BLOCK, &full, &old); 5700 schedule(); 5701 sigprocmask(SIG_SETMASK, &old, NULL); 5702 do_prepare = true; 5703 } 5704 goto retry; 5705 } 5706 5707 if (test_bit(STRIPE_EXPANDING, &sh->state) || 5708 !add_stripe_bio(sh, bi, dd_idx, rw, previous)) { 5709 /* Stripe is busy expanding or 5710 * add failed due to overlap. Flush everything 5711 * and wait a while 5712 */ 5713 md_wakeup_thread(mddev->thread); 5714 raid5_release_stripe(sh); 5715 schedule(); 5716 do_prepare = true; 5717 goto retry; 5718 } 5719 if (do_flush) { 5720 set_bit(STRIPE_R5C_PREFLUSH, &sh->state); 5721 /* we only need flush for one stripe */ 5722 do_flush = false; 5723 } 5724 5725 set_bit(STRIPE_HANDLE, &sh->state); 5726 clear_bit(STRIPE_DELAYED, &sh->state); 5727 if ((!sh->batch_head || sh == sh->batch_head) && 5728 (bi->bi_opf & REQ_SYNC) && 5729 !test_and_set_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) 5730 atomic_inc(&conf->preread_active_stripes); 5731 release_stripe_plug(mddev, sh); 5732 } else { 5733 /* cannot get stripe for read-ahead, just give-up */ 5734 bi->bi_status = BLK_STS_IOERR; 5735 break; 5736 } 5737 } 5738 finish_wait(&conf->wait_for_overlap, &w); 5739 5740 if (rw == WRITE) 5741 md_write_end(mddev); 5742 bio_endio(bi); 5743 return true; 5744 } 5745 5746 static sector_t raid5_size(struct mddev *mddev, sector_t sectors, int raid_disks); 5747 5748 static sector_t reshape_request(struct mddev *mddev, sector_t sector_nr, int *skipped) 5749 { 5750 /* reshaping is quite different to recovery/resync so it is 5751 * handled quite separately ... here. 5752 * 5753 * On each call to sync_request, we gather one chunk worth of 5754 * destination stripes and flag them as expanding. 5755 * Then we find all the source stripes and request reads. 5756 * As the reads complete, handle_stripe will copy the data 5757 * into the destination stripe and release that stripe. 5758 */ 5759 struct r5conf *conf = mddev->private; 5760 struct stripe_head *sh; 5761 sector_t first_sector, last_sector; 5762 int raid_disks = conf->previous_raid_disks; 5763 int data_disks = raid_disks - conf->max_degraded; 5764 int new_data_disks = conf->raid_disks - conf->max_degraded; 5765 int i; 5766 int dd_idx; 5767 sector_t writepos, readpos, safepos; 5768 sector_t stripe_addr; 5769 int reshape_sectors; 5770 struct list_head stripes; 5771 sector_t retn; 5772 5773 if (sector_nr == 0) { 5774 /* If restarting in the middle, skip the initial sectors */ 5775 if (mddev->reshape_backwards && 5776 conf->reshape_progress < raid5_size(mddev, 0, 0)) { 5777 sector_nr = raid5_size(mddev, 0, 0) 5778 - conf->reshape_progress; 5779 } else if (mddev->reshape_backwards && 5780 conf->reshape_progress == MaxSector) { 5781 /* shouldn't happen, but just in case, finish up.*/ 5782 sector_nr = MaxSector; 5783 } else if (!mddev->reshape_backwards && 5784 conf->reshape_progress > 0) 5785 sector_nr = conf->reshape_progress; 5786 sector_div(sector_nr, new_data_disks); 5787 if (sector_nr) { 5788 mddev->curr_resync_completed = sector_nr; 5789 sysfs_notify(&mddev->kobj, NULL, "sync_completed"); 5790 *skipped = 1; 5791 retn = sector_nr; 5792 goto finish; 5793 } 5794 } 5795 5796 /* We need to process a full chunk at a time. 5797 * If old and new chunk sizes differ, we need to process the 5798 * largest of these 5799 */ 5800 5801 reshape_sectors = max(conf->chunk_sectors, conf->prev_chunk_sectors); 5802 5803 /* We update the metadata at least every 10 seconds, or when 5804 * the data about to be copied would over-write the source of 5805 * the data at the front of the range. i.e. one new_stripe 5806 * along from reshape_progress new_maps to after where 5807 * reshape_safe old_maps to 5808 */ 5809 writepos = conf->reshape_progress; 5810 sector_div(writepos, new_data_disks); 5811 readpos = conf->reshape_progress; 5812 sector_div(readpos, data_disks); 5813 safepos = conf->reshape_safe; 5814 sector_div(safepos, data_disks); 5815 if (mddev->reshape_backwards) { 5816 BUG_ON(writepos < reshape_sectors); 5817 writepos -= reshape_sectors; 5818 readpos += reshape_sectors; 5819 safepos += reshape_sectors; 5820 } else { 5821 writepos += reshape_sectors; 5822 /* readpos and safepos are worst-case calculations. 5823 * A negative number is overly pessimistic, and causes 5824 * obvious problems for unsigned storage. So clip to 0. 5825 */ 5826 readpos -= min_t(sector_t, reshape_sectors, readpos); 5827 safepos -= min_t(sector_t, reshape_sectors, safepos); 5828 } 5829 5830 /* Having calculated the 'writepos' possibly use it 5831 * to set 'stripe_addr' which is where we will write to. 5832 */ 5833 if (mddev->reshape_backwards) { 5834 BUG_ON(conf->reshape_progress == 0); 5835 stripe_addr = writepos; 5836 BUG_ON((mddev->dev_sectors & 5837 ~((sector_t)reshape_sectors - 1)) 5838 - reshape_sectors - stripe_addr 5839 != sector_nr); 5840 } else { 5841 BUG_ON(writepos != sector_nr + reshape_sectors); 5842 stripe_addr = sector_nr; 5843 } 5844 5845 /* 'writepos' is the most advanced device address we might write. 5846 * 'readpos' is the least advanced device address we might read. 5847 * 'safepos' is the least address recorded in the metadata as having 5848 * been reshaped. 5849 * If there is a min_offset_diff, these are adjusted either by 5850 * increasing the safepos/readpos if diff is negative, or 5851 * increasing writepos if diff is positive. 5852 * If 'readpos' is then behind 'writepos', there is no way that we can 5853 * ensure safety in the face of a crash - that must be done by userspace 5854 * making a backup of the data. So in that case there is no particular 5855 * rush to update metadata. 5856 * Otherwise if 'safepos' is behind 'writepos', then we really need to 5857 * update the metadata to advance 'safepos' to match 'readpos' so that 5858 * we can be safe in the event of a crash. 5859 * So we insist on updating metadata if safepos is behind writepos and 5860 * readpos is beyond writepos. 5861 * In any case, update the metadata every 10 seconds. 5862 * Maybe that number should be configurable, but I'm not sure it is 5863 * worth it.... maybe it could be a multiple of safemode_delay??? 5864 */ 5865 if (conf->min_offset_diff < 0) { 5866 safepos += -conf->min_offset_diff; 5867 readpos += -conf->min_offset_diff; 5868 } else 5869 writepos += conf->min_offset_diff; 5870 5871 if ((mddev->reshape_backwards 5872 ? (safepos > writepos && readpos < writepos) 5873 : (safepos < writepos && readpos > writepos)) || 5874 time_after(jiffies, conf->reshape_checkpoint + 10*HZ)) { 5875 /* Cannot proceed until we've updated the superblock... */ 5876 wait_event(conf->wait_for_overlap, 5877 atomic_read(&conf->reshape_stripes)==0 5878 || test_bit(MD_RECOVERY_INTR, &mddev->recovery)); 5879 if (atomic_read(&conf->reshape_stripes) != 0) 5880 return 0; 5881 mddev->reshape_position = conf->reshape_progress; 5882 mddev->curr_resync_completed = sector_nr; 5883 conf->reshape_checkpoint = jiffies; 5884 set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags); 5885 md_wakeup_thread(mddev->thread); 5886 wait_event(mddev->sb_wait, mddev->sb_flags == 0 || 5887 test_bit(MD_RECOVERY_INTR, &mddev->recovery)); 5888 if (test_bit(MD_RECOVERY_INTR, &mddev->recovery)) 5889 return 0; 5890 spin_lock_irq(&conf->device_lock); 5891 conf->reshape_safe = mddev->reshape_position; 5892 spin_unlock_irq(&conf->device_lock); 5893 wake_up(&conf->wait_for_overlap); 5894 sysfs_notify(&mddev->kobj, NULL, "sync_completed"); 5895 } 5896 5897 INIT_LIST_HEAD(&stripes); 5898 for (i = 0; i < reshape_sectors; i += STRIPE_SECTORS) { 5899 int j; 5900 int skipped_disk = 0; 5901 sh = raid5_get_active_stripe(conf, stripe_addr+i, 0, 0, 1); 5902 set_bit(STRIPE_EXPANDING, &sh->state); 5903 atomic_inc(&conf->reshape_stripes); 5904 /* If any of this stripe is beyond the end of the old 5905 * array, then we need to zero those blocks 5906 */ 5907 for (j=sh->disks; j--;) { 5908 sector_t s; 5909 if (j == sh->pd_idx) 5910 continue; 5911 if (conf->level == 6 && 5912 j == sh->qd_idx) 5913 continue; 5914 s = raid5_compute_blocknr(sh, j, 0); 5915 if (s < raid5_size(mddev, 0, 0)) { 5916 skipped_disk = 1; 5917 continue; 5918 } 5919 memset(page_address(sh->dev[j].page), 0, STRIPE_SIZE); 5920 set_bit(R5_Expanded, &sh->dev[j].flags); 5921 set_bit(R5_UPTODATE, &sh->dev[j].flags); 5922 } 5923 if (!skipped_disk) { 5924 set_bit(STRIPE_EXPAND_READY, &sh->state); 5925 set_bit(STRIPE_HANDLE, &sh->state); 5926 } 5927 list_add(&sh->lru, &stripes); 5928 } 5929 spin_lock_irq(&conf->device_lock); 5930 if (mddev->reshape_backwards) 5931 conf->reshape_progress -= reshape_sectors * new_data_disks; 5932 else 5933 conf->reshape_progress += reshape_sectors * new_data_disks; 5934 spin_unlock_irq(&conf->device_lock); 5935 /* Ok, those stripe are ready. We can start scheduling 5936 * reads on the source stripes. 5937 * The source stripes are determined by mapping the first and last 5938 * block on the destination stripes. 5939 */ 5940 first_sector = 5941 raid5_compute_sector(conf, stripe_addr*(new_data_disks), 5942 1, &dd_idx, NULL); 5943 last_sector = 5944 raid5_compute_sector(conf, ((stripe_addr+reshape_sectors) 5945 * new_data_disks - 1), 5946 1, &dd_idx, NULL); 5947 if (last_sector >= mddev->dev_sectors) 5948 last_sector = mddev->dev_sectors - 1; 5949 while (first_sector <= last_sector) { 5950 sh = raid5_get_active_stripe(conf, first_sector, 1, 0, 1); 5951 set_bit(STRIPE_EXPAND_SOURCE, &sh->state); 5952 set_bit(STRIPE_HANDLE, &sh->state); 5953 raid5_release_stripe(sh); 5954 first_sector += STRIPE_SECTORS; 5955 } 5956 /* Now that the sources are clearly marked, we can release 5957 * the destination stripes 5958 */ 5959 while (!list_empty(&stripes)) { 5960 sh = list_entry(stripes.next, struct stripe_head, lru); 5961 list_del_init(&sh->lru); 5962 raid5_release_stripe(sh); 5963 } 5964 /* If this takes us to the resync_max point where we have to pause, 5965 * then we need to write out the superblock. 5966 */ 5967 sector_nr += reshape_sectors; 5968 retn = reshape_sectors; 5969 finish: 5970 if (mddev->curr_resync_completed > mddev->resync_max || 5971 (sector_nr - mddev->curr_resync_completed) * 2 5972 >= mddev->resync_max - mddev->curr_resync_completed) { 5973 /* Cannot proceed until we've updated the superblock... */ 5974 wait_event(conf->wait_for_overlap, 5975 atomic_read(&conf->reshape_stripes) == 0 5976 || test_bit(MD_RECOVERY_INTR, &mddev->recovery)); 5977 if (atomic_read(&conf->reshape_stripes) != 0) 5978 goto ret; 5979 mddev->reshape_position = conf->reshape_progress; 5980 mddev->curr_resync_completed = sector_nr; 5981 conf->reshape_checkpoint = jiffies; 5982 set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags); 5983 md_wakeup_thread(mddev->thread); 5984 wait_event(mddev->sb_wait, 5985 !test_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags) 5986 || test_bit(MD_RECOVERY_INTR, &mddev->recovery)); 5987 if (test_bit(MD_RECOVERY_INTR, &mddev->recovery)) 5988 goto ret; 5989 spin_lock_irq(&conf->device_lock); 5990 conf->reshape_safe = mddev->reshape_position; 5991 spin_unlock_irq(&conf->device_lock); 5992 wake_up(&conf->wait_for_overlap); 5993 sysfs_notify(&mddev->kobj, NULL, "sync_completed"); 5994 } 5995 ret: 5996 return retn; 5997 } 5998 5999 static inline sector_t raid5_sync_request(struct mddev *mddev, sector_t sector_nr, 6000 int *skipped) 6001 { 6002 struct r5conf *conf = mddev->private; 6003 struct stripe_head *sh; 6004 sector_t max_sector = mddev->dev_sectors; 6005 sector_t sync_blocks; 6006 int still_degraded = 0; 6007 int i; 6008 6009 if (sector_nr >= max_sector) { 6010 /* just being told to finish up .. nothing much to do */ 6011 6012 if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)) { 6013 end_reshape(conf); 6014 return 0; 6015 } 6016 6017 if (mddev->curr_resync < max_sector) /* aborted */ 6018 bitmap_end_sync(mddev->bitmap, mddev->curr_resync, 6019 &sync_blocks, 1); 6020 else /* completed sync */ 6021 conf->fullsync = 0; 6022 bitmap_close_sync(mddev->bitmap); 6023 6024 return 0; 6025 } 6026 6027 /* Allow raid5_quiesce to complete */ 6028 wait_event(conf->wait_for_overlap, conf->quiesce != 2); 6029 6030 if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)) 6031 return reshape_request(mddev, sector_nr, skipped); 6032 6033 /* No need to check resync_max as we never do more than one 6034 * stripe, and as resync_max will always be on a chunk boundary, 6035 * if the check in md_do_sync didn't fire, there is no chance 6036 * of overstepping resync_max here 6037 */ 6038 6039 /* if there is too many failed drives and we are trying 6040 * to resync, then assert that we are finished, because there is 6041 * nothing we can do. 6042 */ 6043 if (mddev->degraded >= conf->max_degraded && 6044 test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) { 6045 sector_t rv = mddev->dev_sectors - sector_nr; 6046 *skipped = 1; 6047 return rv; 6048 } 6049 if (!test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery) && 6050 !conf->fullsync && 6051 !bitmap_start_sync(mddev->bitmap, sector_nr, &sync_blocks, 1) && 6052 sync_blocks >= STRIPE_SECTORS) { 6053 /* we can skip this block, and probably more */ 6054 sync_blocks /= STRIPE_SECTORS; 6055 *skipped = 1; 6056 return sync_blocks * STRIPE_SECTORS; /* keep things rounded to whole stripes */ 6057 } 6058 6059 bitmap_cond_end_sync(mddev->bitmap, sector_nr, false); 6060 6061 sh = raid5_get_active_stripe(conf, sector_nr, 0, 1, 0); 6062 if (sh == NULL) { 6063 sh = raid5_get_active_stripe(conf, sector_nr, 0, 0, 0); 6064 /* make sure we don't swamp the stripe cache if someone else 6065 * is trying to get access 6066 */ 6067 schedule_timeout_uninterruptible(1); 6068 } 6069 /* Need to check if array will still be degraded after recovery/resync 6070 * Note in case of > 1 drive failures it's possible we're rebuilding 6071 * one drive while leaving another faulty drive in array. 6072 */ 6073 rcu_read_lock(); 6074 for (i = 0; i < conf->raid_disks; i++) { 6075 struct md_rdev *rdev = ACCESS_ONCE(conf->disks[i].rdev); 6076 6077 if (rdev == NULL || test_bit(Faulty, &rdev->flags)) 6078 still_degraded = 1; 6079 } 6080 rcu_read_unlock(); 6081 6082 bitmap_start_sync(mddev->bitmap, sector_nr, &sync_blocks, still_degraded); 6083 6084 set_bit(STRIPE_SYNC_REQUESTED, &sh->state); 6085 set_bit(STRIPE_HANDLE, &sh->state); 6086 6087 raid5_release_stripe(sh); 6088 6089 return STRIPE_SECTORS; 6090 } 6091 6092 static int retry_aligned_read(struct r5conf *conf, struct bio *raid_bio, 6093 unsigned int offset) 6094 { 6095 /* We may not be able to submit a whole bio at once as there 6096 * may not be enough stripe_heads available. 6097 * We cannot pre-allocate enough stripe_heads as we may need 6098 * more than exist in the cache (if we allow ever large chunks). 6099 * So we do one stripe head at a time and record in 6100 * ->bi_hw_segments how many have been done. 6101 * 6102 * We *know* that this entire raid_bio is in one chunk, so 6103 * it will be only one 'dd_idx' and only need one call to raid5_compute_sector. 6104 */ 6105 struct stripe_head *sh; 6106 int dd_idx; 6107 sector_t sector, logical_sector, last_sector; 6108 int scnt = 0; 6109 int handled = 0; 6110 6111 logical_sector = raid_bio->bi_iter.bi_sector & 6112 ~((sector_t)STRIPE_SECTORS-1); 6113 sector = raid5_compute_sector(conf, logical_sector, 6114 0, &dd_idx, NULL); 6115 last_sector = bio_end_sector(raid_bio); 6116 6117 for (; logical_sector < last_sector; 6118 logical_sector += STRIPE_SECTORS, 6119 sector += STRIPE_SECTORS, 6120 scnt++) { 6121 6122 if (scnt < offset) 6123 /* already done this stripe */ 6124 continue; 6125 6126 sh = raid5_get_active_stripe(conf, sector, 0, 1, 1); 6127 6128 if (!sh) { 6129 /* failed to get a stripe - must wait */ 6130 conf->retry_read_aligned = raid_bio; 6131 conf->retry_read_offset = scnt; 6132 return handled; 6133 } 6134 6135 if (!add_stripe_bio(sh, raid_bio, dd_idx, 0, 0)) { 6136 raid5_release_stripe(sh); 6137 conf->retry_read_aligned = raid_bio; 6138 conf->retry_read_offset = scnt; 6139 return handled; 6140 } 6141 6142 set_bit(R5_ReadNoMerge, &sh->dev[dd_idx].flags); 6143 handle_stripe(sh); 6144 raid5_release_stripe(sh); 6145 handled++; 6146 } 6147 6148 bio_endio(raid_bio); 6149 6150 if (atomic_dec_and_test(&conf->active_aligned_reads)) 6151 wake_up(&conf->wait_for_quiescent); 6152 return handled; 6153 } 6154 6155 static int handle_active_stripes(struct r5conf *conf, int group, 6156 struct r5worker *worker, 6157 struct list_head *temp_inactive_list) 6158 { 6159 struct stripe_head *batch[MAX_STRIPE_BATCH], *sh; 6160 int i, batch_size = 0, hash; 6161 bool release_inactive = false; 6162 6163 while (batch_size < MAX_STRIPE_BATCH && 6164 (sh = __get_priority_stripe(conf, group)) != NULL) 6165 batch[batch_size++] = sh; 6166 6167 if (batch_size == 0) { 6168 for (i = 0; i < NR_STRIPE_HASH_LOCKS; i++) 6169 if (!list_empty(temp_inactive_list + i)) 6170 break; 6171 if (i == NR_STRIPE_HASH_LOCKS) { 6172 spin_unlock_irq(&conf->device_lock); 6173 r5l_flush_stripe_to_raid(conf->log); 6174 spin_lock_irq(&conf->device_lock); 6175 return batch_size; 6176 } 6177 release_inactive = true; 6178 } 6179 spin_unlock_irq(&conf->device_lock); 6180 6181 release_inactive_stripe_list(conf, temp_inactive_list, 6182 NR_STRIPE_HASH_LOCKS); 6183 6184 r5l_flush_stripe_to_raid(conf->log); 6185 if (release_inactive) { 6186 spin_lock_irq(&conf->device_lock); 6187 return 0; 6188 } 6189 6190 for (i = 0; i < batch_size; i++) 6191 handle_stripe(batch[i]); 6192 log_write_stripe_run(conf); 6193 6194 cond_resched(); 6195 6196 spin_lock_irq(&conf->device_lock); 6197 for (i = 0; i < batch_size; i++) { 6198 hash = batch[i]->hash_lock_index; 6199 __release_stripe(conf, batch[i], &temp_inactive_list[hash]); 6200 } 6201 return batch_size; 6202 } 6203 6204 static void raid5_do_work(struct work_struct *work) 6205 { 6206 struct r5worker *worker = container_of(work, struct r5worker, work); 6207 struct r5worker_group *group = worker->group; 6208 struct r5conf *conf = group->conf; 6209 struct mddev *mddev = conf->mddev; 6210 int group_id = group - conf->worker_groups; 6211 int handled; 6212 struct blk_plug plug; 6213 6214 pr_debug("+++ raid5worker active\n"); 6215 6216 blk_start_plug(&plug); 6217 handled = 0; 6218 spin_lock_irq(&conf->device_lock); 6219 while (1) { 6220 int batch_size, released; 6221 6222 released = release_stripe_list(conf, worker->temp_inactive_list); 6223 6224 batch_size = handle_active_stripes(conf, group_id, worker, 6225 worker->temp_inactive_list); 6226 worker->working = false; 6227 if (!batch_size && !released) 6228 break; 6229 handled += batch_size; 6230 wait_event_lock_irq(mddev->sb_wait, 6231 !test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags), 6232 conf->device_lock); 6233 } 6234 pr_debug("%d stripes handled\n", handled); 6235 6236 spin_unlock_irq(&conf->device_lock); 6237 6238 flush_deferred_bios(conf); 6239 6240 r5l_flush_stripe_to_raid(conf->log); 6241 6242 async_tx_issue_pending_all(); 6243 blk_finish_plug(&plug); 6244 6245 pr_debug("--- raid5worker inactive\n"); 6246 } 6247 6248 /* 6249 * This is our raid5 kernel thread. 6250 * 6251 * We scan the hash table for stripes which can be handled now. 6252 * During the scan, completed stripes are saved for us by the interrupt 6253 * handler, so that they will not have to wait for our next wakeup. 6254 */ 6255 static void raid5d(struct md_thread *thread) 6256 { 6257 struct mddev *mddev = thread->mddev; 6258 struct r5conf *conf = mddev->private; 6259 int handled; 6260 struct blk_plug plug; 6261 6262 pr_debug("+++ raid5d active\n"); 6263 6264 md_check_recovery(mddev); 6265 6266 blk_start_plug(&plug); 6267 handled = 0; 6268 spin_lock_irq(&conf->device_lock); 6269 while (1) { 6270 struct bio *bio; 6271 int batch_size, released; 6272 unsigned int offset; 6273 6274 released = release_stripe_list(conf, conf->temp_inactive_list); 6275 if (released) 6276 clear_bit(R5_DID_ALLOC, &conf->cache_state); 6277 6278 if ( 6279 !list_empty(&conf->bitmap_list)) { 6280 /* Now is a good time to flush some bitmap updates */ 6281 conf->seq_flush++; 6282 spin_unlock_irq(&conf->device_lock); 6283 bitmap_unplug(mddev->bitmap); 6284 spin_lock_irq(&conf->device_lock); 6285 conf->seq_write = conf->seq_flush; 6286 activate_bit_delay(conf, conf->temp_inactive_list); 6287 } 6288 raid5_activate_delayed(conf); 6289 6290 while ((bio = remove_bio_from_retry(conf, &offset))) { 6291 int ok; 6292 spin_unlock_irq(&conf->device_lock); 6293 ok = retry_aligned_read(conf, bio, offset); 6294 spin_lock_irq(&conf->device_lock); 6295 if (!ok) 6296 break; 6297 handled++; 6298 } 6299 6300 batch_size = handle_active_stripes(conf, ANY_GROUP, NULL, 6301 conf->temp_inactive_list); 6302 if (!batch_size && !released) 6303 break; 6304 handled += batch_size; 6305 6306 if (mddev->sb_flags & ~(1 << MD_SB_CHANGE_PENDING)) { 6307 spin_unlock_irq(&conf->device_lock); 6308 md_check_recovery(mddev); 6309 spin_lock_irq(&conf->device_lock); 6310 } 6311 } 6312 pr_debug("%d stripes handled\n", handled); 6313 6314 spin_unlock_irq(&conf->device_lock); 6315 if (test_and_clear_bit(R5_ALLOC_MORE, &conf->cache_state) && 6316 mutex_trylock(&conf->cache_size_mutex)) { 6317 grow_one_stripe(conf, __GFP_NOWARN); 6318 /* Set flag even if allocation failed. This helps 6319 * slow down allocation requests when mem is short 6320 */ 6321 set_bit(R5_DID_ALLOC, &conf->cache_state); 6322 mutex_unlock(&conf->cache_size_mutex); 6323 } 6324 6325 flush_deferred_bios(conf); 6326 6327 r5l_flush_stripe_to_raid(conf->log); 6328 6329 async_tx_issue_pending_all(); 6330 blk_finish_plug(&plug); 6331 6332 pr_debug("--- raid5d inactive\n"); 6333 } 6334 6335 static ssize_t 6336 raid5_show_stripe_cache_size(struct mddev *mddev, char *page) 6337 { 6338 struct r5conf *conf; 6339 int ret = 0; 6340 spin_lock(&mddev->lock); 6341 conf = mddev->private; 6342 if (conf) 6343 ret = sprintf(page, "%d\n", conf->min_nr_stripes); 6344 spin_unlock(&mddev->lock); 6345 return ret; 6346 } 6347 6348 int 6349 raid5_set_cache_size(struct mddev *mddev, int size) 6350 { 6351 struct r5conf *conf = mddev->private; 6352 6353 if (size <= 16 || size > 32768) 6354 return -EINVAL; 6355 6356 conf->min_nr_stripes = size; 6357 mutex_lock(&conf->cache_size_mutex); 6358 while (size < conf->max_nr_stripes && 6359 drop_one_stripe(conf)) 6360 ; 6361 mutex_unlock(&conf->cache_size_mutex); 6362 6363 md_allow_write(mddev); 6364 6365 mutex_lock(&conf->cache_size_mutex); 6366 while (size > conf->max_nr_stripes) 6367 if (!grow_one_stripe(conf, GFP_KERNEL)) 6368 break; 6369 mutex_unlock(&conf->cache_size_mutex); 6370 6371 return 0; 6372 } 6373 EXPORT_SYMBOL(raid5_set_cache_size); 6374 6375 static ssize_t 6376 raid5_store_stripe_cache_size(struct mddev *mddev, const char *page, size_t len) 6377 { 6378 struct r5conf *conf; 6379 unsigned long new; 6380 int err; 6381 6382 if (len >= PAGE_SIZE) 6383 return -EINVAL; 6384 if (kstrtoul(page, 10, &new)) 6385 return -EINVAL; 6386 err = mddev_lock(mddev); 6387 if (err) 6388 return err; 6389 conf = mddev->private; 6390 if (!conf) 6391 err = -ENODEV; 6392 else 6393 err = raid5_set_cache_size(mddev, new); 6394 mddev_unlock(mddev); 6395 6396 return err ?: len; 6397 } 6398 6399 static struct md_sysfs_entry 6400 raid5_stripecache_size = __ATTR(stripe_cache_size, S_IRUGO | S_IWUSR, 6401 raid5_show_stripe_cache_size, 6402 raid5_store_stripe_cache_size); 6403 6404 static ssize_t 6405 raid5_show_rmw_level(struct mddev *mddev, char *page) 6406 { 6407 struct r5conf *conf = mddev->private; 6408 if (conf) 6409 return sprintf(page, "%d\n", conf->rmw_level); 6410 else 6411 return 0; 6412 } 6413 6414 static ssize_t 6415 raid5_store_rmw_level(struct mddev *mddev, const char *page, size_t len) 6416 { 6417 struct r5conf *conf = mddev->private; 6418 unsigned long new; 6419 6420 if (!conf) 6421 return -ENODEV; 6422 6423 if (len >= PAGE_SIZE) 6424 return -EINVAL; 6425 6426 if (kstrtoul(page, 10, &new)) 6427 return -EINVAL; 6428 6429 if (new != PARITY_DISABLE_RMW && !raid6_call.xor_syndrome) 6430 return -EINVAL; 6431 6432 if (new != PARITY_DISABLE_RMW && 6433 new != PARITY_ENABLE_RMW && 6434 new != PARITY_PREFER_RMW) 6435 return -EINVAL; 6436 6437 conf->rmw_level = new; 6438 return len; 6439 } 6440 6441 static struct md_sysfs_entry 6442 raid5_rmw_level = __ATTR(rmw_level, S_IRUGO | S_IWUSR, 6443 raid5_show_rmw_level, 6444 raid5_store_rmw_level); 6445 6446 6447 static ssize_t 6448 raid5_show_preread_threshold(struct mddev *mddev, char *page) 6449 { 6450 struct r5conf *conf; 6451 int ret = 0; 6452 spin_lock(&mddev->lock); 6453 conf = mddev->private; 6454 if (conf) 6455 ret = sprintf(page, "%d\n", conf->bypass_threshold); 6456 spin_unlock(&mddev->lock); 6457 return ret; 6458 } 6459 6460 static ssize_t 6461 raid5_store_preread_threshold(struct mddev *mddev, const char *page, size_t len) 6462 { 6463 struct r5conf *conf; 6464 unsigned long new; 6465 int err; 6466 6467 if (len >= PAGE_SIZE) 6468 return -EINVAL; 6469 if (kstrtoul(page, 10, &new)) 6470 return -EINVAL; 6471 6472 err = mddev_lock(mddev); 6473 if (err) 6474 return err; 6475 conf = mddev->private; 6476 if (!conf) 6477 err = -ENODEV; 6478 else if (new > conf->min_nr_stripes) 6479 err = -EINVAL; 6480 else 6481 conf->bypass_threshold = new; 6482 mddev_unlock(mddev); 6483 return err ?: len; 6484 } 6485 6486 static struct md_sysfs_entry 6487 raid5_preread_bypass_threshold = __ATTR(preread_bypass_threshold, 6488 S_IRUGO | S_IWUSR, 6489 raid5_show_preread_threshold, 6490 raid5_store_preread_threshold); 6491 6492 static ssize_t 6493 raid5_show_skip_copy(struct mddev *mddev, char *page) 6494 { 6495 struct r5conf *conf; 6496 int ret = 0; 6497 spin_lock(&mddev->lock); 6498 conf = mddev->private; 6499 if (conf) 6500 ret = sprintf(page, "%d\n", conf->skip_copy); 6501 spin_unlock(&mddev->lock); 6502 return ret; 6503 } 6504 6505 static ssize_t 6506 raid5_store_skip_copy(struct mddev *mddev, const char *page, size_t len) 6507 { 6508 struct r5conf *conf; 6509 unsigned long new; 6510 int err; 6511 6512 if (len >= PAGE_SIZE) 6513 return -EINVAL; 6514 if (kstrtoul(page, 10, &new)) 6515 return -EINVAL; 6516 new = !!new; 6517 6518 err = mddev_lock(mddev); 6519 if (err) 6520 return err; 6521 conf = mddev->private; 6522 if (!conf) 6523 err = -ENODEV; 6524 else if (new != conf->skip_copy) { 6525 mddev_suspend(mddev); 6526 conf->skip_copy = new; 6527 if (new) 6528 mddev->queue->backing_dev_info->capabilities |= 6529 BDI_CAP_STABLE_WRITES; 6530 else 6531 mddev->queue->backing_dev_info->capabilities &= 6532 ~BDI_CAP_STABLE_WRITES; 6533 mddev_resume(mddev); 6534 } 6535 mddev_unlock(mddev); 6536 return err ?: len; 6537 } 6538 6539 static struct md_sysfs_entry 6540 raid5_skip_copy = __ATTR(skip_copy, S_IRUGO | S_IWUSR, 6541 raid5_show_skip_copy, 6542 raid5_store_skip_copy); 6543 6544 static ssize_t 6545 stripe_cache_active_show(struct mddev *mddev, char *page) 6546 { 6547 struct r5conf *conf = mddev->private; 6548 if (conf) 6549 return sprintf(page, "%d\n", atomic_read(&conf->active_stripes)); 6550 else 6551 return 0; 6552 } 6553 6554 static struct md_sysfs_entry 6555 raid5_stripecache_active = __ATTR_RO(stripe_cache_active); 6556 6557 static ssize_t 6558 raid5_show_group_thread_cnt(struct mddev *mddev, char *page) 6559 { 6560 struct r5conf *conf; 6561 int ret = 0; 6562 spin_lock(&mddev->lock); 6563 conf = mddev->private; 6564 if (conf) 6565 ret = sprintf(page, "%d\n", conf->worker_cnt_per_group); 6566 spin_unlock(&mddev->lock); 6567 return ret; 6568 } 6569 6570 static int alloc_thread_groups(struct r5conf *conf, int cnt, 6571 int *group_cnt, 6572 int *worker_cnt_per_group, 6573 struct r5worker_group **worker_groups); 6574 static ssize_t 6575 raid5_store_group_thread_cnt(struct mddev *mddev, const char *page, size_t len) 6576 { 6577 struct r5conf *conf; 6578 unsigned int new; 6579 int err; 6580 struct r5worker_group *new_groups, *old_groups; 6581 int group_cnt, worker_cnt_per_group; 6582 6583 if (len >= PAGE_SIZE) 6584 return -EINVAL; 6585 if (kstrtouint(page, 10, &new)) 6586 return -EINVAL; 6587 /* 8192 should be big enough */ 6588 if (new > 8192) 6589 return -EINVAL; 6590 6591 err = mddev_lock(mddev); 6592 if (err) 6593 return err; 6594 conf = mddev->private; 6595 if (!conf) 6596 err = -ENODEV; 6597 else if (new != conf->worker_cnt_per_group) { 6598 mddev_suspend(mddev); 6599 6600 old_groups = conf->worker_groups; 6601 if (old_groups) 6602 flush_workqueue(raid5_wq); 6603 6604 err = alloc_thread_groups(conf, new, 6605 &group_cnt, &worker_cnt_per_group, 6606 &new_groups); 6607 if (!err) { 6608 spin_lock_irq(&conf->device_lock); 6609 conf->group_cnt = group_cnt; 6610 conf->worker_cnt_per_group = worker_cnt_per_group; 6611 conf->worker_groups = new_groups; 6612 spin_unlock_irq(&conf->device_lock); 6613 6614 if (old_groups) 6615 kfree(old_groups[0].workers); 6616 kfree(old_groups); 6617 } 6618 mddev_resume(mddev); 6619 } 6620 mddev_unlock(mddev); 6621 6622 return err ?: len; 6623 } 6624 6625 static struct md_sysfs_entry 6626 raid5_group_thread_cnt = __ATTR(group_thread_cnt, S_IRUGO | S_IWUSR, 6627 raid5_show_group_thread_cnt, 6628 raid5_store_group_thread_cnt); 6629 6630 static struct attribute *raid5_attrs[] = { 6631 &raid5_stripecache_size.attr, 6632 &raid5_stripecache_active.attr, 6633 &raid5_preread_bypass_threshold.attr, 6634 &raid5_group_thread_cnt.attr, 6635 &raid5_skip_copy.attr, 6636 &raid5_rmw_level.attr, 6637 &r5c_journal_mode.attr, 6638 NULL, 6639 }; 6640 static struct attribute_group raid5_attrs_group = { 6641 .name = NULL, 6642 .attrs = raid5_attrs, 6643 }; 6644 6645 static int alloc_thread_groups(struct r5conf *conf, int cnt, 6646 int *group_cnt, 6647 int *worker_cnt_per_group, 6648 struct r5worker_group **worker_groups) 6649 { 6650 int i, j, k; 6651 ssize_t size; 6652 struct r5worker *workers; 6653 6654 *worker_cnt_per_group = cnt; 6655 if (cnt == 0) { 6656 *group_cnt = 0; 6657 *worker_groups = NULL; 6658 return 0; 6659 } 6660 *group_cnt = num_possible_nodes(); 6661 size = sizeof(struct r5worker) * cnt; 6662 workers = kzalloc(size * *group_cnt, GFP_NOIO); 6663 *worker_groups = kzalloc(sizeof(struct r5worker_group) * 6664 *group_cnt, GFP_NOIO); 6665 if (!*worker_groups || !workers) { 6666 kfree(workers); 6667 kfree(*worker_groups); 6668 return -ENOMEM; 6669 } 6670 6671 for (i = 0; i < *group_cnt; i++) { 6672 struct r5worker_group *group; 6673 6674 group = &(*worker_groups)[i]; 6675 INIT_LIST_HEAD(&group->handle_list); 6676 INIT_LIST_HEAD(&group->loprio_list); 6677 group->conf = conf; 6678 group->workers = workers + i * cnt; 6679 6680 for (j = 0; j < cnt; j++) { 6681 struct r5worker *worker = group->workers + j; 6682 worker->group = group; 6683 INIT_WORK(&worker->work, raid5_do_work); 6684 6685 for (k = 0; k < NR_STRIPE_HASH_LOCKS; k++) 6686 INIT_LIST_HEAD(worker->temp_inactive_list + k); 6687 } 6688 } 6689 6690 return 0; 6691 } 6692 6693 static void free_thread_groups(struct r5conf *conf) 6694 { 6695 if (conf->worker_groups) 6696 kfree(conf->worker_groups[0].workers); 6697 kfree(conf->worker_groups); 6698 conf->worker_groups = NULL; 6699 } 6700 6701 static sector_t 6702 raid5_size(struct mddev *mddev, sector_t sectors, int raid_disks) 6703 { 6704 struct r5conf *conf = mddev->private; 6705 6706 if (!sectors) 6707 sectors = mddev->dev_sectors; 6708 if (!raid_disks) 6709 /* size is defined by the smallest of previous and new size */ 6710 raid_disks = min(conf->raid_disks, conf->previous_raid_disks); 6711 6712 sectors &= ~((sector_t)conf->chunk_sectors - 1); 6713 sectors &= ~((sector_t)conf->prev_chunk_sectors - 1); 6714 return sectors * (raid_disks - conf->max_degraded); 6715 } 6716 6717 static void free_scratch_buffer(struct r5conf *conf, struct raid5_percpu *percpu) 6718 { 6719 safe_put_page(percpu->spare_page); 6720 if (percpu->scribble) 6721 flex_array_free(percpu->scribble); 6722 percpu->spare_page = NULL; 6723 percpu->scribble = NULL; 6724 } 6725 6726 static int alloc_scratch_buffer(struct r5conf *conf, struct raid5_percpu *percpu) 6727 { 6728 if (conf->level == 6 && !percpu->spare_page) 6729 percpu->spare_page = alloc_page(GFP_KERNEL); 6730 if (!percpu->scribble) 6731 percpu->scribble = scribble_alloc(max(conf->raid_disks, 6732 conf->previous_raid_disks), 6733 max(conf->chunk_sectors, 6734 conf->prev_chunk_sectors) 6735 / STRIPE_SECTORS, 6736 GFP_KERNEL); 6737 6738 if (!percpu->scribble || (conf->level == 6 && !percpu->spare_page)) { 6739 free_scratch_buffer(conf, percpu); 6740 return -ENOMEM; 6741 } 6742 6743 return 0; 6744 } 6745 6746 static int raid456_cpu_dead(unsigned int cpu, struct hlist_node *node) 6747 { 6748 struct r5conf *conf = hlist_entry_safe(node, struct r5conf, node); 6749 6750 free_scratch_buffer(conf, per_cpu_ptr(conf->percpu, cpu)); 6751 return 0; 6752 } 6753 6754 static void raid5_free_percpu(struct r5conf *conf) 6755 { 6756 if (!conf->percpu) 6757 return; 6758 6759 cpuhp_state_remove_instance(CPUHP_MD_RAID5_PREPARE, &conf->node); 6760 free_percpu(conf->percpu); 6761 } 6762 6763 static void free_conf(struct r5conf *conf) 6764 { 6765 int i; 6766 6767 log_exit(conf); 6768 6769 if (conf->shrinker.nr_deferred) 6770 unregister_shrinker(&conf->shrinker); 6771 6772 free_thread_groups(conf); 6773 shrink_stripes(conf); 6774 raid5_free_percpu(conf); 6775 for (i = 0; i < conf->pool_size; i++) 6776 if (conf->disks[i].extra_page) 6777 put_page(conf->disks[i].extra_page); 6778 kfree(conf->disks); 6779 if (conf->bio_split) 6780 bioset_free(conf->bio_split); 6781 kfree(conf->stripe_hashtbl); 6782 kfree(conf->pending_data); 6783 kfree(conf); 6784 } 6785 6786 static int raid456_cpu_up_prepare(unsigned int cpu, struct hlist_node *node) 6787 { 6788 struct r5conf *conf = hlist_entry_safe(node, struct r5conf, node); 6789 struct raid5_percpu *percpu = per_cpu_ptr(conf->percpu, cpu); 6790 6791 if (alloc_scratch_buffer(conf, percpu)) { 6792 pr_warn("%s: failed memory allocation for cpu%u\n", 6793 __func__, cpu); 6794 return -ENOMEM; 6795 } 6796 return 0; 6797 } 6798 6799 static int raid5_alloc_percpu(struct r5conf *conf) 6800 { 6801 int err = 0; 6802 6803 conf->percpu = alloc_percpu(struct raid5_percpu); 6804 if (!conf->percpu) 6805 return -ENOMEM; 6806 6807 err = cpuhp_state_add_instance(CPUHP_MD_RAID5_PREPARE, &conf->node); 6808 if (!err) { 6809 conf->scribble_disks = max(conf->raid_disks, 6810 conf->previous_raid_disks); 6811 conf->scribble_sectors = max(conf->chunk_sectors, 6812 conf->prev_chunk_sectors); 6813 } 6814 return err; 6815 } 6816 6817 static unsigned long raid5_cache_scan(struct shrinker *shrink, 6818 struct shrink_control *sc) 6819 { 6820 struct r5conf *conf = container_of(shrink, struct r5conf, shrinker); 6821 unsigned long ret = SHRINK_STOP; 6822 6823 if (mutex_trylock(&conf->cache_size_mutex)) { 6824 ret= 0; 6825 while (ret < sc->nr_to_scan && 6826 conf->max_nr_stripes > conf->min_nr_stripes) { 6827 if (drop_one_stripe(conf) == 0) { 6828 ret = SHRINK_STOP; 6829 break; 6830 } 6831 ret++; 6832 } 6833 mutex_unlock(&conf->cache_size_mutex); 6834 } 6835 return ret; 6836 } 6837 6838 static unsigned long raid5_cache_count(struct shrinker *shrink, 6839 struct shrink_control *sc) 6840 { 6841 struct r5conf *conf = container_of(shrink, struct r5conf, shrinker); 6842 6843 if (conf->max_nr_stripes < conf->min_nr_stripes) 6844 /* unlikely, but not impossible */ 6845 return 0; 6846 return conf->max_nr_stripes - conf->min_nr_stripes; 6847 } 6848 6849 static struct r5conf *setup_conf(struct mddev *mddev) 6850 { 6851 struct r5conf *conf; 6852 int raid_disk, memory, max_disks; 6853 struct md_rdev *rdev; 6854 struct disk_info *disk; 6855 char pers_name[6]; 6856 int i; 6857 int group_cnt, worker_cnt_per_group; 6858 struct r5worker_group *new_group; 6859 6860 if (mddev->new_level != 5 6861 && mddev->new_level != 4 6862 && mddev->new_level != 6) { 6863 pr_warn("md/raid:%s: raid level not set to 4/5/6 (%d)\n", 6864 mdname(mddev), mddev->new_level); 6865 return ERR_PTR(-EIO); 6866 } 6867 if ((mddev->new_level == 5 6868 && !algorithm_valid_raid5(mddev->new_layout)) || 6869 (mddev->new_level == 6 6870 && !algorithm_valid_raid6(mddev->new_layout))) { 6871 pr_warn("md/raid:%s: layout %d not supported\n", 6872 mdname(mddev), mddev->new_layout); 6873 return ERR_PTR(-EIO); 6874 } 6875 if (mddev->new_level == 6 && mddev->raid_disks < 4) { 6876 pr_warn("md/raid:%s: not enough configured devices (%d, minimum 4)\n", 6877 mdname(mddev), mddev->raid_disks); 6878 return ERR_PTR(-EINVAL); 6879 } 6880 6881 if (!mddev->new_chunk_sectors || 6882 (mddev->new_chunk_sectors << 9) % PAGE_SIZE || 6883 !is_power_of_2(mddev->new_chunk_sectors)) { 6884 pr_warn("md/raid:%s: invalid chunk size %d\n", 6885 mdname(mddev), mddev->new_chunk_sectors << 9); 6886 return ERR_PTR(-EINVAL); 6887 } 6888 6889 conf = kzalloc(sizeof(struct r5conf), GFP_KERNEL); 6890 if (conf == NULL) 6891 goto abort; 6892 INIT_LIST_HEAD(&conf->free_list); 6893 INIT_LIST_HEAD(&conf->pending_list); 6894 conf->pending_data = kzalloc(sizeof(struct r5pending_data) * 6895 PENDING_IO_MAX, GFP_KERNEL); 6896 if (!conf->pending_data) 6897 goto abort; 6898 for (i = 0; i < PENDING_IO_MAX; i++) 6899 list_add(&conf->pending_data[i].sibling, &conf->free_list); 6900 /* Don't enable multi-threading by default*/ 6901 if (!alloc_thread_groups(conf, 0, &group_cnt, &worker_cnt_per_group, 6902 &new_group)) { 6903 conf->group_cnt = group_cnt; 6904 conf->worker_cnt_per_group = worker_cnt_per_group; 6905 conf->worker_groups = new_group; 6906 } else 6907 goto abort; 6908 spin_lock_init(&conf->device_lock); 6909 seqcount_init(&conf->gen_lock); 6910 mutex_init(&conf->cache_size_mutex); 6911 init_waitqueue_head(&conf->wait_for_quiescent); 6912 init_waitqueue_head(&conf->wait_for_stripe); 6913 init_waitqueue_head(&conf->wait_for_overlap); 6914 INIT_LIST_HEAD(&conf->handle_list); 6915 INIT_LIST_HEAD(&conf->loprio_list); 6916 INIT_LIST_HEAD(&conf->hold_list); 6917 INIT_LIST_HEAD(&conf->delayed_list); 6918 INIT_LIST_HEAD(&conf->bitmap_list); 6919 init_llist_head(&conf->released_stripes); 6920 atomic_set(&conf->active_stripes, 0); 6921 atomic_set(&conf->preread_active_stripes, 0); 6922 atomic_set(&conf->active_aligned_reads, 0); 6923 spin_lock_init(&conf->pending_bios_lock); 6924 conf->batch_bio_dispatch = true; 6925 rdev_for_each(rdev, mddev) { 6926 if (test_bit(Journal, &rdev->flags)) 6927 continue; 6928 if (blk_queue_nonrot(bdev_get_queue(rdev->bdev))) { 6929 conf->batch_bio_dispatch = false; 6930 break; 6931 } 6932 } 6933 6934 conf->bypass_threshold = BYPASS_THRESHOLD; 6935 conf->recovery_disabled = mddev->recovery_disabled - 1; 6936 6937 conf->raid_disks = mddev->raid_disks; 6938 if (mddev->reshape_position == MaxSector) 6939 conf->previous_raid_disks = mddev->raid_disks; 6940 else 6941 conf->previous_raid_disks = mddev->raid_disks - mddev->delta_disks; 6942 max_disks = max(conf->raid_disks, conf->previous_raid_disks); 6943 6944 conf->disks = kzalloc(max_disks * sizeof(struct disk_info), 6945 GFP_KERNEL); 6946 6947 if (!conf->disks) 6948 goto abort; 6949 6950 for (i = 0; i < max_disks; i++) { 6951 conf->disks[i].extra_page = alloc_page(GFP_KERNEL); 6952 if (!conf->disks[i].extra_page) 6953 goto abort; 6954 } 6955 6956 conf->bio_split = bioset_create(BIO_POOL_SIZE, 0, 0); 6957 if (!conf->bio_split) 6958 goto abort; 6959 conf->mddev = mddev; 6960 6961 if ((conf->stripe_hashtbl = kzalloc(PAGE_SIZE, GFP_KERNEL)) == NULL) 6962 goto abort; 6963 6964 /* We init hash_locks[0] separately to that it can be used 6965 * as the reference lock in the spin_lock_nest_lock() call 6966 * in lock_all_device_hash_locks_irq in order to convince 6967 * lockdep that we know what we are doing. 6968 */ 6969 spin_lock_init(conf->hash_locks); 6970 for (i = 1; i < NR_STRIPE_HASH_LOCKS; i++) 6971 spin_lock_init(conf->hash_locks + i); 6972 6973 for (i = 0; i < NR_STRIPE_HASH_LOCKS; i++) 6974 INIT_LIST_HEAD(conf->inactive_list + i); 6975 6976 for (i = 0; i < NR_STRIPE_HASH_LOCKS; i++) 6977 INIT_LIST_HEAD(conf->temp_inactive_list + i); 6978 6979 atomic_set(&conf->r5c_cached_full_stripes, 0); 6980 INIT_LIST_HEAD(&conf->r5c_full_stripe_list); 6981 atomic_set(&conf->r5c_cached_partial_stripes, 0); 6982 INIT_LIST_HEAD(&conf->r5c_partial_stripe_list); 6983 atomic_set(&conf->r5c_flushing_full_stripes, 0); 6984 atomic_set(&conf->r5c_flushing_partial_stripes, 0); 6985 6986 conf->level = mddev->new_level; 6987 conf->chunk_sectors = mddev->new_chunk_sectors; 6988 if (raid5_alloc_percpu(conf) != 0) 6989 goto abort; 6990 6991 pr_debug("raid456: run(%s) called.\n", mdname(mddev)); 6992 6993 rdev_for_each(rdev, mddev) { 6994 raid_disk = rdev->raid_disk; 6995 if (raid_disk >= max_disks 6996 || raid_disk < 0 || test_bit(Journal, &rdev->flags)) 6997 continue; 6998 disk = conf->disks + raid_disk; 6999 7000 if (test_bit(Replacement, &rdev->flags)) { 7001 if (disk->replacement) 7002 goto abort; 7003 disk->replacement = rdev; 7004 } else { 7005 if (disk->rdev) 7006 goto abort; 7007 disk->rdev = rdev; 7008 } 7009 7010 if (test_bit(In_sync, &rdev->flags)) { 7011 char b[BDEVNAME_SIZE]; 7012 pr_info("md/raid:%s: device %s operational as raid disk %d\n", 7013 mdname(mddev), bdevname(rdev->bdev, b), raid_disk); 7014 } else if (rdev->saved_raid_disk != raid_disk) 7015 /* Cannot rely on bitmap to complete recovery */ 7016 conf->fullsync = 1; 7017 } 7018 7019 conf->level = mddev->new_level; 7020 if (conf->level == 6) { 7021 conf->max_degraded = 2; 7022 if (raid6_call.xor_syndrome) 7023 conf->rmw_level = PARITY_ENABLE_RMW; 7024 else 7025 conf->rmw_level = PARITY_DISABLE_RMW; 7026 } else { 7027 conf->max_degraded = 1; 7028 conf->rmw_level = PARITY_ENABLE_RMW; 7029 } 7030 conf->algorithm = mddev->new_layout; 7031 conf->reshape_progress = mddev->reshape_position; 7032 if (conf->reshape_progress != MaxSector) { 7033 conf->prev_chunk_sectors = mddev->chunk_sectors; 7034 conf->prev_algo = mddev->layout; 7035 } else { 7036 conf->prev_chunk_sectors = conf->chunk_sectors; 7037 conf->prev_algo = conf->algorithm; 7038 } 7039 7040 conf->min_nr_stripes = NR_STRIPES; 7041 if (mddev->reshape_position != MaxSector) { 7042 int stripes = max_t(int, 7043 ((mddev->chunk_sectors << 9) / STRIPE_SIZE) * 4, 7044 ((mddev->new_chunk_sectors << 9) / STRIPE_SIZE) * 4); 7045 conf->min_nr_stripes = max(NR_STRIPES, stripes); 7046 if (conf->min_nr_stripes != NR_STRIPES) 7047 pr_info("md/raid:%s: force stripe size %d for reshape\n", 7048 mdname(mddev), conf->min_nr_stripes); 7049 } 7050 memory = conf->min_nr_stripes * (sizeof(struct stripe_head) + 7051 max_disks * ((sizeof(struct bio) + PAGE_SIZE))) / 1024; 7052 atomic_set(&conf->empty_inactive_list_nr, NR_STRIPE_HASH_LOCKS); 7053 if (grow_stripes(conf, conf->min_nr_stripes)) { 7054 pr_warn("md/raid:%s: couldn't allocate %dkB for buffers\n", 7055 mdname(mddev), memory); 7056 goto abort; 7057 } else 7058 pr_debug("md/raid:%s: allocated %dkB\n", mdname(mddev), memory); 7059 /* 7060 * Losing a stripe head costs more than the time to refill it, 7061 * it reduces the queue depth and so can hurt throughput. 7062 * So set it rather large, scaled by number of devices. 7063 */ 7064 conf->shrinker.seeks = DEFAULT_SEEKS * conf->raid_disks * 4; 7065 conf->shrinker.scan_objects = raid5_cache_scan; 7066 conf->shrinker.count_objects = raid5_cache_count; 7067 conf->shrinker.batch = 128; 7068 conf->shrinker.flags = 0; 7069 if (register_shrinker(&conf->shrinker)) { 7070 pr_warn("md/raid:%s: couldn't register shrinker.\n", 7071 mdname(mddev)); 7072 goto abort; 7073 } 7074 7075 sprintf(pers_name, "raid%d", mddev->new_level); 7076 conf->thread = md_register_thread(raid5d, mddev, pers_name); 7077 if (!conf->thread) { 7078 pr_warn("md/raid:%s: couldn't allocate thread.\n", 7079 mdname(mddev)); 7080 goto abort; 7081 } 7082 7083 return conf; 7084 7085 abort: 7086 if (conf) { 7087 free_conf(conf); 7088 return ERR_PTR(-EIO); 7089 } else 7090 return ERR_PTR(-ENOMEM); 7091 } 7092 7093 static int only_parity(int raid_disk, int algo, int raid_disks, int max_degraded) 7094 { 7095 switch (algo) { 7096 case ALGORITHM_PARITY_0: 7097 if (raid_disk < max_degraded) 7098 return 1; 7099 break; 7100 case ALGORITHM_PARITY_N: 7101 if (raid_disk >= raid_disks - max_degraded) 7102 return 1; 7103 break; 7104 case ALGORITHM_PARITY_0_6: 7105 if (raid_disk == 0 || 7106 raid_disk == raid_disks - 1) 7107 return 1; 7108 break; 7109 case ALGORITHM_LEFT_ASYMMETRIC_6: 7110 case ALGORITHM_RIGHT_ASYMMETRIC_6: 7111 case ALGORITHM_LEFT_SYMMETRIC_6: 7112 case ALGORITHM_RIGHT_SYMMETRIC_6: 7113 if (raid_disk == raid_disks - 1) 7114 return 1; 7115 } 7116 return 0; 7117 } 7118 7119 static int raid5_run(struct mddev *mddev) 7120 { 7121 struct r5conf *conf; 7122 int working_disks = 0; 7123 int dirty_parity_disks = 0; 7124 struct md_rdev *rdev; 7125 struct md_rdev *journal_dev = NULL; 7126 sector_t reshape_offset = 0; 7127 int i; 7128 long long min_offset_diff = 0; 7129 int first = 1; 7130 7131 if (mddev_init_writes_pending(mddev) < 0) 7132 return -ENOMEM; 7133 7134 if (mddev->recovery_cp != MaxSector) 7135 pr_notice("md/raid:%s: not clean -- starting background reconstruction\n", 7136 mdname(mddev)); 7137 7138 rdev_for_each(rdev, mddev) { 7139 long long diff; 7140 7141 if (test_bit(Journal, &rdev->flags)) { 7142 journal_dev = rdev; 7143 continue; 7144 } 7145 if (rdev->raid_disk < 0) 7146 continue; 7147 diff = (rdev->new_data_offset - rdev->data_offset); 7148 if (first) { 7149 min_offset_diff = diff; 7150 first = 0; 7151 } else if (mddev->reshape_backwards && 7152 diff < min_offset_diff) 7153 min_offset_diff = diff; 7154 else if (!mddev->reshape_backwards && 7155 diff > min_offset_diff) 7156 min_offset_diff = diff; 7157 } 7158 7159 if (mddev->reshape_position != MaxSector) { 7160 /* Check that we can continue the reshape. 7161 * Difficulties arise if the stripe we would write to 7162 * next is at or after the stripe we would read from next. 7163 * For a reshape that changes the number of devices, this 7164 * is only possible for a very short time, and mdadm makes 7165 * sure that time appears to have past before assembling 7166 * the array. So we fail if that time hasn't passed. 7167 * For a reshape that keeps the number of devices the same 7168 * mdadm must be monitoring the reshape can keeping the 7169 * critical areas read-only and backed up. It will start 7170 * the array in read-only mode, so we check for that. 7171 */ 7172 sector_t here_new, here_old; 7173 int old_disks; 7174 int max_degraded = (mddev->level == 6 ? 2 : 1); 7175 int chunk_sectors; 7176 int new_data_disks; 7177 7178 if (journal_dev) { 7179 pr_warn("md/raid:%s: don't support reshape with journal - aborting.\n", 7180 mdname(mddev)); 7181 return -EINVAL; 7182 } 7183 7184 if (mddev->new_level != mddev->level) { 7185 pr_warn("md/raid:%s: unsupported reshape required - aborting.\n", 7186 mdname(mddev)); 7187 return -EINVAL; 7188 } 7189 old_disks = mddev->raid_disks - mddev->delta_disks; 7190 /* reshape_position must be on a new-stripe boundary, and one 7191 * further up in new geometry must map after here in old 7192 * geometry. 7193 * If the chunk sizes are different, then as we perform reshape 7194 * in units of the largest of the two, reshape_position needs 7195 * be a multiple of the largest chunk size times new data disks. 7196 */ 7197 here_new = mddev->reshape_position; 7198 chunk_sectors = max(mddev->chunk_sectors, mddev->new_chunk_sectors); 7199 new_data_disks = mddev->raid_disks - max_degraded; 7200 if (sector_div(here_new, chunk_sectors * new_data_disks)) { 7201 pr_warn("md/raid:%s: reshape_position not on a stripe boundary\n", 7202 mdname(mddev)); 7203 return -EINVAL; 7204 } 7205 reshape_offset = here_new * chunk_sectors; 7206 /* here_new is the stripe we will write to */ 7207 here_old = mddev->reshape_position; 7208 sector_div(here_old, chunk_sectors * (old_disks-max_degraded)); 7209 /* here_old is the first stripe that we might need to read 7210 * from */ 7211 if (mddev->delta_disks == 0) { 7212 /* We cannot be sure it is safe to start an in-place 7213 * reshape. It is only safe if user-space is monitoring 7214 * and taking constant backups. 7215 * mdadm always starts a situation like this in 7216 * readonly mode so it can take control before 7217 * allowing any writes. So just check for that. 7218 */ 7219 if (abs(min_offset_diff) >= mddev->chunk_sectors && 7220 abs(min_offset_diff) >= mddev->new_chunk_sectors) 7221 /* not really in-place - so OK */; 7222 else if (mddev->ro == 0) { 7223 pr_warn("md/raid:%s: in-place reshape must be started in read-only mode - aborting\n", 7224 mdname(mddev)); 7225 return -EINVAL; 7226 } 7227 } else if (mddev->reshape_backwards 7228 ? (here_new * chunk_sectors + min_offset_diff <= 7229 here_old * chunk_sectors) 7230 : (here_new * chunk_sectors >= 7231 here_old * chunk_sectors + (-min_offset_diff))) { 7232 /* Reading from the same stripe as writing to - bad */ 7233 pr_warn("md/raid:%s: reshape_position too early for auto-recovery - aborting.\n", 7234 mdname(mddev)); 7235 return -EINVAL; 7236 } 7237 pr_debug("md/raid:%s: reshape will continue\n", mdname(mddev)); 7238 /* OK, we should be able to continue; */ 7239 } else { 7240 BUG_ON(mddev->level != mddev->new_level); 7241 BUG_ON(mddev->layout != mddev->new_layout); 7242 BUG_ON(mddev->chunk_sectors != mddev->new_chunk_sectors); 7243 BUG_ON(mddev->delta_disks != 0); 7244 } 7245 7246 if (test_bit(MD_HAS_JOURNAL, &mddev->flags) && 7247 test_bit(MD_HAS_PPL, &mddev->flags)) { 7248 pr_warn("md/raid:%s: using journal device and PPL not allowed - disabling PPL\n", 7249 mdname(mddev)); 7250 clear_bit(MD_HAS_PPL, &mddev->flags); 7251 clear_bit(MD_HAS_MULTIPLE_PPLS, &mddev->flags); 7252 } 7253 7254 if (mddev->private == NULL) 7255 conf = setup_conf(mddev); 7256 else 7257 conf = mddev->private; 7258 7259 if (IS_ERR(conf)) 7260 return PTR_ERR(conf); 7261 7262 if (test_bit(MD_HAS_JOURNAL, &mddev->flags)) { 7263 if (!journal_dev) { 7264 pr_warn("md/raid:%s: journal disk is missing, force array readonly\n", 7265 mdname(mddev)); 7266 mddev->ro = 1; 7267 set_disk_ro(mddev->gendisk, 1); 7268 } else if (mddev->recovery_cp == MaxSector) 7269 set_bit(MD_JOURNAL_CLEAN, &mddev->flags); 7270 } 7271 7272 conf->min_offset_diff = min_offset_diff; 7273 mddev->thread = conf->thread; 7274 conf->thread = NULL; 7275 mddev->private = conf; 7276 7277 for (i = 0; i < conf->raid_disks && conf->previous_raid_disks; 7278 i++) { 7279 rdev = conf->disks[i].rdev; 7280 if (!rdev && conf->disks[i].replacement) { 7281 /* The replacement is all we have yet */ 7282 rdev = conf->disks[i].replacement; 7283 conf->disks[i].replacement = NULL; 7284 clear_bit(Replacement, &rdev->flags); 7285 conf->disks[i].rdev = rdev; 7286 } 7287 if (!rdev) 7288 continue; 7289 if (conf->disks[i].replacement && 7290 conf->reshape_progress != MaxSector) { 7291 /* replacements and reshape simply do not mix. */ 7292 pr_warn("md: cannot handle concurrent replacement and reshape.\n"); 7293 goto abort; 7294 } 7295 if (test_bit(In_sync, &rdev->flags)) { 7296 working_disks++; 7297 continue; 7298 } 7299 /* This disc is not fully in-sync. However if it 7300 * just stored parity (beyond the recovery_offset), 7301 * when we don't need to be concerned about the 7302 * array being dirty. 7303 * When reshape goes 'backwards', we never have 7304 * partially completed devices, so we only need 7305 * to worry about reshape going forwards. 7306 */ 7307 /* Hack because v0.91 doesn't store recovery_offset properly. */ 7308 if (mddev->major_version == 0 && 7309 mddev->minor_version > 90) 7310 rdev->recovery_offset = reshape_offset; 7311 7312 if (rdev->recovery_offset < reshape_offset) { 7313 /* We need to check old and new layout */ 7314 if (!only_parity(rdev->raid_disk, 7315 conf->algorithm, 7316 conf->raid_disks, 7317 conf->max_degraded)) 7318 continue; 7319 } 7320 if (!only_parity(rdev->raid_disk, 7321 conf->prev_algo, 7322 conf->previous_raid_disks, 7323 conf->max_degraded)) 7324 continue; 7325 dirty_parity_disks++; 7326 } 7327 7328 /* 7329 * 0 for a fully functional array, 1 or 2 for a degraded array. 7330 */ 7331 mddev->degraded = raid5_calc_degraded(conf); 7332 7333 if (has_failed(conf)) { 7334 pr_crit("md/raid:%s: not enough operational devices (%d/%d failed)\n", 7335 mdname(mddev), mddev->degraded, conf->raid_disks); 7336 goto abort; 7337 } 7338 7339 /* device size must be a multiple of chunk size */ 7340 mddev->dev_sectors &= ~(mddev->chunk_sectors - 1); 7341 mddev->resync_max_sectors = mddev->dev_sectors; 7342 7343 if (mddev->degraded > dirty_parity_disks && 7344 mddev->recovery_cp != MaxSector) { 7345 if (test_bit(MD_HAS_PPL, &mddev->flags)) 7346 pr_crit("md/raid:%s: starting dirty degraded array with PPL.\n", 7347 mdname(mddev)); 7348 else if (mddev->ok_start_degraded) 7349 pr_crit("md/raid:%s: starting dirty degraded array - data corruption possible.\n", 7350 mdname(mddev)); 7351 else { 7352 pr_crit("md/raid:%s: cannot start dirty degraded array.\n", 7353 mdname(mddev)); 7354 goto abort; 7355 } 7356 } 7357 7358 pr_info("md/raid:%s: raid level %d active with %d out of %d devices, algorithm %d\n", 7359 mdname(mddev), conf->level, 7360 mddev->raid_disks-mddev->degraded, mddev->raid_disks, 7361 mddev->new_layout); 7362 7363 print_raid5_conf(conf); 7364 7365 if (conf->reshape_progress != MaxSector) { 7366 conf->reshape_safe = conf->reshape_progress; 7367 atomic_set(&conf->reshape_stripes, 0); 7368 clear_bit(MD_RECOVERY_SYNC, &mddev->recovery); 7369 clear_bit(MD_RECOVERY_CHECK, &mddev->recovery); 7370 set_bit(MD_RECOVERY_RESHAPE, &mddev->recovery); 7371 set_bit(MD_RECOVERY_RUNNING, &mddev->recovery); 7372 mddev->sync_thread = md_register_thread(md_do_sync, mddev, 7373 "reshape"); 7374 } 7375 7376 /* Ok, everything is just fine now */ 7377 if (mddev->to_remove == &raid5_attrs_group) 7378 mddev->to_remove = NULL; 7379 else if (mddev->kobj.sd && 7380 sysfs_create_group(&mddev->kobj, &raid5_attrs_group)) 7381 pr_warn("raid5: failed to create sysfs attributes for %s\n", 7382 mdname(mddev)); 7383 md_set_array_sectors(mddev, raid5_size(mddev, 0, 0)); 7384 7385 if (mddev->queue) { 7386 int chunk_size; 7387 /* read-ahead size must cover two whole stripes, which 7388 * is 2 * (datadisks) * chunksize where 'n' is the 7389 * number of raid devices 7390 */ 7391 int data_disks = conf->previous_raid_disks - conf->max_degraded; 7392 int stripe = data_disks * 7393 ((mddev->chunk_sectors << 9) / PAGE_SIZE); 7394 if (mddev->queue->backing_dev_info->ra_pages < 2 * stripe) 7395 mddev->queue->backing_dev_info->ra_pages = 2 * stripe; 7396 7397 chunk_size = mddev->chunk_sectors << 9; 7398 blk_queue_io_min(mddev->queue, chunk_size); 7399 blk_queue_io_opt(mddev->queue, chunk_size * 7400 (conf->raid_disks - conf->max_degraded)); 7401 mddev->queue->limits.raid_partial_stripes_expensive = 1; 7402 /* 7403 * We can only discard a whole stripe. It doesn't make sense to 7404 * discard data disk but write parity disk 7405 */ 7406 stripe = stripe * PAGE_SIZE; 7407 /* Round up to power of 2, as discard handling 7408 * currently assumes that */ 7409 while ((stripe-1) & stripe) 7410 stripe = (stripe | (stripe-1)) + 1; 7411 mddev->queue->limits.discard_alignment = stripe; 7412 mddev->queue->limits.discard_granularity = stripe; 7413 7414 blk_queue_max_write_same_sectors(mddev->queue, 0); 7415 blk_queue_max_write_zeroes_sectors(mddev->queue, 0); 7416 7417 rdev_for_each(rdev, mddev) { 7418 disk_stack_limits(mddev->gendisk, rdev->bdev, 7419 rdev->data_offset << 9); 7420 disk_stack_limits(mddev->gendisk, rdev->bdev, 7421 rdev->new_data_offset << 9); 7422 } 7423 7424 /* 7425 * zeroing is required, otherwise data 7426 * could be lost. Consider a scenario: discard a stripe 7427 * (the stripe could be inconsistent if 7428 * discard_zeroes_data is 0); write one disk of the 7429 * stripe (the stripe could be inconsistent again 7430 * depending on which disks are used to calculate 7431 * parity); the disk is broken; The stripe data of this 7432 * disk is lost. 7433 * 7434 * We only allow DISCARD if the sysadmin has confirmed that 7435 * only safe devices are in use by setting a module parameter. 7436 * A better idea might be to turn DISCARD into WRITE_ZEROES 7437 * requests, as that is required to be safe. 7438 */ 7439 if (devices_handle_discard_safely && 7440 mddev->queue->limits.max_discard_sectors >= (stripe >> 9) && 7441 mddev->queue->limits.discard_granularity >= stripe) 7442 queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, 7443 mddev->queue); 7444 else 7445 queue_flag_clear_unlocked(QUEUE_FLAG_DISCARD, 7446 mddev->queue); 7447 7448 blk_queue_max_hw_sectors(mddev->queue, UINT_MAX); 7449 } 7450 7451 if (log_init(conf, journal_dev, raid5_has_ppl(conf))) 7452 goto abort; 7453 7454 return 0; 7455 abort: 7456 md_unregister_thread(&mddev->thread); 7457 print_raid5_conf(conf); 7458 free_conf(conf); 7459 mddev->private = NULL; 7460 pr_warn("md/raid:%s: failed to run raid set.\n", mdname(mddev)); 7461 return -EIO; 7462 } 7463 7464 static void raid5_free(struct mddev *mddev, void *priv) 7465 { 7466 struct r5conf *conf = priv; 7467 7468 free_conf(conf); 7469 mddev->to_remove = &raid5_attrs_group; 7470 } 7471 7472 static void raid5_status(struct seq_file *seq, struct mddev *mddev) 7473 { 7474 struct r5conf *conf = mddev->private; 7475 int i; 7476 7477 seq_printf(seq, " level %d, %dk chunk, algorithm %d", mddev->level, 7478 conf->chunk_sectors / 2, mddev->layout); 7479 seq_printf (seq, " [%d/%d] [", conf->raid_disks, conf->raid_disks - mddev->degraded); 7480 rcu_read_lock(); 7481 for (i = 0; i < conf->raid_disks; i++) { 7482 struct md_rdev *rdev = rcu_dereference(conf->disks[i].rdev); 7483 seq_printf (seq, "%s", rdev && test_bit(In_sync, &rdev->flags) ? "U" : "_"); 7484 } 7485 rcu_read_unlock(); 7486 seq_printf (seq, "]"); 7487 } 7488 7489 static void print_raid5_conf (struct r5conf *conf) 7490 { 7491 int i; 7492 struct disk_info *tmp; 7493 7494 pr_debug("RAID conf printout:\n"); 7495 if (!conf) { 7496 pr_debug("(conf==NULL)\n"); 7497 return; 7498 } 7499 pr_debug(" --- level:%d rd:%d wd:%d\n", conf->level, 7500 conf->raid_disks, 7501 conf->raid_disks - conf->mddev->degraded); 7502 7503 for (i = 0; i < conf->raid_disks; i++) { 7504 char b[BDEVNAME_SIZE]; 7505 tmp = conf->disks + i; 7506 if (tmp->rdev) 7507 pr_debug(" disk %d, o:%d, dev:%s\n", 7508 i, !test_bit(Faulty, &tmp->rdev->flags), 7509 bdevname(tmp->rdev->bdev, b)); 7510 } 7511 } 7512 7513 static int raid5_spare_active(struct mddev *mddev) 7514 { 7515 int i; 7516 struct r5conf *conf = mddev->private; 7517 struct disk_info *tmp; 7518 int count = 0; 7519 unsigned long flags; 7520 7521 for (i = 0; i < conf->raid_disks; i++) { 7522 tmp = conf->disks + i; 7523 if (tmp->replacement 7524 && tmp->replacement->recovery_offset == MaxSector 7525 && !test_bit(Faulty, &tmp->replacement->flags) 7526 && !test_and_set_bit(In_sync, &tmp->replacement->flags)) { 7527 /* Replacement has just become active. */ 7528 if (!tmp->rdev 7529 || !test_and_clear_bit(In_sync, &tmp->rdev->flags)) 7530 count++; 7531 if (tmp->rdev) { 7532 /* Replaced device not technically faulty, 7533 * but we need to be sure it gets removed 7534 * and never re-added. 7535 */ 7536 set_bit(Faulty, &tmp->rdev->flags); 7537 sysfs_notify_dirent_safe( 7538 tmp->rdev->sysfs_state); 7539 } 7540 sysfs_notify_dirent_safe(tmp->replacement->sysfs_state); 7541 } else if (tmp->rdev 7542 && tmp->rdev->recovery_offset == MaxSector 7543 && !test_bit(Faulty, &tmp->rdev->flags) 7544 && !test_and_set_bit(In_sync, &tmp->rdev->flags)) { 7545 count++; 7546 sysfs_notify_dirent_safe(tmp->rdev->sysfs_state); 7547 } 7548 } 7549 spin_lock_irqsave(&conf->device_lock, flags); 7550 mddev->degraded = raid5_calc_degraded(conf); 7551 spin_unlock_irqrestore(&conf->device_lock, flags); 7552 print_raid5_conf(conf); 7553 return count; 7554 } 7555 7556 static int raid5_remove_disk(struct mddev *mddev, struct md_rdev *rdev) 7557 { 7558 struct r5conf *conf = mddev->private; 7559 int err = 0; 7560 int number = rdev->raid_disk; 7561 struct md_rdev **rdevp; 7562 struct disk_info *p = conf->disks + number; 7563 7564 print_raid5_conf(conf); 7565 if (test_bit(Journal, &rdev->flags) && conf->log) { 7566 /* 7567 * we can't wait pending write here, as this is called in 7568 * raid5d, wait will deadlock. 7569 * neilb: there is no locking about new writes here, 7570 * so this cannot be safe. 7571 */ 7572 if (atomic_read(&conf->active_stripes) || 7573 atomic_read(&conf->r5c_cached_full_stripes) || 7574 atomic_read(&conf->r5c_cached_partial_stripes)) { 7575 return -EBUSY; 7576 } 7577 log_exit(conf); 7578 return 0; 7579 } 7580 if (rdev == p->rdev) 7581 rdevp = &p->rdev; 7582 else if (rdev == p->replacement) 7583 rdevp = &p->replacement; 7584 else 7585 return 0; 7586 7587 if (number >= conf->raid_disks && 7588 conf->reshape_progress == MaxSector) 7589 clear_bit(In_sync, &rdev->flags); 7590 7591 if (test_bit(In_sync, &rdev->flags) || 7592 atomic_read(&rdev->nr_pending)) { 7593 err = -EBUSY; 7594 goto abort; 7595 } 7596 /* Only remove non-faulty devices if recovery 7597 * isn't possible. 7598 */ 7599 if (!test_bit(Faulty, &rdev->flags) && 7600 mddev->recovery_disabled != conf->recovery_disabled && 7601 !has_failed(conf) && 7602 (!p->replacement || p->replacement == rdev) && 7603 number < conf->raid_disks) { 7604 err = -EBUSY; 7605 goto abort; 7606 } 7607 *rdevp = NULL; 7608 if (!test_bit(RemoveSynchronized, &rdev->flags)) { 7609 synchronize_rcu(); 7610 if (atomic_read(&rdev->nr_pending)) { 7611 /* lost the race, try later */ 7612 err = -EBUSY; 7613 *rdevp = rdev; 7614 } 7615 } 7616 if (!err) { 7617 err = log_modify(conf, rdev, false); 7618 if (err) 7619 goto abort; 7620 } 7621 if (p->replacement) { 7622 /* We must have just cleared 'rdev' */ 7623 p->rdev = p->replacement; 7624 clear_bit(Replacement, &p->replacement->flags); 7625 smp_mb(); /* Make sure other CPUs may see both as identical 7626 * but will never see neither - if they are careful 7627 */ 7628 p->replacement = NULL; 7629 7630 if (!err) 7631 err = log_modify(conf, p->rdev, true); 7632 } 7633 7634 clear_bit(WantReplacement, &rdev->flags); 7635 abort: 7636 7637 print_raid5_conf(conf); 7638 return err; 7639 } 7640 7641 static int raid5_add_disk(struct mddev *mddev, struct md_rdev *rdev) 7642 { 7643 struct r5conf *conf = mddev->private; 7644 int err = -EEXIST; 7645 int disk; 7646 struct disk_info *p; 7647 int first = 0; 7648 int last = conf->raid_disks - 1; 7649 7650 if (test_bit(Journal, &rdev->flags)) { 7651 if (conf->log) 7652 return -EBUSY; 7653 7654 rdev->raid_disk = 0; 7655 /* 7656 * The array is in readonly mode if journal is missing, so no 7657 * write requests running. We should be safe 7658 */ 7659 log_init(conf, rdev, false); 7660 return 0; 7661 } 7662 if (mddev->recovery_disabled == conf->recovery_disabled) 7663 return -EBUSY; 7664 7665 if (rdev->saved_raid_disk < 0 && has_failed(conf)) 7666 /* no point adding a device */ 7667 return -EINVAL; 7668 7669 if (rdev->raid_disk >= 0) 7670 first = last = rdev->raid_disk; 7671 7672 /* 7673 * find the disk ... but prefer rdev->saved_raid_disk 7674 * if possible. 7675 */ 7676 if (rdev->saved_raid_disk >= 0 && 7677 rdev->saved_raid_disk >= first && 7678 conf->disks[rdev->saved_raid_disk].rdev == NULL) 7679 first = rdev->saved_raid_disk; 7680 7681 for (disk = first; disk <= last; disk++) { 7682 p = conf->disks + disk; 7683 if (p->rdev == NULL) { 7684 clear_bit(In_sync, &rdev->flags); 7685 rdev->raid_disk = disk; 7686 if (rdev->saved_raid_disk != disk) 7687 conf->fullsync = 1; 7688 rcu_assign_pointer(p->rdev, rdev); 7689 7690 err = log_modify(conf, rdev, true); 7691 7692 goto out; 7693 } 7694 } 7695 for (disk = first; disk <= last; disk++) { 7696 p = conf->disks + disk; 7697 if (test_bit(WantReplacement, &p->rdev->flags) && 7698 p->replacement == NULL) { 7699 clear_bit(In_sync, &rdev->flags); 7700 set_bit(Replacement, &rdev->flags); 7701 rdev->raid_disk = disk; 7702 err = 0; 7703 conf->fullsync = 1; 7704 rcu_assign_pointer(p->replacement, rdev); 7705 break; 7706 } 7707 } 7708 out: 7709 print_raid5_conf(conf); 7710 return err; 7711 } 7712 7713 static int raid5_resize(struct mddev *mddev, sector_t sectors) 7714 { 7715 /* no resync is happening, and there is enough space 7716 * on all devices, so we can resize. 7717 * We need to make sure resync covers any new space. 7718 * If the array is shrinking we should possibly wait until 7719 * any io in the removed space completes, but it hardly seems 7720 * worth it. 7721 */ 7722 sector_t newsize; 7723 struct r5conf *conf = mddev->private; 7724 7725 if (conf->log || raid5_has_ppl(conf)) 7726 return -EINVAL; 7727 sectors &= ~((sector_t)conf->chunk_sectors - 1); 7728 newsize = raid5_size(mddev, sectors, mddev->raid_disks); 7729 if (mddev->external_size && 7730 mddev->array_sectors > newsize) 7731 return -EINVAL; 7732 if (mddev->bitmap) { 7733 int ret = bitmap_resize(mddev->bitmap, sectors, 0, 0); 7734 if (ret) 7735 return ret; 7736 } 7737 md_set_array_sectors(mddev, newsize); 7738 if (sectors > mddev->dev_sectors && 7739 mddev->recovery_cp > mddev->dev_sectors) { 7740 mddev->recovery_cp = mddev->dev_sectors; 7741 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 7742 } 7743 mddev->dev_sectors = sectors; 7744 mddev->resync_max_sectors = sectors; 7745 return 0; 7746 } 7747 7748 static int check_stripe_cache(struct mddev *mddev) 7749 { 7750 /* Can only proceed if there are plenty of stripe_heads. 7751 * We need a minimum of one full stripe,, and for sensible progress 7752 * it is best to have about 4 times that. 7753 * If we require 4 times, then the default 256 4K stripe_heads will 7754 * allow for chunk sizes up to 256K, which is probably OK. 7755 * If the chunk size is greater, user-space should request more 7756 * stripe_heads first. 7757 */ 7758 struct r5conf *conf = mddev->private; 7759 if (((mddev->chunk_sectors << 9) / STRIPE_SIZE) * 4 7760 > conf->min_nr_stripes || 7761 ((mddev->new_chunk_sectors << 9) / STRIPE_SIZE) * 4 7762 > conf->min_nr_stripes) { 7763 pr_warn("md/raid:%s: reshape: not enough stripes. Needed %lu\n", 7764 mdname(mddev), 7765 ((max(mddev->chunk_sectors, mddev->new_chunk_sectors) << 9) 7766 / STRIPE_SIZE)*4); 7767 return 0; 7768 } 7769 return 1; 7770 } 7771 7772 static int check_reshape(struct mddev *mddev) 7773 { 7774 struct r5conf *conf = mddev->private; 7775 7776 if (conf->log || raid5_has_ppl(conf)) 7777 return -EINVAL; 7778 if (mddev->delta_disks == 0 && 7779 mddev->new_layout == mddev->layout && 7780 mddev->new_chunk_sectors == mddev->chunk_sectors) 7781 return 0; /* nothing to do */ 7782 if (has_failed(conf)) 7783 return -EINVAL; 7784 if (mddev->delta_disks < 0 && mddev->reshape_position == MaxSector) { 7785 /* We might be able to shrink, but the devices must 7786 * be made bigger first. 7787 * For raid6, 4 is the minimum size. 7788 * Otherwise 2 is the minimum 7789 */ 7790 int min = 2; 7791 if (mddev->level == 6) 7792 min = 4; 7793 if (mddev->raid_disks + mddev->delta_disks < min) 7794 return -EINVAL; 7795 } 7796 7797 if (!check_stripe_cache(mddev)) 7798 return -ENOSPC; 7799 7800 if (mddev->new_chunk_sectors > mddev->chunk_sectors || 7801 mddev->delta_disks > 0) 7802 if (resize_chunks(conf, 7803 conf->previous_raid_disks 7804 + max(0, mddev->delta_disks), 7805 max(mddev->new_chunk_sectors, 7806 mddev->chunk_sectors) 7807 ) < 0) 7808 return -ENOMEM; 7809 7810 if (conf->previous_raid_disks + mddev->delta_disks <= conf->pool_size) 7811 return 0; /* never bother to shrink */ 7812 return resize_stripes(conf, (conf->previous_raid_disks 7813 + mddev->delta_disks)); 7814 } 7815 7816 static int raid5_start_reshape(struct mddev *mddev) 7817 { 7818 struct r5conf *conf = mddev->private; 7819 struct md_rdev *rdev; 7820 int spares = 0; 7821 unsigned long flags; 7822 7823 if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) 7824 return -EBUSY; 7825 7826 if (!check_stripe_cache(mddev)) 7827 return -ENOSPC; 7828 7829 if (has_failed(conf)) 7830 return -EINVAL; 7831 7832 rdev_for_each(rdev, mddev) { 7833 if (!test_bit(In_sync, &rdev->flags) 7834 && !test_bit(Faulty, &rdev->flags)) 7835 spares++; 7836 } 7837 7838 if (spares - mddev->degraded < mddev->delta_disks - conf->max_degraded) 7839 /* Not enough devices even to make a degraded array 7840 * of that size 7841 */ 7842 return -EINVAL; 7843 7844 /* Refuse to reduce size of the array. Any reductions in 7845 * array size must be through explicit setting of array_size 7846 * attribute. 7847 */ 7848 if (raid5_size(mddev, 0, conf->raid_disks + mddev->delta_disks) 7849 < mddev->array_sectors) { 7850 pr_warn("md/raid:%s: array size must be reduced before number of disks\n", 7851 mdname(mddev)); 7852 return -EINVAL; 7853 } 7854 7855 atomic_set(&conf->reshape_stripes, 0); 7856 spin_lock_irq(&conf->device_lock); 7857 write_seqcount_begin(&conf->gen_lock); 7858 conf->previous_raid_disks = conf->raid_disks; 7859 conf->raid_disks += mddev->delta_disks; 7860 conf->prev_chunk_sectors = conf->chunk_sectors; 7861 conf->chunk_sectors = mddev->new_chunk_sectors; 7862 conf->prev_algo = conf->algorithm; 7863 conf->algorithm = mddev->new_layout; 7864 conf->generation++; 7865 /* Code that selects data_offset needs to see the generation update 7866 * if reshape_progress has been set - so a memory barrier needed. 7867 */ 7868 smp_mb(); 7869 if (mddev->reshape_backwards) 7870 conf->reshape_progress = raid5_size(mddev, 0, 0); 7871 else 7872 conf->reshape_progress = 0; 7873 conf->reshape_safe = conf->reshape_progress; 7874 write_seqcount_end(&conf->gen_lock); 7875 spin_unlock_irq(&conf->device_lock); 7876 7877 /* Now make sure any requests that proceeded on the assumption 7878 * the reshape wasn't running - like Discard or Read - have 7879 * completed. 7880 */ 7881 mddev_suspend(mddev); 7882 mddev_resume(mddev); 7883 7884 /* Add some new drives, as many as will fit. 7885 * We know there are enough to make the newly sized array work. 7886 * Don't add devices if we are reducing the number of 7887 * devices in the array. This is because it is not possible 7888 * to correctly record the "partially reconstructed" state of 7889 * such devices during the reshape and confusion could result. 7890 */ 7891 if (mddev->delta_disks >= 0) { 7892 rdev_for_each(rdev, mddev) 7893 if (rdev->raid_disk < 0 && 7894 !test_bit(Faulty, &rdev->flags)) { 7895 if (raid5_add_disk(mddev, rdev) == 0) { 7896 if (rdev->raid_disk 7897 >= conf->previous_raid_disks) 7898 set_bit(In_sync, &rdev->flags); 7899 else 7900 rdev->recovery_offset = 0; 7901 7902 if (sysfs_link_rdev(mddev, rdev)) 7903 /* Failure here is OK */; 7904 } 7905 } else if (rdev->raid_disk >= conf->previous_raid_disks 7906 && !test_bit(Faulty, &rdev->flags)) { 7907 /* This is a spare that was manually added */ 7908 set_bit(In_sync, &rdev->flags); 7909 } 7910 7911 /* When a reshape changes the number of devices, 7912 * ->degraded is measured against the larger of the 7913 * pre and post number of devices. 7914 */ 7915 spin_lock_irqsave(&conf->device_lock, flags); 7916 mddev->degraded = raid5_calc_degraded(conf); 7917 spin_unlock_irqrestore(&conf->device_lock, flags); 7918 } 7919 mddev->raid_disks = conf->raid_disks; 7920 mddev->reshape_position = conf->reshape_progress; 7921 set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags); 7922 7923 clear_bit(MD_RECOVERY_SYNC, &mddev->recovery); 7924 clear_bit(MD_RECOVERY_CHECK, &mddev->recovery); 7925 clear_bit(MD_RECOVERY_DONE, &mddev->recovery); 7926 set_bit(MD_RECOVERY_RESHAPE, &mddev->recovery); 7927 set_bit(MD_RECOVERY_RUNNING, &mddev->recovery); 7928 mddev->sync_thread = md_register_thread(md_do_sync, mddev, 7929 "reshape"); 7930 if (!mddev->sync_thread) { 7931 mddev->recovery = 0; 7932 spin_lock_irq(&conf->device_lock); 7933 write_seqcount_begin(&conf->gen_lock); 7934 mddev->raid_disks = conf->raid_disks = conf->previous_raid_disks; 7935 mddev->new_chunk_sectors = 7936 conf->chunk_sectors = conf->prev_chunk_sectors; 7937 mddev->new_layout = conf->algorithm = conf->prev_algo; 7938 rdev_for_each(rdev, mddev) 7939 rdev->new_data_offset = rdev->data_offset; 7940 smp_wmb(); 7941 conf->generation --; 7942 conf->reshape_progress = MaxSector; 7943 mddev->reshape_position = MaxSector; 7944 write_seqcount_end(&conf->gen_lock); 7945 spin_unlock_irq(&conf->device_lock); 7946 return -EAGAIN; 7947 } 7948 conf->reshape_checkpoint = jiffies; 7949 md_wakeup_thread(mddev->sync_thread); 7950 md_new_event(mddev); 7951 return 0; 7952 } 7953 7954 /* This is called from the reshape thread and should make any 7955 * changes needed in 'conf' 7956 */ 7957 static void end_reshape(struct r5conf *conf) 7958 { 7959 7960 if (!test_bit(MD_RECOVERY_INTR, &conf->mddev->recovery)) { 7961 7962 spin_lock_irq(&conf->device_lock); 7963 conf->previous_raid_disks = conf->raid_disks; 7964 md_finish_reshape(conf->mddev); 7965 smp_wmb(); 7966 conf->reshape_progress = MaxSector; 7967 conf->mddev->reshape_position = MaxSector; 7968 spin_unlock_irq(&conf->device_lock); 7969 wake_up(&conf->wait_for_overlap); 7970 7971 /* read-ahead size must cover two whole stripes, which is 7972 * 2 * (datadisks) * chunksize where 'n' is the number of raid devices 7973 */ 7974 if (conf->mddev->queue) { 7975 int data_disks = conf->raid_disks - conf->max_degraded; 7976 int stripe = data_disks * ((conf->chunk_sectors << 9) 7977 / PAGE_SIZE); 7978 if (conf->mddev->queue->backing_dev_info->ra_pages < 2 * stripe) 7979 conf->mddev->queue->backing_dev_info->ra_pages = 2 * stripe; 7980 } 7981 } 7982 } 7983 7984 /* This is called from the raid5d thread with mddev_lock held. 7985 * It makes config changes to the device. 7986 */ 7987 static void raid5_finish_reshape(struct mddev *mddev) 7988 { 7989 struct r5conf *conf = mddev->private; 7990 7991 if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery)) { 7992 7993 if (mddev->delta_disks > 0) { 7994 md_set_array_sectors(mddev, raid5_size(mddev, 0, 0)); 7995 if (mddev->queue) { 7996 set_capacity(mddev->gendisk, mddev->array_sectors); 7997 revalidate_disk(mddev->gendisk); 7998 } 7999 } else { 8000 int d; 8001 spin_lock_irq(&conf->device_lock); 8002 mddev->degraded = raid5_calc_degraded(conf); 8003 spin_unlock_irq(&conf->device_lock); 8004 for (d = conf->raid_disks ; 8005 d < conf->raid_disks - mddev->delta_disks; 8006 d++) { 8007 struct md_rdev *rdev = conf->disks[d].rdev; 8008 if (rdev) 8009 clear_bit(In_sync, &rdev->flags); 8010 rdev = conf->disks[d].replacement; 8011 if (rdev) 8012 clear_bit(In_sync, &rdev->flags); 8013 } 8014 } 8015 mddev->layout = conf->algorithm; 8016 mddev->chunk_sectors = conf->chunk_sectors; 8017 mddev->reshape_position = MaxSector; 8018 mddev->delta_disks = 0; 8019 mddev->reshape_backwards = 0; 8020 } 8021 } 8022 8023 static void raid5_quiesce(struct mddev *mddev, int state) 8024 { 8025 struct r5conf *conf = mddev->private; 8026 8027 switch(state) { 8028 case 2: /* resume for a suspend */ 8029 wake_up(&conf->wait_for_overlap); 8030 break; 8031 8032 case 1: /* stop all writes */ 8033 lock_all_device_hash_locks_irq(conf); 8034 /* '2' tells resync/reshape to pause so that all 8035 * active stripes can drain 8036 */ 8037 r5c_flush_cache(conf, INT_MAX); 8038 conf->quiesce = 2; 8039 wait_event_cmd(conf->wait_for_quiescent, 8040 atomic_read(&conf->active_stripes) == 0 && 8041 atomic_read(&conf->active_aligned_reads) == 0, 8042 unlock_all_device_hash_locks_irq(conf), 8043 lock_all_device_hash_locks_irq(conf)); 8044 conf->quiesce = 1; 8045 unlock_all_device_hash_locks_irq(conf); 8046 /* allow reshape to continue */ 8047 wake_up(&conf->wait_for_overlap); 8048 break; 8049 8050 case 0: /* re-enable writes */ 8051 lock_all_device_hash_locks_irq(conf); 8052 conf->quiesce = 0; 8053 wake_up(&conf->wait_for_quiescent); 8054 wake_up(&conf->wait_for_overlap); 8055 unlock_all_device_hash_locks_irq(conf); 8056 break; 8057 } 8058 r5l_quiesce(conf->log, state); 8059 } 8060 8061 static void *raid45_takeover_raid0(struct mddev *mddev, int level) 8062 { 8063 struct r0conf *raid0_conf = mddev->private; 8064 sector_t sectors; 8065 8066 /* for raid0 takeover only one zone is supported */ 8067 if (raid0_conf->nr_strip_zones > 1) { 8068 pr_warn("md/raid:%s: cannot takeover raid0 with more than one zone.\n", 8069 mdname(mddev)); 8070 return ERR_PTR(-EINVAL); 8071 } 8072 8073 sectors = raid0_conf->strip_zone[0].zone_end; 8074 sector_div(sectors, raid0_conf->strip_zone[0].nb_dev); 8075 mddev->dev_sectors = sectors; 8076 mddev->new_level = level; 8077 mddev->new_layout = ALGORITHM_PARITY_N; 8078 mddev->new_chunk_sectors = mddev->chunk_sectors; 8079 mddev->raid_disks += 1; 8080 mddev->delta_disks = 1; 8081 /* make sure it will be not marked as dirty */ 8082 mddev->recovery_cp = MaxSector; 8083 8084 return setup_conf(mddev); 8085 } 8086 8087 static void *raid5_takeover_raid1(struct mddev *mddev) 8088 { 8089 int chunksect; 8090 void *ret; 8091 8092 if (mddev->raid_disks != 2 || 8093 mddev->degraded > 1) 8094 return ERR_PTR(-EINVAL); 8095 8096 /* Should check if there are write-behind devices? */ 8097 8098 chunksect = 64*2; /* 64K by default */ 8099 8100 /* The array must be an exact multiple of chunksize */ 8101 while (chunksect && (mddev->array_sectors & (chunksect-1))) 8102 chunksect >>= 1; 8103 8104 if ((chunksect<<9) < STRIPE_SIZE) 8105 /* array size does not allow a suitable chunk size */ 8106 return ERR_PTR(-EINVAL); 8107 8108 mddev->new_level = 5; 8109 mddev->new_layout = ALGORITHM_LEFT_SYMMETRIC; 8110 mddev->new_chunk_sectors = chunksect; 8111 8112 ret = setup_conf(mddev); 8113 if (!IS_ERR(ret)) 8114 mddev_clear_unsupported_flags(mddev, 8115 UNSUPPORTED_MDDEV_FLAGS); 8116 return ret; 8117 } 8118 8119 static void *raid5_takeover_raid6(struct mddev *mddev) 8120 { 8121 int new_layout; 8122 8123 switch (mddev->layout) { 8124 case ALGORITHM_LEFT_ASYMMETRIC_6: 8125 new_layout = ALGORITHM_LEFT_ASYMMETRIC; 8126 break; 8127 case ALGORITHM_RIGHT_ASYMMETRIC_6: 8128 new_layout = ALGORITHM_RIGHT_ASYMMETRIC; 8129 break; 8130 case ALGORITHM_LEFT_SYMMETRIC_6: 8131 new_layout = ALGORITHM_LEFT_SYMMETRIC; 8132 break; 8133 case ALGORITHM_RIGHT_SYMMETRIC_6: 8134 new_layout = ALGORITHM_RIGHT_SYMMETRIC; 8135 break; 8136 case ALGORITHM_PARITY_0_6: 8137 new_layout = ALGORITHM_PARITY_0; 8138 break; 8139 case ALGORITHM_PARITY_N: 8140 new_layout = ALGORITHM_PARITY_N; 8141 break; 8142 default: 8143 return ERR_PTR(-EINVAL); 8144 } 8145 mddev->new_level = 5; 8146 mddev->new_layout = new_layout; 8147 mddev->delta_disks = -1; 8148 mddev->raid_disks -= 1; 8149 return setup_conf(mddev); 8150 } 8151 8152 static int raid5_check_reshape(struct mddev *mddev) 8153 { 8154 /* For a 2-drive array, the layout and chunk size can be changed 8155 * immediately as not restriping is needed. 8156 * For larger arrays we record the new value - after validation 8157 * to be used by a reshape pass. 8158 */ 8159 struct r5conf *conf = mddev->private; 8160 int new_chunk = mddev->new_chunk_sectors; 8161 8162 if (mddev->new_layout >= 0 && !algorithm_valid_raid5(mddev->new_layout)) 8163 return -EINVAL; 8164 if (new_chunk > 0) { 8165 if (!is_power_of_2(new_chunk)) 8166 return -EINVAL; 8167 if (new_chunk < (PAGE_SIZE>>9)) 8168 return -EINVAL; 8169 if (mddev->array_sectors & (new_chunk-1)) 8170 /* not factor of array size */ 8171 return -EINVAL; 8172 } 8173 8174 /* They look valid */ 8175 8176 if (mddev->raid_disks == 2) { 8177 /* can make the change immediately */ 8178 if (mddev->new_layout >= 0) { 8179 conf->algorithm = mddev->new_layout; 8180 mddev->layout = mddev->new_layout; 8181 } 8182 if (new_chunk > 0) { 8183 conf->chunk_sectors = new_chunk ; 8184 mddev->chunk_sectors = new_chunk; 8185 } 8186 set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags); 8187 md_wakeup_thread(mddev->thread); 8188 } 8189 return check_reshape(mddev); 8190 } 8191 8192 static int raid6_check_reshape(struct mddev *mddev) 8193 { 8194 int new_chunk = mddev->new_chunk_sectors; 8195 8196 if (mddev->new_layout >= 0 && !algorithm_valid_raid6(mddev->new_layout)) 8197 return -EINVAL; 8198 if (new_chunk > 0) { 8199 if (!is_power_of_2(new_chunk)) 8200 return -EINVAL; 8201 if (new_chunk < (PAGE_SIZE >> 9)) 8202 return -EINVAL; 8203 if (mddev->array_sectors & (new_chunk-1)) 8204 /* not factor of array size */ 8205 return -EINVAL; 8206 } 8207 8208 /* They look valid */ 8209 return check_reshape(mddev); 8210 } 8211 8212 static void *raid5_takeover(struct mddev *mddev) 8213 { 8214 /* raid5 can take over: 8215 * raid0 - if there is only one strip zone - make it a raid4 layout 8216 * raid1 - if there are two drives. We need to know the chunk size 8217 * raid4 - trivial - just use a raid4 layout. 8218 * raid6 - Providing it is a *_6 layout 8219 */ 8220 if (mddev->level == 0) 8221 return raid45_takeover_raid0(mddev, 5); 8222 if (mddev->level == 1) 8223 return raid5_takeover_raid1(mddev); 8224 if (mddev->level == 4) { 8225 mddev->new_layout = ALGORITHM_PARITY_N; 8226 mddev->new_level = 5; 8227 return setup_conf(mddev); 8228 } 8229 if (mddev->level == 6) 8230 return raid5_takeover_raid6(mddev); 8231 8232 return ERR_PTR(-EINVAL); 8233 } 8234 8235 static void *raid4_takeover(struct mddev *mddev) 8236 { 8237 /* raid4 can take over: 8238 * raid0 - if there is only one strip zone 8239 * raid5 - if layout is right 8240 */ 8241 if (mddev->level == 0) 8242 return raid45_takeover_raid0(mddev, 4); 8243 if (mddev->level == 5 && 8244 mddev->layout == ALGORITHM_PARITY_N) { 8245 mddev->new_layout = 0; 8246 mddev->new_level = 4; 8247 return setup_conf(mddev); 8248 } 8249 return ERR_PTR(-EINVAL); 8250 } 8251 8252 static struct md_personality raid5_personality; 8253 8254 static void *raid6_takeover(struct mddev *mddev) 8255 { 8256 /* Currently can only take over a raid5. We map the 8257 * personality to an equivalent raid6 personality 8258 * with the Q block at the end. 8259 */ 8260 int new_layout; 8261 8262 if (mddev->pers != &raid5_personality) 8263 return ERR_PTR(-EINVAL); 8264 if (mddev->degraded > 1) 8265 return ERR_PTR(-EINVAL); 8266 if (mddev->raid_disks > 253) 8267 return ERR_PTR(-EINVAL); 8268 if (mddev->raid_disks < 3) 8269 return ERR_PTR(-EINVAL); 8270 8271 switch (mddev->layout) { 8272 case ALGORITHM_LEFT_ASYMMETRIC: 8273 new_layout = ALGORITHM_LEFT_ASYMMETRIC_6; 8274 break; 8275 case ALGORITHM_RIGHT_ASYMMETRIC: 8276 new_layout = ALGORITHM_RIGHT_ASYMMETRIC_6; 8277 break; 8278 case ALGORITHM_LEFT_SYMMETRIC: 8279 new_layout = ALGORITHM_LEFT_SYMMETRIC_6; 8280 break; 8281 case ALGORITHM_RIGHT_SYMMETRIC: 8282 new_layout = ALGORITHM_RIGHT_SYMMETRIC_6; 8283 break; 8284 case ALGORITHM_PARITY_0: 8285 new_layout = ALGORITHM_PARITY_0_6; 8286 break; 8287 case ALGORITHM_PARITY_N: 8288 new_layout = ALGORITHM_PARITY_N; 8289 break; 8290 default: 8291 return ERR_PTR(-EINVAL); 8292 } 8293 mddev->new_level = 6; 8294 mddev->new_layout = new_layout; 8295 mddev->delta_disks = 1; 8296 mddev->raid_disks += 1; 8297 return setup_conf(mddev); 8298 } 8299 8300 static int raid5_change_consistency_policy(struct mddev *mddev, const char *buf) 8301 { 8302 struct r5conf *conf; 8303 int err; 8304 8305 err = mddev_lock(mddev); 8306 if (err) 8307 return err; 8308 conf = mddev->private; 8309 if (!conf) { 8310 mddev_unlock(mddev); 8311 return -ENODEV; 8312 } 8313 8314 if (strncmp(buf, "ppl", 3) == 0) { 8315 /* ppl only works with RAID 5 */ 8316 if (!raid5_has_ppl(conf) && conf->level == 5) { 8317 err = log_init(conf, NULL, true); 8318 if (!err) { 8319 err = resize_stripes(conf, conf->pool_size); 8320 if (err) 8321 log_exit(conf); 8322 } 8323 } else 8324 err = -EINVAL; 8325 } else if (strncmp(buf, "resync", 6) == 0) { 8326 if (raid5_has_ppl(conf)) { 8327 mddev_suspend(mddev); 8328 log_exit(conf); 8329 mddev_resume(mddev); 8330 err = resize_stripes(conf, conf->pool_size); 8331 } else if (test_bit(MD_HAS_JOURNAL, &conf->mddev->flags) && 8332 r5l_log_disk_error(conf)) { 8333 bool journal_dev_exists = false; 8334 struct md_rdev *rdev; 8335 8336 rdev_for_each(rdev, mddev) 8337 if (test_bit(Journal, &rdev->flags)) { 8338 journal_dev_exists = true; 8339 break; 8340 } 8341 8342 if (!journal_dev_exists) { 8343 mddev_suspend(mddev); 8344 clear_bit(MD_HAS_JOURNAL, &mddev->flags); 8345 mddev_resume(mddev); 8346 } else /* need remove journal device first */ 8347 err = -EBUSY; 8348 } else 8349 err = -EINVAL; 8350 } else { 8351 err = -EINVAL; 8352 } 8353 8354 if (!err) 8355 md_update_sb(mddev, 1); 8356 8357 mddev_unlock(mddev); 8358 8359 return err; 8360 } 8361 8362 static struct md_personality raid6_personality = 8363 { 8364 .name = "raid6", 8365 .level = 6, 8366 .owner = THIS_MODULE, 8367 .make_request = raid5_make_request, 8368 .run = raid5_run, 8369 .free = raid5_free, 8370 .status = raid5_status, 8371 .error_handler = raid5_error, 8372 .hot_add_disk = raid5_add_disk, 8373 .hot_remove_disk= raid5_remove_disk, 8374 .spare_active = raid5_spare_active, 8375 .sync_request = raid5_sync_request, 8376 .resize = raid5_resize, 8377 .size = raid5_size, 8378 .check_reshape = raid6_check_reshape, 8379 .start_reshape = raid5_start_reshape, 8380 .finish_reshape = raid5_finish_reshape, 8381 .quiesce = raid5_quiesce, 8382 .takeover = raid6_takeover, 8383 .congested = raid5_congested, 8384 .change_consistency_policy = raid5_change_consistency_policy, 8385 }; 8386 static struct md_personality raid5_personality = 8387 { 8388 .name = "raid5", 8389 .level = 5, 8390 .owner = THIS_MODULE, 8391 .make_request = raid5_make_request, 8392 .run = raid5_run, 8393 .free = raid5_free, 8394 .status = raid5_status, 8395 .error_handler = raid5_error, 8396 .hot_add_disk = raid5_add_disk, 8397 .hot_remove_disk= raid5_remove_disk, 8398 .spare_active = raid5_spare_active, 8399 .sync_request = raid5_sync_request, 8400 .resize = raid5_resize, 8401 .size = raid5_size, 8402 .check_reshape = raid5_check_reshape, 8403 .start_reshape = raid5_start_reshape, 8404 .finish_reshape = raid5_finish_reshape, 8405 .quiesce = raid5_quiesce, 8406 .takeover = raid5_takeover, 8407 .congested = raid5_congested, 8408 .change_consistency_policy = raid5_change_consistency_policy, 8409 }; 8410 8411 static struct md_personality raid4_personality = 8412 { 8413 .name = "raid4", 8414 .level = 4, 8415 .owner = THIS_MODULE, 8416 .make_request = raid5_make_request, 8417 .run = raid5_run, 8418 .free = raid5_free, 8419 .status = raid5_status, 8420 .error_handler = raid5_error, 8421 .hot_add_disk = raid5_add_disk, 8422 .hot_remove_disk= raid5_remove_disk, 8423 .spare_active = raid5_spare_active, 8424 .sync_request = raid5_sync_request, 8425 .resize = raid5_resize, 8426 .size = raid5_size, 8427 .check_reshape = raid5_check_reshape, 8428 .start_reshape = raid5_start_reshape, 8429 .finish_reshape = raid5_finish_reshape, 8430 .quiesce = raid5_quiesce, 8431 .takeover = raid4_takeover, 8432 .congested = raid5_congested, 8433 .change_consistency_policy = raid5_change_consistency_policy, 8434 }; 8435 8436 static int __init raid5_init(void) 8437 { 8438 int ret; 8439 8440 raid5_wq = alloc_workqueue("raid5wq", 8441 WQ_UNBOUND|WQ_MEM_RECLAIM|WQ_CPU_INTENSIVE|WQ_SYSFS, 0); 8442 if (!raid5_wq) 8443 return -ENOMEM; 8444 8445 ret = cpuhp_setup_state_multi(CPUHP_MD_RAID5_PREPARE, 8446 "md/raid5:prepare", 8447 raid456_cpu_up_prepare, 8448 raid456_cpu_dead); 8449 if (ret) { 8450 destroy_workqueue(raid5_wq); 8451 return ret; 8452 } 8453 register_md_personality(&raid6_personality); 8454 register_md_personality(&raid5_personality); 8455 register_md_personality(&raid4_personality); 8456 return 0; 8457 } 8458 8459 static void raid5_exit(void) 8460 { 8461 unregister_md_personality(&raid6_personality); 8462 unregister_md_personality(&raid5_personality); 8463 unregister_md_personality(&raid4_personality); 8464 cpuhp_remove_multi_state(CPUHP_MD_RAID5_PREPARE); 8465 destroy_workqueue(raid5_wq); 8466 } 8467 8468 module_init(raid5_init); 8469 module_exit(raid5_exit); 8470 MODULE_LICENSE("GPL"); 8471 MODULE_DESCRIPTION("RAID4/5/6 (striping with parity) personality for MD"); 8472 MODULE_ALIAS("md-personality-4"); /* RAID5 */ 8473 MODULE_ALIAS("md-raid5"); 8474 MODULE_ALIAS("md-raid4"); 8475 MODULE_ALIAS("md-level-5"); 8476 MODULE_ALIAS("md-level-4"); 8477 MODULE_ALIAS("md-personality-8"); /* RAID6 */ 8478 MODULE_ALIAS("md-raid6"); 8479 MODULE_ALIAS("md-level-6"); 8480 8481 /* This used to be two separate modules, they were: */ 8482 MODULE_ALIAS("raid5"); 8483 MODULE_ALIAS("raid6"); 8484