1 /* 2 * raid5.c : Multiple Devices driver for Linux 3 * Copyright (C) 1996, 1997 Ingo Molnar, Miguel de Icaza, Gadi Oxman 4 * Copyright (C) 1999, 2000 Ingo Molnar 5 * Copyright (C) 2002, 2003 H. Peter Anvin 6 * 7 * RAID-4/5/6 management functions. 8 * Thanks to Penguin Computing for making the RAID-6 development possible 9 * by donating a test server! 10 * 11 * This program is free software; you can redistribute it and/or modify 12 * it under the terms of the GNU General Public License as published by 13 * the Free Software Foundation; either version 2, or (at your option) 14 * any later version. 15 * 16 * You should have received a copy of the GNU General Public License 17 * (for example /usr/src/linux/COPYING); if not, write to the Free 18 * Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. 19 */ 20 21 /* 22 * BITMAP UNPLUGGING: 23 * 24 * The sequencing for updating the bitmap reliably is a little 25 * subtle (and I got it wrong the first time) so it deserves some 26 * explanation. 27 * 28 * We group bitmap updates into batches. Each batch has a number. 29 * We may write out several batches at once, but that isn't very important. 30 * conf->seq_write is the number of the last batch successfully written. 31 * conf->seq_flush is the number of the last batch that was closed to 32 * new additions. 33 * When we discover that we will need to write to any block in a stripe 34 * (in add_stripe_bio) we update the in-memory bitmap and record in sh->bm_seq 35 * the number of the batch it will be in. This is seq_flush+1. 36 * When we are ready to do a write, if that batch hasn't been written yet, 37 * we plug the array and queue the stripe for later. 38 * When an unplug happens, we increment bm_flush, thus closing the current 39 * batch. 40 * When we notice that bm_flush > bm_write, we write out all pending updates 41 * to the bitmap, and advance bm_write to where bm_flush was. 42 * This may occasionally write a bit out twice, but is sure never to 43 * miss any bits. 44 */ 45 46 #include <linux/blkdev.h> 47 #include <linux/kthread.h> 48 #include <linux/raid/pq.h> 49 #include <linux/async_tx.h> 50 #include <linux/module.h> 51 #include <linux/async.h> 52 #include <linux/seq_file.h> 53 #include <linux/cpu.h> 54 #include <linux/slab.h> 55 #include <linux/ratelimit.h> 56 #include <linux/nodemask.h> 57 #include <linux/flex_array.h> 58 59 #include <trace/events/block.h> 60 #include <linux/list_sort.h> 61 62 #include "md.h" 63 #include "raid5.h" 64 #include "raid0.h" 65 #include "md-bitmap.h" 66 #include "raid5-log.h" 67 68 #define UNSUPPORTED_MDDEV_FLAGS (1L << MD_FAILFAST_SUPPORTED) 69 70 #define cpu_to_group(cpu) cpu_to_node(cpu) 71 #define ANY_GROUP NUMA_NO_NODE 72 73 static bool devices_handle_discard_safely = false; 74 module_param(devices_handle_discard_safely, bool, 0644); 75 MODULE_PARM_DESC(devices_handle_discard_safely, 76 "Set to Y if all devices in each array reliably return zeroes on reads from discarded regions"); 77 static struct workqueue_struct *raid5_wq; 78 79 static inline struct hlist_head *stripe_hash(struct r5conf *conf, sector_t sect) 80 { 81 int hash = (sect >> STRIPE_SHIFT) & HASH_MASK; 82 return &conf->stripe_hashtbl[hash]; 83 } 84 85 static inline int stripe_hash_locks_hash(sector_t sect) 86 { 87 return (sect >> STRIPE_SHIFT) & STRIPE_HASH_LOCKS_MASK; 88 } 89 90 static inline void lock_device_hash_lock(struct r5conf *conf, int hash) 91 { 92 spin_lock_irq(conf->hash_locks + hash); 93 spin_lock(&conf->device_lock); 94 } 95 96 static inline void unlock_device_hash_lock(struct r5conf *conf, int hash) 97 { 98 spin_unlock(&conf->device_lock); 99 spin_unlock_irq(conf->hash_locks + hash); 100 } 101 102 static inline void lock_all_device_hash_locks_irq(struct r5conf *conf) 103 { 104 int i; 105 spin_lock_irq(conf->hash_locks); 106 for (i = 1; i < NR_STRIPE_HASH_LOCKS; i++) 107 spin_lock_nest_lock(conf->hash_locks + i, conf->hash_locks); 108 spin_lock(&conf->device_lock); 109 } 110 111 static inline void unlock_all_device_hash_locks_irq(struct r5conf *conf) 112 { 113 int i; 114 spin_unlock(&conf->device_lock); 115 for (i = NR_STRIPE_HASH_LOCKS - 1; i; i--) 116 spin_unlock(conf->hash_locks + i); 117 spin_unlock_irq(conf->hash_locks); 118 } 119 120 /* Find first data disk in a raid6 stripe */ 121 static inline int raid6_d0(struct stripe_head *sh) 122 { 123 if (sh->ddf_layout) 124 /* ddf always start from first device */ 125 return 0; 126 /* md starts just after Q block */ 127 if (sh->qd_idx == sh->disks - 1) 128 return 0; 129 else 130 return sh->qd_idx + 1; 131 } 132 static inline int raid6_next_disk(int disk, int raid_disks) 133 { 134 disk++; 135 return (disk < raid_disks) ? disk : 0; 136 } 137 138 /* When walking through the disks in a raid5, starting at raid6_d0, 139 * We need to map each disk to a 'slot', where the data disks are slot 140 * 0 .. raid_disks-3, the parity disk is raid_disks-2 and the Q disk 141 * is raid_disks-1. This help does that mapping. 142 */ 143 static int raid6_idx_to_slot(int idx, struct stripe_head *sh, 144 int *count, int syndrome_disks) 145 { 146 int slot = *count; 147 148 if (sh->ddf_layout) 149 (*count)++; 150 if (idx == sh->pd_idx) 151 return syndrome_disks; 152 if (idx == sh->qd_idx) 153 return syndrome_disks + 1; 154 if (!sh->ddf_layout) 155 (*count)++; 156 return slot; 157 } 158 159 static void print_raid5_conf (struct r5conf *conf); 160 161 static int stripe_operations_active(struct stripe_head *sh) 162 { 163 return sh->check_state || sh->reconstruct_state || 164 test_bit(STRIPE_BIOFILL_RUN, &sh->state) || 165 test_bit(STRIPE_COMPUTE_RUN, &sh->state); 166 } 167 168 static bool stripe_is_lowprio(struct stripe_head *sh) 169 { 170 return (test_bit(STRIPE_R5C_FULL_STRIPE, &sh->state) || 171 test_bit(STRIPE_R5C_PARTIAL_STRIPE, &sh->state)) && 172 !test_bit(STRIPE_R5C_CACHING, &sh->state); 173 } 174 175 static void raid5_wakeup_stripe_thread(struct stripe_head *sh) 176 { 177 struct r5conf *conf = sh->raid_conf; 178 struct r5worker_group *group; 179 int thread_cnt; 180 int i, cpu = sh->cpu; 181 182 if (!cpu_online(cpu)) { 183 cpu = cpumask_any(cpu_online_mask); 184 sh->cpu = cpu; 185 } 186 187 if (list_empty(&sh->lru)) { 188 struct r5worker_group *group; 189 group = conf->worker_groups + cpu_to_group(cpu); 190 if (stripe_is_lowprio(sh)) 191 list_add_tail(&sh->lru, &group->loprio_list); 192 else 193 list_add_tail(&sh->lru, &group->handle_list); 194 group->stripes_cnt++; 195 sh->group = group; 196 } 197 198 if (conf->worker_cnt_per_group == 0) { 199 md_wakeup_thread(conf->mddev->thread); 200 return; 201 } 202 203 group = conf->worker_groups + cpu_to_group(sh->cpu); 204 205 group->workers[0].working = true; 206 /* at least one worker should run to avoid race */ 207 queue_work_on(sh->cpu, raid5_wq, &group->workers[0].work); 208 209 thread_cnt = group->stripes_cnt / MAX_STRIPE_BATCH - 1; 210 /* wakeup more workers */ 211 for (i = 1; i < conf->worker_cnt_per_group && thread_cnt > 0; i++) { 212 if (group->workers[i].working == false) { 213 group->workers[i].working = true; 214 queue_work_on(sh->cpu, raid5_wq, 215 &group->workers[i].work); 216 thread_cnt--; 217 } 218 } 219 } 220 221 static void do_release_stripe(struct r5conf *conf, struct stripe_head *sh, 222 struct list_head *temp_inactive_list) 223 { 224 int i; 225 int injournal = 0; /* number of date pages with R5_InJournal */ 226 227 BUG_ON(!list_empty(&sh->lru)); 228 BUG_ON(atomic_read(&conf->active_stripes)==0); 229 230 if (r5c_is_writeback(conf->log)) 231 for (i = sh->disks; i--; ) 232 if (test_bit(R5_InJournal, &sh->dev[i].flags)) 233 injournal++; 234 /* 235 * In the following cases, the stripe cannot be released to cached 236 * lists. Therefore, we make the stripe write out and set 237 * STRIPE_HANDLE: 238 * 1. when quiesce in r5c write back; 239 * 2. when resync is requested fot the stripe. 240 */ 241 if (test_bit(STRIPE_SYNC_REQUESTED, &sh->state) || 242 (conf->quiesce && r5c_is_writeback(conf->log) && 243 !test_bit(STRIPE_HANDLE, &sh->state) && injournal != 0)) { 244 if (test_bit(STRIPE_R5C_CACHING, &sh->state)) 245 r5c_make_stripe_write_out(sh); 246 set_bit(STRIPE_HANDLE, &sh->state); 247 } 248 249 if (test_bit(STRIPE_HANDLE, &sh->state)) { 250 if (test_bit(STRIPE_DELAYED, &sh->state) && 251 !test_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) 252 list_add_tail(&sh->lru, &conf->delayed_list); 253 else if (test_bit(STRIPE_BIT_DELAY, &sh->state) && 254 sh->bm_seq - conf->seq_write > 0) 255 list_add_tail(&sh->lru, &conf->bitmap_list); 256 else { 257 clear_bit(STRIPE_DELAYED, &sh->state); 258 clear_bit(STRIPE_BIT_DELAY, &sh->state); 259 if (conf->worker_cnt_per_group == 0) { 260 if (stripe_is_lowprio(sh)) 261 list_add_tail(&sh->lru, 262 &conf->loprio_list); 263 else 264 list_add_tail(&sh->lru, 265 &conf->handle_list); 266 } else { 267 raid5_wakeup_stripe_thread(sh); 268 return; 269 } 270 } 271 md_wakeup_thread(conf->mddev->thread); 272 } else { 273 BUG_ON(stripe_operations_active(sh)); 274 if (test_and_clear_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) 275 if (atomic_dec_return(&conf->preread_active_stripes) 276 < IO_THRESHOLD) 277 md_wakeup_thread(conf->mddev->thread); 278 atomic_dec(&conf->active_stripes); 279 if (!test_bit(STRIPE_EXPANDING, &sh->state)) { 280 if (!r5c_is_writeback(conf->log)) 281 list_add_tail(&sh->lru, temp_inactive_list); 282 else { 283 WARN_ON(test_bit(R5_InJournal, &sh->dev[sh->pd_idx].flags)); 284 if (injournal == 0) 285 list_add_tail(&sh->lru, temp_inactive_list); 286 else if (injournal == conf->raid_disks - conf->max_degraded) { 287 /* full stripe */ 288 if (!test_and_set_bit(STRIPE_R5C_FULL_STRIPE, &sh->state)) 289 atomic_inc(&conf->r5c_cached_full_stripes); 290 if (test_and_clear_bit(STRIPE_R5C_PARTIAL_STRIPE, &sh->state)) 291 atomic_dec(&conf->r5c_cached_partial_stripes); 292 list_add_tail(&sh->lru, &conf->r5c_full_stripe_list); 293 r5c_check_cached_full_stripe(conf); 294 } else 295 /* 296 * STRIPE_R5C_PARTIAL_STRIPE is set in 297 * r5c_try_caching_write(). No need to 298 * set it again. 299 */ 300 list_add_tail(&sh->lru, &conf->r5c_partial_stripe_list); 301 } 302 } 303 } 304 } 305 306 static void __release_stripe(struct r5conf *conf, struct stripe_head *sh, 307 struct list_head *temp_inactive_list) 308 { 309 if (atomic_dec_and_test(&sh->count)) 310 do_release_stripe(conf, sh, temp_inactive_list); 311 } 312 313 /* 314 * @hash could be NR_STRIPE_HASH_LOCKS, then we have a list of inactive_list 315 * 316 * Be careful: Only one task can add/delete stripes from temp_inactive_list at 317 * given time. Adding stripes only takes device lock, while deleting stripes 318 * only takes hash lock. 319 */ 320 static void release_inactive_stripe_list(struct r5conf *conf, 321 struct list_head *temp_inactive_list, 322 int hash) 323 { 324 int size; 325 bool do_wakeup = false; 326 unsigned long flags; 327 328 if (hash == NR_STRIPE_HASH_LOCKS) { 329 size = NR_STRIPE_HASH_LOCKS; 330 hash = NR_STRIPE_HASH_LOCKS - 1; 331 } else 332 size = 1; 333 while (size) { 334 struct list_head *list = &temp_inactive_list[size - 1]; 335 336 /* 337 * We don't hold any lock here yet, raid5_get_active_stripe() might 338 * remove stripes from the list 339 */ 340 if (!list_empty_careful(list)) { 341 spin_lock_irqsave(conf->hash_locks + hash, flags); 342 if (list_empty(conf->inactive_list + hash) && 343 !list_empty(list)) 344 atomic_dec(&conf->empty_inactive_list_nr); 345 list_splice_tail_init(list, conf->inactive_list + hash); 346 do_wakeup = true; 347 spin_unlock_irqrestore(conf->hash_locks + hash, flags); 348 } 349 size--; 350 hash--; 351 } 352 353 if (do_wakeup) { 354 wake_up(&conf->wait_for_stripe); 355 if (atomic_read(&conf->active_stripes) == 0) 356 wake_up(&conf->wait_for_quiescent); 357 if (conf->retry_read_aligned) 358 md_wakeup_thread(conf->mddev->thread); 359 } 360 } 361 362 /* should hold conf->device_lock already */ 363 static int release_stripe_list(struct r5conf *conf, 364 struct list_head *temp_inactive_list) 365 { 366 struct stripe_head *sh, *t; 367 int count = 0; 368 struct llist_node *head; 369 370 head = llist_del_all(&conf->released_stripes); 371 head = llist_reverse_order(head); 372 llist_for_each_entry_safe(sh, t, head, release_list) { 373 int hash; 374 375 /* sh could be readded after STRIPE_ON_RELEASE_LIST is cleard */ 376 smp_mb(); 377 clear_bit(STRIPE_ON_RELEASE_LIST, &sh->state); 378 /* 379 * Don't worry the bit is set here, because if the bit is set 380 * again, the count is always > 1. This is true for 381 * STRIPE_ON_UNPLUG_LIST bit too. 382 */ 383 hash = sh->hash_lock_index; 384 __release_stripe(conf, sh, &temp_inactive_list[hash]); 385 count++; 386 } 387 388 return count; 389 } 390 391 void raid5_release_stripe(struct stripe_head *sh) 392 { 393 struct r5conf *conf = sh->raid_conf; 394 unsigned long flags; 395 struct list_head list; 396 int hash; 397 bool wakeup; 398 399 /* Avoid release_list until the last reference. 400 */ 401 if (atomic_add_unless(&sh->count, -1, 1)) 402 return; 403 404 if (unlikely(!conf->mddev->thread) || 405 test_and_set_bit(STRIPE_ON_RELEASE_LIST, &sh->state)) 406 goto slow_path; 407 wakeup = llist_add(&sh->release_list, &conf->released_stripes); 408 if (wakeup) 409 md_wakeup_thread(conf->mddev->thread); 410 return; 411 slow_path: 412 local_irq_save(flags); 413 /* we are ok here if STRIPE_ON_RELEASE_LIST is set or not */ 414 if (atomic_dec_and_lock(&sh->count, &conf->device_lock)) { 415 INIT_LIST_HEAD(&list); 416 hash = sh->hash_lock_index; 417 do_release_stripe(conf, sh, &list); 418 spin_unlock(&conf->device_lock); 419 release_inactive_stripe_list(conf, &list, hash); 420 } 421 local_irq_restore(flags); 422 } 423 424 static inline void remove_hash(struct stripe_head *sh) 425 { 426 pr_debug("remove_hash(), stripe %llu\n", 427 (unsigned long long)sh->sector); 428 429 hlist_del_init(&sh->hash); 430 } 431 432 static inline void insert_hash(struct r5conf *conf, struct stripe_head *sh) 433 { 434 struct hlist_head *hp = stripe_hash(conf, sh->sector); 435 436 pr_debug("insert_hash(), stripe %llu\n", 437 (unsigned long long)sh->sector); 438 439 hlist_add_head(&sh->hash, hp); 440 } 441 442 /* find an idle stripe, make sure it is unhashed, and return it. */ 443 static struct stripe_head *get_free_stripe(struct r5conf *conf, int hash) 444 { 445 struct stripe_head *sh = NULL; 446 struct list_head *first; 447 448 if (list_empty(conf->inactive_list + hash)) 449 goto out; 450 first = (conf->inactive_list + hash)->next; 451 sh = list_entry(first, struct stripe_head, lru); 452 list_del_init(first); 453 remove_hash(sh); 454 atomic_inc(&conf->active_stripes); 455 BUG_ON(hash != sh->hash_lock_index); 456 if (list_empty(conf->inactive_list + hash)) 457 atomic_inc(&conf->empty_inactive_list_nr); 458 out: 459 return sh; 460 } 461 462 static void shrink_buffers(struct stripe_head *sh) 463 { 464 struct page *p; 465 int i; 466 int num = sh->raid_conf->pool_size; 467 468 for (i = 0; i < num ; i++) { 469 WARN_ON(sh->dev[i].page != sh->dev[i].orig_page); 470 p = sh->dev[i].page; 471 if (!p) 472 continue; 473 sh->dev[i].page = NULL; 474 put_page(p); 475 } 476 } 477 478 static int grow_buffers(struct stripe_head *sh, gfp_t gfp) 479 { 480 int i; 481 int num = sh->raid_conf->pool_size; 482 483 for (i = 0; i < num; i++) { 484 struct page *page; 485 486 if (!(page = alloc_page(gfp))) { 487 return 1; 488 } 489 sh->dev[i].page = page; 490 sh->dev[i].orig_page = page; 491 } 492 493 return 0; 494 } 495 496 static void stripe_set_idx(sector_t stripe, struct r5conf *conf, int previous, 497 struct stripe_head *sh); 498 499 static void init_stripe(struct stripe_head *sh, sector_t sector, int previous) 500 { 501 struct r5conf *conf = sh->raid_conf; 502 int i, seq; 503 504 BUG_ON(atomic_read(&sh->count) != 0); 505 BUG_ON(test_bit(STRIPE_HANDLE, &sh->state)); 506 BUG_ON(stripe_operations_active(sh)); 507 BUG_ON(sh->batch_head); 508 509 pr_debug("init_stripe called, stripe %llu\n", 510 (unsigned long long)sector); 511 retry: 512 seq = read_seqcount_begin(&conf->gen_lock); 513 sh->generation = conf->generation - previous; 514 sh->disks = previous ? conf->previous_raid_disks : conf->raid_disks; 515 sh->sector = sector; 516 stripe_set_idx(sector, conf, previous, sh); 517 sh->state = 0; 518 519 for (i = sh->disks; i--; ) { 520 struct r5dev *dev = &sh->dev[i]; 521 522 if (dev->toread || dev->read || dev->towrite || dev->written || 523 test_bit(R5_LOCKED, &dev->flags)) { 524 pr_err("sector=%llx i=%d %p %p %p %p %d\n", 525 (unsigned long long)sh->sector, i, dev->toread, 526 dev->read, dev->towrite, dev->written, 527 test_bit(R5_LOCKED, &dev->flags)); 528 WARN_ON(1); 529 } 530 dev->flags = 0; 531 dev->sector = raid5_compute_blocknr(sh, i, previous); 532 } 533 if (read_seqcount_retry(&conf->gen_lock, seq)) 534 goto retry; 535 sh->overwrite_disks = 0; 536 insert_hash(conf, sh); 537 sh->cpu = smp_processor_id(); 538 set_bit(STRIPE_BATCH_READY, &sh->state); 539 } 540 541 static struct stripe_head *__find_stripe(struct r5conf *conf, sector_t sector, 542 short generation) 543 { 544 struct stripe_head *sh; 545 546 pr_debug("__find_stripe, sector %llu\n", (unsigned long long)sector); 547 hlist_for_each_entry(sh, stripe_hash(conf, sector), hash) 548 if (sh->sector == sector && sh->generation == generation) 549 return sh; 550 pr_debug("__stripe %llu not in cache\n", (unsigned long long)sector); 551 return NULL; 552 } 553 554 /* 555 * Need to check if array has failed when deciding whether to: 556 * - start an array 557 * - remove non-faulty devices 558 * - add a spare 559 * - allow a reshape 560 * This determination is simple when no reshape is happening. 561 * However if there is a reshape, we need to carefully check 562 * both the before and after sections. 563 * This is because some failed devices may only affect one 564 * of the two sections, and some non-in_sync devices may 565 * be insync in the section most affected by failed devices. 566 */ 567 int raid5_calc_degraded(struct r5conf *conf) 568 { 569 int degraded, degraded2; 570 int i; 571 572 rcu_read_lock(); 573 degraded = 0; 574 for (i = 0; i < conf->previous_raid_disks; i++) { 575 struct md_rdev *rdev = rcu_dereference(conf->disks[i].rdev); 576 if (rdev && test_bit(Faulty, &rdev->flags)) 577 rdev = rcu_dereference(conf->disks[i].replacement); 578 if (!rdev || test_bit(Faulty, &rdev->flags)) 579 degraded++; 580 else if (test_bit(In_sync, &rdev->flags)) 581 ; 582 else 583 /* not in-sync or faulty. 584 * If the reshape increases the number of devices, 585 * this is being recovered by the reshape, so 586 * this 'previous' section is not in_sync. 587 * If the number of devices is being reduced however, 588 * the device can only be part of the array if 589 * we are reverting a reshape, so this section will 590 * be in-sync. 591 */ 592 if (conf->raid_disks >= conf->previous_raid_disks) 593 degraded++; 594 } 595 rcu_read_unlock(); 596 if (conf->raid_disks == conf->previous_raid_disks) 597 return degraded; 598 rcu_read_lock(); 599 degraded2 = 0; 600 for (i = 0; i < conf->raid_disks; i++) { 601 struct md_rdev *rdev = rcu_dereference(conf->disks[i].rdev); 602 if (rdev && test_bit(Faulty, &rdev->flags)) 603 rdev = rcu_dereference(conf->disks[i].replacement); 604 if (!rdev || test_bit(Faulty, &rdev->flags)) 605 degraded2++; 606 else if (test_bit(In_sync, &rdev->flags)) 607 ; 608 else 609 /* not in-sync or faulty. 610 * If reshape increases the number of devices, this 611 * section has already been recovered, else it 612 * almost certainly hasn't. 613 */ 614 if (conf->raid_disks <= conf->previous_raid_disks) 615 degraded2++; 616 } 617 rcu_read_unlock(); 618 if (degraded2 > degraded) 619 return degraded2; 620 return degraded; 621 } 622 623 static int has_failed(struct r5conf *conf) 624 { 625 int degraded; 626 627 if (conf->mddev->reshape_position == MaxSector) 628 return conf->mddev->degraded > conf->max_degraded; 629 630 degraded = raid5_calc_degraded(conf); 631 if (degraded > conf->max_degraded) 632 return 1; 633 return 0; 634 } 635 636 struct stripe_head * 637 raid5_get_active_stripe(struct r5conf *conf, sector_t sector, 638 int previous, int noblock, int noquiesce) 639 { 640 struct stripe_head *sh; 641 int hash = stripe_hash_locks_hash(sector); 642 int inc_empty_inactive_list_flag; 643 644 pr_debug("get_stripe, sector %llu\n", (unsigned long long)sector); 645 646 spin_lock_irq(conf->hash_locks + hash); 647 648 do { 649 wait_event_lock_irq(conf->wait_for_quiescent, 650 conf->quiesce == 0 || noquiesce, 651 *(conf->hash_locks + hash)); 652 sh = __find_stripe(conf, sector, conf->generation - previous); 653 if (!sh) { 654 if (!test_bit(R5_INACTIVE_BLOCKED, &conf->cache_state)) { 655 sh = get_free_stripe(conf, hash); 656 if (!sh && !test_bit(R5_DID_ALLOC, 657 &conf->cache_state)) 658 set_bit(R5_ALLOC_MORE, 659 &conf->cache_state); 660 } 661 if (noblock && sh == NULL) 662 break; 663 664 r5c_check_stripe_cache_usage(conf); 665 if (!sh) { 666 set_bit(R5_INACTIVE_BLOCKED, 667 &conf->cache_state); 668 r5l_wake_reclaim(conf->log, 0); 669 wait_event_lock_irq( 670 conf->wait_for_stripe, 671 !list_empty(conf->inactive_list + hash) && 672 (atomic_read(&conf->active_stripes) 673 < (conf->max_nr_stripes * 3 / 4) 674 || !test_bit(R5_INACTIVE_BLOCKED, 675 &conf->cache_state)), 676 *(conf->hash_locks + hash)); 677 clear_bit(R5_INACTIVE_BLOCKED, 678 &conf->cache_state); 679 } else { 680 init_stripe(sh, sector, previous); 681 atomic_inc(&sh->count); 682 } 683 } else if (!atomic_inc_not_zero(&sh->count)) { 684 spin_lock(&conf->device_lock); 685 if (!atomic_read(&sh->count)) { 686 if (!test_bit(STRIPE_HANDLE, &sh->state)) 687 atomic_inc(&conf->active_stripes); 688 BUG_ON(list_empty(&sh->lru) && 689 !test_bit(STRIPE_EXPANDING, &sh->state)); 690 inc_empty_inactive_list_flag = 0; 691 if (!list_empty(conf->inactive_list + hash)) 692 inc_empty_inactive_list_flag = 1; 693 list_del_init(&sh->lru); 694 if (list_empty(conf->inactive_list + hash) && inc_empty_inactive_list_flag) 695 atomic_inc(&conf->empty_inactive_list_nr); 696 if (sh->group) { 697 sh->group->stripes_cnt--; 698 sh->group = NULL; 699 } 700 } 701 atomic_inc(&sh->count); 702 spin_unlock(&conf->device_lock); 703 } 704 } while (sh == NULL); 705 706 spin_unlock_irq(conf->hash_locks + hash); 707 return sh; 708 } 709 710 static bool is_full_stripe_write(struct stripe_head *sh) 711 { 712 BUG_ON(sh->overwrite_disks > (sh->disks - sh->raid_conf->max_degraded)); 713 return sh->overwrite_disks == (sh->disks - sh->raid_conf->max_degraded); 714 } 715 716 static void lock_two_stripes(struct stripe_head *sh1, struct stripe_head *sh2) 717 { 718 if (sh1 > sh2) { 719 spin_lock_irq(&sh2->stripe_lock); 720 spin_lock_nested(&sh1->stripe_lock, 1); 721 } else { 722 spin_lock_irq(&sh1->stripe_lock); 723 spin_lock_nested(&sh2->stripe_lock, 1); 724 } 725 } 726 727 static void unlock_two_stripes(struct stripe_head *sh1, struct stripe_head *sh2) 728 { 729 spin_unlock(&sh1->stripe_lock); 730 spin_unlock_irq(&sh2->stripe_lock); 731 } 732 733 /* Only freshly new full stripe normal write stripe can be added to a batch list */ 734 static bool stripe_can_batch(struct stripe_head *sh) 735 { 736 struct r5conf *conf = sh->raid_conf; 737 738 if (conf->log || raid5_has_ppl(conf)) 739 return false; 740 return test_bit(STRIPE_BATCH_READY, &sh->state) && 741 !test_bit(STRIPE_BITMAP_PENDING, &sh->state) && 742 is_full_stripe_write(sh); 743 } 744 745 /* we only do back search */ 746 static void stripe_add_to_batch_list(struct r5conf *conf, struct stripe_head *sh) 747 { 748 struct stripe_head *head; 749 sector_t head_sector, tmp_sec; 750 int hash; 751 int dd_idx; 752 int inc_empty_inactive_list_flag; 753 754 /* Don't cross chunks, so stripe pd_idx/qd_idx is the same */ 755 tmp_sec = sh->sector; 756 if (!sector_div(tmp_sec, conf->chunk_sectors)) 757 return; 758 head_sector = sh->sector - STRIPE_SECTORS; 759 760 hash = stripe_hash_locks_hash(head_sector); 761 spin_lock_irq(conf->hash_locks + hash); 762 head = __find_stripe(conf, head_sector, conf->generation); 763 if (head && !atomic_inc_not_zero(&head->count)) { 764 spin_lock(&conf->device_lock); 765 if (!atomic_read(&head->count)) { 766 if (!test_bit(STRIPE_HANDLE, &head->state)) 767 atomic_inc(&conf->active_stripes); 768 BUG_ON(list_empty(&head->lru) && 769 !test_bit(STRIPE_EXPANDING, &head->state)); 770 inc_empty_inactive_list_flag = 0; 771 if (!list_empty(conf->inactive_list + hash)) 772 inc_empty_inactive_list_flag = 1; 773 list_del_init(&head->lru); 774 if (list_empty(conf->inactive_list + hash) && inc_empty_inactive_list_flag) 775 atomic_inc(&conf->empty_inactive_list_nr); 776 if (head->group) { 777 head->group->stripes_cnt--; 778 head->group = NULL; 779 } 780 } 781 atomic_inc(&head->count); 782 spin_unlock(&conf->device_lock); 783 } 784 spin_unlock_irq(conf->hash_locks + hash); 785 786 if (!head) 787 return; 788 if (!stripe_can_batch(head)) 789 goto out; 790 791 lock_two_stripes(head, sh); 792 /* clear_batch_ready clear the flag */ 793 if (!stripe_can_batch(head) || !stripe_can_batch(sh)) 794 goto unlock_out; 795 796 if (sh->batch_head) 797 goto unlock_out; 798 799 dd_idx = 0; 800 while (dd_idx == sh->pd_idx || dd_idx == sh->qd_idx) 801 dd_idx++; 802 if (head->dev[dd_idx].towrite->bi_opf != sh->dev[dd_idx].towrite->bi_opf || 803 bio_op(head->dev[dd_idx].towrite) != bio_op(sh->dev[dd_idx].towrite)) 804 goto unlock_out; 805 806 if (head->batch_head) { 807 spin_lock(&head->batch_head->batch_lock); 808 /* This batch list is already running */ 809 if (!stripe_can_batch(head)) { 810 spin_unlock(&head->batch_head->batch_lock); 811 goto unlock_out; 812 } 813 /* 814 * We must assign batch_head of this stripe within the 815 * batch_lock, otherwise clear_batch_ready of batch head 816 * stripe could clear BATCH_READY bit of this stripe and 817 * this stripe->batch_head doesn't get assigned, which 818 * could confuse clear_batch_ready for this stripe 819 */ 820 sh->batch_head = head->batch_head; 821 822 /* 823 * at this point, head's BATCH_READY could be cleared, but we 824 * can still add the stripe to batch list 825 */ 826 list_add(&sh->batch_list, &head->batch_list); 827 spin_unlock(&head->batch_head->batch_lock); 828 } else { 829 head->batch_head = head; 830 sh->batch_head = head->batch_head; 831 spin_lock(&head->batch_lock); 832 list_add_tail(&sh->batch_list, &head->batch_list); 833 spin_unlock(&head->batch_lock); 834 } 835 836 if (test_and_clear_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) 837 if (atomic_dec_return(&conf->preread_active_stripes) 838 < IO_THRESHOLD) 839 md_wakeup_thread(conf->mddev->thread); 840 841 if (test_and_clear_bit(STRIPE_BIT_DELAY, &sh->state)) { 842 int seq = sh->bm_seq; 843 if (test_bit(STRIPE_BIT_DELAY, &sh->batch_head->state) && 844 sh->batch_head->bm_seq > seq) 845 seq = sh->batch_head->bm_seq; 846 set_bit(STRIPE_BIT_DELAY, &sh->batch_head->state); 847 sh->batch_head->bm_seq = seq; 848 } 849 850 atomic_inc(&sh->count); 851 unlock_out: 852 unlock_two_stripes(head, sh); 853 out: 854 raid5_release_stripe(head); 855 } 856 857 /* Determine if 'data_offset' or 'new_data_offset' should be used 858 * in this stripe_head. 859 */ 860 static int use_new_offset(struct r5conf *conf, struct stripe_head *sh) 861 { 862 sector_t progress = conf->reshape_progress; 863 /* Need a memory barrier to make sure we see the value 864 * of conf->generation, or ->data_offset that was set before 865 * reshape_progress was updated. 866 */ 867 smp_rmb(); 868 if (progress == MaxSector) 869 return 0; 870 if (sh->generation == conf->generation - 1) 871 return 0; 872 /* We are in a reshape, and this is a new-generation stripe, 873 * so use new_data_offset. 874 */ 875 return 1; 876 } 877 878 static void dispatch_bio_list(struct bio_list *tmp) 879 { 880 struct bio *bio; 881 882 while ((bio = bio_list_pop(tmp))) 883 generic_make_request(bio); 884 } 885 886 static int cmp_stripe(void *priv, struct list_head *a, struct list_head *b) 887 { 888 const struct r5pending_data *da = list_entry(a, 889 struct r5pending_data, sibling); 890 const struct r5pending_data *db = list_entry(b, 891 struct r5pending_data, sibling); 892 if (da->sector > db->sector) 893 return 1; 894 if (da->sector < db->sector) 895 return -1; 896 return 0; 897 } 898 899 static void dispatch_defer_bios(struct r5conf *conf, int target, 900 struct bio_list *list) 901 { 902 struct r5pending_data *data; 903 struct list_head *first, *next = NULL; 904 int cnt = 0; 905 906 if (conf->pending_data_cnt == 0) 907 return; 908 909 list_sort(NULL, &conf->pending_list, cmp_stripe); 910 911 first = conf->pending_list.next; 912 913 /* temporarily move the head */ 914 if (conf->next_pending_data) 915 list_move_tail(&conf->pending_list, 916 &conf->next_pending_data->sibling); 917 918 while (!list_empty(&conf->pending_list)) { 919 data = list_first_entry(&conf->pending_list, 920 struct r5pending_data, sibling); 921 if (&data->sibling == first) 922 first = data->sibling.next; 923 next = data->sibling.next; 924 925 bio_list_merge(list, &data->bios); 926 list_move(&data->sibling, &conf->free_list); 927 cnt++; 928 if (cnt >= target) 929 break; 930 } 931 conf->pending_data_cnt -= cnt; 932 BUG_ON(conf->pending_data_cnt < 0 || cnt < target); 933 934 if (next != &conf->pending_list) 935 conf->next_pending_data = list_entry(next, 936 struct r5pending_data, sibling); 937 else 938 conf->next_pending_data = NULL; 939 /* list isn't empty */ 940 if (first != &conf->pending_list) 941 list_move_tail(&conf->pending_list, first); 942 } 943 944 static void flush_deferred_bios(struct r5conf *conf) 945 { 946 struct bio_list tmp = BIO_EMPTY_LIST; 947 948 if (conf->pending_data_cnt == 0) 949 return; 950 951 spin_lock(&conf->pending_bios_lock); 952 dispatch_defer_bios(conf, conf->pending_data_cnt, &tmp); 953 BUG_ON(conf->pending_data_cnt != 0); 954 spin_unlock(&conf->pending_bios_lock); 955 956 dispatch_bio_list(&tmp); 957 } 958 959 static void defer_issue_bios(struct r5conf *conf, sector_t sector, 960 struct bio_list *bios) 961 { 962 struct bio_list tmp = BIO_EMPTY_LIST; 963 struct r5pending_data *ent; 964 965 spin_lock(&conf->pending_bios_lock); 966 ent = list_first_entry(&conf->free_list, struct r5pending_data, 967 sibling); 968 list_move_tail(&ent->sibling, &conf->pending_list); 969 ent->sector = sector; 970 bio_list_init(&ent->bios); 971 bio_list_merge(&ent->bios, bios); 972 conf->pending_data_cnt++; 973 if (conf->pending_data_cnt >= PENDING_IO_MAX) 974 dispatch_defer_bios(conf, PENDING_IO_ONE_FLUSH, &tmp); 975 976 spin_unlock(&conf->pending_bios_lock); 977 978 dispatch_bio_list(&tmp); 979 } 980 981 static void 982 raid5_end_read_request(struct bio *bi); 983 static void 984 raid5_end_write_request(struct bio *bi); 985 986 static void ops_run_io(struct stripe_head *sh, struct stripe_head_state *s) 987 { 988 struct r5conf *conf = sh->raid_conf; 989 int i, disks = sh->disks; 990 struct stripe_head *head_sh = sh; 991 struct bio_list pending_bios = BIO_EMPTY_LIST; 992 bool should_defer; 993 994 might_sleep(); 995 996 if (log_stripe(sh, s) == 0) 997 return; 998 999 should_defer = conf->batch_bio_dispatch && conf->group_cnt; 1000 1001 for (i = disks; i--; ) { 1002 int op, op_flags = 0; 1003 int replace_only = 0; 1004 struct bio *bi, *rbi; 1005 struct md_rdev *rdev, *rrdev = NULL; 1006 1007 sh = head_sh; 1008 if (test_and_clear_bit(R5_Wantwrite, &sh->dev[i].flags)) { 1009 op = REQ_OP_WRITE; 1010 if (test_and_clear_bit(R5_WantFUA, &sh->dev[i].flags)) 1011 op_flags = REQ_FUA; 1012 if (test_bit(R5_Discard, &sh->dev[i].flags)) 1013 op = REQ_OP_DISCARD; 1014 } else if (test_and_clear_bit(R5_Wantread, &sh->dev[i].flags)) 1015 op = REQ_OP_READ; 1016 else if (test_and_clear_bit(R5_WantReplace, 1017 &sh->dev[i].flags)) { 1018 op = REQ_OP_WRITE; 1019 replace_only = 1; 1020 } else 1021 continue; 1022 if (test_and_clear_bit(R5_SyncIO, &sh->dev[i].flags)) 1023 op_flags |= REQ_SYNC; 1024 1025 again: 1026 bi = &sh->dev[i].req; 1027 rbi = &sh->dev[i].rreq; /* For writing to replacement */ 1028 1029 rcu_read_lock(); 1030 rrdev = rcu_dereference(conf->disks[i].replacement); 1031 smp_mb(); /* Ensure that if rrdev is NULL, rdev won't be */ 1032 rdev = rcu_dereference(conf->disks[i].rdev); 1033 if (!rdev) { 1034 rdev = rrdev; 1035 rrdev = NULL; 1036 } 1037 if (op_is_write(op)) { 1038 if (replace_only) 1039 rdev = NULL; 1040 if (rdev == rrdev) 1041 /* We raced and saw duplicates */ 1042 rrdev = NULL; 1043 } else { 1044 if (test_bit(R5_ReadRepl, &head_sh->dev[i].flags) && rrdev) 1045 rdev = rrdev; 1046 rrdev = NULL; 1047 } 1048 1049 if (rdev && test_bit(Faulty, &rdev->flags)) 1050 rdev = NULL; 1051 if (rdev) 1052 atomic_inc(&rdev->nr_pending); 1053 if (rrdev && test_bit(Faulty, &rrdev->flags)) 1054 rrdev = NULL; 1055 if (rrdev) 1056 atomic_inc(&rrdev->nr_pending); 1057 rcu_read_unlock(); 1058 1059 /* We have already checked bad blocks for reads. Now 1060 * need to check for writes. We never accept write errors 1061 * on the replacement, so we don't to check rrdev. 1062 */ 1063 while (op_is_write(op) && rdev && 1064 test_bit(WriteErrorSeen, &rdev->flags)) { 1065 sector_t first_bad; 1066 int bad_sectors; 1067 int bad = is_badblock(rdev, sh->sector, STRIPE_SECTORS, 1068 &first_bad, &bad_sectors); 1069 if (!bad) 1070 break; 1071 1072 if (bad < 0) { 1073 set_bit(BlockedBadBlocks, &rdev->flags); 1074 if (!conf->mddev->external && 1075 conf->mddev->sb_flags) { 1076 /* It is very unlikely, but we might 1077 * still need to write out the 1078 * bad block log - better give it 1079 * a chance*/ 1080 md_check_recovery(conf->mddev); 1081 } 1082 /* 1083 * Because md_wait_for_blocked_rdev 1084 * will dec nr_pending, we must 1085 * increment it first. 1086 */ 1087 atomic_inc(&rdev->nr_pending); 1088 md_wait_for_blocked_rdev(rdev, conf->mddev); 1089 } else { 1090 /* Acknowledged bad block - skip the write */ 1091 rdev_dec_pending(rdev, conf->mddev); 1092 rdev = NULL; 1093 } 1094 } 1095 1096 if (rdev) { 1097 if (s->syncing || s->expanding || s->expanded 1098 || s->replacing) 1099 md_sync_acct(rdev->bdev, STRIPE_SECTORS); 1100 1101 set_bit(STRIPE_IO_STARTED, &sh->state); 1102 1103 bio_set_dev(bi, rdev->bdev); 1104 bio_set_op_attrs(bi, op, op_flags); 1105 bi->bi_end_io = op_is_write(op) 1106 ? raid5_end_write_request 1107 : raid5_end_read_request; 1108 bi->bi_private = sh; 1109 1110 pr_debug("%s: for %llu schedule op %d on disc %d\n", 1111 __func__, (unsigned long long)sh->sector, 1112 bi->bi_opf, i); 1113 atomic_inc(&sh->count); 1114 if (sh != head_sh) 1115 atomic_inc(&head_sh->count); 1116 if (use_new_offset(conf, sh)) 1117 bi->bi_iter.bi_sector = (sh->sector 1118 + rdev->new_data_offset); 1119 else 1120 bi->bi_iter.bi_sector = (sh->sector 1121 + rdev->data_offset); 1122 if (test_bit(R5_ReadNoMerge, &head_sh->dev[i].flags)) 1123 bi->bi_opf |= REQ_NOMERGE; 1124 1125 if (test_bit(R5_SkipCopy, &sh->dev[i].flags)) 1126 WARN_ON(test_bit(R5_UPTODATE, &sh->dev[i].flags)); 1127 1128 if (!op_is_write(op) && 1129 test_bit(R5_InJournal, &sh->dev[i].flags)) 1130 /* 1131 * issuing read for a page in journal, this 1132 * must be preparing for prexor in rmw; read 1133 * the data into orig_page 1134 */ 1135 sh->dev[i].vec.bv_page = sh->dev[i].orig_page; 1136 else 1137 sh->dev[i].vec.bv_page = sh->dev[i].page; 1138 bi->bi_vcnt = 1; 1139 bi->bi_io_vec[0].bv_len = STRIPE_SIZE; 1140 bi->bi_io_vec[0].bv_offset = 0; 1141 bi->bi_iter.bi_size = STRIPE_SIZE; 1142 /* 1143 * If this is discard request, set bi_vcnt 0. We don't 1144 * want to confuse SCSI because SCSI will replace payload 1145 */ 1146 if (op == REQ_OP_DISCARD) 1147 bi->bi_vcnt = 0; 1148 if (rrdev) 1149 set_bit(R5_DOUBLE_LOCKED, &sh->dev[i].flags); 1150 1151 if (conf->mddev->gendisk) 1152 trace_block_bio_remap(bi->bi_disk->queue, 1153 bi, disk_devt(conf->mddev->gendisk), 1154 sh->dev[i].sector); 1155 if (should_defer && op_is_write(op)) 1156 bio_list_add(&pending_bios, bi); 1157 else 1158 generic_make_request(bi); 1159 } 1160 if (rrdev) { 1161 if (s->syncing || s->expanding || s->expanded 1162 || s->replacing) 1163 md_sync_acct(rrdev->bdev, STRIPE_SECTORS); 1164 1165 set_bit(STRIPE_IO_STARTED, &sh->state); 1166 1167 bio_set_dev(rbi, rrdev->bdev); 1168 bio_set_op_attrs(rbi, op, op_flags); 1169 BUG_ON(!op_is_write(op)); 1170 rbi->bi_end_io = raid5_end_write_request; 1171 rbi->bi_private = sh; 1172 1173 pr_debug("%s: for %llu schedule op %d on " 1174 "replacement disc %d\n", 1175 __func__, (unsigned long long)sh->sector, 1176 rbi->bi_opf, i); 1177 atomic_inc(&sh->count); 1178 if (sh != head_sh) 1179 atomic_inc(&head_sh->count); 1180 if (use_new_offset(conf, sh)) 1181 rbi->bi_iter.bi_sector = (sh->sector 1182 + rrdev->new_data_offset); 1183 else 1184 rbi->bi_iter.bi_sector = (sh->sector 1185 + rrdev->data_offset); 1186 if (test_bit(R5_SkipCopy, &sh->dev[i].flags)) 1187 WARN_ON(test_bit(R5_UPTODATE, &sh->dev[i].flags)); 1188 sh->dev[i].rvec.bv_page = sh->dev[i].page; 1189 rbi->bi_vcnt = 1; 1190 rbi->bi_io_vec[0].bv_len = STRIPE_SIZE; 1191 rbi->bi_io_vec[0].bv_offset = 0; 1192 rbi->bi_iter.bi_size = STRIPE_SIZE; 1193 /* 1194 * If this is discard request, set bi_vcnt 0. We don't 1195 * want to confuse SCSI because SCSI will replace payload 1196 */ 1197 if (op == REQ_OP_DISCARD) 1198 rbi->bi_vcnt = 0; 1199 if (conf->mddev->gendisk) 1200 trace_block_bio_remap(rbi->bi_disk->queue, 1201 rbi, disk_devt(conf->mddev->gendisk), 1202 sh->dev[i].sector); 1203 if (should_defer && op_is_write(op)) 1204 bio_list_add(&pending_bios, rbi); 1205 else 1206 generic_make_request(rbi); 1207 } 1208 if (!rdev && !rrdev) { 1209 if (op_is_write(op)) 1210 set_bit(STRIPE_DEGRADED, &sh->state); 1211 pr_debug("skip op %d on disc %d for sector %llu\n", 1212 bi->bi_opf, i, (unsigned long long)sh->sector); 1213 clear_bit(R5_LOCKED, &sh->dev[i].flags); 1214 set_bit(STRIPE_HANDLE, &sh->state); 1215 } 1216 1217 if (!head_sh->batch_head) 1218 continue; 1219 sh = list_first_entry(&sh->batch_list, struct stripe_head, 1220 batch_list); 1221 if (sh != head_sh) 1222 goto again; 1223 } 1224 1225 if (should_defer && !bio_list_empty(&pending_bios)) 1226 defer_issue_bios(conf, head_sh->sector, &pending_bios); 1227 } 1228 1229 static struct dma_async_tx_descriptor * 1230 async_copy_data(int frombio, struct bio *bio, struct page **page, 1231 sector_t sector, struct dma_async_tx_descriptor *tx, 1232 struct stripe_head *sh, int no_skipcopy) 1233 { 1234 struct bio_vec bvl; 1235 struct bvec_iter iter; 1236 struct page *bio_page; 1237 int page_offset; 1238 struct async_submit_ctl submit; 1239 enum async_tx_flags flags = 0; 1240 1241 if (bio->bi_iter.bi_sector >= sector) 1242 page_offset = (signed)(bio->bi_iter.bi_sector - sector) * 512; 1243 else 1244 page_offset = (signed)(sector - bio->bi_iter.bi_sector) * -512; 1245 1246 if (frombio) 1247 flags |= ASYNC_TX_FENCE; 1248 init_async_submit(&submit, flags, tx, NULL, NULL, NULL); 1249 1250 bio_for_each_segment(bvl, bio, iter) { 1251 int len = bvl.bv_len; 1252 int clen; 1253 int b_offset = 0; 1254 1255 if (page_offset < 0) { 1256 b_offset = -page_offset; 1257 page_offset += b_offset; 1258 len -= b_offset; 1259 } 1260 1261 if (len > 0 && page_offset + len > STRIPE_SIZE) 1262 clen = STRIPE_SIZE - page_offset; 1263 else 1264 clen = len; 1265 1266 if (clen > 0) { 1267 b_offset += bvl.bv_offset; 1268 bio_page = bvl.bv_page; 1269 if (frombio) { 1270 if (sh->raid_conf->skip_copy && 1271 b_offset == 0 && page_offset == 0 && 1272 clen == STRIPE_SIZE && 1273 !no_skipcopy) 1274 *page = bio_page; 1275 else 1276 tx = async_memcpy(*page, bio_page, page_offset, 1277 b_offset, clen, &submit); 1278 } else 1279 tx = async_memcpy(bio_page, *page, b_offset, 1280 page_offset, clen, &submit); 1281 } 1282 /* chain the operations */ 1283 submit.depend_tx = tx; 1284 1285 if (clen < len) /* hit end of page */ 1286 break; 1287 page_offset += len; 1288 } 1289 1290 return tx; 1291 } 1292 1293 static void ops_complete_biofill(void *stripe_head_ref) 1294 { 1295 struct stripe_head *sh = stripe_head_ref; 1296 int i; 1297 1298 pr_debug("%s: stripe %llu\n", __func__, 1299 (unsigned long long)sh->sector); 1300 1301 /* clear completed biofills */ 1302 for (i = sh->disks; i--; ) { 1303 struct r5dev *dev = &sh->dev[i]; 1304 1305 /* acknowledge completion of a biofill operation */ 1306 /* and check if we need to reply to a read request, 1307 * new R5_Wantfill requests are held off until 1308 * !STRIPE_BIOFILL_RUN 1309 */ 1310 if (test_and_clear_bit(R5_Wantfill, &dev->flags)) { 1311 struct bio *rbi, *rbi2; 1312 1313 BUG_ON(!dev->read); 1314 rbi = dev->read; 1315 dev->read = NULL; 1316 while (rbi && rbi->bi_iter.bi_sector < 1317 dev->sector + STRIPE_SECTORS) { 1318 rbi2 = r5_next_bio(rbi, dev->sector); 1319 bio_endio(rbi); 1320 rbi = rbi2; 1321 } 1322 } 1323 } 1324 clear_bit(STRIPE_BIOFILL_RUN, &sh->state); 1325 1326 set_bit(STRIPE_HANDLE, &sh->state); 1327 raid5_release_stripe(sh); 1328 } 1329 1330 static void ops_run_biofill(struct stripe_head *sh) 1331 { 1332 struct dma_async_tx_descriptor *tx = NULL; 1333 struct async_submit_ctl submit; 1334 int i; 1335 1336 BUG_ON(sh->batch_head); 1337 pr_debug("%s: stripe %llu\n", __func__, 1338 (unsigned long long)sh->sector); 1339 1340 for (i = sh->disks; i--; ) { 1341 struct r5dev *dev = &sh->dev[i]; 1342 if (test_bit(R5_Wantfill, &dev->flags)) { 1343 struct bio *rbi; 1344 spin_lock_irq(&sh->stripe_lock); 1345 dev->read = rbi = dev->toread; 1346 dev->toread = NULL; 1347 spin_unlock_irq(&sh->stripe_lock); 1348 while (rbi && rbi->bi_iter.bi_sector < 1349 dev->sector + STRIPE_SECTORS) { 1350 tx = async_copy_data(0, rbi, &dev->page, 1351 dev->sector, tx, sh, 0); 1352 rbi = r5_next_bio(rbi, dev->sector); 1353 } 1354 } 1355 } 1356 1357 atomic_inc(&sh->count); 1358 init_async_submit(&submit, ASYNC_TX_ACK, tx, ops_complete_biofill, sh, NULL); 1359 async_trigger_callback(&submit); 1360 } 1361 1362 static void mark_target_uptodate(struct stripe_head *sh, int target) 1363 { 1364 struct r5dev *tgt; 1365 1366 if (target < 0) 1367 return; 1368 1369 tgt = &sh->dev[target]; 1370 set_bit(R5_UPTODATE, &tgt->flags); 1371 BUG_ON(!test_bit(R5_Wantcompute, &tgt->flags)); 1372 clear_bit(R5_Wantcompute, &tgt->flags); 1373 } 1374 1375 static void ops_complete_compute(void *stripe_head_ref) 1376 { 1377 struct stripe_head *sh = stripe_head_ref; 1378 1379 pr_debug("%s: stripe %llu\n", __func__, 1380 (unsigned long long)sh->sector); 1381 1382 /* mark the computed target(s) as uptodate */ 1383 mark_target_uptodate(sh, sh->ops.target); 1384 mark_target_uptodate(sh, sh->ops.target2); 1385 1386 clear_bit(STRIPE_COMPUTE_RUN, &sh->state); 1387 if (sh->check_state == check_state_compute_run) 1388 sh->check_state = check_state_compute_result; 1389 set_bit(STRIPE_HANDLE, &sh->state); 1390 raid5_release_stripe(sh); 1391 } 1392 1393 /* return a pointer to the address conversion region of the scribble buffer */ 1394 static addr_conv_t *to_addr_conv(struct stripe_head *sh, 1395 struct raid5_percpu *percpu, int i) 1396 { 1397 void *addr; 1398 1399 addr = flex_array_get(percpu->scribble, i); 1400 return addr + sizeof(struct page *) * (sh->disks + 2); 1401 } 1402 1403 /* return a pointer to the address conversion region of the scribble buffer */ 1404 static struct page **to_addr_page(struct raid5_percpu *percpu, int i) 1405 { 1406 void *addr; 1407 1408 addr = flex_array_get(percpu->scribble, i); 1409 return addr; 1410 } 1411 1412 static struct dma_async_tx_descriptor * 1413 ops_run_compute5(struct stripe_head *sh, struct raid5_percpu *percpu) 1414 { 1415 int disks = sh->disks; 1416 struct page **xor_srcs = to_addr_page(percpu, 0); 1417 int target = sh->ops.target; 1418 struct r5dev *tgt = &sh->dev[target]; 1419 struct page *xor_dest = tgt->page; 1420 int count = 0; 1421 struct dma_async_tx_descriptor *tx; 1422 struct async_submit_ctl submit; 1423 int i; 1424 1425 BUG_ON(sh->batch_head); 1426 1427 pr_debug("%s: stripe %llu block: %d\n", 1428 __func__, (unsigned long long)sh->sector, target); 1429 BUG_ON(!test_bit(R5_Wantcompute, &tgt->flags)); 1430 1431 for (i = disks; i--; ) 1432 if (i != target) 1433 xor_srcs[count++] = sh->dev[i].page; 1434 1435 atomic_inc(&sh->count); 1436 1437 init_async_submit(&submit, ASYNC_TX_FENCE|ASYNC_TX_XOR_ZERO_DST, NULL, 1438 ops_complete_compute, sh, to_addr_conv(sh, percpu, 0)); 1439 if (unlikely(count == 1)) 1440 tx = async_memcpy(xor_dest, xor_srcs[0], 0, 0, STRIPE_SIZE, &submit); 1441 else 1442 tx = async_xor(xor_dest, xor_srcs, 0, count, STRIPE_SIZE, &submit); 1443 1444 return tx; 1445 } 1446 1447 /* set_syndrome_sources - populate source buffers for gen_syndrome 1448 * @srcs - (struct page *) array of size sh->disks 1449 * @sh - stripe_head to parse 1450 * 1451 * Populates srcs in proper layout order for the stripe and returns the 1452 * 'count' of sources to be used in a call to async_gen_syndrome. The P 1453 * destination buffer is recorded in srcs[count] and the Q destination 1454 * is recorded in srcs[count+1]]. 1455 */ 1456 static int set_syndrome_sources(struct page **srcs, 1457 struct stripe_head *sh, 1458 int srctype) 1459 { 1460 int disks = sh->disks; 1461 int syndrome_disks = sh->ddf_layout ? disks : (disks - 2); 1462 int d0_idx = raid6_d0(sh); 1463 int count; 1464 int i; 1465 1466 for (i = 0; i < disks; i++) 1467 srcs[i] = NULL; 1468 1469 count = 0; 1470 i = d0_idx; 1471 do { 1472 int slot = raid6_idx_to_slot(i, sh, &count, syndrome_disks); 1473 struct r5dev *dev = &sh->dev[i]; 1474 1475 if (i == sh->qd_idx || i == sh->pd_idx || 1476 (srctype == SYNDROME_SRC_ALL) || 1477 (srctype == SYNDROME_SRC_WANT_DRAIN && 1478 (test_bit(R5_Wantdrain, &dev->flags) || 1479 test_bit(R5_InJournal, &dev->flags))) || 1480 (srctype == SYNDROME_SRC_WRITTEN && 1481 (dev->written || 1482 test_bit(R5_InJournal, &dev->flags)))) { 1483 if (test_bit(R5_InJournal, &dev->flags)) 1484 srcs[slot] = sh->dev[i].orig_page; 1485 else 1486 srcs[slot] = sh->dev[i].page; 1487 } 1488 i = raid6_next_disk(i, disks); 1489 } while (i != d0_idx); 1490 1491 return syndrome_disks; 1492 } 1493 1494 static struct dma_async_tx_descriptor * 1495 ops_run_compute6_1(struct stripe_head *sh, struct raid5_percpu *percpu) 1496 { 1497 int disks = sh->disks; 1498 struct page **blocks = to_addr_page(percpu, 0); 1499 int target; 1500 int qd_idx = sh->qd_idx; 1501 struct dma_async_tx_descriptor *tx; 1502 struct async_submit_ctl submit; 1503 struct r5dev *tgt; 1504 struct page *dest; 1505 int i; 1506 int count; 1507 1508 BUG_ON(sh->batch_head); 1509 if (sh->ops.target < 0) 1510 target = sh->ops.target2; 1511 else if (sh->ops.target2 < 0) 1512 target = sh->ops.target; 1513 else 1514 /* we should only have one valid target */ 1515 BUG(); 1516 BUG_ON(target < 0); 1517 pr_debug("%s: stripe %llu block: %d\n", 1518 __func__, (unsigned long long)sh->sector, target); 1519 1520 tgt = &sh->dev[target]; 1521 BUG_ON(!test_bit(R5_Wantcompute, &tgt->flags)); 1522 dest = tgt->page; 1523 1524 atomic_inc(&sh->count); 1525 1526 if (target == qd_idx) { 1527 count = set_syndrome_sources(blocks, sh, SYNDROME_SRC_ALL); 1528 blocks[count] = NULL; /* regenerating p is not necessary */ 1529 BUG_ON(blocks[count+1] != dest); /* q should already be set */ 1530 init_async_submit(&submit, ASYNC_TX_FENCE, NULL, 1531 ops_complete_compute, sh, 1532 to_addr_conv(sh, percpu, 0)); 1533 tx = async_gen_syndrome(blocks, 0, count+2, STRIPE_SIZE, &submit); 1534 } else { 1535 /* Compute any data- or p-drive using XOR */ 1536 count = 0; 1537 for (i = disks; i-- ; ) { 1538 if (i == target || i == qd_idx) 1539 continue; 1540 blocks[count++] = sh->dev[i].page; 1541 } 1542 1543 init_async_submit(&submit, ASYNC_TX_FENCE|ASYNC_TX_XOR_ZERO_DST, 1544 NULL, ops_complete_compute, sh, 1545 to_addr_conv(sh, percpu, 0)); 1546 tx = async_xor(dest, blocks, 0, count, STRIPE_SIZE, &submit); 1547 } 1548 1549 return tx; 1550 } 1551 1552 static struct dma_async_tx_descriptor * 1553 ops_run_compute6_2(struct stripe_head *sh, struct raid5_percpu *percpu) 1554 { 1555 int i, count, disks = sh->disks; 1556 int syndrome_disks = sh->ddf_layout ? disks : disks-2; 1557 int d0_idx = raid6_d0(sh); 1558 int faila = -1, failb = -1; 1559 int target = sh->ops.target; 1560 int target2 = sh->ops.target2; 1561 struct r5dev *tgt = &sh->dev[target]; 1562 struct r5dev *tgt2 = &sh->dev[target2]; 1563 struct dma_async_tx_descriptor *tx; 1564 struct page **blocks = to_addr_page(percpu, 0); 1565 struct async_submit_ctl submit; 1566 1567 BUG_ON(sh->batch_head); 1568 pr_debug("%s: stripe %llu block1: %d block2: %d\n", 1569 __func__, (unsigned long long)sh->sector, target, target2); 1570 BUG_ON(target < 0 || target2 < 0); 1571 BUG_ON(!test_bit(R5_Wantcompute, &tgt->flags)); 1572 BUG_ON(!test_bit(R5_Wantcompute, &tgt2->flags)); 1573 1574 /* we need to open-code set_syndrome_sources to handle the 1575 * slot number conversion for 'faila' and 'failb' 1576 */ 1577 for (i = 0; i < disks ; i++) 1578 blocks[i] = NULL; 1579 count = 0; 1580 i = d0_idx; 1581 do { 1582 int slot = raid6_idx_to_slot(i, sh, &count, syndrome_disks); 1583 1584 blocks[slot] = sh->dev[i].page; 1585 1586 if (i == target) 1587 faila = slot; 1588 if (i == target2) 1589 failb = slot; 1590 i = raid6_next_disk(i, disks); 1591 } while (i != d0_idx); 1592 1593 BUG_ON(faila == failb); 1594 if (failb < faila) 1595 swap(faila, failb); 1596 pr_debug("%s: stripe: %llu faila: %d failb: %d\n", 1597 __func__, (unsigned long long)sh->sector, faila, failb); 1598 1599 atomic_inc(&sh->count); 1600 1601 if (failb == syndrome_disks+1) { 1602 /* Q disk is one of the missing disks */ 1603 if (faila == syndrome_disks) { 1604 /* Missing P+Q, just recompute */ 1605 init_async_submit(&submit, ASYNC_TX_FENCE, NULL, 1606 ops_complete_compute, sh, 1607 to_addr_conv(sh, percpu, 0)); 1608 return async_gen_syndrome(blocks, 0, syndrome_disks+2, 1609 STRIPE_SIZE, &submit); 1610 } else { 1611 struct page *dest; 1612 int data_target; 1613 int qd_idx = sh->qd_idx; 1614 1615 /* Missing D+Q: recompute D from P, then recompute Q */ 1616 if (target == qd_idx) 1617 data_target = target2; 1618 else 1619 data_target = target; 1620 1621 count = 0; 1622 for (i = disks; i-- ; ) { 1623 if (i == data_target || i == qd_idx) 1624 continue; 1625 blocks[count++] = sh->dev[i].page; 1626 } 1627 dest = sh->dev[data_target].page; 1628 init_async_submit(&submit, 1629 ASYNC_TX_FENCE|ASYNC_TX_XOR_ZERO_DST, 1630 NULL, NULL, NULL, 1631 to_addr_conv(sh, percpu, 0)); 1632 tx = async_xor(dest, blocks, 0, count, STRIPE_SIZE, 1633 &submit); 1634 1635 count = set_syndrome_sources(blocks, sh, SYNDROME_SRC_ALL); 1636 init_async_submit(&submit, ASYNC_TX_FENCE, tx, 1637 ops_complete_compute, sh, 1638 to_addr_conv(sh, percpu, 0)); 1639 return async_gen_syndrome(blocks, 0, count+2, 1640 STRIPE_SIZE, &submit); 1641 } 1642 } else { 1643 init_async_submit(&submit, ASYNC_TX_FENCE, NULL, 1644 ops_complete_compute, sh, 1645 to_addr_conv(sh, percpu, 0)); 1646 if (failb == syndrome_disks) { 1647 /* We're missing D+P. */ 1648 return async_raid6_datap_recov(syndrome_disks+2, 1649 STRIPE_SIZE, faila, 1650 blocks, &submit); 1651 } else { 1652 /* We're missing D+D. */ 1653 return async_raid6_2data_recov(syndrome_disks+2, 1654 STRIPE_SIZE, faila, failb, 1655 blocks, &submit); 1656 } 1657 } 1658 } 1659 1660 static void ops_complete_prexor(void *stripe_head_ref) 1661 { 1662 struct stripe_head *sh = stripe_head_ref; 1663 1664 pr_debug("%s: stripe %llu\n", __func__, 1665 (unsigned long long)sh->sector); 1666 1667 if (r5c_is_writeback(sh->raid_conf->log)) 1668 /* 1669 * raid5-cache write back uses orig_page during prexor. 1670 * After prexor, it is time to free orig_page 1671 */ 1672 r5c_release_extra_page(sh); 1673 } 1674 1675 static struct dma_async_tx_descriptor * 1676 ops_run_prexor5(struct stripe_head *sh, struct raid5_percpu *percpu, 1677 struct dma_async_tx_descriptor *tx) 1678 { 1679 int disks = sh->disks; 1680 struct page **xor_srcs = to_addr_page(percpu, 0); 1681 int count = 0, pd_idx = sh->pd_idx, i; 1682 struct async_submit_ctl submit; 1683 1684 /* existing parity data subtracted */ 1685 struct page *xor_dest = xor_srcs[count++] = sh->dev[pd_idx].page; 1686 1687 BUG_ON(sh->batch_head); 1688 pr_debug("%s: stripe %llu\n", __func__, 1689 (unsigned long long)sh->sector); 1690 1691 for (i = disks; i--; ) { 1692 struct r5dev *dev = &sh->dev[i]; 1693 /* Only process blocks that are known to be uptodate */ 1694 if (test_bit(R5_InJournal, &dev->flags)) 1695 xor_srcs[count++] = dev->orig_page; 1696 else if (test_bit(R5_Wantdrain, &dev->flags)) 1697 xor_srcs[count++] = dev->page; 1698 } 1699 1700 init_async_submit(&submit, ASYNC_TX_FENCE|ASYNC_TX_XOR_DROP_DST, tx, 1701 ops_complete_prexor, sh, to_addr_conv(sh, percpu, 0)); 1702 tx = async_xor(xor_dest, xor_srcs, 0, count, STRIPE_SIZE, &submit); 1703 1704 return tx; 1705 } 1706 1707 static struct dma_async_tx_descriptor * 1708 ops_run_prexor6(struct stripe_head *sh, struct raid5_percpu *percpu, 1709 struct dma_async_tx_descriptor *tx) 1710 { 1711 struct page **blocks = to_addr_page(percpu, 0); 1712 int count; 1713 struct async_submit_ctl submit; 1714 1715 pr_debug("%s: stripe %llu\n", __func__, 1716 (unsigned long long)sh->sector); 1717 1718 count = set_syndrome_sources(blocks, sh, SYNDROME_SRC_WANT_DRAIN); 1719 1720 init_async_submit(&submit, ASYNC_TX_FENCE|ASYNC_TX_PQ_XOR_DST, tx, 1721 ops_complete_prexor, sh, to_addr_conv(sh, percpu, 0)); 1722 tx = async_gen_syndrome(blocks, 0, count+2, STRIPE_SIZE, &submit); 1723 1724 return tx; 1725 } 1726 1727 static struct dma_async_tx_descriptor * 1728 ops_run_biodrain(struct stripe_head *sh, struct dma_async_tx_descriptor *tx) 1729 { 1730 struct r5conf *conf = sh->raid_conf; 1731 int disks = sh->disks; 1732 int i; 1733 struct stripe_head *head_sh = sh; 1734 1735 pr_debug("%s: stripe %llu\n", __func__, 1736 (unsigned long long)sh->sector); 1737 1738 for (i = disks; i--; ) { 1739 struct r5dev *dev; 1740 struct bio *chosen; 1741 1742 sh = head_sh; 1743 if (test_and_clear_bit(R5_Wantdrain, &head_sh->dev[i].flags)) { 1744 struct bio *wbi; 1745 1746 again: 1747 dev = &sh->dev[i]; 1748 /* 1749 * clear R5_InJournal, so when rewriting a page in 1750 * journal, it is not skipped by r5l_log_stripe() 1751 */ 1752 clear_bit(R5_InJournal, &dev->flags); 1753 spin_lock_irq(&sh->stripe_lock); 1754 chosen = dev->towrite; 1755 dev->towrite = NULL; 1756 sh->overwrite_disks = 0; 1757 BUG_ON(dev->written); 1758 wbi = dev->written = chosen; 1759 spin_unlock_irq(&sh->stripe_lock); 1760 WARN_ON(dev->page != dev->orig_page); 1761 1762 while (wbi && wbi->bi_iter.bi_sector < 1763 dev->sector + STRIPE_SECTORS) { 1764 if (wbi->bi_opf & REQ_FUA) 1765 set_bit(R5_WantFUA, &dev->flags); 1766 if (wbi->bi_opf & REQ_SYNC) 1767 set_bit(R5_SyncIO, &dev->flags); 1768 if (bio_op(wbi) == REQ_OP_DISCARD) 1769 set_bit(R5_Discard, &dev->flags); 1770 else { 1771 tx = async_copy_data(1, wbi, &dev->page, 1772 dev->sector, tx, sh, 1773 r5c_is_writeback(conf->log)); 1774 if (dev->page != dev->orig_page && 1775 !r5c_is_writeback(conf->log)) { 1776 set_bit(R5_SkipCopy, &dev->flags); 1777 clear_bit(R5_UPTODATE, &dev->flags); 1778 clear_bit(R5_OVERWRITE, &dev->flags); 1779 } 1780 } 1781 wbi = r5_next_bio(wbi, dev->sector); 1782 } 1783 1784 if (head_sh->batch_head) { 1785 sh = list_first_entry(&sh->batch_list, 1786 struct stripe_head, 1787 batch_list); 1788 if (sh == head_sh) 1789 continue; 1790 goto again; 1791 } 1792 } 1793 } 1794 1795 return tx; 1796 } 1797 1798 static void ops_complete_reconstruct(void *stripe_head_ref) 1799 { 1800 struct stripe_head *sh = stripe_head_ref; 1801 int disks = sh->disks; 1802 int pd_idx = sh->pd_idx; 1803 int qd_idx = sh->qd_idx; 1804 int i; 1805 bool fua = false, sync = false, discard = false; 1806 1807 pr_debug("%s: stripe %llu\n", __func__, 1808 (unsigned long long)sh->sector); 1809 1810 for (i = disks; i--; ) { 1811 fua |= test_bit(R5_WantFUA, &sh->dev[i].flags); 1812 sync |= test_bit(R5_SyncIO, &sh->dev[i].flags); 1813 discard |= test_bit(R5_Discard, &sh->dev[i].flags); 1814 } 1815 1816 for (i = disks; i--; ) { 1817 struct r5dev *dev = &sh->dev[i]; 1818 1819 if (dev->written || i == pd_idx || i == qd_idx) { 1820 if (!discard && !test_bit(R5_SkipCopy, &dev->flags)) { 1821 set_bit(R5_UPTODATE, &dev->flags); 1822 if (test_bit(STRIPE_EXPAND_READY, &sh->state)) 1823 set_bit(R5_Expanded, &dev->flags); 1824 } 1825 if (fua) 1826 set_bit(R5_WantFUA, &dev->flags); 1827 if (sync) 1828 set_bit(R5_SyncIO, &dev->flags); 1829 } 1830 } 1831 1832 if (sh->reconstruct_state == reconstruct_state_drain_run) 1833 sh->reconstruct_state = reconstruct_state_drain_result; 1834 else if (sh->reconstruct_state == reconstruct_state_prexor_drain_run) 1835 sh->reconstruct_state = reconstruct_state_prexor_drain_result; 1836 else { 1837 BUG_ON(sh->reconstruct_state != reconstruct_state_run); 1838 sh->reconstruct_state = reconstruct_state_result; 1839 } 1840 1841 set_bit(STRIPE_HANDLE, &sh->state); 1842 raid5_release_stripe(sh); 1843 } 1844 1845 static void 1846 ops_run_reconstruct5(struct stripe_head *sh, struct raid5_percpu *percpu, 1847 struct dma_async_tx_descriptor *tx) 1848 { 1849 int disks = sh->disks; 1850 struct page **xor_srcs; 1851 struct async_submit_ctl submit; 1852 int count, pd_idx = sh->pd_idx, i; 1853 struct page *xor_dest; 1854 int prexor = 0; 1855 unsigned long flags; 1856 int j = 0; 1857 struct stripe_head *head_sh = sh; 1858 int last_stripe; 1859 1860 pr_debug("%s: stripe %llu\n", __func__, 1861 (unsigned long long)sh->sector); 1862 1863 for (i = 0; i < sh->disks; i++) { 1864 if (pd_idx == i) 1865 continue; 1866 if (!test_bit(R5_Discard, &sh->dev[i].flags)) 1867 break; 1868 } 1869 if (i >= sh->disks) { 1870 atomic_inc(&sh->count); 1871 set_bit(R5_Discard, &sh->dev[pd_idx].flags); 1872 ops_complete_reconstruct(sh); 1873 return; 1874 } 1875 again: 1876 count = 0; 1877 xor_srcs = to_addr_page(percpu, j); 1878 /* check if prexor is active which means only process blocks 1879 * that are part of a read-modify-write (written) 1880 */ 1881 if (head_sh->reconstruct_state == reconstruct_state_prexor_drain_run) { 1882 prexor = 1; 1883 xor_dest = xor_srcs[count++] = sh->dev[pd_idx].page; 1884 for (i = disks; i--; ) { 1885 struct r5dev *dev = &sh->dev[i]; 1886 if (head_sh->dev[i].written || 1887 test_bit(R5_InJournal, &head_sh->dev[i].flags)) 1888 xor_srcs[count++] = dev->page; 1889 } 1890 } else { 1891 xor_dest = sh->dev[pd_idx].page; 1892 for (i = disks; i--; ) { 1893 struct r5dev *dev = &sh->dev[i]; 1894 if (i != pd_idx) 1895 xor_srcs[count++] = dev->page; 1896 } 1897 } 1898 1899 /* 1/ if we prexor'd then the dest is reused as a source 1900 * 2/ if we did not prexor then we are redoing the parity 1901 * set ASYNC_TX_XOR_DROP_DST and ASYNC_TX_XOR_ZERO_DST 1902 * for the synchronous xor case 1903 */ 1904 last_stripe = !head_sh->batch_head || 1905 list_first_entry(&sh->batch_list, 1906 struct stripe_head, batch_list) == head_sh; 1907 if (last_stripe) { 1908 flags = ASYNC_TX_ACK | 1909 (prexor ? ASYNC_TX_XOR_DROP_DST : ASYNC_TX_XOR_ZERO_DST); 1910 1911 atomic_inc(&head_sh->count); 1912 init_async_submit(&submit, flags, tx, ops_complete_reconstruct, head_sh, 1913 to_addr_conv(sh, percpu, j)); 1914 } else { 1915 flags = prexor ? ASYNC_TX_XOR_DROP_DST : ASYNC_TX_XOR_ZERO_DST; 1916 init_async_submit(&submit, flags, tx, NULL, NULL, 1917 to_addr_conv(sh, percpu, j)); 1918 } 1919 1920 if (unlikely(count == 1)) 1921 tx = async_memcpy(xor_dest, xor_srcs[0], 0, 0, STRIPE_SIZE, &submit); 1922 else 1923 tx = async_xor(xor_dest, xor_srcs, 0, count, STRIPE_SIZE, &submit); 1924 if (!last_stripe) { 1925 j++; 1926 sh = list_first_entry(&sh->batch_list, struct stripe_head, 1927 batch_list); 1928 goto again; 1929 } 1930 } 1931 1932 static void 1933 ops_run_reconstruct6(struct stripe_head *sh, struct raid5_percpu *percpu, 1934 struct dma_async_tx_descriptor *tx) 1935 { 1936 struct async_submit_ctl submit; 1937 struct page **blocks; 1938 int count, i, j = 0; 1939 struct stripe_head *head_sh = sh; 1940 int last_stripe; 1941 int synflags; 1942 unsigned long txflags; 1943 1944 pr_debug("%s: stripe %llu\n", __func__, (unsigned long long)sh->sector); 1945 1946 for (i = 0; i < sh->disks; i++) { 1947 if (sh->pd_idx == i || sh->qd_idx == i) 1948 continue; 1949 if (!test_bit(R5_Discard, &sh->dev[i].flags)) 1950 break; 1951 } 1952 if (i >= sh->disks) { 1953 atomic_inc(&sh->count); 1954 set_bit(R5_Discard, &sh->dev[sh->pd_idx].flags); 1955 set_bit(R5_Discard, &sh->dev[sh->qd_idx].flags); 1956 ops_complete_reconstruct(sh); 1957 return; 1958 } 1959 1960 again: 1961 blocks = to_addr_page(percpu, j); 1962 1963 if (sh->reconstruct_state == reconstruct_state_prexor_drain_run) { 1964 synflags = SYNDROME_SRC_WRITTEN; 1965 txflags = ASYNC_TX_ACK | ASYNC_TX_PQ_XOR_DST; 1966 } else { 1967 synflags = SYNDROME_SRC_ALL; 1968 txflags = ASYNC_TX_ACK; 1969 } 1970 1971 count = set_syndrome_sources(blocks, sh, synflags); 1972 last_stripe = !head_sh->batch_head || 1973 list_first_entry(&sh->batch_list, 1974 struct stripe_head, batch_list) == head_sh; 1975 1976 if (last_stripe) { 1977 atomic_inc(&head_sh->count); 1978 init_async_submit(&submit, txflags, tx, ops_complete_reconstruct, 1979 head_sh, to_addr_conv(sh, percpu, j)); 1980 } else 1981 init_async_submit(&submit, 0, tx, NULL, NULL, 1982 to_addr_conv(sh, percpu, j)); 1983 tx = async_gen_syndrome(blocks, 0, count+2, STRIPE_SIZE, &submit); 1984 if (!last_stripe) { 1985 j++; 1986 sh = list_first_entry(&sh->batch_list, struct stripe_head, 1987 batch_list); 1988 goto again; 1989 } 1990 } 1991 1992 static void ops_complete_check(void *stripe_head_ref) 1993 { 1994 struct stripe_head *sh = stripe_head_ref; 1995 1996 pr_debug("%s: stripe %llu\n", __func__, 1997 (unsigned long long)sh->sector); 1998 1999 sh->check_state = check_state_check_result; 2000 set_bit(STRIPE_HANDLE, &sh->state); 2001 raid5_release_stripe(sh); 2002 } 2003 2004 static void ops_run_check_p(struct stripe_head *sh, struct raid5_percpu *percpu) 2005 { 2006 int disks = sh->disks; 2007 int pd_idx = sh->pd_idx; 2008 int qd_idx = sh->qd_idx; 2009 struct page *xor_dest; 2010 struct page **xor_srcs = to_addr_page(percpu, 0); 2011 struct dma_async_tx_descriptor *tx; 2012 struct async_submit_ctl submit; 2013 int count; 2014 int i; 2015 2016 pr_debug("%s: stripe %llu\n", __func__, 2017 (unsigned long long)sh->sector); 2018 2019 BUG_ON(sh->batch_head); 2020 count = 0; 2021 xor_dest = sh->dev[pd_idx].page; 2022 xor_srcs[count++] = xor_dest; 2023 for (i = disks; i--; ) { 2024 if (i == pd_idx || i == qd_idx) 2025 continue; 2026 xor_srcs[count++] = sh->dev[i].page; 2027 } 2028 2029 init_async_submit(&submit, 0, NULL, NULL, NULL, 2030 to_addr_conv(sh, percpu, 0)); 2031 tx = async_xor_val(xor_dest, xor_srcs, 0, count, STRIPE_SIZE, 2032 &sh->ops.zero_sum_result, &submit); 2033 2034 atomic_inc(&sh->count); 2035 init_async_submit(&submit, ASYNC_TX_ACK, tx, ops_complete_check, sh, NULL); 2036 tx = async_trigger_callback(&submit); 2037 } 2038 2039 static void ops_run_check_pq(struct stripe_head *sh, struct raid5_percpu *percpu, int checkp) 2040 { 2041 struct page **srcs = to_addr_page(percpu, 0); 2042 struct async_submit_ctl submit; 2043 int count; 2044 2045 pr_debug("%s: stripe %llu checkp: %d\n", __func__, 2046 (unsigned long long)sh->sector, checkp); 2047 2048 BUG_ON(sh->batch_head); 2049 count = set_syndrome_sources(srcs, sh, SYNDROME_SRC_ALL); 2050 if (!checkp) 2051 srcs[count] = NULL; 2052 2053 atomic_inc(&sh->count); 2054 init_async_submit(&submit, ASYNC_TX_ACK, NULL, ops_complete_check, 2055 sh, to_addr_conv(sh, percpu, 0)); 2056 async_syndrome_val(srcs, 0, count+2, STRIPE_SIZE, 2057 &sh->ops.zero_sum_result, percpu->spare_page, &submit); 2058 } 2059 2060 static void raid_run_ops(struct stripe_head *sh, unsigned long ops_request) 2061 { 2062 int overlap_clear = 0, i, disks = sh->disks; 2063 struct dma_async_tx_descriptor *tx = NULL; 2064 struct r5conf *conf = sh->raid_conf; 2065 int level = conf->level; 2066 struct raid5_percpu *percpu; 2067 unsigned long cpu; 2068 2069 cpu = get_cpu(); 2070 percpu = per_cpu_ptr(conf->percpu, cpu); 2071 if (test_bit(STRIPE_OP_BIOFILL, &ops_request)) { 2072 ops_run_biofill(sh); 2073 overlap_clear++; 2074 } 2075 2076 if (test_bit(STRIPE_OP_COMPUTE_BLK, &ops_request)) { 2077 if (level < 6) 2078 tx = ops_run_compute5(sh, percpu); 2079 else { 2080 if (sh->ops.target2 < 0 || sh->ops.target < 0) 2081 tx = ops_run_compute6_1(sh, percpu); 2082 else 2083 tx = ops_run_compute6_2(sh, percpu); 2084 } 2085 /* terminate the chain if reconstruct is not set to be run */ 2086 if (tx && !test_bit(STRIPE_OP_RECONSTRUCT, &ops_request)) 2087 async_tx_ack(tx); 2088 } 2089 2090 if (test_bit(STRIPE_OP_PREXOR, &ops_request)) { 2091 if (level < 6) 2092 tx = ops_run_prexor5(sh, percpu, tx); 2093 else 2094 tx = ops_run_prexor6(sh, percpu, tx); 2095 } 2096 2097 if (test_bit(STRIPE_OP_PARTIAL_PARITY, &ops_request)) 2098 tx = ops_run_partial_parity(sh, percpu, tx); 2099 2100 if (test_bit(STRIPE_OP_BIODRAIN, &ops_request)) { 2101 tx = ops_run_biodrain(sh, tx); 2102 overlap_clear++; 2103 } 2104 2105 if (test_bit(STRIPE_OP_RECONSTRUCT, &ops_request)) { 2106 if (level < 6) 2107 ops_run_reconstruct5(sh, percpu, tx); 2108 else 2109 ops_run_reconstruct6(sh, percpu, tx); 2110 } 2111 2112 if (test_bit(STRIPE_OP_CHECK, &ops_request)) { 2113 if (sh->check_state == check_state_run) 2114 ops_run_check_p(sh, percpu); 2115 else if (sh->check_state == check_state_run_q) 2116 ops_run_check_pq(sh, percpu, 0); 2117 else if (sh->check_state == check_state_run_pq) 2118 ops_run_check_pq(sh, percpu, 1); 2119 else 2120 BUG(); 2121 } 2122 2123 if (overlap_clear && !sh->batch_head) 2124 for (i = disks; i--; ) { 2125 struct r5dev *dev = &sh->dev[i]; 2126 if (test_and_clear_bit(R5_Overlap, &dev->flags)) 2127 wake_up(&sh->raid_conf->wait_for_overlap); 2128 } 2129 put_cpu(); 2130 } 2131 2132 static void free_stripe(struct kmem_cache *sc, struct stripe_head *sh) 2133 { 2134 if (sh->ppl_page) 2135 __free_page(sh->ppl_page); 2136 kmem_cache_free(sc, sh); 2137 } 2138 2139 static struct stripe_head *alloc_stripe(struct kmem_cache *sc, gfp_t gfp, 2140 int disks, struct r5conf *conf) 2141 { 2142 struct stripe_head *sh; 2143 int i; 2144 2145 sh = kmem_cache_zalloc(sc, gfp); 2146 if (sh) { 2147 spin_lock_init(&sh->stripe_lock); 2148 spin_lock_init(&sh->batch_lock); 2149 INIT_LIST_HEAD(&sh->batch_list); 2150 INIT_LIST_HEAD(&sh->lru); 2151 INIT_LIST_HEAD(&sh->r5c); 2152 INIT_LIST_HEAD(&sh->log_list); 2153 atomic_set(&sh->count, 1); 2154 sh->raid_conf = conf; 2155 sh->log_start = MaxSector; 2156 for (i = 0; i < disks; i++) { 2157 struct r5dev *dev = &sh->dev[i]; 2158 2159 bio_init(&dev->req, &dev->vec, 1); 2160 bio_init(&dev->rreq, &dev->rvec, 1); 2161 } 2162 2163 if (raid5_has_ppl(conf)) { 2164 sh->ppl_page = alloc_page(gfp); 2165 if (!sh->ppl_page) { 2166 free_stripe(sc, sh); 2167 sh = NULL; 2168 } 2169 } 2170 } 2171 return sh; 2172 } 2173 static int grow_one_stripe(struct r5conf *conf, gfp_t gfp) 2174 { 2175 struct stripe_head *sh; 2176 2177 sh = alloc_stripe(conf->slab_cache, gfp, conf->pool_size, conf); 2178 if (!sh) 2179 return 0; 2180 2181 if (grow_buffers(sh, gfp)) { 2182 shrink_buffers(sh); 2183 free_stripe(conf->slab_cache, sh); 2184 return 0; 2185 } 2186 sh->hash_lock_index = 2187 conf->max_nr_stripes % NR_STRIPE_HASH_LOCKS; 2188 /* we just created an active stripe so... */ 2189 atomic_inc(&conf->active_stripes); 2190 2191 raid5_release_stripe(sh); 2192 conf->max_nr_stripes++; 2193 return 1; 2194 } 2195 2196 static int grow_stripes(struct r5conf *conf, int num) 2197 { 2198 struct kmem_cache *sc; 2199 size_t namelen = sizeof(conf->cache_name[0]); 2200 int devs = max(conf->raid_disks, conf->previous_raid_disks); 2201 2202 if (conf->mddev->gendisk) 2203 snprintf(conf->cache_name[0], namelen, 2204 "raid%d-%s", conf->level, mdname(conf->mddev)); 2205 else 2206 snprintf(conf->cache_name[0], namelen, 2207 "raid%d-%p", conf->level, conf->mddev); 2208 snprintf(conf->cache_name[1], namelen, "%.27s-alt", conf->cache_name[0]); 2209 2210 conf->active_name = 0; 2211 sc = kmem_cache_create(conf->cache_name[conf->active_name], 2212 sizeof(struct stripe_head)+(devs-1)*sizeof(struct r5dev), 2213 0, 0, NULL); 2214 if (!sc) 2215 return 1; 2216 conf->slab_cache = sc; 2217 conf->pool_size = devs; 2218 while (num--) 2219 if (!grow_one_stripe(conf, GFP_KERNEL)) 2220 return 1; 2221 2222 return 0; 2223 } 2224 2225 /** 2226 * scribble_len - return the required size of the scribble region 2227 * @num - total number of disks in the array 2228 * 2229 * The size must be enough to contain: 2230 * 1/ a struct page pointer for each device in the array +2 2231 * 2/ room to convert each entry in (1) to its corresponding dma 2232 * (dma_map_page()) or page (page_address()) address. 2233 * 2234 * Note: the +2 is for the destination buffers of the ddf/raid6 case where we 2235 * calculate over all devices (not just the data blocks), using zeros in place 2236 * of the P and Q blocks. 2237 */ 2238 static struct flex_array *scribble_alloc(int num, int cnt, gfp_t flags) 2239 { 2240 struct flex_array *ret; 2241 size_t len; 2242 2243 len = sizeof(struct page *) * (num+2) + sizeof(addr_conv_t) * (num+2); 2244 ret = flex_array_alloc(len, cnt, flags); 2245 if (!ret) 2246 return NULL; 2247 /* always prealloc all elements, so no locking is required */ 2248 if (flex_array_prealloc(ret, 0, cnt, flags)) { 2249 flex_array_free(ret); 2250 return NULL; 2251 } 2252 return ret; 2253 } 2254 2255 static int resize_chunks(struct r5conf *conf, int new_disks, int new_sectors) 2256 { 2257 unsigned long cpu; 2258 int err = 0; 2259 2260 /* 2261 * Never shrink. And mddev_suspend() could deadlock if this is called 2262 * from raid5d. In that case, scribble_disks and scribble_sectors 2263 * should equal to new_disks and new_sectors 2264 */ 2265 if (conf->scribble_disks >= new_disks && 2266 conf->scribble_sectors >= new_sectors) 2267 return 0; 2268 mddev_suspend(conf->mddev); 2269 get_online_cpus(); 2270 for_each_present_cpu(cpu) { 2271 struct raid5_percpu *percpu; 2272 struct flex_array *scribble; 2273 2274 percpu = per_cpu_ptr(conf->percpu, cpu); 2275 scribble = scribble_alloc(new_disks, 2276 new_sectors / STRIPE_SECTORS, 2277 GFP_NOIO); 2278 2279 if (scribble) { 2280 flex_array_free(percpu->scribble); 2281 percpu->scribble = scribble; 2282 } else { 2283 err = -ENOMEM; 2284 break; 2285 } 2286 } 2287 put_online_cpus(); 2288 mddev_resume(conf->mddev); 2289 if (!err) { 2290 conf->scribble_disks = new_disks; 2291 conf->scribble_sectors = new_sectors; 2292 } 2293 return err; 2294 } 2295 2296 static int resize_stripes(struct r5conf *conf, int newsize) 2297 { 2298 /* Make all the stripes able to hold 'newsize' devices. 2299 * New slots in each stripe get 'page' set to a new page. 2300 * 2301 * This happens in stages: 2302 * 1/ create a new kmem_cache and allocate the required number of 2303 * stripe_heads. 2304 * 2/ gather all the old stripe_heads and transfer the pages across 2305 * to the new stripe_heads. This will have the side effect of 2306 * freezing the array as once all stripe_heads have been collected, 2307 * no IO will be possible. Old stripe heads are freed once their 2308 * pages have been transferred over, and the old kmem_cache is 2309 * freed when all stripes are done. 2310 * 3/ reallocate conf->disks to be suitable bigger. If this fails, 2311 * we simple return a failure status - no need to clean anything up. 2312 * 4/ allocate new pages for the new slots in the new stripe_heads. 2313 * If this fails, we don't bother trying the shrink the 2314 * stripe_heads down again, we just leave them as they are. 2315 * As each stripe_head is processed the new one is released into 2316 * active service. 2317 * 2318 * Once step2 is started, we cannot afford to wait for a write, 2319 * so we use GFP_NOIO allocations. 2320 */ 2321 struct stripe_head *osh, *nsh; 2322 LIST_HEAD(newstripes); 2323 struct disk_info *ndisks; 2324 int err = 0; 2325 struct kmem_cache *sc; 2326 int i; 2327 int hash, cnt; 2328 2329 md_allow_write(conf->mddev); 2330 2331 /* Step 1 */ 2332 sc = kmem_cache_create(conf->cache_name[1-conf->active_name], 2333 sizeof(struct stripe_head)+(newsize-1)*sizeof(struct r5dev), 2334 0, 0, NULL); 2335 if (!sc) 2336 return -ENOMEM; 2337 2338 /* Need to ensure auto-resizing doesn't interfere */ 2339 mutex_lock(&conf->cache_size_mutex); 2340 2341 for (i = conf->max_nr_stripes; i; i--) { 2342 nsh = alloc_stripe(sc, GFP_KERNEL, newsize, conf); 2343 if (!nsh) 2344 break; 2345 2346 list_add(&nsh->lru, &newstripes); 2347 } 2348 if (i) { 2349 /* didn't get enough, give up */ 2350 while (!list_empty(&newstripes)) { 2351 nsh = list_entry(newstripes.next, struct stripe_head, lru); 2352 list_del(&nsh->lru); 2353 free_stripe(sc, nsh); 2354 } 2355 kmem_cache_destroy(sc); 2356 mutex_unlock(&conf->cache_size_mutex); 2357 return -ENOMEM; 2358 } 2359 /* Step 2 - Must use GFP_NOIO now. 2360 * OK, we have enough stripes, start collecting inactive 2361 * stripes and copying them over 2362 */ 2363 hash = 0; 2364 cnt = 0; 2365 list_for_each_entry(nsh, &newstripes, lru) { 2366 lock_device_hash_lock(conf, hash); 2367 wait_event_cmd(conf->wait_for_stripe, 2368 !list_empty(conf->inactive_list + hash), 2369 unlock_device_hash_lock(conf, hash), 2370 lock_device_hash_lock(conf, hash)); 2371 osh = get_free_stripe(conf, hash); 2372 unlock_device_hash_lock(conf, hash); 2373 2374 for(i=0; i<conf->pool_size; i++) { 2375 nsh->dev[i].page = osh->dev[i].page; 2376 nsh->dev[i].orig_page = osh->dev[i].page; 2377 } 2378 nsh->hash_lock_index = hash; 2379 free_stripe(conf->slab_cache, osh); 2380 cnt++; 2381 if (cnt >= conf->max_nr_stripes / NR_STRIPE_HASH_LOCKS + 2382 !!((conf->max_nr_stripes % NR_STRIPE_HASH_LOCKS) > hash)) { 2383 hash++; 2384 cnt = 0; 2385 } 2386 } 2387 kmem_cache_destroy(conf->slab_cache); 2388 2389 /* Step 3. 2390 * At this point, we are holding all the stripes so the array 2391 * is completely stalled, so now is a good time to resize 2392 * conf->disks and the scribble region 2393 */ 2394 ndisks = kzalloc(newsize * sizeof(struct disk_info), GFP_NOIO); 2395 if (ndisks) { 2396 for (i = 0; i < conf->pool_size; i++) 2397 ndisks[i] = conf->disks[i]; 2398 2399 for (i = conf->pool_size; i < newsize; i++) { 2400 ndisks[i].extra_page = alloc_page(GFP_NOIO); 2401 if (!ndisks[i].extra_page) 2402 err = -ENOMEM; 2403 } 2404 2405 if (err) { 2406 for (i = conf->pool_size; i < newsize; i++) 2407 if (ndisks[i].extra_page) 2408 put_page(ndisks[i].extra_page); 2409 kfree(ndisks); 2410 } else { 2411 kfree(conf->disks); 2412 conf->disks = ndisks; 2413 } 2414 } else 2415 err = -ENOMEM; 2416 2417 mutex_unlock(&conf->cache_size_mutex); 2418 2419 conf->slab_cache = sc; 2420 conf->active_name = 1-conf->active_name; 2421 2422 /* Step 4, return new stripes to service */ 2423 while(!list_empty(&newstripes)) { 2424 nsh = list_entry(newstripes.next, struct stripe_head, lru); 2425 list_del_init(&nsh->lru); 2426 2427 for (i=conf->raid_disks; i < newsize; i++) 2428 if (nsh->dev[i].page == NULL) { 2429 struct page *p = alloc_page(GFP_NOIO); 2430 nsh->dev[i].page = p; 2431 nsh->dev[i].orig_page = p; 2432 if (!p) 2433 err = -ENOMEM; 2434 } 2435 raid5_release_stripe(nsh); 2436 } 2437 /* critical section pass, GFP_NOIO no longer needed */ 2438 2439 if (!err) 2440 conf->pool_size = newsize; 2441 return err; 2442 } 2443 2444 static int drop_one_stripe(struct r5conf *conf) 2445 { 2446 struct stripe_head *sh; 2447 int hash = (conf->max_nr_stripes - 1) & STRIPE_HASH_LOCKS_MASK; 2448 2449 spin_lock_irq(conf->hash_locks + hash); 2450 sh = get_free_stripe(conf, hash); 2451 spin_unlock_irq(conf->hash_locks + hash); 2452 if (!sh) 2453 return 0; 2454 BUG_ON(atomic_read(&sh->count)); 2455 shrink_buffers(sh); 2456 free_stripe(conf->slab_cache, sh); 2457 atomic_dec(&conf->active_stripes); 2458 conf->max_nr_stripes--; 2459 return 1; 2460 } 2461 2462 static void shrink_stripes(struct r5conf *conf) 2463 { 2464 while (conf->max_nr_stripes && 2465 drop_one_stripe(conf)) 2466 ; 2467 2468 kmem_cache_destroy(conf->slab_cache); 2469 conf->slab_cache = NULL; 2470 } 2471 2472 static void raid5_end_read_request(struct bio * bi) 2473 { 2474 struct stripe_head *sh = bi->bi_private; 2475 struct r5conf *conf = sh->raid_conf; 2476 int disks = sh->disks, i; 2477 char b[BDEVNAME_SIZE]; 2478 struct md_rdev *rdev = NULL; 2479 sector_t s; 2480 2481 for (i=0 ; i<disks; i++) 2482 if (bi == &sh->dev[i].req) 2483 break; 2484 2485 pr_debug("end_read_request %llu/%d, count: %d, error %d.\n", 2486 (unsigned long long)sh->sector, i, atomic_read(&sh->count), 2487 bi->bi_status); 2488 if (i == disks) { 2489 bio_reset(bi); 2490 BUG(); 2491 return; 2492 } 2493 if (test_bit(R5_ReadRepl, &sh->dev[i].flags)) 2494 /* If replacement finished while this request was outstanding, 2495 * 'replacement' might be NULL already. 2496 * In that case it moved down to 'rdev'. 2497 * rdev is not removed until all requests are finished. 2498 */ 2499 rdev = conf->disks[i].replacement; 2500 if (!rdev) 2501 rdev = conf->disks[i].rdev; 2502 2503 if (use_new_offset(conf, sh)) 2504 s = sh->sector + rdev->new_data_offset; 2505 else 2506 s = sh->sector + rdev->data_offset; 2507 if (!bi->bi_status) { 2508 set_bit(R5_UPTODATE, &sh->dev[i].flags); 2509 if (test_bit(R5_ReadError, &sh->dev[i].flags)) { 2510 /* Note that this cannot happen on a 2511 * replacement device. We just fail those on 2512 * any error 2513 */ 2514 pr_info_ratelimited( 2515 "md/raid:%s: read error corrected (%lu sectors at %llu on %s)\n", 2516 mdname(conf->mddev), STRIPE_SECTORS, 2517 (unsigned long long)s, 2518 bdevname(rdev->bdev, b)); 2519 atomic_add(STRIPE_SECTORS, &rdev->corrected_errors); 2520 clear_bit(R5_ReadError, &sh->dev[i].flags); 2521 clear_bit(R5_ReWrite, &sh->dev[i].flags); 2522 } else if (test_bit(R5_ReadNoMerge, &sh->dev[i].flags)) 2523 clear_bit(R5_ReadNoMerge, &sh->dev[i].flags); 2524 2525 if (test_bit(R5_InJournal, &sh->dev[i].flags)) 2526 /* 2527 * end read for a page in journal, this 2528 * must be preparing for prexor in rmw 2529 */ 2530 set_bit(R5_OrigPageUPTDODATE, &sh->dev[i].flags); 2531 2532 if (atomic_read(&rdev->read_errors)) 2533 atomic_set(&rdev->read_errors, 0); 2534 } else { 2535 const char *bdn = bdevname(rdev->bdev, b); 2536 int retry = 0; 2537 int set_bad = 0; 2538 2539 clear_bit(R5_UPTODATE, &sh->dev[i].flags); 2540 atomic_inc(&rdev->read_errors); 2541 if (test_bit(R5_ReadRepl, &sh->dev[i].flags)) 2542 pr_warn_ratelimited( 2543 "md/raid:%s: read error on replacement device (sector %llu on %s).\n", 2544 mdname(conf->mddev), 2545 (unsigned long long)s, 2546 bdn); 2547 else if (conf->mddev->degraded >= conf->max_degraded) { 2548 set_bad = 1; 2549 pr_warn_ratelimited( 2550 "md/raid:%s: read error not correctable (sector %llu on %s).\n", 2551 mdname(conf->mddev), 2552 (unsigned long long)s, 2553 bdn); 2554 } else if (test_bit(R5_ReWrite, &sh->dev[i].flags)) { 2555 /* Oh, no!!! */ 2556 set_bad = 1; 2557 pr_warn_ratelimited( 2558 "md/raid:%s: read error NOT corrected!! (sector %llu on %s).\n", 2559 mdname(conf->mddev), 2560 (unsigned long long)s, 2561 bdn); 2562 } else if (atomic_read(&rdev->read_errors) 2563 > conf->max_nr_stripes) 2564 pr_warn("md/raid:%s: Too many read errors, failing device %s.\n", 2565 mdname(conf->mddev), bdn); 2566 else 2567 retry = 1; 2568 if (set_bad && test_bit(In_sync, &rdev->flags) 2569 && !test_bit(R5_ReadNoMerge, &sh->dev[i].flags)) 2570 retry = 1; 2571 if (retry) 2572 if (test_bit(R5_ReadNoMerge, &sh->dev[i].flags)) { 2573 set_bit(R5_ReadError, &sh->dev[i].flags); 2574 clear_bit(R5_ReadNoMerge, &sh->dev[i].flags); 2575 } else 2576 set_bit(R5_ReadNoMerge, &sh->dev[i].flags); 2577 else { 2578 clear_bit(R5_ReadError, &sh->dev[i].flags); 2579 clear_bit(R5_ReWrite, &sh->dev[i].flags); 2580 if (!(set_bad 2581 && test_bit(In_sync, &rdev->flags) 2582 && rdev_set_badblocks( 2583 rdev, sh->sector, STRIPE_SECTORS, 0))) 2584 md_error(conf->mddev, rdev); 2585 } 2586 } 2587 rdev_dec_pending(rdev, conf->mddev); 2588 bio_reset(bi); 2589 clear_bit(R5_LOCKED, &sh->dev[i].flags); 2590 set_bit(STRIPE_HANDLE, &sh->state); 2591 raid5_release_stripe(sh); 2592 } 2593 2594 static void raid5_end_write_request(struct bio *bi) 2595 { 2596 struct stripe_head *sh = bi->bi_private; 2597 struct r5conf *conf = sh->raid_conf; 2598 int disks = sh->disks, i; 2599 struct md_rdev *uninitialized_var(rdev); 2600 sector_t first_bad; 2601 int bad_sectors; 2602 int replacement = 0; 2603 2604 for (i = 0 ; i < disks; i++) { 2605 if (bi == &sh->dev[i].req) { 2606 rdev = conf->disks[i].rdev; 2607 break; 2608 } 2609 if (bi == &sh->dev[i].rreq) { 2610 rdev = conf->disks[i].replacement; 2611 if (rdev) 2612 replacement = 1; 2613 else 2614 /* rdev was removed and 'replacement' 2615 * replaced it. rdev is not removed 2616 * until all requests are finished. 2617 */ 2618 rdev = conf->disks[i].rdev; 2619 break; 2620 } 2621 } 2622 pr_debug("end_write_request %llu/%d, count %d, error: %d.\n", 2623 (unsigned long long)sh->sector, i, atomic_read(&sh->count), 2624 bi->bi_status); 2625 if (i == disks) { 2626 bio_reset(bi); 2627 BUG(); 2628 return; 2629 } 2630 2631 if (replacement) { 2632 if (bi->bi_status) 2633 md_error(conf->mddev, rdev); 2634 else if (is_badblock(rdev, sh->sector, 2635 STRIPE_SECTORS, 2636 &first_bad, &bad_sectors)) 2637 set_bit(R5_MadeGoodRepl, &sh->dev[i].flags); 2638 } else { 2639 if (bi->bi_status) { 2640 set_bit(STRIPE_DEGRADED, &sh->state); 2641 set_bit(WriteErrorSeen, &rdev->flags); 2642 set_bit(R5_WriteError, &sh->dev[i].flags); 2643 if (!test_and_set_bit(WantReplacement, &rdev->flags)) 2644 set_bit(MD_RECOVERY_NEEDED, 2645 &rdev->mddev->recovery); 2646 } else if (is_badblock(rdev, sh->sector, 2647 STRIPE_SECTORS, 2648 &first_bad, &bad_sectors)) { 2649 set_bit(R5_MadeGood, &sh->dev[i].flags); 2650 if (test_bit(R5_ReadError, &sh->dev[i].flags)) 2651 /* That was a successful write so make 2652 * sure it looks like we already did 2653 * a re-write. 2654 */ 2655 set_bit(R5_ReWrite, &sh->dev[i].flags); 2656 } 2657 } 2658 rdev_dec_pending(rdev, conf->mddev); 2659 2660 if (sh->batch_head && bi->bi_status && !replacement) 2661 set_bit(STRIPE_BATCH_ERR, &sh->batch_head->state); 2662 2663 bio_reset(bi); 2664 if (!test_and_clear_bit(R5_DOUBLE_LOCKED, &sh->dev[i].flags)) 2665 clear_bit(R5_LOCKED, &sh->dev[i].flags); 2666 set_bit(STRIPE_HANDLE, &sh->state); 2667 raid5_release_stripe(sh); 2668 2669 if (sh->batch_head && sh != sh->batch_head) 2670 raid5_release_stripe(sh->batch_head); 2671 } 2672 2673 static void raid5_error(struct mddev *mddev, struct md_rdev *rdev) 2674 { 2675 char b[BDEVNAME_SIZE]; 2676 struct r5conf *conf = mddev->private; 2677 unsigned long flags; 2678 pr_debug("raid456: error called\n"); 2679 2680 spin_lock_irqsave(&conf->device_lock, flags); 2681 set_bit(Faulty, &rdev->flags); 2682 clear_bit(In_sync, &rdev->flags); 2683 mddev->degraded = raid5_calc_degraded(conf); 2684 spin_unlock_irqrestore(&conf->device_lock, flags); 2685 set_bit(MD_RECOVERY_INTR, &mddev->recovery); 2686 2687 set_bit(Blocked, &rdev->flags); 2688 set_mask_bits(&mddev->sb_flags, 0, 2689 BIT(MD_SB_CHANGE_DEVS) | BIT(MD_SB_CHANGE_PENDING)); 2690 pr_crit("md/raid:%s: Disk failure on %s, disabling device.\n" 2691 "md/raid:%s: Operation continuing on %d devices.\n", 2692 mdname(mddev), 2693 bdevname(rdev->bdev, b), 2694 mdname(mddev), 2695 conf->raid_disks - mddev->degraded); 2696 r5c_update_on_rdev_error(mddev, rdev); 2697 } 2698 2699 /* 2700 * Input: a 'big' sector number, 2701 * Output: index of the data and parity disk, and the sector # in them. 2702 */ 2703 sector_t raid5_compute_sector(struct r5conf *conf, sector_t r_sector, 2704 int previous, int *dd_idx, 2705 struct stripe_head *sh) 2706 { 2707 sector_t stripe, stripe2; 2708 sector_t chunk_number; 2709 unsigned int chunk_offset; 2710 int pd_idx, qd_idx; 2711 int ddf_layout = 0; 2712 sector_t new_sector; 2713 int algorithm = previous ? conf->prev_algo 2714 : conf->algorithm; 2715 int sectors_per_chunk = previous ? conf->prev_chunk_sectors 2716 : conf->chunk_sectors; 2717 int raid_disks = previous ? conf->previous_raid_disks 2718 : conf->raid_disks; 2719 int data_disks = raid_disks - conf->max_degraded; 2720 2721 /* First compute the information on this sector */ 2722 2723 /* 2724 * Compute the chunk number and the sector offset inside the chunk 2725 */ 2726 chunk_offset = sector_div(r_sector, sectors_per_chunk); 2727 chunk_number = r_sector; 2728 2729 /* 2730 * Compute the stripe number 2731 */ 2732 stripe = chunk_number; 2733 *dd_idx = sector_div(stripe, data_disks); 2734 stripe2 = stripe; 2735 /* 2736 * Select the parity disk based on the user selected algorithm. 2737 */ 2738 pd_idx = qd_idx = -1; 2739 switch(conf->level) { 2740 case 4: 2741 pd_idx = data_disks; 2742 break; 2743 case 5: 2744 switch (algorithm) { 2745 case ALGORITHM_LEFT_ASYMMETRIC: 2746 pd_idx = data_disks - sector_div(stripe2, raid_disks); 2747 if (*dd_idx >= pd_idx) 2748 (*dd_idx)++; 2749 break; 2750 case ALGORITHM_RIGHT_ASYMMETRIC: 2751 pd_idx = sector_div(stripe2, raid_disks); 2752 if (*dd_idx >= pd_idx) 2753 (*dd_idx)++; 2754 break; 2755 case ALGORITHM_LEFT_SYMMETRIC: 2756 pd_idx = data_disks - sector_div(stripe2, raid_disks); 2757 *dd_idx = (pd_idx + 1 + *dd_idx) % raid_disks; 2758 break; 2759 case ALGORITHM_RIGHT_SYMMETRIC: 2760 pd_idx = sector_div(stripe2, raid_disks); 2761 *dd_idx = (pd_idx + 1 + *dd_idx) % raid_disks; 2762 break; 2763 case ALGORITHM_PARITY_0: 2764 pd_idx = 0; 2765 (*dd_idx)++; 2766 break; 2767 case ALGORITHM_PARITY_N: 2768 pd_idx = data_disks; 2769 break; 2770 default: 2771 BUG(); 2772 } 2773 break; 2774 case 6: 2775 2776 switch (algorithm) { 2777 case ALGORITHM_LEFT_ASYMMETRIC: 2778 pd_idx = raid_disks - 1 - sector_div(stripe2, raid_disks); 2779 qd_idx = pd_idx + 1; 2780 if (pd_idx == raid_disks-1) { 2781 (*dd_idx)++; /* Q D D D P */ 2782 qd_idx = 0; 2783 } else if (*dd_idx >= pd_idx) 2784 (*dd_idx) += 2; /* D D P Q D */ 2785 break; 2786 case ALGORITHM_RIGHT_ASYMMETRIC: 2787 pd_idx = sector_div(stripe2, raid_disks); 2788 qd_idx = pd_idx + 1; 2789 if (pd_idx == raid_disks-1) { 2790 (*dd_idx)++; /* Q D D D P */ 2791 qd_idx = 0; 2792 } else if (*dd_idx >= pd_idx) 2793 (*dd_idx) += 2; /* D D P Q D */ 2794 break; 2795 case ALGORITHM_LEFT_SYMMETRIC: 2796 pd_idx = raid_disks - 1 - sector_div(stripe2, raid_disks); 2797 qd_idx = (pd_idx + 1) % raid_disks; 2798 *dd_idx = (pd_idx + 2 + *dd_idx) % raid_disks; 2799 break; 2800 case ALGORITHM_RIGHT_SYMMETRIC: 2801 pd_idx = sector_div(stripe2, raid_disks); 2802 qd_idx = (pd_idx + 1) % raid_disks; 2803 *dd_idx = (pd_idx + 2 + *dd_idx) % raid_disks; 2804 break; 2805 2806 case ALGORITHM_PARITY_0: 2807 pd_idx = 0; 2808 qd_idx = 1; 2809 (*dd_idx) += 2; 2810 break; 2811 case ALGORITHM_PARITY_N: 2812 pd_idx = data_disks; 2813 qd_idx = data_disks + 1; 2814 break; 2815 2816 case ALGORITHM_ROTATING_ZERO_RESTART: 2817 /* Exactly the same as RIGHT_ASYMMETRIC, but or 2818 * of blocks for computing Q is different. 2819 */ 2820 pd_idx = sector_div(stripe2, raid_disks); 2821 qd_idx = pd_idx + 1; 2822 if (pd_idx == raid_disks-1) { 2823 (*dd_idx)++; /* Q D D D P */ 2824 qd_idx = 0; 2825 } else if (*dd_idx >= pd_idx) 2826 (*dd_idx) += 2; /* D D P Q D */ 2827 ddf_layout = 1; 2828 break; 2829 2830 case ALGORITHM_ROTATING_N_RESTART: 2831 /* Same a left_asymmetric, by first stripe is 2832 * D D D P Q rather than 2833 * Q D D D P 2834 */ 2835 stripe2 += 1; 2836 pd_idx = raid_disks - 1 - sector_div(stripe2, raid_disks); 2837 qd_idx = pd_idx + 1; 2838 if (pd_idx == raid_disks-1) { 2839 (*dd_idx)++; /* Q D D D P */ 2840 qd_idx = 0; 2841 } else if (*dd_idx >= pd_idx) 2842 (*dd_idx) += 2; /* D D P Q D */ 2843 ddf_layout = 1; 2844 break; 2845 2846 case ALGORITHM_ROTATING_N_CONTINUE: 2847 /* Same as left_symmetric but Q is before P */ 2848 pd_idx = raid_disks - 1 - sector_div(stripe2, raid_disks); 2849 qd_idx = (pd_idx + raid_disks - 1) % raid_disks; 2850 *dd_idx = (pd_idx + 1 + *dd_idx) % raid_disks; 2851 ddf_layout = 1; 2852 break; 2853 2854 case ALGORITHM_LEFT_ASYMMETRIC_6: 2855 /* RAID5 left_asymmetric, with Q on last device */ 2856 pd_idx = data_disks - sector_div(stripe2, raid_disks-1); 2857 if (*dd_idx >= pd_idx) 2858 (*dd_idx)++; 2859 qd_idx = raid_disks - 1; 2860 break; 2861 2862 case ALGORITHM_RIGHT_ASYMMETRIC_6: 2863 pd_idx = sector_div(stripe2, raid_disks-1); 2864 if (*dd_idx >= pd_idx) 2865 (*dd_idx)++; 2866 qd_idx = raid_disks - 1; 2867 break; 2868 2869 case ALGORITHM_LEFT_SYMMETRIC_6: 2870 pd_idx = data_disks - sector_div(stripe2, raid_disks-1); 2871 *dd_idx = (pd_idx + 1 + *dd_idx) % (raid_disks-1); 2872 qd_idx = raid_disks - 1; 2873 break; 2874 2875 case ALGORITHM_RIGHT_SYMMETRIC_6: 2876 pd_idx = sector_div(stripe2, raid_disks-1); 2877 *dd_idx = (pd_idx + 1 + *dd_idx) % (raid_disks-1); 2878 qd_idx = raid_disks - 1; 2879 break; 2880 2881 case ALGORITHM_PARITY_0_6: 2882 pd_idx = 0; 2883 (*dd_idx)++; 2884 qd_idx = raid_disks - 1; 2885 break; 2886 2887 default: 2888 BUG(); 2889 } 2890 break; 2891 } 2892 2893 if (sh) { 2894 sh->pd_idx = pd_idx; 2895 sh->qd_idx = qd_idx; 2896 sh->ddf_layout = ddf_layout; 2897 } 2898 /* 2899 * Finally, compute the new sector number 2900 */ 2901 new_sector = (sector_t)stripe * sectors_per_chunk + chunk_offset; 2902 return new_sector; 2903 } 2904 2905 sector_t raid5_compute_blocknr(struct stripe_head *sh, int i, int previous) 2906 { 2907 struct r5conf *conf = sh->raid_conf; 2908 int raid_disks = sh->disks; 2909 int data_disks = raid_disks - conf->max_degraded; 2910 sector_t new_sector = sh->sector, check; 2911 int sectors_per_chunk = previous ? conf->prev_chunk_sectors 2912 : conf->chunk_sectors; 2913 int algorithm = previous ? conf->prev_algo 2914 : conf->algorithm; 2915 sector_t stripe; 2916 int chunk_offset; 2917 sector_t chunk_number; 2918 int dummy1, dd_idx = i; 2919 sector_t r_sector; 2920 struct stripe_head sh2; 2921 2922 chunk_offset = sector_div(new_sector, sectors_per_chunk); 2923 stripe = new_sector; 2924 2925 if (i == sh->pd_idx) 2926 return 0; 2927 switch(conf->level) { 2928 case 4: break; 2929 case 5: 2930 switch (algorithm) { 2931 case ALGORITHM_LEFT_ASYMMETRIC: 2932 case ALGORITHM_RIGHT_ASYMMETRIC: 2933 if (i > sh->pd_idx) 2934 i--; 2935 break; 2936 case ALGORITHM_LEFT_SYMMETRIC: 2937 case ALGORITHM_RIGHT_SYMMETRIC: 2938 if (i < sh->pd_idx) 2939 i += raid_disks; 2940 i -= (sh->pd_idx + 1); 2941 break; 2942 case ALGORITHM_PARITY_0: 2943 i -= 1; 2944 break; 2945 case ALGORITHM_PARITY_N: 2946 break; 2947 default: 2948 BUG(); 2949 } 2950 break; 2951 case 6: 2952 if (i == sh->qd_idx) 2953 return 0; /* It is the Q disk */ 2954 switch (algorithm) { 2955 case ALGORITHM_LEFT_ASYMMETRIC: 2956 case ALGORITHM_RIGHT_ASYMMETRIC: 2957 case ALGORITHM_ROTATING_ZERO_RESTART: 2958 case ALGORITHM_ROTATING_N_RESTART: 2959 if (sh->pd_idx == raid_disks-1) 2960 i--; /* Q D D D P */ 2961 else if (i > sh->pd_idx) 2962 i -= 2; /* D D P Q D */ 2963 break; 2964 case ALGORITHM_LEFT_SYMMETRIC: 2965 case ALGORITHM_RIGHT_SYMMETRIC: 2966 if (sh->pd_idx == raid_disks-1) 2967 i--; /* Q D D D P */ 2968 else { 2969 /* D D P Q D */ 2970 if (i < sh->pd_idx) 2971 i += raid_disks; 2972 i -= (sh->pd_idx + 2); 2973 } 2974 break; 2975 case ALGORITHM_PARITY_0: 2976 i -= 2; 2977 break; 2978 case ALGORITHM_PARITY_N: 2979 break; 2980 case ALGORITHM_ROTATING_N_CONTINUE: 2981 /* Like left_symmetric, but P is before Q */ 2982 if (sh->pd_idx == 0) 2983 i--; /* P D D D Q */ 2984 else { 2985 /* D D Q P D */ 2986 if (i < sh->pd_idx) 2987 i += raid_disks; 2988 i -= (sh->pd_idx + 1); 2989 } 2990 break; 2991 case ALGORITHM_LEFT_ASYMMETRIC_6: 2992 case ALGORITHM_RIGHT_ASYMMETRIC_6: 2993 if (i > sh->pd_idx) 2994 i--; 2995 break; 2996 case ALGORITHM_LEFT_SYMMETRIC_6: 2997 case ALGORITHM_RIGHT_SYMMETRIC_6: 2998 if (i < sh->pd_idx) 2999 i += data_disks + 1; 3000 i -= (sh->pd_idx + 1); 3001 break; 3002 case ALGORITHM_PARITY_0_6: 3003 i -= 1; 3004 break; 3005 default: 3006 BUG(); 3007 } 3008 break; 3009 } 3010 3011 chunk_number = stripe * data_disks + i; 3012 r_sector = chunk_number * sectors_per_chunk + chunk_offset; 3013 3014 check = raid5_compute_sector(conf, r_sector, 3015 previous, &dummy1, &sh2); 3016 if (check != sh->sector || dummy1 != dd_idx || sh2.pd_idx != sh->pd_idx 3017 || sh2.qd_idx != sh->qd_idx) { 3018 pr_warn("md/raid:%s: compute_blocknr: map not correct\n", 3019 mdname(conf->mddev)); 3020 return 0; 3021 } 3022 return r_sector; 3023 } 3024 3025 /* 3026 * There are cases where we want handle_stripe_dirtying() and 3027 * schedule_reconstruction() to delay towrite to some dev of a stripe. 3028 * 3029 * This function checks whether we want to delay the towrite. Specifically, 3030 * we delay the towrite when: 3031 * 3032 * 1. degraded stripe has a non-overwrite to the missing dev, AND this 3033 * stripe has data in journal (for other devices). 3034 * 3035 * In this case, when reading data for the non-overwrite dev, it is 3036 * necessary to handle complex rmw of write back cache (prexor with 3037 * orig_page, and xor with page). To keep read path simple, we would 3038 * like to flush data in journal to RAID disks first, so complex rmw 3039 * is handled in the write patch (handle_stripe_dirtying). 3040 * 3041 * 2. when journal space is critical (R5C_LOG_CRITICAL=1) 3042 * 3043 * It is important to be able to flush all stripes in raid5-cache. 3044 * Therefore, we need reserve some space on the journal device for 3045 * these flushes. If flush operation includes pending writes to the 3046 * stripe, we need to reserve (conf->raid_disk + 1) pages per stripe 3047 * for the flush out. If we exclude these pending writes from flush 3048 * operation, we only need (conf->max_degraded + 1) pages per stripe. 3049 * Therefore, excluding pending writes in these cases enables more 3050 * efficient use of the journal device. 3051 * 3052 * Note: To make sure the stripe makes progress, we only delay 3053 * towrite for stripes with data already in journal (injournal > 0). 3054 * When LOG_CRITICAL, stripes with injournal == 0 will be sent to 3055 * no_space_stripes list. 3056 * 3057 * 3. during journal failure 3058 * In journal failure, we try to flush all cached data to raid disks 3059 * based on data in stripe cache. The array is read-only to upper 3060 * layers, so we would skip all pending writes. 3061 * 3062 */ 3063 static inline bool delay_towrite(struct r5conf *conf, 3064 struct r5dev *dev, 3065 struct stripe_head_state *s) 3066 { 3067 /* case 1 above */ 3068 if (!test_bit(R5_OVERWRITE, &dev->flags) && 3069 !test_bit(R5_Insync, &dev->flags) && s->injournal) 3070 return true; 3071 /* case 2 above */ 3072 if (test_bit(R5C_LOG_CRITICAL, &conf->cache_state) && 3073 s->injournal > 0) 3074 return true; 3075 /* case 3 above */ 3076 if (s->log_failed && s->injournal) 3077 return true; 3078 return false; 3079 } 3080 3081 static void 3082 schedule_reconstruction(struct stripe_head *sh, struct stripe_head_state *s, 3083 int rcw, int expand) 3084 { 3085 int i, pd_idx = sh->pd_idx, qd_idx = sh->qd_idx, disks = sh->disks; 3086 struct r5conf *conf = sh->raid_conf; 3087 int level = conf->level; 3088 3089 if (rcw) { 3090 /* 3091 * In some cases, handle_stripe_dirtying initially decided to 3092 * run rmw and allocates extra page for prexor. However, rcw is 3093 * cheaper later on. We need to free the extra page now, 3094 * because we won't be able to do that in ops_complete_prexor(). 3095 */ 3096 r5c_release_extra_page(sh); 3097 3098 for (i = disks; i--; ) { 3099 struct r5dev *dev = &sh->dev[i]; 3100 3101 if (dev->towrite && !delay_towrite(conf, dev, s)) { 3102 set_bit(R5_LOCKED, &dev->flags); 3103 set_bit(R5_Wantdrain, &dev->flags); 3104 if (!expand) 3105 clear_bit(R5_UPTODATE, &dev->flags); 3106 s->locked++; 3107 } else if (test_bit(R5_InJournal, &dev->flags)) { 3108 set_bit(R5_LOCKED, &dev->flags); 3109 s->locked++; 3110 } 3111 } 3112 /* if we are not expanding this is a proper write request, and 3113 * there will be bios with new data to be drained into the 3114 * stripe cache 3115 */ 3116 if (!expand) { 3117 if (!s->locked) 3118 /* False alarm, nothing to do */ 3119 return; 3120 sh->reconstruct_state = reconstruct_state_drain_run; 3121 set_bit(STRIPE_OP_BIODRAIN, &s->ops_request); 3122 } else 3123 sh->reconstruct_state = reconstruct_state_run; 3124 3125 set_bit(STRIPE_OP_RECONSTRUCT, &s->ops_request); 3126 3127 if (s->locked + conf->max_degraded == disks) 3128 if (!test_and_set_bit(STRIPE_FULL_WRITE, &sh->state)) 3129 atomic_inc(&conf->pending_full_writes); 3130 } else { 3131 BUG_ON(!(test_bit(R5_UPTODATE, &sh->dev[pd_idx].flags) || 3132 test_bit(R5_Wantcompute, &sh->dev[pd_idx].flags))); 3133 BUG_ON(level == 6 && 3134 (!(test_bit(R5_UPTODATE, &sh->dev[qd_idx].flags) || 3135 test_bit(R5_Wantcompute, &sh->dev[qd_idx].flags)))); 3136 3137 for (i = disks; i--; ) { 3138 struct r5dev *dev = &sh->dev[i]; 3139 if (i == pd_idx || i == qd_idx) 3140 continue; 3141 3142 if (dev->towrite && 3143 (test_bit(R5_UPTODATE, &dev->flags) || 3144 test_bit(R5_Wantcompute, &dev->flags))) { 3145 set_bit(R5_Wantdrain, &dev->flags); 3146 set_bit(R5_LOCKED, &dev->flags); 3147 clear_bit(R5_UPTODATE, &dev->flags); 3148 s->locked++; 3149 } else if (test_bit(R5_InJournal, &dev->flags)) { 3150 set_bit(R5_LOCKED, &dev->flags); 3151 s->locked++; 3152 } 3153 } 3154 if (!s->locked) 3155 /* False alarm - nothing to do */ 3156 return; 3157 sh->reconstruct_state = reconstruct_state_prexor_drain_run; 3158 set_bit(STRIPE_OP_PREXOR, &s->ops_request); 3159 set_bit(STRIPE_OP_BIODRAIN, &s->ops_request); 3160 set_bit(STRIPE_OP_RECONSTRUCT, &s->ops_request); 3161 } 3162 3163 /* keep the parity disk(s) locked while asynchronous operations 3164 * are in flight 3165 */ 3166 set_bit(R5_LOCKED, &sh->dev[pd_idx].flags); 3167 clear_bit(R5_UPTODATE, &sh->dev[pd_idx].flags); 3168 s->locked++; 3169 3170 if (level == 6) { 3171 int qd_idx = sh->qd_idx; 3172 struct r5dev *dev = &sh->dev[qd_idx]; 3173 3174 set_bit(R5_LOCKED, &dev->flags); 3175 clear_bit(R5_UPTODATE, &dev->flags); 3176 s->locked++; 3177 } 3178 3179 if (raid5_has_ppl(sh->raid_conf) && sh->ppl_page && 3180 test_bit(STRIPE_OP_BIODRAIN, &s->ops_request) && 3181 !test_bit(STRIPE_FULL_WRITE, &sh->state) && 3182 test_bit(R5_Insync, &sh->dev[pd_idx].flags)) 3183 set_bit(STRIPE_OP_PARTIAL_PARITY, &s->ops_request); 3184 3185 pr_debug("%s: stripe %llu locked: %d ops_request: %lx\n", 3186 __func__, (unsigned long long)sh->sector, 3187 s->locked, s->ops_request); 3188 } 3189 3190 /* 3191 * Each stripe/dev can have one or more bion attached. 3192 * toread/towrite point to the first in a chain. 3193 * The bi_next chain must be in order. 3194 */ 3195 static int add_stripe_bio(struct stripe_head *sh, struct bio *bi, int dd_idx, 3196 int forwrite, int previous) 3197 { 3198 struct bio **bip; 3199 struct r5conf *conf = sh->raid_conf; 3200 int firstwrite=0; 3201 3202 pr_debug("adding bi b#%llu to stripe s#%llu\n", 3203 (unsigned long long)bi->bi_iter.bi_sector, 3204 (unsigned long long)sh->sector); 3205 3206 spin_lock_irq(&sh->stripe_lock); 3207 /* Don't allow new IO added to stripes in batch list */ 3208 if (sh->batch_head) 3209 goto overlap; 3210 if (forwrite) { 3211 bip = &sh->dev[dd_idx].towrite; 3212 if (*bip == NULL) 3213 firstwrite = 1; 3214 } else 3215 bip = &sh->dev[dd_idx].toread; 3216 while (*bip && (*bip)->bi_iter.bi_sector < bi->bi_iter.bi_sector) { 3217 if (bio_end_sector(*bip) > bi->bi_iter.bi_sector) 3218 goto overlap; 3219 bip = & (*bip)->bi_next; 3220 } 3221 if (*bip && (*bip)->bi_iter.bi_sector < bio_end_sector(bi)) 3222 goto overlap; 3223 3224 if (forwrite && raid5_has_ppl(conf)) { 3225 /* 3226 * With PPL only writes to consecutive data chunks within a 3227 * stripe are allowed because for a single stripe_head we can 3228 * only have one PPL entry at a time, which describes one data 3229 * range. Not really an overlap, but wait_for_overlap can be 3230 * used to handle this. 3231 */ 3232 sector_t sector; 3233 sector_t first = 0; 3234 sector_t last = 0; 3235 int count = 0; 3236 int i; 3237 3238 for (i = 0; i < sh->disks; i++) { 3239 if (i != sh->pd_idx && 3240 (i == dd_idx || sh->dev[i].towrite)) { 3241 sector = sh->dev[i].sector; 3242 if (count == 0 || sector < first) 3243 first = sector; 3244 if (sector > last) 3245 last = sector; 3246 count++; 3247 } 3248 } 3249 3250 if (first + conf->chunk_sectors * (count - 1) != last) 3251 goto overlap; 3252 } 3253 3254 if (!forwrite || previous) 3255 clear_bit(STRIPE_BATCH_READY, &sh->state); 3256 3257 BUG_ON(*bip && bi->bi_next && (*bip) != bi->bi_next); 3258 if (*bip) 3259 bi->bi_next = *bip; 3260 *bip = bi; 3261 bio_inc_remaining(bi); 3262 md_write_inc(conf->mddev, bi); 3263 3264 if (forwrite) { 3265 /* check if page is covered */ 3266 sector_t sector = sh->dev[dd_idx].sector; 3267 for (bi=sh->dev[dd_idx].towrite; 3268 sector < sh->dev[dd_idx].sector + STRIPE_SECTORS && 3269 bi && bi->bi_iter.bi_sector <= sector; 3270 bi = r5_next_bio(bi, sh->dev[dd_idx].sector)) { 3271 if (bio_end_sector(bi) >= sector) 3272 sector = bio_end_sector(bi); 3273 } 3274 if (sector >= sh->dev[dd_idx].sector + STRIPE_SECTORS) 3275 if (!test_and_set_bit(R5_OVERWRITE, &sh->dev[dd_idx].flags)) 3276 sh->overwrite_disks++; 3277 } 3278 3279 pr_debug("added bi b#%llu to stripe s#%llu, disk %d.\n", 3280 (unsigned long long)(*bip)->bi_iter.bi_sector, 3281 (unsigned long long)sh->sector, dd_idx); 3282 3283 if (conf->mddev->bitmap && firstwrite) { 3284 /* Cannot hold spinlock over bitmap_startwrite, 3285 * but must ensure this isn't added to a batch until 3286 * we have added to the bitmap and set bm_seq. 3287 * So set STRIPE_BITMAP_PENDING to prevent 3288 * batching. 3289 * If multiple add_stripe_bio() calls race here they 3290 * much all set STRIPE_BITMAP_PENDING. So only the first one 3291 * to complete "bitmap_startwrite" gets to set 3292 * STRIPE_BIT_DELAY. This is important as once a stripe 3293 * is added to a batch, STRIPE_BIT_DELAY cannot be changed 3294 * any more. 3295 */ 3296 set_bit(STRIPE_BITMAP_PENDING, &sh->state); 3297 spin_unlock_irq(&sh->stripe_lock); 3298 bitmap_startwrite(conf->mddev->bitmap, sh->sector, 3299 STRIPE_SECTORS, 0); 3300 spin_lock_irq(&sh->stripe_lock); 3301 clear_bit(STRIPE_BITMAP_PENDING, &sh->state); 3302 if (!sh->batch_head) { 3303 sh->bm_seq = conf->seq_flush+1; 3304 set_bit(STRIPE_BIT_DELAY, &sh->state); 3305 } 3306 } 3307 spin_unlock_irq(&sh->stripe_lock); 3308 3309 if (stripe_can_batch(sh)) 3310 stripe_add_to_batch_list(conf, sh); 3311 return 1; 3312 3313 overlap: 3314 set_bit(R5_Overlap, &sh->dev[dd_idx].flags); 3315 spin_unlock_irq(&sh->stripe_lock); 3316 return 0; 3317 } 3318 3319 static void end_reshape(struct r5conf *conf); 3320 3321 static void stripe_set_idx(sector_t stripe, struct r5conf *conf, int previous, 3322 struct stripe_head *sh) 3323 { 3324 int sectors_per_chunk = 3325 previous ? conf->prev_chunk_sectors : conf->chunk_sectors; 3326 int dd_idx; 3327 int chunk_offset = sector_div(stripe, sectors_per_chunk); 3328 int disks = previous ? conf->previous_raid_disks : conf->raid_disks; 3329 3330 raid5_compute_sector(conf, 3331 stripe * (disks - conf->max_degraded) 3332 *sectors_per_chunk + chunk_offset, 3333 previous, 3334 &dd_idx, sh); 3335 } 3336 3337 static void 3338 handle_failed_stripe(struct r5conf *conf, struct stripe_head *sh, 3339 struct stripe_head_state *s, int disks) 3340 { 3341 int i; 3342 BUG_ON(sh->batch_head); 3343 for (i = disks; i--; ) { 3344 struct bio *bi; 3345 int bitmap_end = 0; 3346 3347 if (test_bit(R5_ReadError, &sh->dev[i].flags)) { 3348 struct md_rdev *rdev; 3349 rcu_read_lock(); 3350 rdev = rcu_dereference(conf->disks[i].rdev); 3351 if (rdev && test_bit(In_sync, &rdev->flags) && 3352 !test_bit(Faulty, &rdev->flags)) 3353 atomic_inc(&rdev->nr_pending); 3354 else 3355 rdev = NULL; 3356 rcu_read_unlock(); 3357 if (rdev) { 3358 if (!rdev_set_badblocks( 3359 rdev, 3360 sh->sector, 3361 STRIPE_SECTORS, 0)) 3362 md_error(conf->mddev, rdev); 3363 rdev_dec_pending(rdev, conf->mddev); 3364 } 3365 } 3366 spin_lock_irq(&sh->stripe_lock); 3367 /* fail all writes first */ 3368 bi = sh->dev[i].towrite; 3369 sh->dev[i].towrite = NULL; 3370 sh->overwrite_disks = 0; 3371 spin_unlock_irq(&sh->stripe_lock); 3372 if (bi) 3373 bitmap_end = 1; 3374 3375 log_stripe_write_finished(sh); 3376 3377 if (test_and_clear_bit(R5_Overlap, &sh->dev[i].flags)) 3378 wake_up(&conf->wait_for_overlap); 3379 3380 while (bi && bi->bi_iter.bi_sector < 3381 sh->dev[i].sector + STRIPE_SECTORS) { 3382 struct bio *nextbi = r5_next_bio(bi, sh->dev[i].sector); 3383 3384 md_write_end(conf->mddev); 3385 bio_io_error(bi); 3386 bi = nextbi; 3387 } 3388 if (bitmap_end) 3389 bitmap_endwrite(conf->mddev->bitmap, sh->sector, 3390 STRIPE_SECTORS, 0, 0); 3391 bitmap_end = 0; 3392 /* and fail all 'written' */ 3393 bi = sh->dev[i].written; 3394 sh->dev[i].written = NULL; 3395 if (test_and_clear_bit(R5_SkipCopy, &sh->dev[i].flags)) { 3396 WARN_ON(test_bit(R5_UPTODATE, &sh->dev[i].flags)); 3397 sh->dev[i].page = sh->dev[i].orig_page; 3398 } 3399 3400 if (bi) bitmap_end = 1; 3401 while (bi && bi->bi_iter.bi_sector < 3402 sh->dev[i].sector + STRIPE_SECTORS) { 3403 struct bio *bi2 = r5_next_bio(bi, sh->dev[i].sector); 3404 3405 md_write_end(conf->mddev); 3406 bio_io_error(bi); 3407 bi = bi2; 3408 } 3409 3410 /* fail any reads if this device is non-operational and 3411 * the data has not reached the cache yet. 3412 */ 3413 if (!test_bit(R5_Wantfill, &sh->dev[i].flags) && 3414 s->failed > conf->max_degraded && 3415 (!test_bit(R5_Insync, &sh->dev[i].flags) || 3416 test_bit(R5_ReadError, &sh->dev[i].flags))) { 3417 spin_lock_irq(&sh->stripe_lock); 3418 bi = sh->dev[i].toread; 3419 sh->dev[i].toread = NULL; 3420 spin_unlock_irq(&sh->stripe_lock); 3421 if (test_and_clear_bit(R5_Overlap, &sh->dev[i].flags)) 3422 wake_up(&conf->wait_for_overlap); 3423 if (bi) 3424 s->to_read--; 3425 while (bi && bi->bi_iter.bi_sector < 3426 sh->dev[i].sector + STRIPE_SECTORS) { 3427 struct bio *nextbi = 3428 r5_next_bio(bi, sh->dev[i].sector); 3429 3430 bio_io_error(bi); 3431 bi = nextbi; 3432 } 3433 } 3434 if (bitmap_end) 3435 bitmap_endwrite(conf->mddev->bitmap, sh->sector, 3436 STRIPE_SECTORS, 0, 0); 3437 /* If we were in the middle of a write the parity block might 3438 * still be locked - so just clear all R5_LOCKED flags 3439 */ 3440 clear_bit(R5_LOCKED, &sh->dev[i].flags); 3441 } 3442 s->to_write = 0; 3443 s->written = 0; 3444 3445 if (test_and_clear_bit(STRIPE_FULL_WRITE, &sh->state)) 3446 if (atomic_dec_and_test(&conf->pending_full_writes)) 3447 md_wakeup_thread(conf->mddev->thread); 3448 } 3449 3450 static void 3451 handle_failed_sync(struct r5conf *conf, struct stripe_head *sh, 3452 struct stripe_head_state *s) 3453 { 3454 int abort = 0; 3455 int i; 3456 3457 BUG_ON(sh->batch_head); 3458 clear_bit(STRIPE_SYNCING, &sh->state); 3459 if (test_and_clear_bit(R5_Overlap, &sh->dev[sh->pd_idx].flags)) 3460 wake_up(&conf->wait_for_overlap); 3461 s->syncing = 0; 3462 s->replacing = 0; 3463 /* There is nothing more to do for sync/check/repair. 3464 * Don't even need to abort as that is handled elsewhere 3465 * if needed, and not always wanted e.g. if there is a known 3466 * bad block here. 3467 * For recover/replace we need to record a bad block on all 3468 * non-sync devices, or abort the recovery 3469 */ 3470 if (test_bit(MD_RECOVERY_RECOVER, &conf->mddev->recovery)) { 3471 /* During recovery devices cannot be removed, so 3472 * locking and refcounting of rdevs is not needed 3473 */ 3474 rcu_read_lock(); 3475 for (i = 0; i < conf->raid_disks; i++) { 3476 struct md_rdev *rdev = rcu_dereference(conf->disks[i].rdev); 3477 if (rdev 3478 && !test_bit(Faulty, &rdev->flags) 3479 && !test_bit(In_sync, &rdev->flags) 3480 && !rdev_set_badblocks(rdev, sh->sector, 3481 STRIPE_SECTORS, 0)) 3482 abort = 1; 3483 rdev = rcu_dereference(conf->disks[i].replacement); 3484 if (rdev 3485 && !test_bit(Faulty, &rdev->flags) 3486 && !test_bit(In_sync, &rdev->flags) 3487 && !rdev_set_badblocks(rdev, sh->sector, 3488 STRIPE_SECTORS, 0)) 3489 abort = 1; 3490 } 3491 rcu_read_unlock(); 3492 if (abort) 3493 conf->recovery_disabled = 3494 conf->mddev->recovery_disabled; 3495 } 3496 md_done_sync(conf->mddev, STRIPE_SECTORS, !abort); 3497 } 3498 3499 static int want_replace(struct stripe_head *sh, int disk_idx) 3500 { 3501 struct md_rdev *rdev; 3502 int rv = 0; 3503 3504 rcu_read_lock(); 3505 rdev = rcu_dereference(sh->raid_conf->disks[disk_idx].replacement); 3506 if (rdev 3507 && !test_bit(Faulty, &rdev->flags) 3508 && !test_bit(In_sync, &rdev->flags) 3509 && (rdev->recovery_offset <= sh->sector 3510 || rdev->mddev->recovery_cp <= sh->sector)) 3511 rv = 1; 3512 rcu_read_unlock(); 3513 return rv; 3514 } 3515 3516 static int need_this_block(struct stripe_head *sh, struct stripe_head_state *s, 3517 int disk_idx, int disks) 3518 { 3519 struct r5dev *dev = &sh->dev[disk_idx]; 3520 struct r5dev *fdev[2] = { &sh->dev[s->failed_num[0]], 3521 &sh->dev[s->failed_num[1]] }; 3522 int i; 3523 3524 3525 if (test_bit(R5_LOCKED, &dev->flags) || 3526 test_bit(R5_UPTODATE, &dev->flags)) 3527 /* No point reading this as we already have it or have 3528 * decided to get it. 3529 */ 3530 return 0; 3531 3532 if (dev->toread || 3533 (dev->towrite && !test_bit(R5_OVERWRITE, &dev->flags))) 3534 /* We need this block to directly satisfy a request */ 3535 return 1; 3536 3537 if (s->syncing || s->expanding || 3538 (s->replacing && want_replace(sh, disk_idx))) 3539 /* When syncing, or expanding we read everything. 3540 * When replacing, we need the replaced block. 3541 */ 3542 return 1; 3543 3544 if ((s->failed >= 1 && fdev[0]->toread) || 3545 (s->failed >= 2 && fdev[1]->toread)) 3546 /* If we want to read from a failed device, then 3547 * we need to actually read every other device. 3548 */ 3549 return 1; 3550 3551 /* Sometimes neither read-modify-write nor reconstruct-write 3552 * cycles can work. In those cases we read every block we 3553 * can. Then the parity-update is certain to have enough to 3554 * work with. 3555 * This can only be a problem when we need to write something, 3556 * and some device has failed. If either of those tests 3557 * fail we need look no further. 3558 */ 3559 if (!s->failed || !s->to_write) 3560 return 0; 3561 3562 if (test_bit(R5_Insync, &dev->flags) && 3563 !test_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) 3564 /* Pre-reads at not permitted until after short delay 3565 * to gather multiple requests. However if this 3566 * device is no Insync, the block could only be computed 3567 * and there is no need to delay that. 3568 */ 3569 return 0; 3570 3571 for (i = 0; i < s->failed && i < 2; i++) { 3572 if (fdev[i]->towrite && 3573 !test_bit(R5_UPTODATE, &fdev[i]->flags) && 3574 !test_bit(R5_OVERWRITE, &fdev[i]->flags)) 3575 /* If we have a partial write to a failed 3576 * device, then we will need to reconstruct 3577 * the content of that device, so all other 3578 * devices must be read. 3579 */ 3580 return 1; 3581 } 3582 3583 /* If we are forced to do a reconstruct-write, either because 3584 * the current RAID6 implementation only supports that, or 3585 * because parity cannot be trusted and we are currently 3586 * recovering it, there is extra need to be careful. 3587 * If one of the devices that we would need to read, because 3588 * it is not being overwritten (and maybe not written at all) 3589 * is missing/faulty, then we need to read everything we can. 3590 */ 3591 if (sh->raid_conf->level != 6 && 3592 sh->sector < sh->raid_conf->mddev->recovery_cp) 3593 /* reconstruct-write isn't being forced */ 3594 return 0; 3595 for (i = 0; i < s->failed && i < 2; i++) { 3596 if (s->failed_num[i] != sh->pd_idx && 3597 s->failed_num[i] != sh->qd_idx && 3598 !test_bit(R5_UPTODATE, &fdev[i]->flags) && 3599 !test_bit(R5_OVERWRITE, &fdev[i]->flags)) 3600 return 1; 3601 } 3602 3603 return 0; 3604 } 3605 3606 /* fetch_block - checks the given member device to see if its data needs 3607 * to be read or computed to satisfy a request. 3608 * 3609 * Returns 1 when no more member devices need to be checked, otherwise returns 3610 * 0 to tell the loop in handle_stripe_fill to continue 3611 */ 3612 static int fetch_block(struct stripe_head *sh, struct stripe_head_state *s, 3613 int disk_idx, int disks) 3614 { 3615 struct r5dev *dev = &sh->dev[disk_idx]; 3616 3617 /* is the data in this block needed, and can we get it? */ 3618 if (need_this_block(sh, s, disk_idx, disks)) { 3619 /* we would like to get this block, possibly by computing it, 3620 * otherwise read it if the backing disk is insync 3621 */ 3622 BUG_ON(test_bit(R5_Wantcompute, &dev->flags)); 3623 BUG_ON(test_bit(R5_Wantread, &dev->flags)); 3624 BUG_ON(sh->batch_head); 3625 3626 /* 3627 * In the raid6 case if the only non-uptodate disk is P 3628 * then we already trusted P to compute the other failed 3629 * drives. It is safe to compute rather than re-read P. 3630 * In other cases we only compute blocks from failed 3631 * devices, otherwise check/repair might fail to detect 3632 * a real inconsistency. 3633 */ 3634 3635 if ((s->uptodate == disks - 1) && 3636 ((sh->qd_idx >= 0 && sh->pd_idx == disk_idx) || 3637 (s->failed && (disk_idx == s->failed_num[0] || 3638 disk_idx == s->failed_num[1])))) { 3639 /* have disk failed, and we're requested to fetch it; 3640 * do compute it 3641 */ 3642 pr_debug("Computing stripe %llu block %d\n", 3643 (unsigned long long)sh->sector, disk_idx); 3644 set_bit(STRIPE_COMPUTE_RUN, &sh->state); 3645 set_bit(STRIPE_OP_COMPUTE_BLK, &s->ops_request); 3646 set_bit(R5_Wantcompute, &dev->flags); 3647 sh->ops.target = disk_idx; 3648 sh->ops.target2 = -1; /* no 2nd target */ 3649 s->req_compute = 1; 3650 /* Careful: from this point on 'uptodate' is in the eye 3651 * of raid_run_ops which services 'compute' operations 3652 * before writes. R5_Wantcompute flags a block that will 3653 * be R5_UPTODATE by the time it is needed for a 3654 * subsequent operation. 3655 */ 3656 s->uptodate++; 3657 return 1; 3658 } else if (s->uptodate == disks-2 && s->failed >= 2) { 3659 /* Computing 2-failure is *very* expensive; only 3660 * do it if failed >= 2 3661 */ 3662 int other; 3663 for (other = disks; other--; ) { 3664 if (other == disk_idx) 3665 continue; 3666 if (!test_bit(R5_UPTODATE, 3667 &sh->dev[other].flags)) 3668 break; 3669 } 3670 BUG_ON(other < 0); 3671 pr_debug("Computing stripe %llu blocks %d,%d\n", 3672 (unsigned long long)sh->sector, 3673 disk_idx, other); 3674 set_bit(STRIPE_COMPUTE_RUN, &sh->state); 3675 set_bit(STRIPE_OP_COMPUTE_BLK, &s->ops_request); 3676 set_bit(R5_Wantcompute, &sh->dev[disk_idx].flags); 3677 set_bit(R5_Wantcompute, &sh->dev[other].flags); 3678 sh->ops.target = disk_idx; 3679 sh->ops.target2 = other; 3680 s->uptodate += 2; 3681 s->req_compute = 1; 3682 return 1; 3683 } else if (test_bit(R5_Insync, &dev->flags)) { 3684 set_bit(R5_LOCKED, &dev->flags); 3685 set_bit(R5_Wantread, &dev->flags); 3686 s->locked++; 3687 pr_debug("Reading block %d (sync=%d)\n", 3688 disk_idx, s->syncing); 3689 } 3690 } 3691 3692 return 0; 3693 } 3694 3695 /** 3696 * handle_stripe_fill - read or compute data to satisfy pending requests. 3697 */ 3698 static void handle_stripe_fill(struct stripe_head *sh, 3699 struct stripe_head_state *s, 3700 int disks) 3701 { 3702 int i; 3703 3704 /* look for blocks to read/compute, skip this if a compute 3705 * is already in flight, or if the stripe contents are in the 3706 * midst of changing due to a write 3707 */ 3708 if (!test_bit(STRIPE_COMPUTE_RUN, &sh->state) && !sh->check_state && 3709 !sh->reconstruct_state) { 3710 3711 /* 3712 * For degraded stripe with data in journal, do not handle 3713 * read requests yet, instead, flush the stripe to raid 3714 * disks first, this avoids handling complex rmw of write 3715 * back cache (prexor with orig_page, and then xor with 3716 * page) in the read path 3717 */ 3718 if (s->injournal && s->failed) { 3719 if (test_bit(STRIPE_R5C_CACHING, &sh->state)) 3720 r5c_make_stripe_write_out(sh); 3721 goto out; 3722 } 3723 3724 for (i = disks; i--; ) 3725 if (fetch_block(sh, s, i, disks)) 3726 break; 3727 } 3728 out: 3729 set_bit(STRIPE_HANDLE, &sh->state); 3730 } 3731 3732 static void break_stripe_batch_list(struct stripe_head *head_sh, 3733 unsigned long handle_flags); 3734 /* handle_stripe_clean_event 3735 * any written block on an uptodate or failed drive can be returned. 3736 * Note that if we 'wrote' to a failed drive, it will be UPTODATE, but 3737 * never LOCKED, so we don't need to test 'failed' directly. 3738 */ 3739 static void handle_stripe_clean_event(struct r5conf *conf, 3740 struct stripe_head *sh, int disks) 3741 { 3742 int i; 3743 struct r5dev *dev; 3744 int discard_pending = 0; 3745 struct stripe_head *head_sh = sh; 3746 bool do_endio = false; 3747 3748 for (i = disks; i--; ) 3749 if (sh->dev[i].written) { 3750 dev = &sh->dev[i]; 3751 if (!test_bit(R5_LOCKED, &dev->flags) && 3752 (test_bit(R5_UPTODATE, &dev->flags) || 3753 test_bit(R5_Discard, &dev->flags) || 3754 test_bit(R5_SkipCopy, &dev->flags))) { 3755 /* We can return any write requests */ 3756 struct bio *wbi, *wbi2; 3757 pr_debug("Return write for disc %d\n", i); 3758 if (test_and_clear_bit(R5_Discard, &dev->flags)) 3759 clear_bit(R5_UPTODATE, &dev->flags); 3760 if (test_and_clear_bit(R5_SkipCopy, &dev->flags)) { 3761 WARN_ON(test_bit(R5_UPTODATE, &dev->flags)); 3762 } 3763 do_endio = true; 3764 3765 returnbi: 3766 dev->page = dev->orig_page; 3767 wbi = dev->written; 3768 dev->written = NULL; 3769 while (wbi && wbi->bi_iter.bi_sector < 3770 dev->sector + STRIPE_SECTORS) { 3771 wbi2 = r5_next_bio(wbi, dev->sector); 3772 md_write_end(conf->mddev); 3773 bio_endio(wbi); 3774 wbi = wbi2; 3775 } 3776 bitmap_endwrite(conf->mddev->bitmap, sh->sector, 3777 STRIPE_SECTORS, 3778 !test_bit(STRIPE_DEGRADED, &sh->state), 3779 0); 3780 if (head_sh->batch_head) { 3781 sh = list_first_entry(&sh->batch_list, 3782 struct stripe_head, 3783 batch_list); 3784 if (sh != head_sh) { 3785 dev = &sh->dev[i]; 3786 goto returnbi; 3787 } 3788 } 3789 sh = head_sh; 3790 dev = &sh->dev[i]; 3791 } else if (test_bit(R5_Discard, &dev->flags)) 3792 discard_pending = 1; 3793 } 3794 3795 log_stripe_write_finished(sh); 3796 3797 if (!discard_pending && 3798 test_bit(R5_Discard, &sh->dev[sh->pd_idx].flags)) { 3799 int hash; 3800 clear_bit(R5_Discard, &sh->dev[sh->pd_idx].flags); 3801 clear_bit(R5_UPTODATE, &sh->dev[sh->pd_idx].flags); 3802 if (sh->qd_idx >= 0) { 3803 clear_bit(R5_Discard, &sh->dev[sh->qd_idx].flags); 3804 clear_bit(R5_UPTODATE, &sh->dev[sh->qd_idx].flags); 3805 } 3806 /* now that discard is done we can proceed with any sync */ 3807 clear_bit(STRIPE_DISCARD, &sh->state); 3808 /* 3809 * SCSI discard will change some bio fields and the stripe has 3810 * no updated data, so remove it from hash list and the stripe 3811 * will be reinitialized 3812 */ 3813 unhash: 3814 hash = sh->hash_lock_index; 3815 spin_lock_irq(conf->hash_locks + hash); 3816 remove_hash(sh); 3817 spin_unlock_irq(conf->hash_locks + hash); 3818 if (head_sh->batch_head) { 3819 sh = list_first_entry(&sh->batch_list, 3820 struct stripe_head, batch_list); 3821 if (sh != head_sh) 3822 goto unhash; 3823 } 3824 sh = head_sh; 3825 3826 if (test_bit(STRIPE_SYNC_REQUESTED, &sh->state)) 3827 set_bit(STRIPE_HANDLE, &sh->state); 3828 3829 } 3830 3831 if (test_and_clear_bit(STRIPE_FULL_WRITE, &sh->state)) 3832 if (atomic_dec_and_test(&conf->pending_full_writes)) 3833 md_wakeup_thread(conf->mddev->thread); 3834 3835 if (head_sh->batch_head && do_endio) 3836 break_stripe_batch_list(head_sh, STRIPE_EXPAND_SYNC_FLAGS); 3837 } 3838 3839 /* 3840 * For RMW in write back cache, we need extra page in prexor to store the 3841 * old data. This page is stored in dev->orig_page. 3842 * 3843 * This function checks whether we have data for prexor. The exact logic 3844 * is: 3845 * R5_UPTODATE && (!R5_InJournal || R5_OrigPageUPTDODATE) 3846 */ 3847 static inline bool uptodate_for_rmw(struct r5dev *dev) 3848 { 3849 return (test_bit(R5_UPTODATE, &dev->flags)) && 3850 (!test_bit(R5_InJournal, &dev->flags) || 3851 test_bit(R5_OrigPageUPTDODATE, &dev->flags)); 3852 } 3853 3854 static int handle_stripe_dirtying(struct r5conf *conf, 3855 struct stripe_head *sh, 3856 struct stripe_head_state *s, 3857 int disks) 3858 { 3859 int rmw = 0, rcw = 0, i; 3860 sector_t recovery_cp = conf->mddev->recovery_cp; 3861 3862 /* Check whether resync is now happening or should start. 3863 * If yes, then the array is dirty (after unclean shutdown or 3864 * initial creation), so parity in some stripes might be inconsistent. 3865 * In this case, we need to always do reconstruct-write, to ensure 3866 * that in case of drive failure or read-error correction, we 3867 * generate correct data from the parity. 3868 */ 3869 if (conf->rmw_level == PARITY_DISABLE_RMW || 3870 (recovery_cp < MaxSector && sh->sector >= recovery_cp && 3871 s->failed == 0)) { 3872 /* Calculate the real rcw later - for now make it 3873 * look like rcw is cheaper 3874 */ 3875 rcw = 1; rmw = 2; 3876 pr_debug("force RCW rmw_level=%u, recovery_cp=%llu sh->sector=%llu\n", 3877 conf->rmw_level, (unsigned long long)recovery_cp, 3878 (unsigned long long)sh->sector); 3879 } else for (i = disks; i--; ) { 3880 /* would I have to read this buffer for read_modify_write */ 3881 struct r5dev *dev = &sh->dev[i]; 3882 if (((dev->towrite && !delay_towrite(conf, dev, s)) || 3883 i == sh->pd_idx || i == sh->qd_idx || 3884 test_bit(R5_InJournal, &dev->flags)) && 3885 !test_bit(R5_LOCKED, &dev->flags) && 3886 !(uptodate_for_rmw(dev) || 3887 test_bit(R5_Wantcompute, &dev->flags))) { 3888 if (test_bit(R5_Insync, &dev->flags)) 3889 rmw++; 3890 else 3891 rmw += 2*disks; /* cannot read it */ 3892 } 3893 /* Would I have to read this buffer for reconstruct_write */ 3894 if (!test_bit(R5_OVERWRITE, &dev->flags) && 3895 i != sh->pd_idx && i != sh->qd_idx && 3896 !test_bit(R5_LOCKED, &dev->flags) && 3897 !(test_bit(R5_UPTODATE, &dev->flags) || 3898 test_bit(R5_Wantcompute, &dev->flags))) { 3899 if (test_bit(R5_Insync, &dev->flags)) 3900 rcw++; 3901 else 3902 rcw += 2*disks; 3903 } 3904 } 3905 3906 pr_debug("for sector %llu state 0x%lx, rmw=%d rcw=%d\n", 3907 (unsigned long long)sh->sector, sh->state, rmw, rcw); 3908 set_bit(STRIPE_HANDLE, &sh->state); 3909 if ((rmw < rcw || (rmw == rcw && conf->rmw_level == PARITY_PREFER_RMW)) && rmw > 0) { 3910 /* prefer read-modify-write, but need to get some data */ 3911 if (conf->mddev->queue) 3912 blk_add_trace_msg(conf->mddev->queue, 3913 "raid5 rmw %llu %d", 3914 (unsigned long long)sh->sector, rmw); 3915 for (i = disks; i--; ) { 3916 struct r5dev *dev = &sh->dev[i]; 3917 if (test_bit(R5_InJournal, &dev->flags) && 3918 dev->page == dev->orig_page && 3919 !test_bit(R5_LOCKED, &sh->dev[sh->pd_idx].flags)) { 3920 /* alloc page for prexor */ 3921 struct page *p = alloc_page(GFP_NOIO); 3922 3923 if (p) { 3924 dev->orig_page = p; 3925 continue; 3926 } 3927 3928 /* 3929 * alloc_page() failed, try use 3930 * disk_info->extra_page 3931 */ 3932 if (!test_and_set_bit(R5C_EXTRA_PAGE_IN_USE, 3933 &conf->cache_state)) { 3934 r5c_use_extra_page(sh); 3935 break; 3936 } 3937 3938 /* extra_page in use, add to delayed_list */ 3939 set_bit(STRIPE_DELAYED, &sh->state); 3940 s->waiting_extra_page = 1; 3941 return -EAGAIN; 3942 } 3943 } 3944 3945 for (i = disks; i--; ) { 3946 struct r5dev *dev = &sh->dev[i]; 3947 if (((dev->towrite && !delay_towrite(conf, dev, s)) || 3948 i == sh->pd_idx || i == sh->qd_idx || 3949 test_bit(R5_InJournal, &dev->flags)) && 3950 !test_bit(R5_LOCKED, &dev->flags) && 3951 !(uptodate_for_rmw(dev) || 3952 test_bit(R5_Wantcompute, &dev->flags)) && 3953 test_bit(R5_Insync, &dev->flags)) { 3954 if (test_bit(STRIPE_PREREAD_ACTIVE, 3955 &sh->state)) { 3956 pr_debug("Read_old block %d for r-m-w\n", 3957 i); 3958 set_bit(R5_LOCKED, &dev->flags); 3959 set_bit(R5_Wantread, &dev->flags); 3960 s->locked++; 3961 } else { 3962 set_bit(STRIPE_DELAYED, &sh->state); 3963 set_bit(STRIPE_HANDLE, &sh->state); 3964 } 3965 } 3966 } 3967 } 3968 if ((rcw < rmw || (rcw == rmw && conf->rmw_level != PARITY_PREFER_RMW)) && rcw > 0) { 3969 /* want reconstruct write, but need to get some data */ 3970 int qread =0; 3971 rcw = 0; 3972 for (i = disks; i--; ) { 3973 struct r5dev *dev = &sh->dev[i]; 3974 if (!test_bit(R5_OVERWRITE, &dev->flags) && 3975 i != sh->pd_idx && i != sh->qd_idx && 3976 !test_bit(R5_LOCKED, &dev->flags) && 3977 !(test_bit(R5_UPTODATE, &dev->flags) || 3978 test_bit(R5_Wantcompute, &dev->flags))) { 3979 rcw++; 3980 if (test_bit(R5_Insync, &dev->flags) && 3981 test_bit(STRIPE_PREREAD_ACTIVE, 3982 &sh->state)) { 3983 pr_debug("Read_old block " 3984 "%d for Reconstruct\n", i); 3985 set_bit(R5_LOCKED, &dev->flags); 3986 set_bit(R5_Wantread, &dev->flags); 3987 s->locked++; 3988 qread++; 3989 } else { 3990 set_bit(STRIPE_DELAYED, &sh->state); 3991 set_bit(STRIPE_HANDLE, &sh->state); 3992 } 3993 } 3994 } 3995 if (rcw && conf->mddev->queue) 3996 blk_add_trace_msg(conf->mddev->queue, "raid5 rcw %llu %d %d %d", 3997 (unsigned long long)sh->sector, 3998 rcw, qread, test_bit(STRIPE_DELAYED, &sh->state)); 3999 } 4000 4001 if (rcw > disks && rmw > disks && 4002 !test_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) 4003 set_bit(STRIPE_DELAYED, &sh->state); 4004 4005 /* now if nothing is locked, and if we have enough data, 4006 * we can start a write request 4007 */ 4008 /* since handle_stripe can be called at any time we need to handle the 4009 * case where a compute block operation has been submitted and then a 4010 * subsequent call wants to start a write request. raid_run_ops only 4011 * handles the case where compute block and reconstruct are requested 4012 * simultaneously. If this is not the case then new writes need to be 4013 * held off until the compute completes. 4014 */ 4015 if ((s->req_compute || !test_bit(STRIPE_COMPUTE_RUN, &sh->state)) && 4016 (s->locked == 0 && (rcw == 0 || rmw == 0) && 4017 !test_bit(STRIPE_BIT_DELAY, &sh->state))) 4018 schedule_reconstruction(sh, s, rcw == 0, 0); 4019 return 0; 4020 } 4021 4022 static void handle_parity_checks5(struct r5conf *conf, struct stripe_head *sh, 4023 struct stripe_head_state *s, int disks) 4024 { 4025 struct r5dev *dev = NULL; 4026 4027 BUG_ON(sh->batch_head); 4028 set_bit(STRIPE_HANDLE, &sh->state); 4029 4030 switch (sh->check_state) { 4031 case check_state_idle: 4032 /* start a new check operation if there are no failures */ 4033 if (s->failed == 0) { 4034 BUG_ON(s->uptodate != disks); 4035 sh->check_state = check_state_run; 4036 set_bit(STRIPE_OP_CHECK, &s->ops_request); 4037 clear_bit(R5_UPTODATE, &sh->dev[sh->pd_idx].flags); 4038 s->uptodate--; 4039 break; 4040 } 4041 dev = &sh->dev[s->failed_num[0]]; 4042 /* fall through */ 4043 case check_state_compute_result: 4044 sh->check_state = check_state_idle; 4045 if (!dev) 4046 dev = &sh->dev[sh->pd_idx]; 4047 4048 /* check that a write has not made the stripe insync */ 4049 if (test_bit(STRIPE_INSYNC, &sh->state)) 4050 break; 4051 4052 /* either failed parity check, or recovery is happening */ 4053 BUG_ON(!test_bit(R5_UPTODATE, &dev->flags)); 4054 BUG_ON(s->uptodate != disks); 4055 4056 set_bit(R5_LOCKED, &dev->flags); 4057 s->locked++; 4058 set_bit(R5_Wantwrite, &dev->flags); 4059 4060 clear_bit(STRIPE_DEGRADED, &sh->state); 4061 set_bit(STRIPE_INSYNC, &sh->state); 4062 break; 4063 case check_state_run: 4064 break; /* we will be called again upon completion */ 4065 case check_state_check_result: 4066 sh->check_state = check_state_idle; 4067 4068 /* if a failure occurred during the check operation, leave 4069 * STRIPE_INSYNC not set and let the stripe be handled again 4070 */ 4071 if (s->failed) 4072 break; 4073 4074 /* handle a successful check operation, if parity is correct 4075 * we are done. Otherwise update the mismatch count and repair 4076 * parity if !MD_RECOVERY_CHECK 4077 */ 4078 if ((sh->ops.zero_sum_result & SUM_CHECK_P_RESULT) == 0) 4079 /* parity is correct (on disc, 4080 * not in buffer any more) 4081 */ 4082 set_bit(STRIPE_INSYNC, &sh->state); 4083 else { 4084 atomic64_add(STRIPE_SECTORS, &conf->mddev->resync_mismatches); 4085 if (test_bit(MD_RECOVERY_CHECK, &conf->mddev->recovery)) { 4086 /* don't try to repair!! */ 4087 set_bit(STRIPE_INSYNC, &sh->state); 4088 pr_warn_ratelimited("%s: mismatch sector in range " 4089 "%llu-%llu\n", mdname(conf->mddev), 4090 (unsigned long long) sh->sector, 4091 (unsigned long long) sh->sector + 4092 STRIPE_SECTORS); 4093 } else { 4094 sh->check_state = check_state_compute_run; 4095 set_bit(STRIPE_COMPUTE_RUN, &sh->state); 4096 set_bit(STRIPE_OP_COMPUTE_BLK, &s->ops_request); 4097 set_bit(R5_Wantcompute, 4098 &sh->dev[sh->pd_idx].flags); 4099 sh->ops.target = sh->pd_idx; 4100 sh->ops.target2 = -1; 4101 s->uptodate++; 4102 } 4103 } 4104 break; 4105 case check_state_compute_run: 4106 break; 4107 default: 4108 pr_err("%s: unknown check_state: %d sector: %llu\n", 4109 __func__, sh->check_state, 4110 (unsigned long long) sh->sector); 4111 BUG(); 4112 } 4113 } 4114 4115 static void handle_parity_checks6(struct r5conf *conf, struct stripe_head *sh, 4116 struct stripe_head_state *s, 4117 int disks) 4118 { 4119 int pd_idx = sh->pd_idx; 4120 int qd_idx = sh->qd_idx; 4121 struct r5dev *dev; 4122 4123 BUG_ON(sh->batch_head); 4124 set_bit(STRIPE_HANDLE, &sh->state); 4125 4126 BUG_ON(s->failed > 2); 4127 4128 /* Want to check and possibly repair P and Q. 4129 * However there could be one 'failed' device, in which 4130 * case we can only check one of them, possibly using the 4131 * other to generate missing data 4132 */ 4133 4134 switch (sh->check_state) { 4135 case check_state_idle: 4136 /* start a new check operation if there are < 2 failures */ 4137 if (s->failed == s->q_failed) { 4138 /* The only possible failed device holds Q, so it 4139 * makes sense to check P (If anything else were failed, 4140 * we would have used P to recreate it). 4141 */ 4142 sh->check_state = check_state_run; 4143 } 4144 if (!s->q_failed && s->failed < 2) { 4145 /* Q is not failed, and we didn't use it to generate 4146 * anything, so it makes sense to check it 4147 */ 4148 if (sh->check_state == check_state_run) 4149 sh->check_state = check_state_run_pq; 4150 else 4151 sh->check_state = check_state_run_q; 4152 } 4153 4154 /* discard potentially stale zero_sum_result */ 4155 sh->ops.zero_sum_result = 0; 4156 4157 if (sh->check_state == check_state_run) { 4158 /* async_xor_zero_sum destroys the contents of P */ 4159 clear_bit(R5_UPTODATE, &sh->dev[pd_idx].flags); 4160 s->uptodate--; 4161 } 4162 if (sh->check_state >= check_state_run && 4163 sh->check_state <= check_state_run_pq) { 4164 /* async_syndrome_zero_sum preserves P and Q, so 4165 * no need to mark them !uptodate here 4166 */ 4167 set_bit(STRIPE_OP_CHECK, &s->ops_request); 4168 break; 4169 } 4170 4171 /* we have 2-disk failure */ 4172 BUG_ON(s->failed != 2); 4173 /* fall through */ 4174 case check_state_compute_result: 4175 sh->check_state = check_state_idle; 4176 4177 /* check that a write has not made the stripe insync */ 4178 if (test_bit(STRIPE_INSYNC, &sh->state)) 4179 break; 4180 4181 /* now write out any block on a failed drive, 4182 * or P or Q if they were recomputed 4183 */ 4184 BUG_ON(s->uptodate < disks - 1); /* We don't need Q to recover */ 4185 if (s->failed == 2) { 4186 dev = &sh->dev[s->failed_num[1]]; 4187 s->locked++; 4188 set_bit(R5_LOCKED, &dev->flags); 4189 set_bit(R5_Wantwrite, &dev->flags); 4190 } 4191 if (s->failed >= 1) { 4192 dev = &sh->dev[s->failed_num[0]]; 4193 s->locked++; 4194 set_bit(R5_LOCKED, &dev->flags); 4195 set_bit(R5_Wantwrite, &dev->flags); 4196 } 4197 if (sh->ops.zero_sum_result & SUM_CHECK_P_RESULT) { 4198 dev = &sh->dev[pd_idx]; 4199 s->locked++; 4200 set_bit(R5_LOCKED, &dev->flags); 4201 set_bit(R5_Wantwrite, &dev->flags); 4202 } 4203 if (sh->ops.zero_sum_result & SUM_CHECK_Q_RESULT) { 4204 dev = &sh->dev[qd_idx]; 4205 s->locked++; 4206 set_bit(R5_LOCKED, &dev->flags); 4207 set_bit(R5_Wantwrite, &dev->flags); 4208 } 4209 clear_bit(STRIPE_DEGRADED, &sh->state); 4210 4211 set_bit(STRIPE_INSYNC, &sh->state); 4212 break; 4213 case check_state_run: 4214 case check_state_run_q: 4215 case check_state_run_pq: 4216 break; /* we will be called again upon completion */ 4217 case check_state_check_result: 4218 sh->check_state = check_state_idle; 4219 4220 /* handle a successful check operation, if parity is correct 4221 * we are done. Otherwise update the mismatch count and repair 4222 * parity if !MD_RECOVERY_CHECK 4223 */ 4224 if (sh->ops.zero_sum_result == 0) { 4225 /* both parities are correct */ 4226 if (!s->failed) 4227 set_bit(STRIPE_INSYNC, &sh->state); 4228 else { 4229 /* in contrast to the raid5 case we can validate 4230 * parity, but still have a failure to write 4231 * back 4232 */ 4233 sh->check_state = check_state_compute_result; 4234 /* Returning at this point means that we may go 4235 * off and bring p and/or q uptodate again so 4236 * we make sure to check zero_sum_result again 4237 * to verify if p or q need writeback 4238 */ 4239 } 4240 } else { 4241 atomic64_add(STRIPE_SECTORS, &conf->mddev->resync_mismatches); 4242 if (test_bit(MD_RECOVERY_CHECK, &conf->mddev->recovery)) { 4243 /* don't try to repair!! */ 4244 set_bit(STRIPE_INSYNC, &sh->state); 4245 pr_warn_ratelimited("%s: mismatch sector in range " 4246 "%llu-%llu\n", mdname(conf->mddev), 4247 (unsigned long long) sh->sector, 4248 (unsigned long long) sh->sector + 4249 STRIPE_SECTORS); 4250 } else { 4251 int *target = &sh->ops.target; 4252 4253 sh->ops.target = -1; 4254 sh->ops.target2 = -1; 4255 sh->check_state = check_state_compute_run; 4256 set_bit(STRIPE_COMPUTE_RUN, &sh->state); 4257 set_bit(STRIPE_OP_COMPUTE_BLK, &s->ops_request); 4258 if (sh->ops.zero_sum_result & SUM_CHECK_P_RESULT) { 4259 set_bit(R5_Wantcompute, 4260 &sh->dev[pd_idx].flags); 4261 *target = pd_idx; 4262 target = &sh->ops.target2; 4263 s->uptodate++; 4264 } 4265 if (sh->ops.zero_sum_result & SUM_CHECK_Q_RESULT) { 4266 set_bit(R5_Wantcompute, 4267 &sh->dev[qd_idx].flags); 4268 *target = qd_idx; 4269 s->uptodate++; 4270 } 4271 } 4272 } 4273 break; 4274 case check_state_compute_run: 4275 break; 4276 default: 4277 pr_warn("%s: unknown check_state: %d sector: %llu\n", 4278 __func__, sh->check_state, 4279 (unsigned long long) sh->sector); 4280 BUG(); 4281 } 4282 } 4283 4284 static void handle_stripe_expansion(struct r5conf *conf, struct stripe_head *sh) 4285 { 4286 int i; 4287 4288 /* We have read all the blocks in this stripe and now we need to 4289 * copy some of them into a target stripe for expand. 4290 */ 4291 struct dma_async_tx_descriptor *tx = NULL; 4292 BUG_ON(sh->batch_head); 4293 clear_bit(STRIPE_EXPAND_SOURCE, &sh->state); 4294 for (i = 0; i < sh->disks; i++) 4295 if (i != sh->pd_idx && i != sh->qd_idx) { 4296 int dd_idx, j; 4297 struct stripe_head *sh2; 4298 struct async_submit_ctl submit; 4299 4300 sector_t bn = raid5_compute_blocknr(sh, i, 1); 4301 sector_t s = raid5_compute_sector(conf, bn, 0, 4302 &dd_idx, NULL); 4303 sh2 = raid5_get_active_stripe(conf, s, 0, 1, 1); 4304 if (sh2 == NULL) 4305 /* so far only the early blocks of this stripe 4306 * have been requested. When later blocks 4307 * get requested, we will try again 4308 */ 4309 continue; 4310 if (!test_bit(STRIPE_EXPANDING, &sh2->state) || 4311 test_bit(R5_Expanded, &sh2->dev[dd_idx].flags)) { 4312 /* must have already done this block */ 4313 raid5_release_stripe(sh2); 4314 continue; 4315 } 4316 4317 /* place all the copies on one channel */ 4318 init_async_submit(&submit, 0, tx, NULL, NULL, NULL); 4319 tx = async_memcpy(sh2->dev[dd_idx].page, 4320 sh->dev[i].page, 0, 0, STRIPE_SIZE, 4321 &submit); 4322 4323 set_bit(R5_Expanded, &sh2->dev[dd_idx].flags); 4324 set_bit(R5_UPTODATE, &sh2->dev[dd_idx].flags); 4325 for (j = 0; j < conf->raid_disks; j++) 4326 if (j != sh2->pd_idx && 4327 j != sh2->qd_idx && 4328 !test_bit(R5_Expanded, &sh2->dev[j].flags)) 4329 break; 4330 if (j == conf->raid_disks) { 4331 set_bit(STRIPE_EXPAND_READY, &sh2->state); 4332 set_bit(STRIPE_HANDLE, &sh2->state); 4333 } 4334 raid5_release_stripe(sh2); 4335 4336 } 4337 /* done submitting copies, wait for them to complete */ 4338 async_tx_quiesce(&tx); 4339 } 4340 4341 /* 4342 * handle_stripe - do things to a stripe. 4343 * 4344 * We lock the stripe by setting STRIPE_ACTIVE and then examine the 4345 * state of various bits to see what needs to be done. 4346 * Possible results: 4347 * return some read requests which now have data 4348 * return some write requests which are safely on storage 4349 * schedule a read on some buffers 4350 * schedule a write of some buffers 4351 * return confirmation of parity correctness 4352 * 4353 */ 4354 4355 static void analyse_stripe(struct stripe_head *sh, struct stripe_head_state *s) 4356 { 4357 struct r5conf *conf = sh->raid_conf; 4358 int disks = sh->disks; 4359 struct r5dev *dev; 4360 int i; 4361 int do_recovery = 0; 4362 4363 memset(s, 0, sizeof(*s)); 4364 4365 s->expanding = test_bit(STRIPE_EXPAND_SOURCE, &sh->state) && !sh->batch_head; 4366 s->expanded = test_bit(STRIPE_EXPAND_READY, &sh->state) && !sh->batch_head; 4367 s->failed_num[0] = -1; 4368 s->failed_num[1] = -1; 4369 s->log_failed = r5l_log_disk_error(conf); 4370 4371 /* Now to look around and see what can be done */ 4372 rcu_read_lock(); 4373 for (i=disks; i--; ) { 4374 struct md_rdev *rdev; 4375 sector_t first_bad; 4376 int bad_sectors; 4377 int is_bad = 0; 4378 4379 dev = &sh->dev[i]; 4380 4381 pr_debug("check %d: state 0x%lx read %p write %p written %p\n", 4382 i, dev->flags, 4383 dev->toread, dev->towrite, dev->written); 4384 /* maybe we can reply to a read 4385 * 4386 * new wantfill requests are only permitted while 4387 * ops_complete_biofill is guaranteed to be inactive 4388 */ 4389 if (test_bit(R5_UPTODATE, &dev->flags) && dev->toread && 4390 !test_bit(STRIPE_BIOFILL_RUN, &sh->state)) 4391 set_bit(R5_Wantfill, &dev->flags); 4392 4393 /* now count some things */ 4394 if (test_bit(R5_LOCKED, &dev->flags)) 4395 s->locked++; 4396 if (test_bit(R5_UPTODATE, &dev->flags)) 4397 s->uptodate++; 4398 if (test_bit(R5_Wantcompute, &dev->flags)) { 4399 s->compute++; 4400 BUG_ON(s->compute > 2); 4401 } 4402 4403 if (test_bit(R5_Wantfill, &dev->flags)) 4404 s->to_fill++; 4405 else if (dev->toread) 4406 s->to_read++; 4407 if (dev->towrite) { 4408 s->to_write++; 4409 if (!test_bit(R5_OVERWRITE, &dev->flags)) 4410 s->non_overwrite++; 4411 } 4412 if (dev->written) 4413 s->written++; 4414 /* Prefer to use the replacement for reads, but only 4415 * if it is recovered enough and has no bad blocks. 4416 */ 4417 rdev = rcu_dereference(conf->disks[i].replacement); 4418 if (rdev && !test_bit(Faulty, &rdev->flags) && 4419 rdev->recovery_offset >= sh->sector + STRIPE_SECTORS && 4420 !is_badblock(rdev, sh->sector, STRIPE_SECTORS, 4421 &first_bad, &bad_sectors)) 4422 set_bit(R5_ReadRepl, &dev->flags); 4423 else { 4424 if (rdev && !test_bit(Faulty, &rdev->flags)) 4425 set_bit(R5_NeedReplace, &dev->flags); 4426 else 4427 clear_bit(R5_NeedReplace, &dev->flags); 4428 rdev = rcu_dereference(conf->disks[i].rdev); 4429 clear_bit(R5_ReadRepl, &dev->flags); 4430 } 4431 if (rdev && test_bit(Faulty, &rdev->flags)) 4432 rdev = NULL; 4433 if (rdev) { 4434 is_bad = is_badblock(rdev, sh->sector, STRIPE_SECTORS, 4435 &first_bad, &bad_sectors); 4436 if (s->blocked_rdev == NULL 4437 && (test_bit(Blocked, &rdev->flags) 4438 || is_bad < 0)) { 4439 if (is_bad < 0) 4440 set_bit(BlockedBadBlocks, 4441 &rdev->flags); 4442 s->blocked_rdev = rdev; 4443 atomic_inc(&rdev->nr_pending); 4444 } 4445 } 4446 clear_bit(R5_Insync, &dev->flags); 4447 if (!rdev) 4448 /* Not in-sync */; 4449 else if (is_bad) { 4450 /* also not in-sync */ 4451 if (!test_bit(WriteErrorSeen, &rdev->flags) && 4452 test_bit(R5_UPTODATE, &dev->flags)) { 4453 /* treat as in-sync, but with a read error 4454 * which we can now try to correct 4455 */ 4456 set_bit(R5_Insync, &dev->flags); 4457 set_bit(R5_ReadError, &dev->flags); 4458 } 4459 } else if (test_bit(In_sync, &rdev->flags)) 4460 set_bit(R5_Insync, &dev->flags); 4461 else if (sh->sector + STRIPE_SECTORS <= rdev->recovery_offset) 4462 /* in sync if before recovery_offset */ 4463 set_bit(R5_Insync, &dev->flags); 4464 else if (test_bit(R5_UPTODATE, &dev->flags) && 4465 test_bit(R5_Expanded, &dev->flags)) 4466 /* If we've reshaped into here, we assume it is Insync. 4467 * We will shortly update recovery_offset to make 4468 * it official. 4469 */ 4470 set_bit(R5_Insync, &dev->flags); 4471 4472 if (test_bit(R5_WriteError, &dev->flags)) { 4473 /* This flag does not apply to '.replacement' 4474 * only to .rdev, so make sure to check that*/ 4475 struct md_rdev *rdev2 = rcu_dereference( 4476 conf->disks[i].rdev); 4477 if (rdev2 == rdev) 4478 clear_bit(R5_Insync, &dev->flags); 4479 if (rdev2 && !test_bit(Faulty, &rdev2->flags)) { 4480 s->handle_bad_blocks = 1; 4481 atomic_inc(&rdev2->nr_pending); 4482 } else 4483 clear_bit(R5_WriteError, &dev->flags); 4484 } 4485 if (test_bit(R5_MadeGood, &dev->flags)) { 4486 /* This flag does not apply to '.replacement' 4487 * only to .rdev, so make sure to check that*/ 4488 struct md_rdev *rdev2 = rcu_dereference( 4489 conf->disks[i].rdev); 4490 if (rdev2 && !test_bit(Faulty, &rdev2->flags)) { 4491 s->handle_bad_blocks = 1; 4492 atomic_inc(&rdev2->nr_pending); 4493 } else 4494 clear_bit(R5_MadeGood, &dev->flags); 4495 } 4496 if (test_bit(R5_MadeGoodRepl, &dev->flags)) { 4497 struct md_rdev *rdev2 = rcu_dereference( 4498 conf->disks[i].replacement); 4499 if (rdev2 && !test_bit(Faulty, &rdev2->flags)) { 4500 s->handle_bad_blocks = 1; 4501 atomic_inc(&rdev2->nr_pending); 4502 } else 4503 clear_bit(R5_MadeGoodRepl, &dev->flags); 4504 } 4505 if (!test_bit(R5_Insync, &dev->flags)) { 4506 /* The ReadError flag will just be confusing now */ 4507 clear_bit(R5_ReadError, &dev->flags); 4508 clear_bit(R5_ReWrite, &dev->flags); 4509 } 4510 if (test_bit(R5_ReadError, &dev->flags)) 4511 clear_bit(R5_Insync, &dev->flags); 4512 if (!test_bit(R5_Insync, &dev->flags)) { 4513 if (s->failed < 2) 4514 s->failed_num[s->failed] = i; 4515 s->failed++; 4516 if (rdev && !test_bit(Faulty, &rdev->flags)) 4517 do_recovery = 1; 4518 } 4519 4520 if (test_bit(R5_InJournal, &dev->flags)) 4521 s->injournal++; 4522 if (test_bit(R5_InJournal, &dev->flags) && dev->written) 4523 s->just_cached++; 4524 } 4525 if (test_bit(STRIPE_SYNCING, &sh->state)) { 4526 /* If there is a failed device being replaced, 4527 * we must be recovering. 4528 * else if we are after recovery_cp, we must be syncing 4529 * else if MD_RECOVERY_REQUESTED is set, we also are syncing. 4530 * else we can only be replacing 4531 * sync and recovery both need to read all devices, and so 4532 * use the same flag. 4533 */ 4534 if (do_recovery || 4535 sh->sector >= conf->mddev->recovery_cp || 4536 test_bit(MD_RECOVERY_REQUESTED, &(conf->mddev->recovery))) 4537 s->syncing = 1; 4538 else 4539 s->replacing = 1; 4540 } 4541 rcu_read_unlock(); 4542 } 4543 4544 static int clear_batch_ready(struct stripe_head *sh) 4545 { 4546 /* Return '1' if this is a member of batch, or 4547 * '0' if it is a lone stripe or a head which can now be 4548 * handled. 4549 */ 4550 struct stripe_head *tmp; 4551 if (!test_and_clear_bit(STRIPE_BATCH_READY, &sh->state)) 4552 return (sh->batch_head && sh->batch_head != sh); 4553 spin_lock(&sh->stripe_lock); 4554 if (!sh->batch_head) { 4555 spin_unlock(&sh->stripe_lock); 4556 return 0; 4557 } 4558 4559 /* 4560 * this stripe could be added to a batch list before we check 4561 * BATCH_READY, skips it 4562 */ 4563 if (sh->batch_head != sh) { 4564 spin_unlock(&sh->stripe_lock); 4565 return 1; 4566 } 4567 spin_lock(&sh->batch_lock); 4568 list_for_each_entry(tmp, &sh->batch_list, batch_list) 4569 clear_bit(STRIPE_BATCH_READY, &tmp->state); 4570 spin_unlock(&sh->batch_lock); 4571 spin_unlock(&sh->stripe_lock); 4572 4573 /* 4574 * BATCH_READY is cleared, no new stripes can be added. 4575 * batch_list can be accessed without lock 4576 */ 4577 return 0; 4578 } 4579 4580 static void break_stripe_batch_list(struct stripe_head *head_sh, 4581 unsigned long handle_flags) 4582 { 4583 struct stripe_head *sh, *next; 4584 int i; 4585 int do_wakeup = 0; 4586 4587 list_for_each_entry_safe(sh, next, &head_sh->batch_list, batch_list) { 4588 4589 list_del_init(&sh->batch_list); 4590 4591 WARN_ONCE(sh->state & ((1 << STRIPE_ACTIVE) | 4592 (1 << STRIPE_SYNCING) | 4593 (1 << STRIPE_REPLACED) | 4594 (1 << STRIPE_DELAYED) | 4595 (1 << STRIPE_BIT_DELAY) | 4596 (1 << STRIPE_FULL_WRITE) | 4597 (1 << STRIPE_BIOFILL_RUN) | 4598 (1 << STRIPE_COMPUTE_RUN) | 4599 (1 << STRIPE_OPS_REQ_PENDING) | 4600 (1 << STRIPE_DISCARD) | 4601 (1 << STRIPE_BATCH_READY) | 4602 (1 << STRIPE_BATCH_ERR) | 4603 (1 << STRIPE_BITMAP_PENDING)), 4604 "stripe state: %lx\n", sh->state); 4605 WARN_ONCE(head_sh->state & ((1 << STRIPE_DISCARD) | 4606 (1 << STRIPE_REPLACED)), 4607 "head stripe state: %lx\n", head_sh->state); 4608 4609 set_mask_bits(&sh->state, ~(STRIPE_EXPAND_SYNC_FLAGS | 4610 (1 << STRIPE_PREREAD_ACTIVE) | 4611 (1 << STRIPE_DEGRADED) | 4612 (1 << STRIPE_ON_UNPLUG_LIST)), 4613 head_sh->state & (1 << STRIPE_INSYNC)); 4614 4615 sh->check_state = head_sh->check_state; 4616 sh->reconstruct_state = head_sh->reconstruct_state; 4617 for (i = 0; i < sh->disks; i++) { 4618 if (test_and_clear_bit(R5_Overlap, &sh->dev[i].flags)) 4619 do_wakeup = 1; 4620 sh->dev[i].flags = head_sh->dev[i].flags & 4621 (~((1 << R5_WriteError) | (1 << R5_Overlap))); 4622 } 4623 spin_lock_irq(&sh->stripe_lock); 4624 sh->batch_head = NULL; 4625 spin_unlock_irq(&sh->stripe_lock); 4626 if (handle_flags == 0 || 4627 sh->state & handle_flags) 4628 set_bit(STRIPE_HANDLE, &sh->state); 4629 raid5_release_stripe(sh); 4630 } 4631 spin_lock_irq(&head_sh->stripe_lock); 4632 head_sh->batch_head = NULL; 4633 spin_unlock_irq(&head_sh->stripe_lock); 4634 for (i = 0; i < head_sh->disks; i++) 4635 if (test_and_clear_bit(R5_Overlap, &head_sh->dev[i].flags)) 4636 do_wakeup = 1; 4637 if (head_sh->state & handle_flags) 4638 set_bit(STRIPE_HANDLE, &head_sh->state); 4639 4640 if (do_wakeup) 4641 wake_up(&head_sh->raid_conf->wait_for_overlap); 4642 } 4643 4644 static void handle_stripe(struct stripe_head *sh) 4645 { 4646 struct stripe_head_state s; 4647 struct r5conf *conf = sh->raid_conf; 4648 int i; 4649 int prexor; 4650 int disks = sh->disks; 4651 struct r5dev *pdev, *qdev; 4652 4653 clear_bit(STRIPE_HANDLE, &sh->state); 4654 if (test_and_set_bit_lock(STRIPE_ACTIVE, &sh->state)) { 4655 /* already being handled, ensure it gets handled 4656 * again when current action finishes */ 4657 set_bit(STRIPE_HANDLE, &sh->state); 4658 return; 4659 } 4660 4661 if (clear_batch_ready(sh) ) { 4662 clear_bit_unlock(STRIPE_ACTIVE, &sh->state); 4663 return; 4664 } 4665 4666 if (test_and_clear_bit(STRIPE_BATCH_ERR, &sh->state)) 4667 break_stripe_batch_list(sh, 0); 4668 4669 if (test_bit(STRIPE_SYNC_REQUESTED, &sh->state) && !sh->batch_head) { 4670 spin_lock(&sh->stripe_lock); 4671 /* 4672 * Cannot process 'sync' concurrently with 'discard'. 4673 * Flush data in r5cache before 'sync'. 4674 */ 4675 if (!test_bit(STRIPE_R5C_PARTIAL_STRIPE, &sh->state) && 4676 !test_bit(STRIPE_R5C_FULL_STRIPE, &sh->state) && 4677 !test_bit(STRIPE_DISCARD, &sh->state) && 4678 test_and_clear_bit(STRIPE_SYNC_REQUESTED, &sh->state)) { 4679 set_bit(STRIPE_SYNCING, &sh->state); 4680 clear_bit(STRIPE_INSYNC, &sh->state); 4681 clear_bit(STRIPE_REPLACED, &sh->state); 4682 } 4683 spin_unlock(&sh->stripe_lock); 4684 } 4685 clear_bit(STRIPE_DELAYED, &sh->state); 4686 4687 pr_debug("handling stripe %llu, state=%#lx cnt=%d, " 4688 "pd_idx=%d, qd_idx=%d\n, check:%d, reconstruct:%d\n", 4689 (unsigned long long)sh->sector, sh->state, 4690 atomic_read(&sh->count), sh->pd_idx, sh->qd_idx, 4691 sh->check_state, sh->reconstruct_state); 4692 4693 analyse_stripe(sh, &s); 4694 4695 if (test_bit(STRIPE_LOG_TRAPPED, &sh->state)) 4696 goto finish; 4697 4698 if (s.handle_bad_blocks || 4699 test_bit(MD_SB_CHANGE_PENDING, &conf->mddev->sb_flags)) { 4700 set_bit(STRIPE_HANDLE, &sh->state); 4701 goto finish; 4702 } 4703 4704 if (unlikely(s.blocked_rdev)) { 4705 if (s.syncing || s.expanding || s.expanded || 4706 s.replacing || s.to_write || s.written) { 4707 set_bit(STRIPE_HANDLE, &sh->state); 4708 goto finish; 4709 } 4710 /* There is nothing for the blocked_rdev to block */ 4711 rdev_dec_pending(s.blocked_rdev, conf->mddev); 4712 s.blocked_rdev = NULL; 4713 } 4714 4715 if (s.to_fill && !test_bit(STRIPE_BIOFILL_RUN, &sh->state)) { 4716 set_bit(STRIPE_OP_BIOFILL, &s.ops_request); 4717 set_bit(STRIPE_BIOFILL_RUN, &sh->state); 4718 } 4719 4720 pr_debug("locked=%d uptodate=%d to_read=%d" 4721 " to_write=%d failed=%d failed_num=%d,%d\n", 4722 s.locked, s.uptodate, s.to_read, s.to_write, s.failed, 4723 s.failed_num[0], s.failed_num[1]); 4724 /* 4725 * check if the array has lost more than max_degraded devices and, 4726 * if so, some requests might need to be failed. 4727 * 4728 * When journal device failed (log_failed), we will only process 4729 * the stripe if there is data need write to raid disks 4730 */ 4731 if (s.failed > conf->max_degraded || 4732 (s.log_failed && s.injournal == 0)) { 4733 sh->check_state = 0; 4734 sh->reconstruct_state = 0; 4735 break_stripe_batch_list(sh, 0); 4736 if (s.to_read+s.to_write+s.written) 4737 handle_failed_stripe(conf, sh, &s, disks); 4738 if (s.syncing + s.replacing) 4739 handle_failed_sync(conf, sh, &s); 4740 } 4741 4742 /* Now we check to see if any write operations have recently 4743 * completed 4744 */ 4745 prexor = 0; 4746 if (sh->reconstruct_state == reconstruct_state_prexor_drain_result) 4747 prexor = 1; 4748 if (sh->reconstruct_state == reconstruct_state_drain_result || 4749 sh->reconstruct_state == reconstruct_state_prexor_drain_result) { 4750 sh->reconstruct_state = reconstruct_state_idle; 4751 4752 /* All the 'written' buffers and the parity block are ready to 4753 * be written back to disk 4754 */ 4755 BUG_ON(!test_bit(R5_UPTODATE, &sh->dev[sh->pd_idx].flags) && 4756 !test_bit(R5_Discard, &sh->dev[sh->pd_idx].flags)); 4757 BUG_ON(sh->qd_idx >= 0 && 4758 !test_bit(R5_UPTODATE, &sh->dev[sh->qd_idx].flags) && 4759 !test_bit(R5_Discard, &sh->dev[sh->qd_idx].flags)); 4760 for (i = disks; i--; ) { 4761 struct r5dev *dev = &sh->dev[i]; 4762 if (test_bit(R5_LOCKED, &dev->flags) && 4763 (i == sh->pd_idx || i == sh->qd_idx || 4764 dev->written || test_bit(R5_InJournal, 4765 &dev->flags))) { 4766 pr_debug("Writing block %d\n", i); 4767 set_bit(R5_Wantwrite, &dev->flags); 4768 if (prexor) 4769 continue; 4770 if (s.failed > 1) 4771 continue; 4772 if (!test_bit(R5_Insync, &dev->flags) || 4773 ((i == sh->pd_idx || i == sh->qd_idx) && 4774 s.failed == 0)) 4775 set_bit(STRIPE_INSYNC, &sh->state); 4776 } 4777 } 4778 if (test_and_clear_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) 4779 s.dec_preread_active = 1; 4780 } 4781 4782 /* 4783 * might be able to return some write requests if the parity blocks 4784 * are safe, or on a failed drive 4785 */ 4786 pdev = &sh->dev[sh->pd_idx]; 4787 s.p_failed = (s.failed >= 1 && s.failed_num[0] == sh->pd_idx) 4788 || (s.failed >= 2 && s.failed_num[1] == sh->pd_idx); 4789 qdev = &sh->dev[sh->qd_idx]; 4790 s.q_failed = (s.failed >= 1 && s.failed_num[0] == sh->qd_idx) 4791 || (s.failed >= 2 && s.failed_num[1] == sh->qd_idx) 4792 || conf->level < 6; 4793 4794 if (s.written && 4795 (s.p_failed || ((test_bit(R5_Insync, &pdev->flags) 4796 && !test_bit(R5_LOCKED, &pdev->flags) 4797 && (test_bit(R5_UPTODATE, &pdev->flags) || 4798 test_bit(R5_Discard, &pdev->flags))))) && 4799 (s.q_failed || ((test_bit(R5_Insync, &qdev->flags) 4800 && !test_bit(R5_LOCKED, &qdev->flags) 4801 && (test_bit(R5_UPTODATE, &qdev->flags) || 4802 test_bit(R5_Discard, &qdev->flags)))))) 4803 handle_stripe_clean_event(conf, sh, disks); 4804 4805 if (s.just_cached) 4806 r5c_handle_cached_data_endio(conf, sh, disks); 4807 log_stripe_write_finished(sh); 4808 4809 /* Now we might consider reading some blocks, either to check/generate 4810 * parity, or to satisfy requests 4811 * or to load a block that is being partially written. 4812 */ 4813 if (s.to_read || s.non_overwrite 4814 || (conf->level == 6 && s.to_write && s.failed) 4815 || (s.syncing && (s.uptodate + s.compute < disks)) 4816 || s.replacing 4817 || s.expanding) 4818 handle_stripe_fill(sh, &s, disks); 4819 4820 /* 4821 * When the stripe finishes full journal write cycle (write to journal 4822 * and raid disk), this is the clean up procedure so it is ready for 4823 * next operation. 4824 */ 4825 r5c_finish_stripe_write_out(conf, sh, &s); 4826 4827 /* 4828 * Now to consider new write requests, cache write back and what else, 4829 * if anything should be read. We do not handle new writes when: 4830 * 1/ A 'write' operation (copy+xor) is already in flight. 4831 * 2/ A 'check' operation is in flight, as it may clobber the parity 4832 * block. 4833 * 3/ A r5c cache log write is in flight. 4834 */ 4835 4836 if (!sh->reconstruct_state && !sh->check_state && !sh->log_io) { 4837 if (!r5c_is_writeback(conf->log)) { 4838 if (s.to_write) 4839 handle_stripe_dirtying(conf, sh, &s, disks); 4840 } else { /* write back cache */ 4841 int ret = 0; 4842 4843 /* First, try handle writes in caching phase */ 4844 if (s.to_write) 4845 ret = r5c_try_caching_write(conf, sh, &s, 4846 disks); 4847 /* 4848 * If caching phase failed: ret == -EAGAIN 4849 * OR 4850 * stripe under reclaim: !caching && injournal 4851 * 4852 * fall back to handle_stripe_dirtying() 4853 */ 4854 if (ret == -EAGAIN || 4855 /* stripe under reclaim: !caching && injournal */ 4856 (!test_bit(STRIPE_R5C_CACHING, &sh->state) && 4857 s.injournal > 0)) { 4858 ret = handle_stripe_dirtying(conf, sh, &s, 4859 disks); 4860 if (ret == -EAGAIN) 4861 goto finish; 4862 } 4863 } 4864 } 4865 4866 /* maybe we need to check and possibly fix the parity for this stripe 4867 * Any reads will already have been scheduled, so we just see if enough 4868 * data is available. The parity check is held off while parity 4869 * dependent operations are in flight. 4870 */ 4871 if (sh->check_state || 4872 (s.syncing && s.locked == 0 && 4873 !test_bit(STRIPE_COMPUTE_RUN, &sh->state) && 4874 !test_bit(STRIPE_INSYNC, &sh->state))) { 4875 if (conf->level == 6) 4876 handle_parity_checks6(conf, sh, &s, disks); 4877 else 4878 handle_parity_checks5(conf, sh, &s, disks); 4879 } 4880 4881 if ((s.replacing || s.syncing) && s.locked == 0 4882 && !test_bit(STRIPE_COMPUTE_RUN, &sh->state) 4883 && !test_bit(STRIPE_REPLACED, &sh->state)) { 4884 /* Write out to replacement devices where possible */ 4885 for (i = 0; i < conf->raid_disks; i++) 4886 if (test_bit(R5_NeedReplace, &sh->dev[i].flags)) { 4887 WARN_ON(!test_bit(R5_UPTODATE, &sh->dev[i].flags)); 4888 set_bit(R5_WantReplace, &sh->dev[i].flags); 4889 set_bit(R5_LOCKED, &sh->dev[i].flags); 4890 s.locked++; 4891 } 4892 if (s.replacing) 4893 set_bit(STRIPE_INSYNC, &sh->state); 4894 set_bit(STRIPE_REPLACED, &sh->state); 4895 } 4896 if ((s.syncing || s.replacing) && s.locked == 0 && 4897 !test_bit(STRIPE_COMPUTE_RUN, &sh->state) && 4898 test_bit(STRIPE_INSYNC, &sh->state)) { 4899 md_done_sync(conf->mddev, STRIPE_SECTORS, 1); 4900 clear_bit(STRIPE_SYNCING, &sh->state); 4901 if (test_and_clear_bit(R5_Overlap, &sh->dev[sh->pd_idx].flags)) 4902 wake_up(&conf->wait_for_overlap); 4903 } 4904 4905 /* If the failed drives are just a ReadError, then we might need 4906 * to progress the repair/check process 4907 */ 4908 if (s.failed <= conf->max_degraded && !conf->mddev->ro) 4909 for (i = 0; i < s.failed; i++) { 4910 struct r5dev *dev = &sh->dev[s.failed_num[i]]; 4911 if (test_bit(R5_ReadError, &dev->flags) 4912 && !test_bit(R5_LOCKED, &dev->flags) 4913 && test_bit(R5_UPTODATE, &dev->flags) 4914 ) { 4915 if (!test_bit(R5_ReWrite, &dev->flags)) { 4916 set_bit(R5_Wantwrite, &dev->flags); 4917 set_bit(R5_ReWrite, &dev->flags); 4918 set_bit(R5_LOCKED, &dev->flags); 4919 s.locked++; 4920 } else { 4921 /* let's read it back */ 4922 set_bit(R5_Wantread, &dev->flags); 4923 set_bit(R5_LOCKED, &dev->flags); 4924 s.locked++; 4925 } 4926 } 4927 } 4928 4929 /* Finish reconstruct operations initiated by the expansion process */ 4930 if (sh->reconstruct_state == reconstruct_state_result) { 4931 struct stripe_head *sh_src 4932 = raid5_get_active_stripe(conf, sh->sector, 1, 1, 1); 4933 if (sh_src && test_bit(STRIPE_EXPAND_SOURCE, &sh_src->state)) { 4934 /* sh cannot be written until sh_src has been read. 4935 * so arrange for sh to be delayed a little 4936 */ 4937 set_bit(STRIPE_DELAYED, &sh->state); 4938 set_bit(STRIPE_HANDLE, &sh->state); 4939 if (!test_and_set_bit(STRIPE_PREREAD_ACTIVE, 4940 &sh_src->state)) 4941 atomic_inc(&conf->preread_active_stripes); 4942 raid5_release_stripe(sh_src); 4943 goto finish; 4944 } 4945 if (sh_src) 4946 raid5_release_stripe(sh_src); 4947 4948 sh->reconstruct_state = reconstruct_state_idle; 4949 clear_bit(STRIPE_EXPANDING, &sh->state); 4950 for (i = conf->raid_disks; i--; ) { 4951 set_bit(R5_Wantwrite, &sh->dev[i].flags); 4952 set_bit(R5_LOCKED, &sh->dev[i].flags); 4953 s.locked++; 4954 } 4955 } 4956 4957 if (s.expanded && test_bit(STRIPE_EXPANDING, &sh->state) && 4958 !sh->reconstruct_state) { 4959 /* Need to write out all blocks after computing parity */ 4960 sh->disks = conf->raid_disks; 4961 stripe_set_idx(sh->sector, conf, 0, sh); 4962 schedule_reconstruction(sh, &s, 1, 1); 4963 } else if (s.expanded && !sh->reconstruct_state && s.locked == 0) { 4964 clear_bit(STRIPE_EXPAND_READY, &sh->state); 4965 atomic_dec(&conf->reshape_stripes); 4966 wake_up(&conf->wait_for_overlap); 4967 md_done_sync(conf->mddev, STRIPE_SECTORS, 1); 4968 } 4969 4970 if (s.expanding && s.locked == 0 && 4971 !test_bit(STRIPE_COMPUTE_RUN, &sh->state)) 4972 handle_stripe_expansion(conf, sh); 4973 4974 finish: 4975 /* wait for this device to become unblocked */ 4976 if (unlikely(s.blocked_rdev)) { 4977 if (conf->mddev->external) 4978 md_wait_for_blocked_rdev(s.blocked_rdev, 4979 conf->mddev); 4980 else 4981 /* Internal metadata will immediately 4982 * be written by raid5d, so we don't 4983 * need to wait here. 4984 */ 4985 rdev_dec_pending(s.blocked_rdev, 4986 conf->mddev); 4987 } 4988 4989 if (s.handle_bad_blocks) 4990 for (i = disks; i--; ) { 4991 struct md_rdev *rdev; 4992 struct r5dev *dev = &sh->dev[i]; 4993 if (test_and_clear_bit(R5_WriteError, &dev->flags)) { 4994 /* We own a safe reference to the rdev */ 4995 rdev = conf->disks[i].rdev; 4996 if (!rdev_set_badblocks(rdev, sh->sector, 4997 STRIPE_SECTORS, 0)) 4998 md_error(conf->mddev, rdev); 4999 rdev_dec_pending(rdev, conf->mddev); 5000 } 5001 if (test_and_clear_bit(R5_MadeGood, &dev->flags)) { 5002 rdev = conf->disks[i].rdev; 5003 rdev_clear_badblocks(rdev, sh->sector, 5004 STRIPE_SECTORS, 0); 5005 rdev_dec_pending(rdev, conf->mddev); 5006 } 5007 if (test_and_clear_bit(R5_MadeGoodRepl, &dev->flags)) { 5008 rdev = conf->disks[i].replacement; 5009 if (!rdev) 5010 /* rdev have been moved down */ 5011 rdev = conf->disks[i].rdev; 5012 rdev_clear_badblocks(rdev, sh->sector, 5013 STRIPE_SECTORS, 0); 5014 rdev_dec_pending(rdev, conf->mddev); 5015 } 5016 } 5017 5018 if (s.ops_request) 5019 raid_run_ops(sh, s.ops_request); 5020 5021 ops_run_io(sh, &s); 5022 5023 if (s.dec_preread_active) { 5024 /* We delay this until after ops_run_io so that if make_request 5025 * is waiting on a flush, it won't continue until the writes 5026 * have actually been submitted. 5027 */ 5028 atomic_dec(&conf->preread_active_stripes); 5029 if (atomic_read(&conf->preread_active_stripes) < 5030 IO_THRESHOLD) 5031 md_wakeup_thread(conf->mddev->thread); 5032 } 5033 5034 clear_bit_unlock(STRIPE_ACTIVE, &sh->state); 5035 } 5036 5037 static void raid5_activate_delayed(struct r5conf *conf) 5038 { 5039 if (atomic_read(&conf->preread_active_stripes) < IO_THRESHOLD) { 5040 while (!list_empty(&conf->delayed_list)) { 5041 struct list_head *l = conf->delayed_list.next; 5042 struct stripe_head *sh; 5043 sh = list_entry(l, struct stripe_head, lru); 5044 list_del_init(l); 5045 clear_bit(STRIPE_DELAYED, &sh->state); 5046 if (!test_and_set_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) 5047 atomic_inc(&conf->preread_active_stripes); 5048 list_add_tail(&sh->lru, &conf->hold_list); 5049 raid5_wakeup_stripe_thread(sh); 5050 } 5051 } 5052 } 5053 5054 static void activate_bit_delay(struct r5conf *conf, 5055 struct list_head *temp_inactive_list) 5056 { 5057 /* device_lock is held */ 5058 struct list_head head; 5059 list_add(&head, &conf->bitmap_list); 5060 list_del_init(&conf->bitmap_list); 5061 while (!list_empty(&head)) { 5062 struct stripe_head *sh = list_entry(head.next, struct stripe_head, lru); 5063 int hash; 5064 list_del_init(&sh->lru); 5065 atomic_inc(&sh->count); 5066 hash = sh->hash_lock_index; 5067 __release_stripe(conf, sh, &temp_inactive_list[hash]); 5068 } 5069 } 5070 5071 static int raid5_congested(struct mddev *mddev, int bits) 5072 { 5073 struct r5conf *conf = mddev->private; 5074 5075 /* No difference between reads and writes. Just check 5076 * how busy the stripe_cache is 5077 */ 5078 5079 if (test_bit(R5_INACTIVE_BLOCKED, &conf->cache_state)) 5080 return 1; 5081 5082 /* Also checks whether there is pressure on r5cache log space */ 5083 if (test_bit(R5C_LOG_TIGHT, &conf->cache_state)) 5084 return 1; 5085 if (conf->quiesce) 5086 return 1; 5087 if (atomic_read(&conf->empty_inactive_list_nr)) 5088 return 1; 5089 5090 return 0; 5091 } 5092 5093 static int in_chunk_boundary(struct mddev *mddev, struct bio *bio) 5094 { 5095 struct r5conf *conf = mddev->private; 5096 sector_t sector = bio->bi_iter.bi_sector; 5097 unsigned int chunk_sectors; 5098 unsigned int bio_sectors = bio_sectors(bio); 5099 5100 WARN_ON_ONCE(bio->bi_partno); 5101 5102 chunk_sectors = min(conf->chunk_sectors, conf->prev_chunk_sectors); 5103 return chunk_sectors >= 5104 ((sector & (chunk_sectors - 1)) + bio_sectors); 5105 } 5106 5107 /* 5108 * add bio to the retry LIFO ( in O(1) ... we are in interrupt ) 5109 * later sampled by raid5d. 5110 */ 5111 static void add_bio_to_retry(struct bio *bi,struct r5conf *conf) 5112 { 5113 unsigned long flags; 5114 5115 spin_lock_irqsave(&conf->device_lock, flags); 5116 5117 bi->bi_next = conf->retry_read_aligned_list; 5118 conf->retry_read_aligned_list = bi; 5119 5120 spin_unlock_irqrestore(&conf->device_lock, flags); 5121 md_wakeup_thread(conf->mddev->thread); 5122 } 5123 5124 static struct bio *remove_bio_from_retry(struct r5conf *conf, 5125 unsigned int *offset) 5126 { 5127 struct bio *bi; 5128 5129 bi = conf->retry_read_aligned; 5130 if (bi) { 5131 *offset = conf->retry_read_offset; 5132 conf->retry_read_aligned = NULL; 5133 return bi; 5134 } 5135 bi = conf->retry_read_aligned_list; 5136 if(bi) { 5137 conf->retry_read_aligned_list = bi->bi_next; 5138 bi->bi_next = NULL; 5139 *offset = 0; 5140 } 5141 5142 return bi; 5143 } 5144 5145 /* 5146 * The "raid5_align_endio" should check if the read succeeded and if it 5147 * did, call bio_endio on the original bio (having bio_put the new bio 5148 * first). 5149 * If the read failed.. 5150 */ 5151 static void raid5_align_endio(struct bio *bi) 5152 { 5153 struct bio* raid_bi = bi->bi_private; 5154 struct mddev *mddev; 5155 struct r5conf *conf; 5156 struct md_rdev *rdev; 5157 blk_status_t error = bi->bi_status; 5158 5159 bio_put(bi); 5160 5161 rdev = (void*)raid_bi->bi_next; 5162 raid_bi->bi_next = NULL; 5163 mddev = rdev->mddev; 5164 conf = mddev->private; 5165 5166 rdev_dec_pending(rdev, conf->mddev); 5167 5168 if (!error) { 5169 bio_endio(raid_bi); 5170 if (atomic_dec_and_test(&conf->active_aligned_reads)) 5171 wake_up(&conf->wait_for_quiescent); 5172 return; 5173 } 5174 5175 pr_debug("raid5_align_endio : io error...handing IO for a retry\n"); 5176 5177 add_bio_to_retry(raid_bi, conf); 5178 } 5179 5180 static int raid5_read_one_chunk(struct mddev *mddev, struct bio *raid_bio) 5181 { 5182 struct r5conf *conf = mddev->private; 5183 int dd_idx; 5184 struct bio* align_bi; 5185 struct md_rdev *rdev; 5186 sector_t end_sector; 5187 5188 if (!in_chunk_boundary(mddev, raid_bio)) { 5189 pr_debug("%s: non aligned\n", __func__); 5190 return 0; 5191 } 5192 /* 5193 * use bio_clone_fast to make a copy of the bio 5194 */ 5195 align_bi = bio_clone_fast(raid_bio, GFP_NOIO, mddev->bio_set); 5196 if (!align_bi) 5197 return 0; 5198 /* 5199 * set bi_end_io to a new function, and set bi_private to the 5200 * original bio. 5201 */ 5202 align_bi->bi_end_io = raid5_align_endio; 5203 align_bi->bi_private = raid_bio; 5204 /* 5205 * compute position 5206 */ 5207 align_bi->bi_iter.bi_sector = 5208 raid5_compute_sector(conf, raid_bio->bi_iter.bi_sector, 5209 0, &dd_idx, NULL); 5210 5211 end_sector = bio_end_sector(align_bi); 5212 rcu_read_lock(); 5213 rdev = rcu_dereference(conf->disks[dd_idx].replacement); 5214 if (!rdev || test_bit(Faulty, &rdev->flags) || 5215 rdev->recovery_offset < end_sector) { 5216 rdev = rcu_dereference(conf->disks[dd_idx].rdev); 5217 if (rdev && 5218 (test_bit(Faulty, &rdev->flags) || 5219 !(test_bit(In_sync, &rdev->flags) || 5220 rdev->recovery_offset >= end_sector))) 5221 rdev = NULL; 5222 } 5223 5224 if (r5c_big_stripe_cached(conf, align_bi->bi_iter.bi_sector)) { 5225 rcu_read_unlock(); 5226 bio_put(align_bi); 5227 return 0; 5228 } 5229 5230 if (rdev) { 5231 sector_t first_bad; 5232 int bad_sectors; 5233 5234 atomic_inc(&rdev->nr_pending); 5235 rcu_read_unlock(); 5236 raid_bio->bi_next = (void*)rdev; 5237 bio_set_dev(align_bi, rdev->bdev); 5238 bio_clear_flag(align_bi, BIO_SEG_VALID); 5239 5240 if (is_badblock(rdev, align_bi->bi_iter.bi_sector, 5241 bio_sectors(align_bi), 5242 &first_bad, &bad_sectors)) { 5243 bio_put(align_bi); 5244 rdev_dec_pending(rdev, mddev); 5245 return 0; 5246 } 5247 5248 /* No reshape active, so we can trust rdev->data_offset */ 5249 align_bi->bi_iter.bi_sector += rdev->data_offset; 5250 5251 spin_lock_irq(&conf->device_lock); 5252 wait_event_lock_irq(conf->wait_for_quiescent, 5253 conf->quiesce == 0, 5254 conf->device_lock); 5255 atomic_inc(&conf->active_aligned_reads); 5256 spin_unlock_irq(&conf->device_lock); 5257 5258 if (mddev->gendisk) 5259 trace_block_bio_remap(align_bi->bi_disk->queue, 5260 align_bi, disk_devt(mddev->gendisk), 5261 raid_bio->bi_iter.bi_sector); 5262 generic_make_request(align_bi); 5263 return 1; 5264 } else { 5265 rcu_read_unlock(); 5266 bio_put(align_bi); 5267 return 0; 5268 } 5269 } 5270 5271 static struct bio *chunk_aligned_read(struct mddev *mddev, struct bio *raid_bio) 5272 { 5273 struct bio *split; 5274 sector_t sector = raid_bio->bi_iter.bi_sector; 5275 unsigned chunk_sects = mddev->chunk_sectors; 5276 unsigned sectors = chunk_sects - (sector & (chunk_sects-1)); 5277 5278 if (sectors < bio_sectors(raid_bio)) { 5279 struct r5conf *conf = mddev->private; 5280 split = bio_split(raid_bio, sectors, GFP_NOIO, conf->bio_split); 5281 bio_chain(split, raid_bio); 5282 generic_make_request(raid_bio); 5283 raid_bio = split; 5284 } 5285 5286 if (!raid5_read_one_chunk(mddev, raid_bio)) 5287 return raid_bio; 5288 5289 return NULL; 5290 } 5291 5292 /* __get_priority_stripe - get the next stripe to process 5293 * 5294 * Full stripe writes are allowed to pass preread active stripes up until 5295 * the bypass_threshold is exceeded. In general the bypass_count 5296 * increments when the handle_list is handled before the hold_list; however, it 5297 * will not be incremented when STRIPE_IO_STARTED is sampled set signifying a 5298 * stripe with in flight i/o. The bypass_count will be reset when the 5299 * head of the hold_list has changed, i.e. the head was promoted to the 5300 * handle_list. 5301 */ 5302 static struct stripe_head *__get_priority_stripe(struct r5conf *conf, int group) 5303 { 5304 struct stripe_head *sh, *tmp; 5305 struct list_head *handle_list = NULL; 5306 struct r5worker_group *wg; 5307 bool second_try = !r5c_is_writeback(conf->log) && 5308 !r5l_log_disk_error(conf); 5309 bool try_loprio = test_bit(R5C_LOG_TIGHT, &conf->cache_state) || 5310 r5l_log_disk_error(conf); 5311 5312 again: 5313 wg = NULL; 5314 sh = NULL; 5315 if (conf->worker_cnt_per_group == 0) { 5316 handle_list = try_loprio ? &conf->loprio_list : 5317 &conf->handle_list; 5318 } else if (group != ANY_GROUP) { 5319 handle_list = try_loprio ? &conf->worker_groups[group].loprio_list : 5320 &conf->worker_groups[group].handle_list; 5321 wg = &conf->worker_groups[group]; 5322 } else { 5323 int i; 5324 for (i = 0; i < conf->group_cnt; i++) { 5325 handle_list = try_loprio ? &conf->worker_groups[i].loprio_list : 5326 &conf->worker_groups[i].handle_list; 5327 wg = &conf->worker_groups[i]; 5328 if (!list_empty(handle_list)) 5329 break; 5330 } 5331 } 5332 5333 pr_debug("%s: handle: %s hold: %s full_writes: %d bypass_count: %d\n", 5334 __func__, 5335 list_empty(handle_list) ? "empty" : "busy", 5336 list_empty(&conf->hold_list) ? "empty" : "busy", 5337 atomic_read(&conf->pending_full_writes), conf->bypass_count); 5338 5339 if (!list_empty(handle_list)) { 5340 sh = list_entry(handle_list->next, typeof(*sh), lru); 5341 5342 if (list_empty(&conf->hold_list)) 5343 conf->bypass_count = 0; 5344 else if (!test_bit(STRIPE_IO_STARTED, &sh->state)) { 5345 if (conf->hold_list.next == conf->last_hold) 5346 conf->bypass_count++; 5347 else { 5348 conf->last_hold = conf->hold_list.next; 5349 conf->bypass_count -= conf->bypass_threshold; 5350 if (conf->bypass_count < 0) 5351 conf->bypass_count = 0; 5352 } 5353 } 5354 } else if (!list_empty(&conf->hold_list) && 5355 ((conf->bypass_threshold && 5356 conf->bypass_count > conf->bypass_threshold) || 5357 atomic_read(&conf->pending_full_writes) == 0)) { 5358 5359 list_for_each_entry(tmp, &conf->hold_list, lru) { 5360 if (conf->worker_cnt_per_group == 0 || 5361 group == ANY_GROUP || 5362 !cpu_online(tmp->cpu) || 5363 cpu_to_group(tmp->cpu) == group) { 5364 sh = tmp; 5365 break; 5366 } 5367 } 5368 5369 if (sh) { 5370 conf->bypass_count -= conf->bypass_threshold; 5371 if (conf->bypass_count < 0) 5372 conf->bypass_count = 0; 5373 } 5374 wg = NULL; 5375 } 5376 5377 if (!sh) { 5378 if (second_try) 5379 return NULL; 5380 second_try = true; 5381 try_loprio = !try_loprio; 5382 goto again; 5383 } 5384 5385 if (wg) { 5386 wg->stripes_cnt--; 5387 sh->group = NULL; 5388 } 5389 list_del_init(&sh->lru); 5390 BUG_ON(atomic_inc_return(&sh->count) != 1); 5391 return sh; 5392 } 5393 5394 struct raid5_plug_cb { 5395 struct blk_plug_cb cb; 5396 struct list_head list; 5397 struct list_head temp_inactive_list[NR_STRIPE_HASH_LOCKS]; 5398 }; 5399 5400 static void raid5_unplug(struct blk_plug_cb *blk_cb, bool from_schedule) 5401 { 5402 struct raid5_plug_cb *cb = container_of( 5403 blk_cb, struct raid5_plug_cb, cb); 5404 struct stripe_head *sh; 5405 struct mddev *mddev = cb->cb.data; 5406 struct r5conf *conf = mddev->private; 5407 int cnt = 0; 5408 int hash; 5409 5410 if (cb->list.next && !list_empty(&cb->list)) { 5411 spin_lock_irq(&conf->device_lock); 5412 while (!list_empty(&cb->list)) { 5413 sh = list_first_entry(&cb->list, struct stripe_head, lru); 5414 list_del_init(&sh->lru); 5415 /* 5416 * avoid race release_stripe_plug() sees 5417 * STRIPE_ON_UNPLUG_LIST clear but the stripe 5418 * is still in our list 5419 */ 5420 smp_mb__before_atomic(); 5421 clear_bit(STRIPE_ON_UNPLUG_LIST, &sh->state); 5422 /* 5423 * STRIPE_ON_RELEASE_LIST could be set here. In that 5424 * case, the count is always > 1 here 5425 */ 5426 hash = sh->hash_lock_index; 5427 __release_stripe(conf, sh, &cb->temp_inactive_list[hash]); 5428 cnt++; 5429 } 5430 spin_unlock_irq(&conf->device_lock); 5431 } 5432 release_inactive_stripe_list(conf, cb->temp_inactive_list, 5433 NR_STRIPE_HASH_LOCKS); 5434 if (mddev->queue) 5435 trace_block_unplug(mddev->queue, cnt, !from_schedule); 5436 kfree(cb); 5437 } 5438 5439 static void release_stripe_plug(struct mddev *mddev, 5440 struct stripe_head *sh) 5441 { 5442 struct blk_plug_cb *blk_cb = blk_check_plugged( 5443 raid5_unplug, mddev, 5444 sizeof(struct raid5_plug_cb)); 5445 struct raid5_plug_cb *cb; 5446 5447 if (!blk_cb) { 5448 raid5_release_stripe(sh); 5449 return; 5450 } 5451 5452 cb = container_of(blk_cb, struct raid5_plug_cb, cb); 5453 5454 if (cb->list.next == NULL) { 5455 int i; 5456 INIT_LIST_HEAD(&cb->list); 5457 for (i = 0; i < NR_STRIPE_HASH_LOCKS; i++) 5458 INIT_LIST_HEAD(cb->temp_inactive_list + i); 5459 } 5460 5461 if (!test_and_set_bit(STRIPE_ON_UNPLUG_LIST, &sh->state)) 5462 list_add_tail(&sh->lru, &cb->list); 5463 else 5464 raid5_release_stripe(sh); 5465 } 5466 5467 static void make_discard_request(struct mddev *mddev, struct bio *bi) 5468 { 5469 struct r5conf *conf = mddev->private; 5470 sector_t logical_sector, last_sector; 5471 struct stripe_head *sh; 5472 int stripe_sectors; 5473 5474 if (mddev->reshape_position != MaxSector) 5475 /* Skip discard while reshape is happening */ 5476 return; 5477 5478 logical_sector = bi->bi_iter.bi_sector & ~((sector_t)STRIPE_SECTORS-1); 5479 last_sector = bi->bi_iter.bi_sector + (bi->bi_iter.bi_size>>9); 5480 5481 bi->bi_next = NULL; 5482 5483 stripe_sectors = conf->chunk_sectors * 5484 (conf->raid_disks - conf->max_degraded); 5485 logical_sector = DIV_ROUND_UP_SECTOR_T(logical_sector, 5486 stripe_sectors); 5487 sector_div(last_sector, stripe_sectors); 5488 5489 logical_sector *= conf->chunk_sectors; 5490 last_sector *= conf->chunk_sectors; 5491 5492 for (; logical_sector < last_sector; 5493 logical_sector += STRIPE_SECTORS) { 5494 DEFINE_WAIT(w); 5495 int d; 5496 again: 5497 sh = raid5_get_active_stripe(conf, logical_sector, 0, 0, 0); 5498 prepare_to_wait(&conf->wait_for_overlap, &w, 5499 TASK_UNINTERRUPTIBLE); 5500 set_bit(R5_Overlap, &sh->dev[sh->pd_idx].flags); 5501 if (test_bit(STRIPE_SYNCING, &sh->state)) { 5502 raid5_release_stripe(sh); 5503 schedule(); 5504 goto again; 5505 } 5506 clear_bit(R5_Overlap, &sh->dev[sh->pd_idx].flags); 5507 spin_lock_irq(&sh->stripe_lock); 5508 for (d = 0; d < conf->raid_disks; d++) { 5509 if (d == sh->pd_idx || d == sh->qd_idx) 5510 continue; 5511 if (sh->dev[d].towrite || sh->dev[d].toread) { 5512 set_bit(R5_Overlap, &sh->dev[d].flags); 5513 spin_unlock_irq(&sh->stripe_lock); 5514 raid5_release_stripe(sh); 5515 schedule(); 5516 goto again; 5517 } 5518 } 5519 set_bit(STRIPE_DISCARD, &sh->state); 5520 finish_wait(&conf->wait_for_overlap, &w); 5521 sh->overwrite_disks = 0; 5522 for (d = 0; d < conf->raid_disks; d++) { 5523 if (d == sh->pd_idx || d == sh->qd_idx) 5524 continue; 5525 sh->dev[d].towrite = bi; 5526 set_bit(R5_OVERWRITE, &sh->dev[d].flags); 5527 bio_inc_remaining(bi); 5528 md_write_inc(mddev, bi); 5529 sh->overwrite_disks++; 5530 } 5531 spin_unlock_irq(&sh->stripe_lock); 5532 if (conf->mddev->bitmap) { 5533 for (d = 0; 5534 d < conf->raid_disks - conf->max_degraded; 5535 d++) 5536 bitmap_startwrite(mddev->bitmap, 5537 sh->sector, 5538 STRIPE_SECTORS, 5539 0); 5540 sh->bm_seq = conf->seq_flush + 1; 5541 set_bit(STRIPE_BIT_DELAY, &sh->state); 5542 } 5543 5544 set_bit(STRIPE_HANDLE, &sh->state); 5545 clear_bit(STRIPE_DELAYED, &sh->state); 5546 if (!test_and_set_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) 5547 atomic_inc(&conf->preread_active_stripes); 5548 release_stripe_plug(mddev, sh); 5549 } 5550 5551 bio_endio(bi); 5552 } 5553 5554 static bool raid5_make_request(struct mddev *mddev, struct bio * bi) 5555 { 5556 struct r5conf *conf = mddev->private; 5557 int dd_idx; 5558 sector_t new_sector; 5559 sector_t logical_sector, last_sector; 5560 struct stripe_head *sh; 5561 const int rw = bio_data_dir(bi); 5562 DEFINE_WAIT(w); 5563 bool do_prepare; 5564 bool do_flush = false; 5565 5566 if (unlikely(bi->bi_opf & REQ_PREFLUSH)) { 5567 int ret = log_handle_flush_request(conf, bi); 5568 5569 if (ret == 0) 5570 return true; 5571 if (ret == -ENODEV) { 5572 md_flush_request(mddev, bi); 5573 return true; 5574 } 5575 /* ret == -EAGAIN, fallback */ 5576 /* 5577 * if r5l_handle_flush_request() didn't clear REQ_PREFLUSH, 5578 * we need to flush journal device 5579 */ 5580 do_flush = bi->bi_opf & REQ_PREFLUSH; 5581 } 5582 5583 if (!md_write_start(mddev, bi)) 5584 return false; 5585 /* 5586 * If array is degraded, better not do chunk aligned read because 5587 * later we might have to read it again in order to reconstruct 5588 * data on failed drives. 5589 */ 5590 if (rw == READ && mddev->degraded == 0 && 5591 mddev->reshape_position == MaxSector) { 5592 bi = chunk_aligned_read(mddev, bi); 5593 if (!bi) 5594 return true; 5595 } 5596 5597 if (unlikely(bio_op(bi) == REQ_OP_DISCARD)) { 5598 make_discard_request(mddev, bi); 5599 md_write_end(mddev); 5600 return true; 5601 } 5602 5603 logical_sector = bi->bi_iter.bi_sector & ~((sector_t)STRIPE_SECTORS-1); 5604 last_sector = bio_end_sector(bi); 5605 bi->bi_next = NULL; 5606 5607 prepare_to_wait(&conf->wait_for_overlap, &w, TASK_UNINTERRUPTIBLE); 5608 for (;logical_sector < last_sector; logical_sector += STRIPE_SECTORS) { 5609 int previous; 5610 int seq; 5611 5612 do_prepare = false; 5613 retry: 5614 seq = read_seqcount_begin(&conf->gen_lock); 5615 previous = 0; 5616 if (do_prepare) 5617 prepare_to_wait(&conf->wait_for_overlap, &w, 5618 TASK_UNINTERRUPTIBLE); 5619 if (unlikely(conf->reshape_progress != MaxSector)) { 5620 /* spinlock is needed as reshape_progress may be 5621 * 64bit on a 32bit platform, and so it might be 5622 * possible to see a half-updated value 5623 * Of course reshape_progress could change after 5624 * the lock is dropped, so once we get a reference 5625 * to the stripe that we think it is, we will have 5626 * to check again. 5627 */ 5628 spin_lock_irq(&conf->device_lock); 5629 if (mddev->reshape_backwards 5630 ? logical_sector < conf->reshape_progress 5631 : logical_sector >= conf->reshape_progress) { 5632 previous = 1; 5633 } else { 5634 if (mddev->reshape_backwards 5635 ? logical_sector < conf->reshape_safe 5636 : logical_sector >= conf->reshape_safe) { 5637 spin_unlock_irq(&conf->device_lock); 5638 schedule(); 5639 do_prepare = true; 5640 goto retry; 5641 } 5642 } 5643 spin_unlock_irq(&conf->device_lock); 5644 } 5645 5646 new_sector = raid5_compute_sector(conf, logical_sector, 5647 previous, 5648 &dd_idx, NULL); 5649 pr_debug("raid456: raid5_make_request, sector %llu logical %llu\n", 5650 (unsigned long long)new_sector, 5651 (unsigned long long)logical_sector); 5652 5653 sh = raid5_get_active_stripe(conf, new_sector, previous, 5654 (bi->bi_opf & REQ_RAHEAD), 0); 5655 if (sh) { 5656 if (unlikely(previous)) { 5657 /* expansion might have moved on while waiting for a 5658 * stripe, so we must do the range check again. 5659 * Expansion could still move past after this 5660 * test, but as we are holding a reference to 5661 * 'sh', we know that if that happens, 5662 * STRIPE_EXPANDING will get set and the expansion 5663 * won't proceed until we finish with the stripe. 5664 */ 5665 int must_retry = 0; 5666 spin_lock_irq(&conf->device_lock); 5667 if (mddev->reshape_backwards 5668 ? logical_sector >= conf->reshape_progress 5669 : logical_sector < conf->reshape_progress) 5670 /* mismatch, need to try again */ 5671 must_retry = 1; 5672 spin_unlock_irq(&conf->device_lock); 5673 if (must_retry) { 5674 raid5_release_stripe(sh); 5675 schedule(); 5676 do_prepare = true; 5677 goto retry; 5678 } 5679 } 5680 if (read_seqcount_retry(&conf->gen_lock, seq)) { 5681 /* Might have got the wrong stripe_head 5682 * by accident 5683 */ 5684 raid5_release_stripe(sh); 5685 goto retry; 5686 } 5687 5688 if (test_bit(STRIPE_EXPANDING, &sh->state) || 5689 !add_stripe_bio(sh, bi, dd_idx, rw, previous)) { 5690 /* Stripe is busy expanding or 5691 * add failed due to overlap. Flush everything 5692 * and wait a while 5693 */ 5694 md_wakeup_thread(mddev->thread); 5695 raid5_release_stripe(sh); 5696 schedule(); 5697 do_prepare = true; 5698 goto retry; 5699 } 5700 if (do_flush) { 5701 set_bit(STRIPE_R5C_PREFLUSH, &sh->state); 5702 /* we only need flush for one stripe */ 5703 do_flush = false; 5704 } 5705 5706 set_bit(STRIPE_HANDLE, &sh->state); 5707 clear_bit(STRIPE_DELAYED, &sh->state); 5708 if ((!sh->batch_head || sh == sh->batch_head) && 5709 (bi->bi_opf & REQ_SYNC) && 5710 !test_and_set_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) 5711 atomic_inc(&conf->preread_active_stripes); 5712 release_stripe_plug(mddev, sh); 5713 } else { 5714 /* cannot get stripe for read-ahead, just give-up */ 5715 bi->bi_status = BLK_STS_IOERR; 5716 break; 5717 } 5718 } 5719 finish_wait(&conf->wait_for_overlap, &w); 5720 5721 if (rw == WRITE) 5722 md_write_end(mddev); 5723 bio_endio(bi); 5724 return true; 5725 } 5726 5727 static sector_t raid5_size(struct mddev *mddev, sector_t sectors, int raid_disks); 5728 5729 static sector_t reshape_request(struct mddev *mddev, sector_t sector_nr, int *skipped) 5730 { 5731 /* reshaping is quite different to recovery/resync so it is 5732 * handled quite separately ... here. 5733 * 5734 * On each call to sync_request, we gather one chunk worth of 5735 * destination stripes and flag them as expanding. 5736 * Then we find all the source stripes and request reads. 5737 * As the reads complete, handle_stripe will copy the data 5738 * into the destination stripe and release that stripe. 5739 */ 5740 struct r5conf *conf = mddev->private; 5741 struct stripe_head *sh; 5742 struct md_rdev *rdev; 5743 sector_t first_sector, last_sector; 5744 int raid_disks = conf->previous_raid_disks; 5745 int data_disks = raid_disks - conf->max_degraded; 5746 int new_data_disks = conf->raid_disks - conf->max_degraded; 5747 int i; 5748 int dd_idx; 5749 sector_t writepos, readpos, safepos; 5750 sector_t stripe_addr; 5751 int reshape_sectors; 5752 struct list_head stripes; 5753 sector_t retn; 5754 5755 if (sector_nr == 0) { 5756 /* If restarting in the middle, skip the initial sectors */ 5757 if (mddev->reshape_backwards && 5758 conf->reshape_progress < raid5_size(mddev, 0, 0)) { 5759 sector_nr = raid5_size(mddev, 0, 0) 5760 - conf->reshape_progress; 5761 } else if (mddev->reshape_backwards && 5762 conf->reshape_progress == MaxSector) { 5763 /* shouldn't happen, but just in case, finish up.*/ 5764 sector_nr = MaxSector; 5765 } else if (!mddev->reshape_backwards && 5766 conf->reshape_progress > 0) 5767 sector_nr = conf->reshape_progress; 5768 sector_div(sector_nr, new_data_disks); 5769 if (sector_nr) { 5770 mddev->curr_resync_completed = sector_nr; 5771 sysfs_notify(&mddev->kobj, NULL, "sync_completed"); 5772 *skipped = 1; 5773 retn = sector_nr; 5774 goto finish; 5775 } 5776 } 5777 5778 /* We need to process a full chunk at a time. 5779 * If old and new chunk sizes differ, we need to process the 5780 * largest of these 5781 */ 5782 5783 reshape_sectors = max(conf->chunk_sectors, conf->prev_chunk_sectors); 5784 5785 /* We update the metadata at least every 10 seconds, or when 5786 * the data about to be copied would over-write the source of 5787 * the data at the front of the range. i.e. one new_stripe 5788 * along from reshape_progress new_maps to after where 5789 * reshape_safe old_maps to 5790 */ 5791 writepos = conf->reshape_progress; 5792 sector_div(writepos, new_data_disks); 5793 readpos = conf->reshape_progress; 5794 sector_div(readpos, data_disks); 5795 safepos = conf->reshape_safe; 5796 sector_div(safepos, data_disks); 5797 if (mddev->reshape_backwards) { 5798 BUG_ON(writepos < reshape_sectors); 5799 writepos -= reshape_sectors; 5800 readpos += reshape_sectors; 5801 safepos += reshape_sectors; 5802 } else { 5803 writepos += reshape_sectors; 5804 /* readpos and safepos are worst-case calculations. 5805 * A negative number is overly pessimistic, and causes 5806 * obvious problems for unsigned storage. So clip to 0. 5807 */ 5808 readpos -= min_t(sector_t, reshape_sectors, readpos); 5809 safepos -= min_t(sector_t, reshape_sectors, safepos); 5810 } 5811 5812 /* Having calculated the 'writepos' possibly use it 5813 * to set 'stripe_addr' which is where we will write to. 5814 */ 5815 if (mddev->reshape_backwards) { 5816 BUG_ON(conf->reshape_progress == 0); 5817 stripe_addr = writepos; 5818 BUG_ON((mddev->dev_sectors & 5819 ~((sector_t)reshape_sectors - 1)) 5820 - reshape_sectors - stripe_addr 5821 != sector_nr); 5822 } else { 5823 BUG_ON(writepos != sector_nr + reshape_sectors); 5824 stripe_addr = sector_nr; 5825 } 5826 5827 /* 'writepos' is the most advanced device address we might write. 5828 * 'readpos' is the least advanced device address we might read. 5829 * 'safepos' is the least address recorded in the metadata as having 5830 * been reshaped. 5831 * If there is a min_offset_diff, these are adjusted either by 5832 * increasing the safepos/readpos if diff is negative, or 5833 * increasing writepos if diff is positive. 5834 * If 'readpos' is then behind 'writepos', there is no way that we can 5835 * ensure safety in the face of a crash - that must be done by userspace 5836 * making a backup of the data. So in that case there is no particular 5837 * rush to update metadata. 5838 * Otherwise if 'safepos' is behind 'writepos', then we really need to 5839 * update the metadata to advance 'safepos' to match 'readpos' so that 5840 * we can be safe in the event of a crash. 5841 * So we insist on updating metadata if safepos is behind writepos and 5842 * readpos is beyond writepos. 5843 * In any case, update the metadata every 10 seconds. 5844 * Maybe that number should be configurable, but I'm not sure it is 5845 * worth it.... maybe it could be a multiple of safemode_delay??? 5846 */ 5847 if (conf->min_offset_diff < 0) { 5848 safepos += -conf->min_offset_diff; 5849 readpos += -conf->min_offset_diff; 5850 } else 5851 writepos += conf->min_offset_diff; 5852 5853 if ((mddev->reshape_backwards 5854 ? (safepos > writepos && readpos < writepos) 5855 : (safepos < writepos && readpos > writepos)) || 5856 time_after(jiffies, conf->reshape_checkpoint + 10*HZ)) { 5857 /* Cannot proceed until we've updated the superblock... */ 5858 wait_event(conf->wait_for_overlap, 5859 atomic_read(&conf->reshape_stripes)==0 5860 || test_bit(MD_RECOVERY_INTR, &mddev->recovery)); 5861 if (atomic_read(&conf->reshape_stripes) != 0) 5862 return 0; 5863 mddev->reshape_position = conf->reshape_progress; 5864 mddev->curr_resync_completed = sector_nr; 5865 if (!mddev->reshape_backwards) 5866 /* Can update recovery_offset */ 5867 rdev_for_each(rdev, mddev) 5868 if (rdev->raid_disk >= 0 && 5869 !test_bit(Journal, &rdev->flags) && 5870 !test_bit(In_sync, &rdev->flags) && 5871 rdev->recovery_offset < sector_nr) 5872 rdev->recovery_offset = sector_nr; 5873 5874 conf->reshape_checkpoint = jiffies; 5875 set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags); 5876 md_wakeup_thread(mddev->thread); 5877 wait_event(mddev->sb_wait, mddev->sb_flags == 0 || 5878 test_bit(MD_RECOVERY_INTR, &mddev->recovery)); 5879 if (test_bit(MD_RECOVERY_INTR, &mddev->recovery)) 5880 return 0; 5881 spin_lock_irq(&conf->device_lock); 5882 conf->reshape_safe = mddev->reshape_position; 5883 spin_unlock_irq(&conf->device_lock); 5884 wake_up(&conf->wait_for_overlap); 5885 sysfs_notify(&mddev->kobj, NULL, "sync_completed"); 5886 } 5887 5888 INIT_LIST_HEAD(&stripes); 5889 for (i = 0; i < reshape_sectors; i += STRIPE_SECTORS) { 5890 int j; 5891 int skipped_disk = 0; 5892 sh = raid5_get_active_stripe(conf, stripe_addr+i, 0, 0, 1); 5893 set_bit(STRIPE_EXPANDING, &sh->state); 5894 atomic_inc(&conf->reshape_stripes); 5895 /* If any of this stripe is beyond the end of the old 5896 * array, then we need to zero those blocks 5897 */ 5898 for (j=sh->disks; j--;) { 5899 sector_t s; 5900 if (j == sh->pd_idx) 5901 continue; 5902 if (conf->level == 6 && 5903 j == sh->qd_idx) 5904 continue; 5905 s = raid5_compute_blocknr(sh, j, 0); 5906 if (s < raid5_size(mddev, 0, 0)) { 5907 skipped_disk = 1; 5908 continue; 5909 } 5910 memset(page_address(sh->dev[j].page), 0, STRIPE_SIZE); 5911 set_bit(R5_Expanded, &sh->dev[j].flags); 5912 set_bit(R5_UPTODATE, &sh->dev[j].flags); 5913 } 5914 if (!skipped_disk) { 5915 set_bit(STRIPE_EXPAND_READY, &sh->state); 5916 set_bit(STRIPE_HANDLE, &sh->state); 5917 } 5918 list_add(&sh->lru, &stripes); 5919 } 5920 spin_lock_irq(&conf->device_lock); 5921 if (mddev->reshape_backwards) 5922 conf->reshape_progress -= reshape_sectors * new_data_disks; 5923 else 5924 conf->reshape_progress += reshape_sectors * new_data_disks; 5925 spin_unlock_irq(&conf->device_lock); 5926 /* Ok, those stripe are ready. We can start scheduling 5927 * reads on the source stripes. 5928 * The source stripes are determined by mapping the first and last 5929 * block on the destination stripes. 5930 */ 5931 first_sector = 5932 raid5_compute_sector(conf, stripe_addr*(new_data_disks), 5933 1, &dd_idx, NULL); 5934 last_sector = 5935 raid5_compute_sector(conf, ((stripe_addr+reshape_sectors) 5936 * new_data_disks - 1), 5937 1, &dd_idx, NULL); 5938 if (last_sector >= mddev->dev_sectors) 5939 last_sector = mddev->dev_sectors - 1; 5940 while (first_sector <= last_sector) { 5941 sh = raid5_get_active_stripe(conf, first_sector, 1, 0, 1); 5942 set_bit(STRIPE_EXPAND_SOURCE, &sh->state); 5943 set_bit(STRIPE_HANDLE, &sh->state); 5944 raid5_release_stripe(sh); 5945 first_sector += STRIPE_SECTORS; 5946 } 5947 /* Now that the sources are clearly marked, we can release 5948 * the destination stripes 5949 */ 5950 while (!list_empty(&stripes)) { 5951 sh = list_entry(stripes.next, struct stripe_head, lru); 5952 list_del_init(&sh->lru); 5953 raid5_release_stripe(sh); 5954 } 5955 /* If this takes us to the resync_max point where we have to pause, 5956 * then we need to write out the superblock. 5957 */ 5958 sector_nr += reshape_sectors; 5959 retn = reshape_sectors; 5960 finish: 5961 if (mddev->curr_resync_completed > mddev->resync_max || 5962 (sector_nr - mddev->curr_resync_completed) * 2 5963 >= mddev->resync_max - mddev->curr_resync_completed) { 5964 /* Cannot proceed until we've updated the superblock... */ 5965 wait_event(conf->wait_for_overlap, 5966 atomic_read(&conf->reshape_stripes) == 0 5967 || test_bit(MD_RECOVERY_INTR, &mddev->recovery)); 5968 if (atomic_read(&conf->reshape_stripes) != 0) 5969 goto ret; 5970 mddev->reshape_position = conf->reshape_progress; 5971 mddev->curr_resync_completed = sector_nr; 5972 if (!mddev->reshape_backwards) 5973 /* Can update recovery_offset */ 5974 rdev_for_each(rdev, mddev) 5975 if (rdev->raid_disk >= 0 && 5976 !test_bit(Journal, &rdev->flags) && 5977 !test_bit(In_sync, &rdev->flags) && 5978 rdev->recovery_offset < sector_nr) 5979 rdev->recovery_offset = sector_nr; 5980 conf->reshape_checkpoint = jiffies; 5981 set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags); 5982 md_wakeup_thread(mddev->thread); 5983 wait_event(mddev->sb_wait, 5984 !test_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags) 5985 || test_bit(MD_RECOVERY_INTR, &mddev->recovery)); 5986 if (test_bit(MD_RECOVERY_INTR, &mddev->recovery)) 5987 goto ret; 5988 spin_lock_irq(&conf->device_lock); 5989 conf->reshape_safe = mddev->reshape_position; 5990 spin_unlock_irq(&conf->device_lock); 5991 wake_up(&conf->wait_for_overlap); 5992 sysfs_notify(&mddev->kobj, NULL, "sync_completed"); 5993 } 5994 ret: 5995 return retn; 5996 } 5997 5998 static inline sector_t raid5_sync_request(struct mddev *mddev, sector_t sector_nr, 5999 int *skipped) 6000 { 6001 struct r5conf *conf = mddev->private; 6002 struct stripe_head *sh; 6003 sector_t max_sector = mddev->dev_sectors; 6004 sector_t sync_blocks; 6005 int still_degraded = 0; 6006 int i; 6007 6008 if (sector_nr >= max_sector) { 6009 /* just being told to finish up .. nothing much to do */ 6010 6011 if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)) { 6012 end_reshape(conf); 6013 return 0; 6014 } 6015 6016 if (mddev->curr_resync < max_sector) /* aborted */ 6017 bitmap_end_sync(mddev->bitmap, mddev->curr_resync, 6018 &sync_blocks, 1); 6019 else /* completed sync */ 6020 conf->fullsync = 0; 6021 bitmap_close_sync(mddev->bitmap); 6022 6023 return 0; 6024 } 6025 6026 /* Allow raid5_quiesce to complete */ 6027 wait_event(conf->wait_for_overlap, conf->quiesce != 2); 6028 6029 if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)) 6030 return reshape_request(mddev, sector_nr, skipped); 6031 6032 /* No need to check resync_max as we never do more than one 6033 * stripe, and as resync_max will always be on a chunk boundary, 6034 * if the check in md_do_sync didn't fire, there is no chance 6035 * of overstepping resync_max here 6036 */ 6037 6038 /* if there is too many failed drives and we are trying 6039 * to resync, then assert that we are finished, because there is 6040 * nothing we can do. 6041 */ 6042 if (mddev->degraded >= conf->max_degraded && 6043 test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) { 6044 sector_t rv = mddev->dev_sectors - sector_nr; 6045 *skipped = 1; 6046 return rv; 6047 } 6048 if (!test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery) && 6049 !conf->fullsync && 6050 !bitmap_start_sync(mddev->bitmap, sector_nr, &sync_blocks, 1) && 6051 sync_blocks >= STRIPE_SECTORS) { 6052 /* we can skip this block, and probably more */ 6053 sync_blocks /= STRIPE_SECTORS; 6054 *skipped = 1; 6055 return sync_blocks * STRIPE_SECTORS; /* keep things rounded to whole stripes */ 6056 } 6057 6058 bitmap_cond_end_sync(mddev->bitmap, sector_nr, false); 6059 6060 sh = raid5_get_active_stripe(conf, sector_nr, 0, 1, 0); 6061 if (sh == NULL) { 6062 sh = raid5_get_active_stripe(conf, sector_nr, 0, 0, 0); 6063 /* make sure we don't swamp the stripe cache if someone else 6064 * is trying to get access 6065 */ 6066 schedule_timeout_uninterruptible(1); 6067 } 6068 /* Need to check if array will still be degraded after recovery/resync 6069 * Note in case of > 1 drive failures it's possible we're rebuilding 6070 * one drive while leaving another faulty drive in array. 6071 */ 6072 rcu_read_lock(); 6073 for (i = 0; i < conf->raid_disks; i++) { 6074 struct md_rdev *rdev = READ_ONCE(conf->disks[i].rdev); 6075 6076 if (rdev == NULL || test_bit(Faulty, &rdev->flags)) 6077 still_degraded = 1; 6078 } 6079 rcu_read_unlock(); 6080 6081 bitmap_start_sync(mddev->bitmap, sector_nr, &sync_blocks, still_degraded); 6082 6083 set_bit(STRIPE_SYNC_REQUESTED, &sh->state); 6084 set_bit(STRIPE_HANDLE, &sh->state); 6085 6086 raid5_release_stripe(sh); 6087 6088 return STRIPE_SECTORS; 6089 } 6090 6091 static int retry_aligned_read(struct r5conf *conf, struct bio *raid_bio, 6092 unsigned int offset) 6093 { 6094 /* We may not be able to submit a whole bio at once as there 6095 * may not be enough stripe_heads available. 6096 * We cannot pre-allocate enough stripe_heads as we may need 6097 * more than exist in the cache (if we allow ever large chunks). 6098 * So we do one stripe head at a time and record in 6099 * ->bi_hw_segments how many have been done. 6100 * 6101 * We *know* that this entire raid_bio is in one chunk, so 6102 * it will be only one 'dd_idx' and only need one call to raid5_compute_sector. 6103 */ 6104 struct stripe_head *sh; 6105 int dd_idx; 6106 sector_t sector, logical_sector, last_sector; 6107 int scnt = 0; 6108 int handled = 0; 6109 6110 logical_sector = raid_bio->bi_iter.bi_sector & 6111 ~((sector_t)STRIPE_SECTORS-1); 6112 sector = raid5_compute_sector(conf, logical_sector, 6113 0, &dd_idx, NULL); 6114 last_sector = bio_end_sector(raid_bio); 6115 6116 for (; logical_sector < last_sector; 6117 logical_sector += STRIPE_SECTORS, 6118 sector += STRIPE_SECTORS, 6119 scnt++) { 6120 6121 if (scnt < offset) 6122 /* already done this stripe */ 6123 continue; 6124 6125 sh = raid5_get_active_stripe(conf, sector, 0, 1, 1); 6126 6127 if (!sh) { 6128 /* failed to get a stripe - must wait */ 6129 conf->retry_read_aligned = raid_bio; 6130 conf->retry_read_offset = scnt; 6131 return handled; 6132 } 6133 6134 if (!add_stripe_bio(sh, raid_bio, dd_idx, 0, 0)) { 6135 raid5_release_stripe(sh); 6136 conf->retry_read_aligned = raid_bio; 6137 conf->retry_read_offset = scnt; 6138 return handled; 6139 } 6140 6141 set_bit(R5_ReadNoMerge, &sh->dev[dd_idx].flags); 6142 handle_stripe(sh); 6143 raid5_release_stripe(sh); 6144 handled++; 6145 } 6146 6147 bio_endio(raid_bio); 6148 6149 if (atomic_dec_and_test(&conf->active_aligned_reads)) 6150 wake_up(&conf->wait_for_quiescent); 6151 return handled; 6152 } 6153 6154 static int handle_active_stripes(struct r5conf *conf, int group, 6155 struct r5worker *worker, 6156 struct list_head *temp_inactive_list) 6157 { 6158 struct stripe_head *batch[MAX_STRIPE_BATCH], *sh; 6159 int i, batch_size = 0, hash; 6160 bool release_inactive = false; 6161 6162 while (batch_size < MAX_STRIPE_BATCH && 6163 (sh = __get_priority_stripe(conf, group)) != NULL) 6164 batch[batch_size++] = sh; 6165 6166 if (batch_size == 0) { 6167 for (i = 0; i < NR_STRIPE_HASH_LOCKS; i++) 6168 if (!list_empty(temp_inactive_list + i)) 6169 break; 6170 if (i == NR_STRIPE_HASH_LOCKS) { 6171 spin_unlock_irq(&conf->device_lock); 6172 log_flush_stripe_to_raid(conf); 6173 spin_lock_irq(&conf->device_lock); 6174 return batch_size; 6175 } 6176 release_inactive = true; 6177 } 6178 spin_unlock_irq(&conf->device_lock); 6179 6180 release_inactive_stripe_list(conf, temp_inactive_list, 6181 NR_STRIPE_HASH_LOCKS); 6182 6183 r5l_flush_stripe_to_raid(conf->log); 6184 if (release_inactive) { 6185 spin_lock_irq(&conf->device_lock); 6186 return 0; 6187 } 6188 6189 for (i = 0; i < batch_size; i++) 6190 handle_stripe(batch[i]); 6191 log_write_stripe_run(conf); 6192 6193 cond_resched(); 6194 6195 spin_lock_irq(&conf->device_lock); 6196 for (i = 0; i < batch_size; i++) { 6197 hash = batch[i]->hash_lock_index; 6198 __release_stripe(conf, batch[i], &temp_inactive_list[hash]); 6199 } 6200 return batch_size; 6201 } 6202 6203 static void raid5_do_work(struct work_struct *work) 6204 { 6205 struct r5worker *worker = container_of(work, struct r5worker, work); 6206 struct r5worker_group *group = worker->group; 6207 struct r5conf *conf = group->conf; 6208 struct mddev *mddev = conf->mddev; 6209 int group_id = group - conf->worker_groups; 6210 int handled; 6211 struct blk_plug plug; 6212 6213 pr_debug("+++ raid5worker active\n"); 6214 6215 blk_start_plug(&plug); 6216 handled = 0; 6217 spin_lock_irq(&conf->device_lock); 6218 while (1) { 6219 int batch_size, released; 6220 6221 released = release_stripe_list(conf, worker->temp_inactive_list); 6222 6223 batch_size = handle_active_stripes(conf, group_id, worker, 6224 worker->temp_inactive_list); 6225 worker->working = false; 6226 if (!batch_size && !released) 6227 break; 6228 handled += batch_size; 6229 wait_event_lock_irq(mddev->sb_wait, 6230 !test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags), 6231 conf->device_lock); 6232 } 6233 pr_debug("%d stripes handled\n", handled); 6234 6235 spin_unlock_irq(&conf->device_lock); 6236 6237 flush_deferred_bios(conf); 6238 6239 r5l_flush_stripe_to_raid(conf->log); 6240 6241 async_tx_issue_pending_all(); 6242 blk_finish_plug(&plug); 6243 6244 pr_debug("--- raid5worker inactive\n"); 6245 } 6246 6247 /* 6248 * This is our raid5 kernel thread. 6249 * 6250 * We scan the hash table for stripes which can be handled now. 6251 * During the scan, completed stripes are saved for us by the interrupt 6252 * handler, so that they will not have to wait for our next wakeup. 6253 */ 6254 static void raid5d(struct md_thread *thread) 6255 { 6256 struct mddev *mddev = thread->mddev; 6257 struct r5conf *conf = mddev->private; 6258 int handled; 6259 struct blk_plug plug; 6260 6261 pr_debug("+++ raid5d active\n"); 6262 6263 md_check_recovery(mddev); 6264 6265 blk_start_plug(&plug); 6266 handled = 0; 6267 spin_lock_irq(&conf->device_lock); 6268 while (1) { 6269 struct bio *bio; 6270 int batch_size, released; 6271 unsigned int offset; 6272 6273 released = release_stripe_list(conf, conf->temp_inactive_list); 6274 if (released) 6275 clear_bit(R5_DID_ALLOC, &conf->cache_state); 6276 6277 if ( 6278 !list_empty(&conf->bitmap_list)) { 6279 /* Now is a good time to flush some bitmap updates */ 6280 conf->seq_flush++; 6281 spin_unlock_irq(&conf->device_lock); 6282 bitmap_unplug(mddev->bitmap); 6283 spin_lock_irq(&conf->device_lock); 6284 conf->seq_write = conf->seq_flush; 6285 activate_bit_delay(conf, conf->temp_inactive_list); 6286 } 6287 raid5_activate_delayed(conf); 6288 6289 while ((bio = remove_bio_from_retry(conf, &offset))) { 6290 int ok; 6291 spin_unlock_irq(&conf->device_lock); 6292 ok = retry_aligned_read(conf, bio, offset); 6293 spin_lock_irq(&conf->device_lock); 6294 if (!ok) 6295 break; 6296 handled++; 6297 } 6298 6299 batch_size = handle_active_stripes(conf, ANY_GROUP, NULL, 6300 conf->temp_inactive_list); 6301 if (!batch_size && !released) 6302 break; 6303 handled += batch_size; 6304 6305 if (mddev->sb_flags & ~(1 << MD_SB_CHANGE_PENDING)) { 6306 spin_unlock_irq(&conf->device_lock); 6307 md_check_recovery(mddev); 6308 spin_lock_irq(&conf->device_lock); 6309 } 6310 } 6311 pr_debug("%d stripes handled\n", handled); 6312 6313 spin_unlock_irq(&conf->device_lock); 6314 if (test_and_clear_bit(R5_ALLOC_MORE, &conf->cache_state) && 6315 mutex_trylock(&conf->cache_size_mutex)) { 6316 grow_one_stripe(conf, __GFP_NOWARN); 6317 /* Set flag even if allocation failed. This helps 6318 * slow down allocation requests when mem is short 6319 */ 6320 set_bit(R5_DID_ALLOC, &conf->cache_state); 6321 mutex_unlock(&conf->cache_size_mutex); 6322 } 6323 6324 flush_deferred_bios(conf); 6325 6326 r5l_flush_stripe_to_raid(conf->log); 6327 6328 async_tx_issue_pending_all(); 6329 blk_finish_plug(&plug); 6330 6331 pr_debug("--- raid5d inactive\n"); 6332 } 6333 6334 static ssize_t 6335 raid5_show_stripe_cache_size(struct mddev *mddev, char *page) 6336 { 6337 struct r5conf *conf; 6338 int ret = 0; 6339 spin_lock(&mddev->lock); 6340 conf = mddev->private; 6341 if (conf) 6342 ret = sprintf(page, "%d\n", conf->min_nr_stripes); 6343 spin_unlock(&mddev->lock); 6344 return ret; 6345 } 6346 6347 int 6348 raid5_set_cache_size(struct mddev *mddev, int size) 6349 { 6350 struct r5conf *conf = mddev->private; 6351 6352 if (size <= 16 || size > 32768) 6353 return -EINVAL; 6354 6355 conf->min_nr_stripes = size; 6356 mutex_lock(&conf->cache_size_mutex); 6357 while (size < conf->max_nr_stripes && 6358 drop_one_stripe(conf)) 6359 ; 6360 mutex_unlock(&conf->cache_size_mutex); 6361 6362 md_allow_write(mddev); 6363 6364 mutex_lock(&conf->cache_size_mutex); 6365 while (size > conf->max_nr_stripes) 6366 if (!grow_one_stripe(conf, GFP_KERNEL)) 6367 break; 6368 mutex_unlock(&conf->cache_size_mutex); 6369 6370 return 0; 6371 } 6372 EXPORT_SYMBOL(raid5_set_cache_size); 6373 6374 static ssize_t 6375 raid5_store_stripe_cache_size(struct mddev *mddev, const char *page, size_t len) 6376 { 6377 struct r5conf *conf; 6378 unsigned long new; 6379 int err; 6380 6381 if (len >= PAGE_SIZE) 6382 return -EINVAL; 6383 if (kstrtoul(page, 10, &new)) 6384 return -EINVAL; 6385 err = mddev_lock(mddev); 6386 if (err) 6387 return err; 6388 conf = mddev->private; 6389 if (!conf) 6390 err = -ENODEV; 6391 else 6392 err = raid5_set_cache_size(mddev, new); 6393 mddev_unlock(mddev); 6394 6395 return err ?: len; 6396 } 6397 6398 static struct md_sysfs_entry 6399 raid5_stripecache_size = __ATTR(stripe_cache_size, S_IRUGO | S_IWUSR, 6400 raid5_show_stripe_cache_size, 6401 raid5_store_stripe_cache_size); 6402 6403 static ssize_t 6404 raid5_show_rmw_level(struct mddev *mddev, char *page) 6405 { 6406 struct r5conf *conf = mddev->private; 6407 if (conf) 6408 return sprintf(page, "%d\n", conf->rmw_level); 6409 else 6410 return 0; 6411 } 6412 6413 static ssize_t 6414 raid5_store_rmw_level(struct mddev *mddev, const char *page, size_t len) 6415 { 6416 struct r5conf *conf = mddev->private; 6417 unsigned long new; 6418 6419 if (!conf) 6420 return -ENODEV; 6421 6422 if (len >= PAGE_SIZE) 6423 return -EINVAL; 6424 6425 if (kstrtoul(page, 10, &new)) 6426 return -EINVAL; 6427 6428 if (new != PARITY_DISABLE_RMW && !raid6_call.xor_syndrome) 6429 return -EINVAL; 6430 6431 if (new != PARITY_DISABLE_RMW && 6432 new != PARITY_ENABLE_RMW && 6433 new != PARITY_PREFER_RMW) 6434 return -EINVAL; 6435 6436 conf->rmw_level = new; 6437 return len; 6438 } 6439 6440 static struct md_sysfs_entry 6441 raid5_rmw_level = __ATTR(rmw_level, S_IRUGO | S_IWUSR, 6442 raid5_show_rmw_level, 6443 raid5_store_rmw_level); 6444 6445 6446 static ssize_t 6447 raid5_show_preread_threshold(struct mddev *mddev, char *page) 6448 { 6449 struct r5conf *conf; 6450 int ret = 0; 6451 spin_lock(&mddev->lock); 6452 conf = mddev->private; 6453 if (conf) 6454 ret = sprintf(page, "%d\n", conf->bypass_threshold); 6455 spin_unlock(&mddev->lock); 6456 return ret; 6457 } 6458 6459 static ssize_t 6460 raid5_store_preread_threshold(struct mddev *mddev, const char *page, size_t len) 6461 { 6462 struct r5conf *conf; 6463 unsigned long new; 6464 int err; 6465 6466 if (len >= PAGE_SIZE) 6467 return -EINVAL; 6468 if (kstrtoul(page, 10, &new)) 6469 return -EINVAL; 6470 6471 err = mddev_lock(mddev); 6472 if (err) 6473 return err; 6474 conf = mddev->private; 6475 if (!conf) 6476 err = -ENODEV; 6477 else if (new > conf->min_nr_stripes) 6478 err = -EINVAL; 6479 else 6480 conf->bypass_threshold = new; 6481 mddev_unlock(mddev); 6482 return err ?: len; 6483 } 6484 6485 static struct md_sysfs_entry 6486 raid5_preread_bypass_threshold = __ATTR(preread_bypass_threshold, 6487 S_IRUGO | S_IWUSR, 6488 raid5_show_preread_threshold, 6489 raid5_store_preread_threshold); 6490 6491 static ssize_t 6492 raid5_show_skip_copy(struct mddev *mddev, char *page) 6493 { 6494 struct r5conf *conf; 6495 int ret = 0; 6496 spin_lock(&mddev->lock); 6497 conf = mddev->private; 6498 if (conf) 6499 ret = sprintf(page, "%d\n", conf->skip_copy); 6500 spin_unlock(&mddev->lock); 6501 return ret; 6502 } 6503 6504 static ssize_t 6505 raid5_store_skip_copy(struct mddev *mddev, const char *page, size_t len) 6506 { 6507 struct r5conf *conf; 6508 unsigned long new; 6509 int err; 6510 6511 if (len >= PAGE_SIZE) 6512 return -EINVAL; 6513 if (kstrtoul(page, 10, &new)) 6514 return -EINVAL; 6515 new = !!new; 6516 6517 err = mddev_lock(mddev); 6518 if (err) 6519 return err; 6520 conf = mddev->private; 6521 if (!conf) 6522 err = -ENODEV; 6523 else if (new != conf->skip_copy) { 6524 mddev_suspend(mddev); 6525 conf->skip_copy = new; 6526 if (new) 6527 mddev->queue->backing_dev_info->capabilities |= 6528 BDI_CAP_STABLE_WRITES; 6529 else 6530 mddev->queue->backing_dev_info->capabilities &= 6531 ~BDI_CAP_STABLE_WRITES; 6532 mddev_resume(mddev); 6533 } 6534 mddev_unlock(mddev); 6535 return err ?: len; 6536 } 6537 6538 static struct md_sysfs_entry 6539 raid5_skip_copy = __ATTR(skip_copy, S_IRUGO | S_IWUSR, 6540 raid5_show_skip_copy, 6541 raid5_store_skip_copy); 6542 6543 static ssize_t 6544 stripe_cache_active_show(struct mddev *mddev, char *page) 6545 { 6546 struct r5conf *conf = mddev->private; 6547 if (conf) 6548 return sprintf(page, "%d\n", atomic_read(&conf->active_stripes)); 6549 else 6550 return 0; 6551 } 6552 6553 static struct md_sysfs_entry 6554 raid5_stripecache_active = __ATTR_RO(stripe_cache_active); 6555 6556 static ssize_t 6557 raid5_show_group_thread_cnt(struct mddev *mddev, char *page) 6558 { 6559 struct r5conf *conf; 6560 int ret = 0; 6561 spin_lock(&mddev->lock); 6562 conf = mddev->private; 6563 if (conf) 6564 ret = sprintf(page, "%d\n", conf->worker_cnt_per_group); 6565 spin_unlock(&mddev->lock); 6566 return ret; 6567 } 6568 6569 static int alloc_thread_groups(struct r5conf *conf, int cnt, 6570 int *group_cnt, 6571 int *worker_cnt_per_group, 6572 struct r5worker_group **worker_groups); 6573 static ssize_t 6574 raid5_store_group_thread_cnt(struct mddev *mddev, const char *page, size_t len) 6575 { 6576 struct r5conf *conf; 6577 unsigned int new; 6578 int err; 6579 struct r5worker_group *new_groups, *old_groups; 6580 int group_cnt, worker_cnt_per_group; 6581 6582 if (len >= PAGE_SIZE) 6583 return -EINVAL; 6584 if (kstrtouint(page, 10, &new)) 6585 return -EINVAL; 6586 /* 8192 should be big enough */ 6587 if (new > 8192) 6588 return -EINVAL; 6589 6590 err = mddev_lock(mddev); 6591 if (err) 6592 return err; 6593 conf = mddev->private; 6594 if (!conf) 6595 err = -ENODEV; 6596 else if (new != conf->worker_cnt_per_group) { 6597 mddev_suspend(mddev); 6598 6599 old_groups = conf->worker_groups; 6600 if (old_groups) 6601 flush_workqueue(raid5_wq); 6602 6603 err = alloc_thread_groups(conf, new, 6604 &group_cnt, &worker_cnt_per_group, 6605 &new_groups); 6606 if (!err) { 6607 spin_lock_irq(&conf->device_lock); 6608 conf->group_cnt = group_cnt; 6609 conf->worker_cnt_per_group = worker_cnt_per_group; 6610 conf->worker_groups = new_groups; 6611 spin_unlock_irq(&conf->device_lock); 6612 6613 if (old_groups) 6614 kfree(old_groups[0].workers); 6615 kfree(old_groups); 6616 } 6617 mddev_resume(mddev); 6618 } 6619 mddev_unlock(mddev); 6620 6621 return err ?: len; 6622 } 6623 6624 static struct md_sysfs_entry 6625 raid5_group_thread_cnt = __ATTR(group_thread_cnt, S_IRUGO | S_IWUSR, 6626 raid5_show_group_thread_cnt, 6627 raid5_store_group_thread_cnt); 6628 6629 static struct attribute *raid5_attrs[] = { 6630 &raid5_stripecache_size.attr, 6631 &raid5_stripecache_active.attr, 6632 &raid5_preread_bypass_threshold.attr, 6633 &raid5_group_thread_cnt.attr, 6634 &raid5_skip_copy.attr, 6635 &raid5_rmw_level.attr, 6636 &r5c_journal_mode.attr, 6637 NULL, 6638 }; 6639 static struct attribute_group raid5_attrs_group = { 6640 .name = NULL, 6641 .attrs = raid5_attrs, 6642 }; 6643 6644 static int alloc_thread_groups(struct r5conf *conf, int cnt, 6645 int *group_cnt, 6646 int *worker_cnt_per_group, 6647 struct r5worker_group **worker_groups) 6648 { 6649 int i, j, k; 6650 ssize_t size; 6651 struct r5worker *workers; 6652 6653 *worker_cnt_per_group = cnt; 6654 if (cnt == 0) { 6655 *group_cnt = 0; 6656 *worker_groups = NULL; 6657 return 0; 6658 } 6659 *group_cnt = num_possible_nodes(); 6660 size = sizeof(struct r5worker) * cnt; 6661 workers = kzalloc(size * *group_cnt, GFP_NOIO); 6662 *worker_groups = kzalloc(sizeof(struct r5worker_group) * 6663 *group_cnt, GFP_NOIO); 6664 if (!*worker_groups || !workers) { 6665 kfree(workers); 6666 kfree(*worker_groups); 6667 return -ENOMEM; 6668 } 6669 6670 for (i = 0; i < *group_cnt; i++) { 6671 struct r5worker_group *group; 6672 6673 group = &(*worker_groups)[i]; 6674 INIT_LIST_HEAD(&group->handle_list); 6675 INIT_LIST_HEAD(&group->loprio_list); 6676 group->conf = conf; 6677 group->workers = workers + i * cnt; 6678 6679 for (j = 0; j < cnt; j++) { 6680 struct r5worker *worker = group->workers + j; 6681 worker->group = group; 6682 INIT_WORK(&worker->work, raid5_do_work); 6683 6684 for (k = 0; k < NR_STRIPE_HASH_LOCKS; k++) 6685 INIT_LIST_HEAD(worker->temp_inactive_list + k); 6686 } 6687 } 6688 6689 return 0; 6690 } 6691 6692 static void free_thread_groups(struct r5conf *conf) 6693 { 6694 if (conf->worker_groups) 6695 kfree(conf->worker_groups[0].workers); 6696 kfree(conf->worker_groups); 6697 conf->worker_groups = NULL; 6698 } 6699 6700 static sector_t 6701 raid5_size(struct mddev *mddev, sector_t sectors, int raid_disks) 6702 { 6703 struct r5conf *conf = mddev->private; 6704 6705 if (!sectors) 6706 sectors = mddev->dev_sectors; 6707 if (!raid_disks) 6708 /* size is defined by the smallest of previous and new size */ 6709 raid_disks = min(conf->raid_disks, conf->previous_raid_disks); 6710 6711 sectors &= ~((sector_t)conf->chunk_sectors - 1); 6712 sectors &= ~((sector_t)conf->prev_chunk_sectors - 1); 6713 return sectors * (raid_disks - conf->max_degraded); 6714 } 6715 6716 static void free_scratch_buffer(struct r5conf *conf, struct raid5_percpu *percpu) 6717 { 6718 safe_put_page(percpu->spare_page); 6719 if (percpu->scribble) 6720 flex_array_free(percpu->scribble); 6721 percpu->spare_page = NULL; 6722 percpu->scribble = NULL; 6723 } 6724 6725 static int alloc_scratch_buffer(struct r5conf *conf, struct raid5_percpu *percpu) 6726 { 6727 if (conf->level == 6 && !percpu->spare_page) 6728 percpu->spare_page = alloc_page(GFP_KERNEL); 6729 if (!percpu->scribble) 6730 percpu->scribble = scribble_alloc(max(conf->raid_disks, 6731 conf->previous_raid_disks), 6732 max(conf->chunk_sectors, 6733 conf->prev_chunk_sectors) 6734 / STRIPE_SECTORS, 6735 GFP_KERNEL); 6736 6737 if (!percpu->scribble || (conf->level == 6 && !percpu->spare_page)) { 6738 free_scratch_buffer(conf, percpu); 6739 return -ENOMEM; 6740 } 6741 6742 return 0; 6743 } 6744 6745 static int raid456_cpu_dead(unsigned int cpu, struct hlist_node *node) 6746 { 6747 struct r5conf *conf = hlist_entry_safe(node, struct r5conf, node); 6748 6749 free_scratch_buffer(conf, per_cpu_ptr(conf->percpu, cpu)); 6750 return 0; 6751 } 6752 6753 static void raid5_free_percpu(struct r5conf *conf) 6754 { 6755 if (!conf->percpu) 6756 return; 6757 6758 cpuhp_state_remove_instance(CPUHP_MD_RAID5_PREPARE, &conf->node); 6759 free_percpu(conf->percpu); 6760 } 6761 6762 static void free_conf(struct r5conf *conf) 6763 { 6764 int i; 6765 6766 log_exit(conf); 6767 6768 unregister_shrinker(&conf->shrinker); 6769 free_thread_groups(conf); 6770 shrink_stripes(conf); 6771 raid5_free_percpu(conf); 6772 for (i = 0; i < conf->pool_size; i++) 6773 if (conf->disks[i].extra_page) 6774 put_page(conf->disks[i].extra_page); 6775 kfree(conf->disks); 6776 if (conf->bio_split) 6777 bioset_free(conf->bio_split); 6778 kfree(conf->stripe_hashtbl); 6779 kfree(conf->pending_data); 6780 kfree(conf); 6781 } 6782 6783 static int raid456_cpu_up_prepare(unsigned int cpu, struct hlist_node *node) 6784 { 6785 struct r5conf *conf = hlist_entry_safe(node, struct r5conf, node); 6786 struct raid5_percpu *percpu = per_cpu_ptr(conf->percpu, cpu); 6787 6788 if (alloc_scratch_buffer(conf, percpu)) { 6789 pr_warn("%s: failed memory allocation for cpu%u\n", 6790 __func__, cpu); 6791 return -ENOMEM; 6792 } 6793 return 0; 6794 } 6795 6796 static int raid5_alloc_percpu(struct r5conf *conf) 6797 { 6798 int err = 0; 6799 6800 conf->percpu = alloc_percpu(struct raid5_percpu); 6801 if (!conf->percpu) 6802 return -ENOMEM; 6803 6804 err = cpuhp_state_add_instance(CPUHP_MD_RAID5_PREPARE, &conf->node); 6805 if (!err) { 6806 conf->scribble_disks = max(conf->raid_disks, 6807 conf->previous_raid_disks); 6808 conf->scribble_sectors = max(conf->chunk_sectors, 6809 conf->prev_chunk_sectors); 6810 } 6811 return err; 6812 } 6813 6814 static unsigned long raid5_cache_scan(struct shrinker *shrink, 6815 struct shrink_control *sc) 6816 { 6817 struct r5conf *conf = container_of(shrink, struct r5conf, shrinker); 6818 unsigned long ret = SHRINK_STOP; 6819 6820 if (mutex_trylock(&conf->cache_size_mutex)) { 6821 ret= 0; 6822 while (ret < sc->nr_to_scan && 6823 conf->max_nr_stripes > conf->min_nr_stripes) { 6824 if (drop_one_stripe(conf) == 0) { 6825 ret = SHRINK_STOP; 6826 break; 6827 } 6828 ret++; 6829 } 6830 mutex_unlock(&conf->cache_size_mutex); 6831 } 6832 return ret; 6833 } 6834 6835 static unsigned long raid5_cache_count(struct shrinker *shrink, 6836 struct shrink_control *sc) 6837 { 6838 struct r5conf *conf = container_of(shrink, struct r5conf, shrinker); 6839 6840 if (conf->max_nr_stripes < conf->min_nr_stripes) 6841 /* unlikely, but not impossible */ 6842 return 0; 6843 return conf->max_nr_stripes - conf->min_nr_stripes; 6844 } 6845 6846 static struct r5conf *setup_conf(struct mddev *mddev) 6847 { 6848 struct r5conf *conf; 6849 int raid_disk, memory, max_disks; 6850 struct md_rdev *rdev; 6851 struct disk_info *disk; 6852 char pers_name[6]; 6853 int i; 6854 int group_cnt, worker_cnt_per_group; 6855 struct r5worker_group *new_group; 6856 6857 if (mddev->new_level != 5 6858 && mddev->new_level != 4 6859 && mddev->new_level != 6) { 6860 pr_warn("md/raid:%s: raid level not set to 4/5/6 (%d)\n", 6861 mdname(mddev), mddev->new_level); 6862 return ERR_PTR(-EIO); 6863 } 6864 if ((mddev->new_level == 5 6865 && !algorithm_valid_raid5(mddev->new_layout)) || 6866 (mddev->new_level == 6 6867 && !algorithm_valid_raid6(mddev->new_layout))) { 6868 pr_warn("md/raid:%s: layout %d not supported\n", 6869 mdname(mddev), mddev->new_layout); 6870 return ERR_PTR(-EIO); 6871 } 6872 if (mddev->new_level == 6 && mddev->raid_disks < 4) { 6873 pr_warn("md/raid:%s: not enough configured devices (%d, minimum 4)\n", 6874 mdname(mddev), mddev->raid_disks); 6875 return ERR_PTR(-EINVAL); 6876 } 6877 6878 if (!mddev->new_chunk_sectors || 6879 (mddev->new_chunk_sectors << 9) % PAGE_SIZE || 6880 !is_power_of_2(mddev->new_chunk_sectors)) { 6881 pr_warn("md/raid:%s: invalid chunk size %d\n", 6882 mdname(mddev), mddev->new_chunk_sectors << 9); 6883 return ERR_PTR(-EINVAL); 6884 } 6885 6886 conf = kzalloc(sizeof(struct r5conf), GFP_KERNEL); 6887 if (conf == NULL) 6888 goto abort; 6889 INIT_LIST_HEAD(&conf->free_list); 6890 INIT_LIST_HEAD(&conf->pending_list); 6891 conf->pending_data = kzalloc(sizeof(struct r5pending_data) * 6892 PENDING_IO_MAX, GFP_KERNEL); 6893 if (!conf->pending_data) 6894 goto abort; 6895 for (i = 0; i < PENDING_IO_MAX; i++) 6896 list_add(&conf->pending_data[i].sibling, &conf->free_list); 6897 /* Don't enable multi-threading by default*/ 6898 if (!alloc_thread_groups(conf, 0, &group_cnt, &worker_cnt_per_group, 6899 &new_group)) { 6900 conf->group_cnt = group_cnt; 6901 conf->worker_cnt_per_group = worker_cnt_per_group; 6902 conf->worker_groups = new_group; 6903 } else 6904 goto abort; 6905 spin_lock_init(&conf->device_lock); 6906 seqcount_init(&conf->gen_lock); 6907 mutex_init(&conf->cache_size_mutex); 6908 init_waitqueue_head(&conf->wait_for_quiescent); 6909 init_waitqueue_head(&conf->wait_for_stripe); 6910 init_waitqueue_head(&conf->wait_for_overlap); 6911 INIT_LIST_HEAD(&conf->handle_list); 6912 INIT_LIST_HEAD(&conf->loprio_list); 6913 INIT_LIST_HEAD(&conf->hold_list); 6914 INIT_LIST_HEAD(&conf->delayed_list); 6915 INIT_LIST_HEAD(&conf->bitmap_list); 6916 init_llist_head(&conf->released_stripes); 6917 atomic_set(&conf->active_stripes, 0); 6918 atomic_set(&conf->preread_active_stripes, 0); 6919 atomic_set(&conf->active_aligned_reads, 0); 6920 spin_lock_init(&conf->pending_bios_lock); 6921 conf->batch_bio_dispatch = true; 6922 rdev_for_each(rdev, mddev) { 6923 if (test_bit(Journal, &rdev->flags)) 6924 continue; 6925 if (blk_queue_nonrot(bdev_get_queue(rdev->bdev))) { 6926 conf->batch_bio_dispatch = false; 6927 break; 6928 } 6929 } 6930 6931 conf->bypass_threshold = BYPASS_THRESHOLD; 6932 conf->recovery_disabled = mddev->recovery_disabled - 1; 6933 6934 conf->raid_disks = mddev->raid_disks; 6935 if (mddev->reshape_position == MaxSector) 6936 conf->previous_raid_disks = mddev->raid_disks; 6937 else 6938 conf->previous_raid_disks = mddev->raid_disks - mddev->delta_disks; 6939 max_disks = max(conf->raid_disks, conf->previous_raid_disks); 6940 6941 conf->disks = kzalloc(max_disks * sizeof(struct disk_info), 6942 GFP_KERNEL); 6943 6944 if (!conf->disks) 6945 goto abort; 6946 6947 for (i = 0; i < max_disks; i++) { 6948 conf->disks[i].extra_page = alloc_page(GFP_KERNEL); 6949 if (!conf->disks[i].extra_page) 6950 goto abort; 6951 } 6952 6953 conf->bio_split = bioset_create(BIO_POOL_SIZE, 0, 0); 6954 if (!conf->bio_split) 6955 goto abort; 6956 conf->mddev = mddev; 6957 6958 if ((conf->stripe_hashtbl = kzalloc(PAGE_SIZE, GFP_KERNEL)) == NULL) 6959 goto abort; 6960 6961 /* We init hash_locks[0] separately to that it can be used 6962 * as the reference lock in the spin_lock_nest_lock() call 6963 * in lock_all_device_hash_locks_irq in order to convince 6964 * lockdep that we know what we are doing. 6965 */ 6966 spin_lock_init(conf->hash_locks); 6967 for (i = 1; i < NR_STRIPE_HASH_LOCKS; i++) 6968 spin_lock_init(conf->hash_locks + i); 6969 6970 for (i = 0; i < NR_STRIPE_HASH_LOCKS; i++) 6971 INIT_LIST_HEAD(conf->inactive_list + i); 6972 6973 for (i = 0; i < NR_STRIPE_HASH_LOCKS; i++) 6974 INIT_LIST_HEAD(conf->temp_inactive_list + i); 6975 6976 atomic_set(&conf->r5c_cached_full_stripes, 0); 6977 INIT_LIST_HEAD(&conf->r5c_full_stripe_list); 6978 atomic_set(&conf->r5c_cached_partial_stripes, 0); 6979 INIT_LIST_HEAD(&conf->r5c_partial_stripe_list); 6980 atomic_set(&conf->r5c_flushing_full_stripes, 0); 6981 atomic_set(&conf->r5c_flushing_partial_stripes, 0); 6982 6983 conf->level = mddev->new_level; 6984 conf->chunk_sectors = mddev->new_chunk_sectors; 6985 if (raid5_alloc_percpu(conf) != 0) 6986 goto abort; 6987 6988 pr_debug("raid456: run(%s) called.\n", mdname(mddev)); 6989 6990 rdev_for_each(rdev, mddev) { 6991 raid_disk = rdev->raid_disk; 6992 if (raid_disk >= max_disks 6993 || raid_disk < 0 || test_bit(Journal, &rdev->flags)) 6994 continue; 6995 disk = conf->disks + raid_disk; 6996 6997 if (test_bit(Replacement, &rdev->flags)) { 6998 if (disk->replacement) 6999 goto abort; 7000 disk->replacement = rdev; 7001 } else { 7002 if (disk->rdev) 7003 goto abort; 7004 disk->rdev = rdev; 7005 } 7006 7007 if (test_bit(In_sync, &rdev->flags)) { 7008 char b[BDEVNAME_SIZE]; 7009 pr_info("md/raid:%s: device %s operational as raid disk %d\n", 7010 mdname(mddev), bdevname(rdev->bdev, b), raid_disk); 7011 } else if (rdev->saved_raid_disk != raid_disk) 7012 /* Cannot rely on bitmap to complete recovery */ 7013 conf->fullsync = 1; 7014 } 7015 7016 conf->level = mddev->new_level; 7017 if (conf->level == 6) { 7018 conf->max_degraded = 2; 7019 if (raid6_call.xor_syndrome) 7020 conf->rmw_level = PARITY_ENABLE_RMW; 7021 else 7022 conf->rmw_level = PARITY_DISABLE_RMW; 7023 } else { 7024 conf->max_degraded = 1; 7025 conf->rmw_level = PARITY_ENABLE_RMW; 7026 } 7027 conf->algorithm = mddev->new_layout; 7028 conf->reshape_progress = mddev->reshape_position; 7029 if (conf->reshape_progress != MaxSector) { 7030 conf->prev_chunk_sectors = mddev->chunk_sectors; 7031 conf->prev_algo = mddev->layout; 7032 } else { 7033 conf->prev_chunk_sectors = conf->chunk_sectors; 7034 conf->prev_algo = conf->algorithm; 7035 } 7036 7037 conf->min_nr_stripes = NR_STRIPES; 7038 if (mddev->reshape_position != MaxSector) { 7039 int stripes = max_t(int, 7040 ((mddev->chunk_sectors << 9) / STRIPE_SIZE) * 4, 7041 ((mddev->new_chunk_sectors << 9) / STRIPE_SIZE) * 4); 7042 conf->min_nr_stripes = max(NR_STRIPES, stripes); 7043 if (conf->min_nr_stripes != NR_STRIPES) 7044 pr_info("md/raid:%s: force stripe size %d for reshape\n", 7045 mdname(mddev), conf->min_nr_stripes); 7046 } 7047 memory = conf->min_nr_stripes * (sizeof(struct stripe_head) + 7048 max_disks * ((sizeof(struct bio) + PAGE_SIZE))) / 1024; 7049 atomic_set(&conf->empty_inactive_list_nr, NR_STRIPE_HASH_LOCKS); 7050 if (grow_stripes(conf, conf->min_nr_stripes)) { 7051 pr_warn("md/raid:%s: couldn't allocate %dkB for buffers\n", 7052 mdname(mddev), memory); 7053 goto abort; 7054 } else 7055 pr_debug("md/raid:%s: allocated %dkB\n", mdname(mddev), memory); 7056 /* 7057 * Losing a stripe head costs more than the time to refill it, 7058 * it reduces the queue depth and so can hurt throughput. 7059 * So set it rather large, scaled by number of devices. 7060 */ 7061 conf->shrinker.seeks = DEFAULT_SEEKS * conf->raid_disks * 4; 7062 conf->shrinker.scan_objects = raid5_cache_scan; 7063 conf->shrinker.count_objects = raid5_cache_count; 7064 conf->shrinker.batch = 128; 7065 conf->shrinker.flags = 0; 7066 if (register_shrinker(&conf->shrinker)) { 7067 pr_warn("md/raid:%s: couldn't register shrinker.\n", 7068 mdname(mddev)); 7069 goto abort; 7070 } 7071 7072 sprintf(pers_name, "raid%d", mddev->new_level); 7073 conf->thread = md_register_thread(raid5d, mddev, pers_name); 7074 if (!conf->thread) { 7075 pr_warn("md/raid:%s: couldn't allocate thread.\n", 7076 mdname(mddev)); 7077 goto abort; 7078 } 7079 7080 return conf; 7081 7082 abort: 7083 if (conf) { 7084 free_conf(conf); 7085 return ERR_PTR(-EIO); 7086 } else 7087 return ERR_PTR(-ENOMEM); 7088 } 7089 7090 static int only_parity(int raid_disk, int algo, int raid_disks, int max_degraded) 7091 { 7092 switch (algo) { 7093 case ALGORITHM_PARITY_0: 7094 if (raid_disk < max_degraded) 7095 return 1; 7096 break; 7097 case ALGORITHM_PARITY_N: 7098 if (raid_disk >= raid_disks - max_degraded) 7099 return 1; 7100 break; 7101 case ALGORITHM_PARITY_0_6: 7102 if (raid_disk == 0 || 7103 raid_disk == raid_disks - 1) 7104 return 1; 7105 break; 7106 case ALGORITHM_LEFT_ASYMMETRIC_6: 7107 case ALGORITHM_RIGHT_ASYMMETRIC_6: 7108 case ALGORITHM_LEFT_SYMMETRIC_6: 7109 case ALGORITHM_RIGHT_SYMMETRIC_6: 7110 if (raid_disk == raid_disks - 1) 7111 return 1; 7112 } 7113 return 0; 7114 } 7115 7116 static int raid5_run(struct mddev *mddev) 7117 { 7118 struct r5conf *conf; 7119 int working_disks = 0; 7120 int dirty_parity_disks = 0; 7121 struct md_rdev *rdev; 7122 struct md_rdev *journal_dev = NULL; 7123 sector_t reshape_offset = 0; 7124 int i; 7125 long long min_offset_diff = 0; 7126 int first = 1; 7127 7128 if (mddev_init_writes_pending(mddev) < 0) 7129 return -ENOMEM; 7130 7131 if (mddev->recovery_cp != MaxSector) 7132 pr_notice("md/raid:%s: not clean -- starting background reconstruction\n", 7133 mdname(mddev)); 7134 7135 rdev_for_each(rdev, mddev) { 7136 long long diff; 7137 7138 if (test_bit(Journal, &rdev->flags)) { 7139 journal_dev = rdev; 7140 continue; 7141 } 7142 if (rdev->raid_disk < 0) 7143 continue; 7144 diff = (rdev->new_data_offset - rdev->data_offset); 7145 if (first) { 7146 min_offset_diff = diff; 7147 first = 0; 7148 } else if (mddev->reshape_backwards && 7149 diff < min_offset_diff) 7150 min_offset_diff = diff; 7151 else if (!mddev->reshape_backwards && 7152 diff > min_offset_diff) 7153 min_offset_diff = diff; 7154 } 7155 7156 if ((test_bit(MD_HAS_JOURNAL, &mddev->flags) || journal_dev) && 7157 (mddev->bitmap_info.offset || mddev->bitmap_info.file)) { 7158 pr_notice("md/raid:%s: array cannot have both journal and bitmap\n", 7159 mdname(mddev)); 7160 return -EINVAL; 7161 } 7162 7163 if (mddev->reshape_position != MaxSector) { 7164 /* Check that we can continue the reshape. 7165 * Difficulties arise if the stripe we would write to 7166 * next is at or after the stripe we would read from next. 7167 * For a reshape that changes the number of devices, this 7168 * is only possible for a very short time, and mdadm makes 7169 * sure that time appears to have past before assembling 7170 * the array. So we fail if that time hasn't passed. 7171 * For a reshape that keeps the number of devices the same 7172 * mdadm must be monitoring the reshape can keeping the 7173 * critical areas read-only and backed up. It will start 7174 * the array in read-only mode, so we check for that. 7175 */ 7176 sector_t here_new, here_old; 7177 int old_disks; 7178 int max_degraded = (mddev->level == 6 ? 2 : 1); 7179 int chunk_sectors; 7180 int new_data_disks; 7181 7182 if (journal_dev) { 7183 pr_warn("md/raid:%s: don't support reshape with journal - aborting.\n", 7184 mdname(mddev)); 7185 return -EINVAL; 7186 } 7187 7188 if (mddev->new_level != mddev->level) { 7189 pr_warn("md/raid:%s: unsupported reshape required - aborting.\n", 7190 mdname(mddev)); 7191 return -EINVAL; 7192 } 7193 old_disks = mddev->raid_disks - mddev->delta_disks; 7194 /* reshape_position must be on a new-stripe boundary, and one 7195 * further up in new geometry must map after here in old 7196 * geometry. 7197 * If the chunk sizes are different, then as we perform reshape 7198 * in units of the largest of the two, reshape_position needs 7199 * be a multiple of the largest chunk size times new data disks. 7200 */ 7201 here_new = mddev->reshape_position; 7202 chunk_sectors = max(mddev->chunk_sectors, mddev->new_chunk_sectors); 7203 new_data_disks = mddev->raid_disks - max_degraded; 7204 if (sector_div(here_new, chunk_sectors * new_data_disks)) { 7205 pr_warn("md/raid:%s: reshape_position not on a stripe boundary\n", 7206 mdname(mddev)); 7207 return -EINVAL; 7208 } 7209 reshape_offset = here_new * chunk_sectors; 7210 /* here_new is the stripe we will write to */ 7211 here_old = mddev->reshape_position; 7212 sector_div(here_old, chunk_sectors * (old_disks-max_degraded)); 7213 /* here_old is the first stripe that we might need to read 7214 * from */ 7215 if (mddev->delta_disks == 0) { 7216 /* We cannot be sure it is safe to start an in-place 7217 * reshape. It is only safe if user-space is monitoring 7218 * and taking constant backups. 7219 * mdadm always starts a situation like this in 7220 * readonly mode so it can take control before 7221 * allowing any writes. So just check for that. 7222 */ 7223 if (abs(min_offset_diff) >= mddev->chunk_sectors && 7224 abs(min_offset_diff) >= mddev->new_chunk_sectors) 7225 /* not really in-place - so OK */; 7226 else if (mddev->ro == 0) { 7227 pr_warn("md/raid:%s: in-place reshape must be started in read-only mode - aborting\n", 7228 mdname(mddev)); 7229 return -EINVAL; 7230 } 7231 } else if (mddev->reshape_backwards 7232 ? (here_new * chunk_sectors + min_offset_diff <= 7233 here_old * chunk_sectors) 7234 : (here_new * chunk_sectors >= 7235 here_old * chunk_sectors + (-min_offset_diff))) { 7236 /* Reading from the same stripe as writing to - bad */ 7237 pr_warn("md/raid:%s: reshape_position too early for auto-recovery - aborting.\n", 7238 mdname(mddev)); 7239 return -EINVAL; 7240 } 7241 pr_debug("md/raid:%s: reshape will continue\n", mdname(mddev)); 7242 /* OK, we should be able to continue; */ 7243 } else { 7244 BUG_ON(mddev->level != mddev->new_level); 7245 BUG_ON(mddev->layout != mddev->new_layout); 7246 BUG_ON(mddev->chunk_sectors != mddev->new_chunk_sectors); 7247 BUG_ON(mddev->delta_disks != 0); 7248 } 7249 7250 if (test_bit(MD_HAS_JOURNAL, &mddev->flags) && 7251 test_bit(MD_HAS_PPL, &mddev->flags)) { 7252 pr_warn("md/raid:%s: using journal device and PPL not allowed - disabling PPL\n", 7253 mdname(mddev)); 7254 clear_bit(MD_HAS_PPL, &mddev->flags); 7255 clear_bit(MD_HAS_MULTIPLE_PPLS, &mddev->flags); 7256 } 7257 7258 if (mddev->private == NULL) 7259 conf = setup_conf(mddev); 7260 else 7261 conf = mddev->private; 7262 7263 if (IS_ERR(conf)) 7264 return PTR_ERR(conf); 7265 7266 if (test_bit(MD_HAS_JOURNAL, &mddev->flags)) { 7267 if (!journal_dev) { 7268 pr_warn("md/raid:%s: journal disk is missing, force array readonly\n", 7269 mdname(mddev)); 7270 mddev->ro = 1; 7271 set_disk_ro(mddev->gendisk, 1); 7272 } else if (mddev->recovery_cp == MaxSector) 7273 set_bit(MD_JOURNAL_CLEAN, &mddev->flags); 7274 } 7275 7276 conf->min_offset_diff = min_offset_diff; 7277 mddev->thread = conf->thread; 7278 conf->thread = NULL; 7279 mddev->private = conf; 7280 7281 for (i = 0; i < conf->raid_disks && conf->previous_raid_disks; 7282 i++) { 7283 rdev = conf->disks[i].rdev; 7284 if (!rdev && conf->disks[i].replacement) { 7285 /* The replacement is all we have yet */ 7286 rdev = conf->disks[i].replacement; 7287 conf->disks[i].replacement = NULL; 7288 clear_bit(Replacement, &rdev->flags); 7289 conf->disks[i].rdev = rdev; 7290 } 7291 if (!rdev) 7292 continue; 7293 if (conf->disks[i].replacement && 7294 conf->reshape_progress != MaxSector) { 7295 /* replacements and reshape simply do not mix. */ 7296 pr_warn("md: cannot handle concurrent replacement and reshape.\n"); 7297 goto abort; 7298 } 7299 if (test_bit(In_sync, &rdev->flags)) { 7300 working_disks++; 7301 continue; 7302 } 7303 /* This disc is not fully in-sync. However if it 7304 * just stored parity (beyond the recovery_offset), 7305 * when we don't need to be concerned about the 7306 * array being dirty. 7307 * When reshape goes 'backwards', we never have 7308 * partially completed devices, so we only need 7309 * to worry about reshape going forwards. 7310 */ 7311 /* Hack because v0.91 doesn't store recovery_offset properly. */ 7312 if (mddev->major_version == 0 && 7313 mddev->minor_version > 90) 7314 rdev->recovery_offset = reshape_offset; 7315 7316 if (rdev->recovery_offset < reshape_offset) { 7317 /* We need to check old and new layout */ 7318 if (!only_parity(rdev->raid_disk, 7319 conf->algorithm, 7320 conf->raid_disks, 7321 conf->max_degraded)) 7322 continue; 7323 } 7324 if (!only_parity(rdev->raid_disk, 7325 conf->prev_algo, 7326 conf->previous_raid_disks, 7327 conf->max_degraded)) 7328 continue; 7329 dirty_parity_disks++; 7330 } 7331 7332 /* 7333 * 0 for a fully functional array, 1 or 2 for a degraded array. 7334 */ 7335 mddev->degraded = raid5_calc_degraded(conf); 7336 7337 if (has_failed(conf)) { 7338 pr_crit("md/raid:%s: not enough operational devices (%d/%d failed)\n", 7339 mdname(mddev), mddev->degraded, conf->raid_disks); 7340 goto abort; 7341 } 7342 7343 /* device size must be a multiple of chunk size */ 7344 mddev->dev_sectors &= ~(mddev->chunk_sectors - 1); 7345 mddev->resync_max_sectors = mddev->dev_sectors; 7346 7347 if (mddev->degraded > dirty_parity_disks && 7348 mddev->recovery_cp != MaxSector) { 7349 if (test_bit(MD_HAS_PPL, &mddev->flags)) 7350 pr_crit("md/raid:%s: starting dirty degraded array with PPL.\n", 7351 mdname(mddev)); 7352 else if (mddev->ok_start_degraded) 7353 pr_crit("md/raid:%s: starting dirty degraded array - data corruption possible.\n", 7354 mdname(mddev)); 7355 else { 7356 pr_crit("md/raid:%s: cannot start dirty degraded array.\n", 7357 mdname(mddev)); 7358 goto abort; 7359 } 7360 } 7361 7362 pr_info("md/raid:%s: raid level %d active with %d out of %d devices, algorithm %d\n", 7363 mdname(mddev), conf->level, 7364 mddev->raid_disks-mddev->degraded, mddev->raid_disks, 7365 mddev->new_layout); 7366 7367 print_raid5_conf(conf); 7368 7369 if (conf->reshape_progress != MaxSector) { 7370 conf->reshape_safe = conf->reshape_progress; 7371 atomic_set(&conf->reshape_stripes, 0); 7372 clear_bit(MD_RECOVERY_SYNC, &mddev->recovery); 7373 clear_bit(MD_RECOVERY_CHECK, &mddev->recovery); 7374 set_bit(MD_RECOVERY_RESHAPE, &mddev->recovery); 7375 set_bit(MD_RECOVERY_RUNNING, &mddev->recovery); 7376 mddev->sync_thread = md_register_thread(md_do_sync, mddev, 7377 "reshape"); 7378 } 7379 7380 /* Ok, everything is just fine now */ 7381 if (mddev->to_remove == &raid5_attrs_group) 7382 mddev->to_remove = NULL; 7383 else if (mddev->kobj.sd && 7384 sysfs_create_group(&mddev->kobj, &raid5_attrs_group)) 7385 pr_warn("raid5: failed to create sysfs attributes for %s\n", 7386 mdname(mddev)); 7387 md_set_array_sectors(mddev, raid5_size(mddev, 0, 0)); 7388 7389 if (mddev->queue) { 7390 int chunk_size; 7391 /* read-ahead size must cover two whole stripes, which 7392 * is 2 * (datadisks) * chunksize where 'n' is the 7393 * number of raid devices 7394 */ 7395 int data_disks = conf->previous_raid_disks - conf->max_degraded; 7396 int stripe = data_disks * 7397 ((mddev->chunk_sectors << 9) / PAGE_SIZE); 7398 if (mddev->queue->backing_dev_info->ra_pages < 2 * stripe) 7399 mddev->queue->backing_dev_info->ra_pages = 2 * stripe; 7400 7401 chunk_size = mddev->chunk_sectors << 9; 7402 blk_queue_io_min(mddev->queue, chunk_size); 7403 blk_queue_io_opt(mddev->queue, chunk_size * 7404 (conf->raid_disks - conf->max_degraded)); 7405 mddev->queue->limits.raid_partial_stripes_expensive = 1; 7406 /* 7407 * We can only discard a whole stripe. It doesn't make sense to 7408 * discard data disk but write parity disk 7409 */ 7410 stripe = stripe * PAGE_SIZE; 7411 /* Round up to power of 2, as discard handling 7412 * currently assumes that */ 7413 while ((stripe-1) & stripe) 7414 stripe = (stripe | (stripe-1)) + 1; 7415 mddev->queue->limits.discard_alignment = stripe; 7416 mddev->queue->limits.discard_granularity = stripe; 7417 7418 blk_queue_max_write_same_sectors(mddev->queue, 0); 7419 blk_queue_max_write_zeroes_sectors(mddev->queue, 0); 7420 7421 rdev_for_each(rdev, mddev) { 7422 disk_stack_limits(mddev->gendisk, rdev->bdev, 7423 rdev->data_offset << 9); 7424 disk_stack_limits(mddev->gendisk, rdev->bdev, 7425 rdev->new_data_offset << 9); 7426 } 7427 7428 /* 7429 * zeroing is required, otherwise data 7430 * could be lost. Consider a scenario: discard a stripe 7431 * (the stripe could be inconsistent if 7432 * discard_zeroes_data is 0); write one disk of the 7433 * stripe (the stripe could be inconsistent again 7434 * depending on which disks are used to calculate 7435 * parity); the disk is broken; The stripe data of this 7436 * disk is lost. 7437 * 7438 * We only allow DISCARD if the sysadmin has confirmed that 7439 * only safe devices are in use by setting a module parameter. 7440 * A better idea might be to turn DISCARD into WRITE_ZEROES 7441 * requests, as that is required to be safe. 7442 */ 7443 if (devices_handle_discard_safely && 7444 mddev->queue->limits.max_discard_sectors >= (stripe >> 9) && 7445 mddev->queue->limits.discard_granularity >= stripe) 7446 queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, 7447 mddev->queue); 7448 else 7449 queue_flag_clear_unlocked(QUEUE_FLAG_DISCARD, 7450 mddev->queue); 7451 7452 blk_queue_max_hw_sectors(mddev->queue, UINT_MAX); 7453 } 7454 7455 if (log_init(conf, journal_dev, raid5_has_ppl(conf))) 7456 goto abort; 7457 7458 return 0; 7459 abort: 7460 md_unregister_thread(&mddev->thread); 7461 print_raid5_conf(conf); 7462 free_conf(conf); 7463 mddev->private = NULL; 7464 pr_warn("md/raid:%s: failed to run raid set.\n", mdname(mddev)); 7465 return -EIO; 7466 } 7467 7468 static void raid5_free(struct mddev *mddev, void *priv) 7469 { 7470 struct r5conf *conf = priv; 7471 7472 free_conf(conf); 7473 mddev->to_remove = &raid5_attrs_group; 7474 } 7475 7476 static void raid5_status(struct seq_file *seq, struct mddev *mddev) 7477 { 7478 struct r5conf *conf = mddev->private; 7479 int i; 7480 7481 seq_printf(seq, " level %d, %dk chunk, algorithm %d", mddev->level, 7482 conf->chunk_sectors / 2, mddev->layout); 7483 seq_printf (seq, " [%d/%d] [", conf->raid_disks, conf->raid_disks - mddev->degraded); 7484 rcu_read_lock(); 7485 for (i = 0; i < conf->raid_disks; i++) { 7486 struct md_rdev *rdev = rcu_dereference(conf->disks[i].rdev); 7487 seq_printf (seq, "%s", rdev && test_bit(In_sync, &rdev->flags) ? "U" : "_"); 7488 } 7489 rcu_read_unlock(); 7490 seq_printf (seq, "]"); 7491 } 7492 7493 static void print_raid5_conf (struct r5conf *conf) 7494 { 7495 int i; 7496 struct disk_info *tmp; 7497 7498 pr_debug("RAID conf printout:\n"); 7499 if (!conf) { 7500 pr_debug("(conf==NULL)\n"); 7501 return; 7502 } 7503 pr_debug(" --- level:%d rd:%d wd:%d\n", conf->level, 7504 conf->raid_disks, 7505 conf->raid_disks - conf->mddev->degraded); 7506 7507 for (i = 0; i < conf->raid_disks; i++) { 7508 char b[BDEVNAME_SIZE]; 7509 tmp = conf->disks + i; 7510 if (tmp->rdev) 7511 pr_debug(" disk %d, o:%d, dev:%s\n", 7512 i, !test_bit(Faulty, &tmp->rdev->flags), 7513 bdevname(tmp->rdev->bdev, b)); 7514 } 7515 } 7516 7517 static int raid5_spare_active(struct mddev *mddev) 7518 { 7519 int i; 7520 struct r5conf *conf = mddev->private; 7521 struct disk_info *tmp; 7522 int count = 0; 7523 unsigned long flags; 7524 7525 for (i = 0; i < conf->raid_disks; i++) { 7526 tmp = conf->disks + i; 7527 if (tmp->replacement 7528 && tmp->replacement->recovery_offset == MaxSector 7529 && !test_bit(Faulty, &tmp->replacement->flags) 7530 && !test_and_set_bit(In_sync, &tmp->replacement->flags)) { 7531 /* Replacement has just become active. */ 7532 if (!tmp->rdev 7533 || !test_and_clear_bit(In_sync, &tmp->rdev->flags)) 7534 count++; 7535 if (tmp->rdev) { 7536 /* Replaced device not technically faulty, 7537 * but we need to be sure it gets removed 7538 * and never re-added. 7539 */ 7540 set_bit(Faulty, &tmp->rdev->flags); 7541 sysfs_notify_dirent_safe( 7542 tmp->rdev->sysfs_state); 7543 } 7544 sysfs_notify_dirent_safe(tmp->replacement->sysfs_state); 7545 } else if (tmp->rdev 7546 && tmp->rdev->recovery_offset == MaxSector 7547 && !test_bit(Faulty, &tmp->rdev->flags) 7548 && !test_and_set_bit(In_sync, &tmp->rdev->flags)) { 7549 count++; 7550 sysfs_notify_dirent_safe(tmp->rdev->sysfs_state); 7551 } 7552 } 7553 spin_lock_irqsave(&conf->device_lock, flags); 7554 mddev->degraded = raid5_calc_degraded(conf); 7555 spin_unlock_irqrestore(&conf->device_lock, flags); 7556 print_raid5_conf(conf); 7557 return count; 7558 } 7559 7560 static int raid5_remove_disk(struct mddev *mddev, struct md_rdev *rdev) 7561 { 7562 struct r5conf *conf = mddev->private; 7563 int err = 0; 7564 int number = rdev->raid_disk; 7565 struct md_rdev **rdevp; 7566 struct disk_info *p = conf->disks + number; 7567 7568 print_raid5_conf(conf); 7569 if (test_bit(Journal, &rdev->flags) && conf->log) { 7570 /* 7571 * we can't wait pending write here, as this is called in 7572 * raid5d, wait will deadlock. 7573 * neilb: there is no locking about new writes here, 7574 * so this cannot be safe. 7575 */ 7576 if (atomic_read(&conf->active_stripes) || 7577 atomic_read(&conf->r5c_cached_full_stripes) || 7578 atomic_read(&conf->r5c_cached_partial_stripes)) { 7579 return -EBUSY; 7580 } 7581 log_exit(conf); 7582 return 0; 7583 } 7584 if (rdev == p->rdev) 7585 rdevp = &p->rdev; 7586 else if (rdev == p->replacement) 7587 rdevp = &p->replacement; 7588 else 7589 return 0; 7590 7591 if (number >= conf->raid_disks && 7592 conf->reshape_progress == MaxSector) 7593 clear_bit(In_sync, &rdev->flags); 7594 7595 if (test_bit(In_sync, &rdev->flags) || 7596 atomic_read(&rdev->nr_pending)) { 7597 err = -EBUSY; 7598 goto abort; 7599 } 7600 /* Only remove non-faulty devices if recovery 7601 * isn't possible. 7602 */ 7603 if (!test_bit(Faulty, &rdev->flags) && 7604 mddev->recovery_disabled != conf->recovery_disabled && 7605 !has_failed(conf) && 7606 (!p->replacement || p->replacement == rdev) && 7607 number < conf->raid_disks) { 7608 err = -EBUSY; 7609 goto abort; 7610 } 7611 *rdevp = NULL; 7612 if (!test_bit(RemoveSynchronized, &rdev->flags)) { 7613 synchronize_rcu(); 7614 if (atomic_read(&rdev->nr_pending)) { 7615 /* lost the race, try later */ 7616 err = -EBUSY; 7617 *rdevp = rdev; 7618 } 7619 } 7620 if (!err) { 7621 err = log_modify(conf, rdev, false); 7622 if (err) 7623 goto abort; 7624 } 7625 if (p->replacement) { 7626 /* We must have just cleared 'rdev' */ 7627 p->rdev = p->replacement; 7628 clear_bit(Replacement, &p->replacement->flags); 7629 smp_mb(); /* Make sure other CPUs may see both as identical 7630 * but will never see neither - if they are careful 7631 */ 7632 p->replacement = NULL; 7633 7634 if (!err) 7635 err = log_modify(conf, p->rdev, true); 7636 } 7637 7638 clear_bit(WantReplacement, &rdev->flags); 7639 abort: 7640 7641 print_raid5_conf(conf); 7642 return err; 7643 } 7644 7645 static int raid5_add_disk(struct mddev *mddev, struct md_rdev *rdev) 7646 { 7647 struct r5conf *conf = mddev->private; 7648 int err = -EEXIST; 7649 int disk; 7650 struct disk_info *p; 7651 int first = 0; 7652 int last = conf->raid_disks - 1; 7653 7654 if (test_bit(Journal, &rdev->flags)) { 7655 if (conf->log) 7656 return -EBUSY; 7657 7658 rdev->raid_disk = 0; 7659 /* 7660 * The array is in readonly mode if journal is missing, so no 7661 * write requests running. We should be safe 7662 */ 7663 log_init(conf, rdev, false); 7664 return 0; 7665 } 7666 if (mddev->recovery_disabled == conf->recovery_disabled) 7667 return -EBUSY; 7668 7669 if (rdev->saved_raid_disk < 0 && has_failed(conf)) 7670 /* no point adding a device */ 7671 return -EINVAL; 7672 7673 if (rdev->raid_disk >= 0) 7674 first = last = rdev->raid_disk; 7675 7676 /* 7677 * find the disk ... but prefer rdev->saved_raid_disk 7678 * if possible. 7679 */ 7680 if (rdev->saved_raid_disk >= 0 && 7681 rdev->saved_raid_disk >= first && 7682 conf->disks[rdev->saved_raid_disk].rdev == NULL) 7683 first = rdev->saved_raid_disk; 7684 7685 for (disk = first; disk <= last; disk++) { 7686 p = conf->disks + disk; 7687 if (p->rdev == NULL) { 7688 clear_bit(In_sync, &rdev->flags); 7689 rdev->raid_disk = disk; 7690 if (rdev->saved_raid_disk != disk) 7691 conf->fullsync = 1; 7692 rcu_assign_pointer(p->rdev, rdev); 7693 7694 err = log_modify(conf, rdev, true); 7695 7696 goto out; 7697 } 7698 } 7699 for (disk = first; disk <= last; disk++) { 7700 p = conf->disks + disk; 7701 if (test_bit(WantReplacement, &p->rdev->flags) && 7702 p->replacement == NULL) { 7703 clear_bit(In_sync, &rdev->flags); 7704 set_bit(Replacement, &rdev->flags); 7705 rdev->raid_disk = disk; 7706 err = 0; 7707 conf->fullsync = 1; 7708 rcu_assign_pointer(p->replacement, rdev); 7709 break; 7710 } 7711 } 7712 out: 7713 print_raid5_conf(conf); 7714 return err; 7715 } 7716 7717 static int raid5_resize(struct mddev *mddev, sector_t sectors) 7718 { 7719 /* no resync is happening, and there is enough space 7720 * on all devices, so we can resize. 7721 * We need to make sure resync covers any new space. 7722 * If the array is shrinking we should possibly wait until 7723 * any io in the removed space completes, but it hardly seems 7724 * worth it. 7725 */ 7726 sector_t newsize; 7727 struct r5conf *conf = mddev->private; 7728 7729 if (conf->log || raid5_has_ppl(conf)) 7730 return -EINVAL; 7731 sectors &= ~((sector_t)conf->chunk_sectors - 1); 7732 newsize = raid5_size(mddev, sectors, mddev->raid_disks); 7733 if (mddev->external_size && 7734 mddev->array_sectors > newsize) 7735 return -EINVAL; 7736 if (mddev->bitmap) { 7737 int ret = bitmap_resize(mddev->bitmap, sectors, 0, 0); 7738 if (ret) 7739 return ret; 7740 } 7741 md_set_array_sectors(mddev, newsize); 7742 if (sectors > mddev->dev_sectors && 7743 mddev->recovery_cp > mddev->dev_sectors) { 7744 mddev->recovery_cp = mddev->dev_sectors; 7745 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 7746 } 7747 mddev->dev_sectors = sectors; 7748 mddev->resync_max_sectors = sectors; 7749 return 0; 7750 } 7751 7752 static int check_stripe_cache(struct mddev *mddev) 7753 { 7754 /* Can only proceed if there are plenty of stripe_heads. 7755 * We need a minimum of one full stripe,, and for sensible progress 7756 * it is best to have about 4 times that. 7757 * If we require 4 times, then the default 256 4K stripe_heads will 7758 * allow for chunk sizes up to 256K, which is probably OK. 7759 * If the chunk size is greater, user-space should request more 7760 * stripe_heads first. 7761 */ 7762 struct r5conf *conf = mddev->private; 7763 if (((mddev->chunk_sectors << 9) / STRIPE_SIZE) * 4 7764 > conf->min_nr_stripes || 7765 ((mddev->new_chunk_sectors << 9) / STRIPE_SIZE) * 4 7766 > conf->min_nr_stripes) { 7767 pr_warn("md/raid:%s: reshape: not enough stripes. Needed %lu\n", 7768 mdname(mddev), 7769 ((max(mddev->chunk_sectors, mddev->new_chunk_sectors) << 9) 7770 / STRIPE_SIZE)*4); 7771 return 0; 7772 } 7773 return 1; 7774 } 7775 7776 static int check_reshape(struct mddev *mddev) 7777 { 7778 struct r5conf *conf = mddev->private; 7779 7780 if (conf->log || raid5_has_ppl(conf)) 7781 return -EINVAL; 7782 if (mddev->delta_disks == 0 && 7783 mddev->new_layout == mddev->layout && 7784 mddev->new_chunk_sectors == mddev->chunk_sectors) 7785 return 0; /* nothing to do */ 7786 if (has_failed(conf)) 7787 return -EINVAL; 7788 if (mddev->delta_disks < 0 && mddev->reshape_position == MaxSector) { 7789 /* We might be able to shrink, but the devices must 7790 * be made bigger first. 7791 * For raid6, 4 is the minimum size. 7792 * Otherwise 2 is the minimum 7793 */ 7794 int min = 2; 7795 if (mddev->level == 6) 7796 min = 4; 7797 if (mddev->raid_disks + mddev->delta_disks < min) 7798 return -EINVAL; 7799 } 7800 7801 if (!check_stripe_cache(mddev)) 7802 return -ENOSPC; 7803 7804 if (mddev->new_chunk_sectors > mddev->chunk_sectors || 7805 mddev->delta_disks > 0) 7806 if (resize_chunks(conf, 7807 conf->previous_raid_disks 7808 + max(0, mddev->delta_disks), 7809 max(mddev->new_chunk_sectors, 7810 mddev->chunk_sectors) 7811 ) < 0) 7812 return -ENOMEM; 7813 7814 if (conf->previous_raid_disks + mddev->delta_disks <= conf->pool_size) 7815 return 0; /* never bother to shrink */ 7816 return resize_stripes(conf, (conf->previous_raid_disks 7817 + mddev->delta_disks)); 7818 } 7819 7820 static int raid5_start_reshape(struct mddev *mddev) 7821 { 7822 struct r5conf *conf = mddev->private; 7823 struct md_rdev *rdev; 7824 int spares = 0; 7825 unsigned long flags; 7826 7827 if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) 7828 return -EBUSY; 7829 7830 if (!check_stripe_cache(mddev)) 7831 return -ENOSPC; 7832 7833 if (has_failed(conf)) 7834 return -EINVAL; 7835 7836 rdev_for_each(rdev, mddev) { 7837 if (!test_bit(In_sync, &rdev->flags) 7838 && !test_bit(Faulty, &rdev->flags)) 7839 spares++; 7840 } 7841 7842 if (spares - mddev->degraded < mddev->delta_disks - conf->max_degraded) 7843 /* Not enough devices even to make a degraded array 7844 * of that size 7845 */ 7846 return -EINVAL; 7847 7848 /* Refuse to reduce size of the array. Any reductions in 7849 * array size must be through explicit setting of array_size 7850 * attribute. 7851 */ 7852 if (raid5_size(mddev, 0, conf->raid_disks + mddev->delta_disks) 7853 < mddev->array_sectors) { 7854 pr_warn("md/raid:%s: array size must be reduced before number of disks\n", 7855 mdname(mddev)); 7856 return -EINVAL; 7857 } 7858 7859 atomic_set(&conf->reshape_stripes, 0); 7860 spin_lock_irq(&conf->device_lock); 7861 write_seqcount_begin(&conf->gen_lock); 7862 conf->previous_raid_disks = conf->raid_disks; 7863 conf->raid_disks += mddev->delta_disks; 7864 conf->prev_chunk_sectors = conf->chunk_sectors; 7865 conf->chunk_sectors = mddev->new_chunk_sectors; 7866 conf->prev_algo = conf->algorithm; 7867 conf->algorithm = mddev->new_layout; 7868 conf->generation++; 7869 /* Code that selects data_offset needs to see the generation update 7870 * if reshape_progress has been set - so a memory barrier needed. 7871 */ 7872 smp_mb(); 7873 if (mddev->reshape_backwards) 7874 conf->reshape_progress = raid5_size(mddev, 0, 0); 7875 else 7876 conf->reshape_progress = 0; 7877 conf->reshape_safe = conf->reshape_progress; 7878 write_seqcount_end(&conf->gen_lock); 7879 spin_unlock_irq(&conf->device_lock); 7880 7881 /* Now make sure any requests that proceeded on the assumption 7882 * the reshape wasn't running - like Discard or Read - have 7883 * completed. 7884 */ 7885 mddev_suspend(mddev); 7886 mddev_resume(mddev); 7887 7888 /* Add some new drives, as many as will fit. 7889 * We know there are enough to make the newly sized array work. 7890 * Don't add devices if we are reducing the number of 7891 * devices in the array. This is because it is not possible 7892 * to correctly record the "partially reconstructed" state of 7893 * such devices during the reshape and confusion could result. 7894 */ 7895 if (mddev->delta_disks >= 0) { 7896 rdev_for_each(rdev, mddev) 7897 if (rdev->raid_disk < 0 && 7898 !test_bit(Faulty, &rdev->flags)) { 7899 if (raid5_add_disk(mddev, rdev) == 0) { 7900 if (rdev->raid_disk 7901 >= conf->previous_raid_disks) 7902 set_bit(In_sync, &rdev->flags); 7903 else 7904 rdev->recovery_offset = 0; 7905 7906 if (sysfs_link_rdev(mddev, rdev)) 7907 /* Failure here is OK */; 7908 } 7909 } else if (rdev->raid_disk >= conf->previous_raid_disks 7910 && !test_bit(Faulty, &rdev->flags)) { 7911 /* This is a spare that was manually added */ 7912 set_bit(In_sync, &rdev->flags); 7913 } 7914 7915 /* When a reshape changes the number of devices, 7916 * ->degraded is measured against the larger of the 7917 * pre and post number of devices. 7918 */ 7919 spin_lock_irqsave(&conf->device_lock, flags); 7920 mddev->degraded = raid5_calc_degraded(conf); 7921 spin_unlock_irqrestore(&conf->device_lock, flags); 7922 } 7923 mddev->raid_disks = conf->raid_disks; 7924 mddev->reshape_position = conf->reshape_progress; 7925 set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags); 7926 7927 clear_bit(MD_RECOVERY_SYNC, &mddev->recovery); 7928 clear_bit(MD_RECOVERY_CHECK, &mddev->recovery); 7929 clear_bit(MD_RECOVERY_DONE, &mddev->recovery); 7930 set_bit(MD_RECOVERY_RESHAPE, &mddev->recovery); 7931 set_bit(MD_RECOVERY_RUNNING, &mddev->recovery); 7932 mddev->sync_thread = md_register_thread(md_do_sync, mddev, 7933 "reshape"); 7934 if (!mddev->sync_thread) { 7935 mddev->recovery = 0; 7936 spin_lock_irq(&conf->device_lock); 7937 write_seqcount_begin(&conf->gen_lock); 7938 mddev->raid_disks = conf->raid_disks = conf->previous_raid_disks; 7939 mddev->new_chunk_sectors = 7940 conf->chunk_sectors = conf->prev_chunk_sectors; 7941 mddev->new_layout = conf->algorithm = conf->prev_algo; 7942 rdev_for_each(rdev, mddev) 7943 rdev->new_data_offset = rdev->data_offset; 7944 smp_wmb(); 7945 conf->generation --; 7946 conf->reshape_progress = MaxSector; 7947 mddev->reshape_position = MaxSector; 7948 write_seqcount_end(&conf->gen_lock); 7949 spin_unlock_irq(&conf->device_lock); 7950 return -EAGAIN; 7951 } 7952 conf->reshape_checkpoint = jiffies; 7953 md_wakeup_thread(mddev->sync_thread); 7954 md_new_event(mddev); 7955 return 0; 7956 } 7957 7958 /* This is called from the reshape thread and should make any 7959 * changes needed in 'conf' 7960 */ 7961 static void end_reshape(struct r5conf *conf) 7962 { 7963 7964 if (!test_bit(MD_RECOVERY_INTR, &conf->mddev->recovery)) { 7965 struct md_rdev *rdev; 7966 7967 spin_lock_irq(&conf->device_lock); 7968 conf->previous_raid_disks = conf->raid_disks; 7969 md_finish_reshape(conf->mddev); 7970 smp_wmb(); 7971 conf->reshape_progress = MaxSector; 7972 conf->mddev->reshape_position = MaxSector; 7973 rdev_for_each(rdev, conf->mddev) 7974 if (rdev->raid_disk >= 0 && 7975 !test_bit(Journal, &rdev->flags) && 7976 !test_bit(In_sync, &rdev->flags)) 7977 rdev->recovery_offset = MaxSector; 7978 spin_unlock_irq(&conf->device_lock); 7979 wake_up(&conf->wait_for_overlap); 7980 7981 /* read-ahead size must cover two whole stripes, which is 7982 * 2 * (datadisks) * chunksize where 'n' is the number of raid devices 7983 */ 7984 if (conf->mddev->queue) { 7985 int data_disks = conf->raid_disks - conf->max_degraded; 7986 int stripe = data_disks * ((conf->chunk_sectors << 9) 7987 / PAGE_SIZE); 7988 if (conf->mddev->queue->backing_dev_info->ra_pages < 2 * stripe) 7989 conf->mddev->queue->backing_dev_info->ra_pages = 2 * stripe; 7990 } 7991 } 7992 } 7993 7994 /* This is called from the raid5d thread with mddev_lock held. 7995 * It makes config changes to the device. 7996 */ 7997 static void raid5_finish_reshape(struct mddev *mddev) 7998 { 7999 struct r5conf *conf = mddev->private; 8000 8001 if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery)) { 8002 8003 if (mddev->delta_disks <= 0) { 8004 int d; 8005 spin_lock_irq(&conf->device_lock); 8006 mddev->degraded = raid5_calc_degraded(conf); 8007 spin_unlock_irq(&conf->device_lock); 8008 for (d = conf->raid_disks ; 8009 d < conf->raid_disks - mddev->delta_disks; 8010 d++) { 8011 struct md_rdev *rdev = conf->disks[d].rdev; 8012 if (rdev) 8013 clear_bit(In_sync, &rdev->flags); 8014 rdev = conf->disks[d].replacement; 8015 if (rdev) 8016 clear_bit(In_sync, &rdev->flags); 8017 } 8018 } 8019 mddev->layout = conf->algorithm; 8020 mddev->chunk_sectors = conf->chunk_sectors; 8021 mddev->reshape_position = MaxSector; 8022 mddev->delta_disks = 0; 8023 mddev->reshape_backwards = 0; 8024 } 8025 } 8026 8027 static void raid5_quiesce(struct mddev *mddev, int quiesce) 8028 { 8029 struct r5conf *conf = mddev->private; 8030 8031 if (quiesce) { 8032 /* stop all writes */ 8033 lock_all_device_hash_locks_irq(conf); 8034 /* '2' tells resync/reshape to pause so that all 8035 * active stripes can drain 8036 */ 8037 r5c_flush_cache(conf, INT_MAX); 8038 conf->quiesce = 2; 8039 wait_event_cmd(conf->wait_for_quiescent, 8040 atomic_read(&conf->active_stripes) == 0 && 8041 atomic_read(&conf->active_aligned_reads) == 0, 8042 unlock_all_device_hash_locks_irq(conf), 8043 lock_all_device_hash_locks_irq(conf)); 8044 conf->quiesce = 1; 8045 unlock_all_device_hash_locks_irq(conf); 8046 /* allow reshape to continue */ 8047 wake_up(&conf->wait_for_overlap); 8048 } else { 8049 /* re-enable writes */ 8050 lock_all_device_hash_locks_irq(conf); 8051 conf->quiesce = 0; 8052 wake_up(&conf->wait_for_quiescent); 8053 wake_up(&conf->wait_for_overlap); 8054 unlock_all_device_hash_locks_irq(conf); 8055 } 8056 log_quiesce(conf, quiesce); 8057 } 8058 8059 static void *raid45_takeover_raid0(struct mddev *mddev, int level) 8060 { 8061 struct r0conf *raid0_conf = mddev->private; 8062 sector_t sectors; 8063 8064 /* for raid0 takeover only one zone is supported */ 8065 if (raid0_conf->nr_strip_zones > 1) { 8066 pr_warn("md/raid:%s: cannot takeover raid0 with more than one zone.\n", 8067 mdname(mddev)); 8068 return ERR_PTR(-EINVAL); 8069 } 8070 8071 sectors = raid0_conf->strip_zone[0].zone_end; 8072 sector_div(sectors, raid0_conf->strip_zone[0].nb_dev); 8073 mddev->dev_sectors = sectors; 8074 mddev->new_level = level; 8075 mddev->new_layout = ALGORITHM_PARITY_N; 8076 mddev->new_chunk_sectors = mddev->chunk_sectors; 8077 mddev->raid_disks += 1; 8078 mddev->delta_disks = 1; 8079 /* make sure it will be not marked as dirty */ 8080 mddev->recovery_cp = MaxSector; 8081 8082 return setup_conf(mddev); 8083 } 8084 8085 static void *raid5_takeover_raid1(struct mddev *mddev) 8086 { 8087 int chunksect; 8088 void *ret; 8089 8090 if (mddev->raid_disks != 2 || 8091 mddev->degraded > 1) 8092 return ERR_PTR(-EINVAL); 8093 8094 /* Should check if there are write-behind devices? */ 8095 8096 chunksect = 64*2; /* 64K by default */ 8097 8098 /* The array must be an exact multiple of chunksize */ 8099 while (chunksect && (mddev->array_sectors & (chunksect-1))) 8100 chunksect >>= 1; 8101 8102 if ((chunksect<<9) < STRIPE_SIZE) 8103 /* array size does not allow a suitable chunk size */ 8104 return ERR_PTR(-EINVAL); 8105 8106 mddev->new_level = 5; 8107 mddev->new_layout = ALGORITHM_LEFT_SYMMETRIC; 8108 mddev->new_chunk_sectors = chunksect; 8109 8110 ret = setup_conf(mddev); 8111 if (!IS_ERR(ret)) 8112 mddev_clear_unsupported_flags(mddev, 8113 UNSUPPORTED_MDDEV_FLAGS); 8114 return ret; 8115 } 8116 8117 static void *raid5_takeover_raid6(struct mddev *mddev) 8118 { 8119 int new_layout; 8120 8121 switch (mddev->layout) { 8122 case ALGORITHM_LEFT_ASYMMETRIC_6: 8123 new_layout = ALGORITHM_LEFT_ASYMMETRIC; 8124 break; 8125 case ALGORITHM_RIGHT_ASYMMETRIC_6: 8126 new_layout = ALGORITHM_RIGHT_ASYMMETRIC; 8127 break; 8128 case ALGORITHM_LEFT_SYMMETRIC_6: 8129 new_layout = ALGORITHM_LEFT_SYMMETRIC; 8130 break; 8131 case ALGORITHM_RIGHT_SYMMETRIC_6: 8132 new_layout = ALGORITHM_RIGHT_SYMMETRIC; 8133 break; 8134 case ALGORITHM_PARITY_0_6: 8135 new_layout = ALGORITHM_PARITY_0; 8136 break; 8137 case ALGORITHM_PARITY_N: 8138 new_layout = ALGORITHM_PARITY_N; 8139 break; 8140 default: 8141 return ERR_PTR(-EINVAL); 8142 } 8143 mddev->new_level = 5; 8144 mddev->new_layout = new_layout; 8145 mddev->delta_disks = -1; 8146 mddev->raid_disks -= 1; 8147 return setup_conf(mddev); 8148 } 8149 8150 static int raid5_check_reshape(struct mddev *mddev) 8151 { 8152 /* For a 2-drive array, the layout and chunk size can be changed 8153 * immediately as not restriping is needed. 8154 * For larger arrays we record the new value - after validation 8155 * to be used by a reshape pass. 8156 */ 8157 struct r5conf *conf = mddev->private; 8158 int new_chunk = mddev->new_chunk_sectors; 8159 8160 if (mddev->new_layout >= 0 && !algorithm_valid_raid5(mddev->new_layout)) 8161 return -EINVAL; 8162 if (new_chunk > 0) { 8163 if (!is_power_of_2(new_chunk)) 8164 return -EINVAL; 8165 if (new_chunk < (PAGE_SIZE>>9)) 8166 return -EINVAL; 8167 if (mddev->array_sectors & (new_chunk-1)) 8168 /* not factor of array size */ 8169 return -EINVAL; 8170 } 8171 8172 /* They look valid */ 8173 8174 if (mddev->raid_disks == 2) { 8175 /* can make the change immediately */ 8176 if (mddev->new_layout >= 0) { 8177 conf->algorithm = mddev->new_layout; 8178 mddev->layout = mddev->new_layout; 8179 } 8180 if (new_chunk > 0) { 8181 conf->chunk_sectors = new_chunk ; 8182 mddev->chunk_sectors = new_chunk; 8183 } 8184 set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags); 8185 md_wakeup_thread(mddev->thread); 8186 } 8187 return check_reshape(mddev); 8188 } 8189 8190 static int raid6_check_reshape(struct mddev *mddev) 8191 { 8192 int new_chunk = mddev->new_chunk_sectors; 8193 8194 if (mddev->new_layout >= 0 && !algorithm_valid_raid6(mddev->new_layout)) 8195 return -EINVAL; 8196 if (new_chunk > 0) { 8197 if (!is_power_of_2(new_chunk)) 8198 return -EINVAL; 8199 if (new_chunk < (PAGE_SIZE >> 9)) 8200 return -EINVAL; 8201 if (mddev->array_sectors & (new_chunk-1)) 8202 /* not factor of array size */ 8203 return -EINVAL; 8204 } 8205 8206 /* They look valid */ 8207 return check_reshape(mddev); 8208 } 8209 8210 static void *raid5_takeover(struct mddev *mddev) 8211 { 8212 /* raid5 can take over: 8213 * raid0 - if there is only one strip zone - make it a raid4 layout 8214 * raid1 - if there are two drives. We need to know the chunk size 8215 * raid4 - trivial - just use a raid4 layout. 8216 * raid6 - Providing it is a *_6 layout 8217 */ 8218 if (mddev->level == 0) 8219 return raid45_takeover_raid0(mddev, 5); 8220 if (mddev->level == 1) 8221 return raid5_takeover_raid1(mddev); 8222 if (mddev->level == 4) { 8223 mddev->new_layout = ALGORITHM_PARITY_N; 8224 mddev->new_level = 5; 8225 return setup_conf(mddev); 8226 } 8227 if (mddev->level == 6) 8228 return raid5_takeover_raid6(mddev); 8229 8230 return ERR_PTR(-EINVAL); 8231 } 8232 8233 static void *raid4_takeover(struct mddev *mddev) 8234 { 8235 /* raid4 can take over: 8236 * raid0 - if there is only one strip zone 8237 * raid5 - if layout is right 8238 */ 8239 if (mddev->level == 0) 8240 return raid45_takeover_raid0(mddev, 4); 8241 if (mddev->level == 5 && 8242 mddev->layout == ALGORITHM_PARITY_N) { 8243 mddev->new_layout = 0; 8244 mddev->new_level = 4; 8245 return setup_conf(mddev); 8246 } 8247 return ERR_PTR(-EINVAL); 8248 } 8249 8250 static struct md_personality raid5_personality; 8251 8252 static void *raid6_takeover(struct mddev *mddev) 8253 { 8254 /* Currently can only take over a raid5. We map the 8255 * personality to an equivalent raid6 personality 8256 * with the Q block at the end. 8257 */ 8258 int new_layout; 8259 8260 if (mddev->pers != &raid5_personality) 8261 return ERR_PTR(-EINVAL); 8262 if (mddev->degraded > 1) 8263 return ERR_PTR(-EINVAL); 8264 if (mddev->raid_disks > 253) 8265 return ERR_PTR(-EINVAL); 8266 if (mddev->raid_disks < 3) 8267 return ERR_PTR(-EINVAL); 8268 8269 switch (mddev->layout) { 8270 case ALGORITHM_LEFT_ASYMMETRIC: 8271 new_layout = ALGORITHM_LEFT_ASYMMETRIC_6; 8272 break; 8273 case ALGORITHM_RIGHT_ASYMMETRIC: 8274 new_layout = ALGORITHM_RIGHT_ASYMMETRIC_6; 8275 break; 8276 case ALGORITHM_LEFT_SYMMETRIC: 8277 new_layout = ALGORITHM_LEFT_SYMMETRIC_6; 8278 break; 8279 case ALGORITHM_RIGHT_SYMMETRIC: 8280 new_layout = ALGORITHM_RIGHT_SYMMETRIC_6; 8281 break; 8282 case ALGORITHM_PARITY_0: 8283 new_layout = ALGORITHM_PARITY_0_6; 8284 break; 8285 case ALGORITHM_PARITY_N: 8286 new_layout = ALGORITHM_PARITY_N; 8287 break; 8288 default: 8289 return ERR_PTR(-EINVAL); 8290 } 8291 mddev->new_level = 6; 8292 mddev->new_layout = new_layout; 8293 mddev->delta_disks = 1; 8294 mddev->raid_disks += 1; 8295 return setup_conf(mddev); 8296 } 8297 8298 static int raid5_change_consistency_policy(struct mddev *mddev, const char *buf) 8299 { 8300 struct r5conf *conf; 8301 int err; 8302 8303 err = mddev_lock(mddev); 8304 if (err) 8305 return err; 8306 conf = mddev->private; 8307 if (!conf) { 8308 mddev_unlock(mddev); 8309 return -ENODEV; 8310 } 8311 8312 if (strncmp(buf, "ppl", 3) == 0) { 8313 /* ppl only works with RAID 5 */ 8314 if (!raid5_has_ppl(conf) && conf->level == 5) { 8315 err = log_init(conf, NULL, true); 8316 if (!err) { 8317 err = resize_stripes(conf, conf->pool_size); 8318 if (err) 8319 log_exit(conf); 8320 } 8321 } else 8322 err = -EINVAL; 8323 } else if (strncmp(buf, "resync", 6) == 0) { 8324 if (raid5_has_ppl(conf)) { 8325 mddev_suspend(mddev); 8326 log_exit(conf); 8327 mddev_resume(mddev); 8328 err = resize_stripes(conf, conf->pool_size); 8329 } else if (test_bit(MD_HAS_JOURNAL, &conf->mddev->flags) && 8330 r5l_log_disk_error(conf)) { 8331 bool journal_dev_exists = false; 8332 struct md_rdev *rdev; 8333 8334 rdev_for_each(rdev, mddev) 8335 if (test_bit(Journal, &rdev->flags)) { 8336 journal_dev_exists = true; 8337 break; 8338 } 8339 8340 if (!journal_dev_exists) { 8341 mddev_suspend(mddev); 8342 clear_bit(MD_HAS_JOURNAL, &mddev->flags); 8343 mddev_resume(mddev); 8344 } else /* need remove journal device first */ 8345 err = -EBUSY; 8346 } else 8347 err = -EINVAL; 8348 } else { 8349 err = -EINVAL; 8350 } 8351 8352 if (!err) 8353 md_update_sb(mddev, 1); 8354 8355 mddev_unlock(mddev); 8356 8357 return err; 8358 } 8359 8360 static int raid5_start(struct mddev *mddev) 8361 { 8362 struct r5conf *conf = mddev->private; 8363 8364 return r5l_start(conf->log); 8365 } 8366 8367 static struct md_personality raid6_personality = 8368 { 8369 .name = "raid6", 8370 .level = 6, 8371 .owner = THIS_MODULE, 8372 .make_request = raid5_make_request, 8373 .run = raid5_run, 8374 .start = raid5_start, 8375 .free = raid5_free, 8376 .status = raid5_status, 8377 .error_handler = raid5_error, 8378 .hot_add_disk = raid5_add_disk, 8379 .hot_remove_disk= raid5_remove_disk, 8380 .spare_active = raid5_spare_active, 8381 .sync_request = raid5_sync_request, 8382 .resize = raid5_resize, 8383 .size = raid5_size, 8384 .check_reshape = raid6_check_reshape, 8385 .start_reshape = raid5_start_reshape, 8386 .finish_reshape = raid5_finish_reshape, 8387 .quiesce = raid5_quiesce, 8388 .takeover = raid6_takeover, 8389 .congested = raid5_congested, 8390 .change_consistency_policy = raid5_change_consistency_policy, 8391 }; 8392 static struct md_personality raid5_personality = 8393 { 8394 .name = "raid5", 8395 .level = 5, 8396 .owner = THIS_MODULE, 8397 .make_request = raid5_make_request, 8398 .run = raid5_run, 8399 .start = raid5_start, 8400 .free = raid5_free, 8401 .status = raid5_status, 8402 .error_handler = raid5_error, 8403 .hot_add_disk = raid5_add_disk, 8404 .hot_remove_disk= raid5_remove_disk, 8405 .spare_active = raid5_spare_active, 8406 .sync_request = raid5_sync_request, 8407 .resize = raid5_resize, 8408 .size = raid5_size, 8409 .check_reshape = raid5_check_reshape, 8410 .start_reshape = raid5_start_reshape, 8411 .finish_reshape = raid5_finish_reshape, 8412 .quiesce = raid5_quiesce, 8413 .takeover = raid5_takeover, 8414 .congested = raid5_congested, 8415 .change_consistency_policy = raid5_change_consistency_policy, 8416 }; 8417 8418 static struct md_personality raid4_personality = 8419 { 8420 .name = "raid4", 8421 .level = 4, 8422 .owner = THIS_MODULE, 8423 .make_request = raid5_make_request, 8424 .run = raid5_run, 8425 .start = raid5_start, 8426 .free = raid5_free, 8427 .status = raid5_status, 8428 .error_handler = raid5_error, 8429 .hot_add_disk = raid5_add_disk, 8430 .hot_remove_disk= raid5_remove_disk, 8431 .spare_active = raid5_spare_active, 8432 .sync_request = raid5_sync_request, 8433 .resize = raid5_resize, 8434 .size = raid5_size, 8435 .check_reshape = raid5_check_reshape, 8436 .start_reshape = raid5_start_reshape, 8437 .finish_reshape = raid5_finish_reshape, 8438 .quiesce = raid5_quiesce, 8439 .takeover = raid4_takeover, 8440 .congested = raid5_congested, 8441 .change_consistency_policy = raid5_change_consistency_policy, 8442 }; 8443 8444 static int __init raid5_init(void) 8445 { 8446 int ret; 8447 8448 raid5_wq = alloc_workqueue("raid5wq", 8449 WQ_UNBOUND|WQ_MEM_RECLAIM|WQ_CPU_INTENSIVE|WQ_SYSFS, 0); 8450 if (!raid5_wq) 8451 return -ENOMEM; 8452 8453 ret = cpuhp_setup_state_multi(CPUHP_MD_RAID5_PREPARE, 8454 "md/raid5:prepare", 8455 raid456_cpu_up_prepare, 8456 raid456_cpu_dead); 8457 if (ret) { 8458 destroy_workqueue(raid5_wq); 8459 return ret; 8460 } 8461 register_md_personality(&raid6_personality); 8462 register_md_personality(&raid5_personality); 8463 register_md_personality(&raid4_personality); 8464 return 0; 8465 } 8466 8467 static void raid5_exit(void) 8468 { 8469 unregister_md_personality(&raid6_personality); 8470 unregister_md_personality(&raid5_personality); 8471 unregister_md_personality(&raid4_personality); 8472 cpuhp_remove_multi_state(CPUHP_MD_RAID5_PREPARE); 8473 destroy_workqueue(raid5_wq); 8474 } 8475 8476 module_init(raid5_init); 8477 module_exit(raid5_exit); 8478 MODULE_LICENSE("GPL"); 8479 MODULE_DESCRIPTION("RAID4/5/6 (striping with parity) personality for MD"); 8480 MODULE_ALIAS("md-personality-4"); /* RAID5 */ 8481 MODULE_ALIAS("md-raid5"); 8482 MODULE_ALIAS("md-raid4"); 8483 MODULE_ALIAS("md-level-5"); 8484 MODULE_ALIAS("md-level-4"); 8485 MODULE_ALIAS("md-personality-8"); /* RAID6 */ 8486 MODULE_ALIAS("md-raid6"); 8487 MODULE_ALIAS("md-level-6"); 8488 8489 /* This used to be two separate modules, they were: */ 8490 MODULE_ALIAS("raid5"); 8491 MODULE_ALIAS("raid6"); 8492