1 /* 2 * raid5.c : Multiple Devices driver for Linux 3 * Copyright (C) 1996, 1997 Ingo Molnar, Miguel de Icaza, Gadi Oxman 4 * Copyright (C) 1999, 2000 Ingo Molnar 5 * Copyright (C) 2002, 2003 H. Peter Anvin 6 * 7 * RAID-4/5/6 management functions. 8 * Thanks to Penguin Computing for making the RAID-6 development possible 9 * by donating a test server! 10 * 11 * This program is free software; you can redistribute it and/or modify 12 * it under the terms of the GNU General Public License as published by 13 * the Free Software Foundation; either version 2, or (at your option) 14 * any later version. 15 * 16 * You should have received a copy of the GNU General Public License 17 * (for example /usr/src/linux/COPYING); if not, write to the Free 18 * Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. 19 */ 20 21 /* 22 * BITMAP UNPLUGGING: 23 * 24 * The sequencing for updating the bitmap reliably is a little 25 * subtle (and I got it wrong the first time) so it deserves some 26 * explanation. 27 * 28 * We group bitmap updates into batches. Each batch has a number. 29 * We may write out several batches at once, but that isn't very important. 30 * conf->seq_write is the number of the last batch successfully written. 31 * conf->seq_flush is the number of the last batch that was closed to 32 * new additions. 33 * When we discover that we will need to write to any block in a stripe 34 * (in add_stripe_bio) we update the in-memory bitmap and record in sh->bm_seq 35 * the number of the batch it will be in. This is seq_flush+1. 36 * When we are ready to do a write, if that batch hasn't been written yet, 37 * we plug the array and queue the stripe for later. 38 * When an unplug happens, we increment bm_flush, thus closing the current 39 * batch. 40 * When we notice that bm_flush > bm_write, we write out all pending updates 41 * to the bitmap, and advance bm_write to where bm_flush was. 42 * This may occasionally write a bit out twice, but is sure never to 43 * miss any bits. 44 */ 45 46 #include <linux/blkdev.h> 47 #include <linux/kthread.h> 48 #include <linux/raid/pq.h> 49 #include <linux/async_tx.h> 50 #include <linux/module.h> 51 #include <linux/async.h> 52 #include <linux/seq_file.h> 53 #include <linux/cpu.h> 54 #include <linux/slab.h> 55 #include <linux/ratelimit.h> 56 #include <linux/nodemask.h> 57 58 #include <trace/events/block.h> 59 #include <linux/list_sort.h> 60 61 #include "md.h" 62 #include "raid5.h" 63 #include "raid0.h" 64 #include "md-bitmap.h" 65 #include "raid5-log.h" 66 67 #define UNSUPPORTED_MDDEV_FLAGS (1L << MD_FAILFAST_SUPPORTED) 68 69 #define cpu_to_group(cpu) cpu_to_node(cpu) 70 #define ANY_GROUP NUMA_NO_NODE 71 72 static bool devices_handle_discard_safely = false; 73 module_param(devices_handle_discard_safely, bool, 0644); 74 MODULE_PARM_DESC(devices_handle_discard_safely, 75 "Set to Y if all devices in each array reliably return zeroes on reads from discarded regions"); 76 static struct workqueue_struct *raid5_wq; 77 78 static inline struct hlist_head *stripe_hash(struct r5conf *conf, sector_t sect) 79 { 80 int hash = (sect >> STRIPE_SHIFT) & HASH_MASK; 81 return &conf->stripe_hashtbl[hash]; 82 } 83 84 static inline int stripe_hash_locks_hash(sector_t sect) 85 { 86 return (sect >> STRIPE_SHIFT) & STRIPE_HASH_LOCKS_MASK; 87 } 88 89 static inline void lock_device_hash_lock(struct r5conf *conf, int hash) 90 { 91 spin_lock_irq(conf->hash_locks + hash); 92 spin_lock(&conf->device_lock); 93 } 94 95 static inline void unlock_device_hash_lock(struct r5conf *conf, int hash) 96 { 97 spin_unlock(&conf->device_lock); 98 spin_unlock_irq(conf->hash_locks + hash); 99 } 100 101 static inline void lock_all_device_hash_locks_irq(struct r5conf *conf) 102 { 103 int i; 104 spin_lock_irq(conf->hash_locks); 105 for (i = 1; i < NR_STRIPE_HASH_LOCKS; i++) 106 spin_lock_nest_lock(conf->hash_locks + i, conf->hash_locks); 107 spin_lock(&conf->device_lock); 108 } 109 110 static inline void unlock_all_device_hash_locks_irq(struct r5conf *conf) 111 { 112 int i; 113 spin_unlock(&conf->device_lock); 114 for (i = NR_STRIPE_HASH_LOCKS - 1; i; i--) 115 spin_unlock(conf->hash_locks + i); 116 spin_unlock_irq(conf->hash_locks); 117 } 118 119 /* Find first data disk in a raid6 stripe */ 120 static inline int raid6_d0(struct stripe_head *sh) 121 { 122 if (sh->ddf_layout) 123 /* ddf always start from first device */ 124 return 0; 125 /* md starts just after Q block */ 126 if (sh->qd_idx == sh->disks - 1) 127 return 0; 128 else 129 return sh->qd_idx + 1; 130 } 131 static inline int raid6_next_disk(int disk, int raid_disks) 132 { 133 disk++; 134 return (disk < raid_disks) ? disk : 0; 135 } 136 137 /* When walking through the disks in a raid5, starting at raid6_d0, 138 * We need to map each disk to a 'slot', where the data disks are slot 139 * 0 .. raid_disks-3, the parity disk is raid_disks-2 and the Q disk 140 * is raid_disks-1. This help does that mapping. 141 */ 142 static int raid6_idx_to_slot(int idx, struct stripe_head *sh, 143 int *count, int syndrome_disks) 144 { 145 int slot = *count; 146 147 if (sh->ddf_layout) 148 (*count)++; 149 if (idx == sh->pd_idx) 150 return syndrome_disks; 151 if (idx == sh->qd_idx) 152 return syndrome_disks + 1; 153 if (!sh->ddf_layout) 154 (*count)++; 155 return slot; 156 } 157 158 static void print_raid5_conf (struct r5conf *conf); 159 160 static int stripe_operations_active(struct stripe_head *sh) 161 { 162 return sh->check_state || sh->reconstruct_state || 163 test_bit(STRIPE_BIOFILL_RUN, &sh->state) || 164 test_bit(STRIPE_COMPUTE_RUN, &sh->state); 165 } 166 167 static bool stripe_is_lowprio(struct stripe_head *sh) 168 { 169 return (test_bit(STRIPE_R5C_FULL_STRIPE, &sh->state) || 170 test_bit(STRIPE_R5C_PARTIAL_STRIPE, &sh->state)) && 171 !test_bit(STRIPE_R5C_CACHING, &sh->state); 172 } 173 174 static void raid5_wakeup_stripe_thread(struct stripe_head *sh) 175 { 176 struct r5conf *conf = sh->raid_conf; 177 struct r5worker_group *group; 178 int thread_cnt; 179 int i, cpu = sh->cpu; 180 181 if (!cpu_online(cpu)) { 182 cpu = cpumask_any(cpu_online_mask); 183 sh->cpu = cpu; 184 } 185 186 if (list_empty(&sh->lru)) { 187 struct r5worker_group *group; 188 group = conf->worker_groups + cpu_to_group(cpu); 189 if (stripe_is_lowprio(sh)) 190 list_add_tail(&sh->lru, &group->loprio_list); 191 else 192 list_add_tail(&sh->lru, &group->handle_list); 193 group->stripes_cnt++; 194 sh->group = group; 195 } 196 197 if (conf->worker_cnt_per_group == 0) { 198 md_wakeup_thread(conf->mddev->thread); 199 return; 200 } 201 202 group = conf->worker_groups + cpu_to_group(sh->cpu); 203 204 group->workers[0].working = true; 205 /* at least one worker should run to avoid race */ 206 queue_work_on(sh->cpu, raid5_wq, &group->workers[0].work); 207 208 thread_cnt = group->stripes_cnt / MAX_STRIPE_BATCH - 1; 209 /* wakeup more workers */ 210 for (i = 1; i < conf->worker_cnt_per_group && thread_cnt > 0; i++) { 211 if (group->workers[i].working == false) { 212 group->workers[i].working = true; 213 queue_work_on(sh->cpu, raid5_wq, 214 &group->workers[i].work); 215 thread_cnt--; 216 } 217 } 218 } 219 220 static void do_release_stripe(struct r5conf *conf, struct stripe_head *sh, 221 struct list_head *temp_inactive_list) 222 { 223 int i; 224 int injournal = 0; /* number of date pages with R5_InJournal */ 225 226 BUG_ON(!list_empty(&sh->lru)); 227 BUG_ON(atomic_read(&conf->active_stripes)==0); 228 229 if (r5c_is_writeback(conf->log)) 230 for (i = sh->disks; i--; ) 231 if (test_bit(R5_InJournal, &sh->dev[i].flags)) 232 injournal++; 233 /* 234 * In the following cases, the stripe cannot be released to cached 235 * lists. Therefore, we make the stripe write out and set 236 * STRIPE_HANDLE: 237 * 1. when quiesce in r5c write back; 238 * 2. when resync is requested fot the stripe. 239 */ 240 if (test_bit(STRIPE_SYNC_REQUESTED, &sh->state) || 241 (conf->quiesce && r5c_is_writeback(conf->log) && 242 !test_bit(STRIPE_HANDLE, &sh->state) && injournal != 0)) { 243 if (test_bit(STRIPE_R5C_CACHING, &sh->state)) 244 r5c_make_stripe_write_out(sh); 245 set_bit(STRIPE_HANDLE, &sh->state); 246 } 247 248 if (test_bit(STRIPE_HANDLE, &sh->state)) { 249 if (test_bit(STRIPE_DELAYED, &sh->state) && 250 !test_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) 251 list_add_tail(&sh->lru, &conf->delayed_list); 252 else if (test_bit(STRIPE_BIT_DELAY, &sh->state) && 253 sh->bm_seq - conf->seq_write > 0) 254 list_add_tail(&sh->lru, &conf->bitmap_list); 255 else { 256 clear_bit(STRIPE_DELAYED, &sh->state); 257 clear_bit(STRIPE_BIT_DELAY, &sh->state); 258 if (conf->worker_cnt_per_group == 0) { 259 if (stripe_is_lowprio(sh)) 260 list_add_tail(&sh->lru, 261 &conf->loprio_list); 262 else 263 list_add_tail(&sh->lru, 264 &conf->handle_list); 265 } else { 266 raid5_wakeup_stripe_thread(sh); 267 return; 268 } 269 } 270 md_wakeup_thread(conf->mddev->thread); 271 } else { 272 BUG_ON(stripe_operations_active(sh)); 273 if (test_and_clear_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) 274 if (atomic_dec_return(&conf->preread_active_stripes) 275 < IO_THRESHOLD) 276 md_wakeup_thread(conf->mddev->thread); 277 atomic_dec(&conf->active_stripes); 278 if (!test_bit(STRIPE_EXPANDING, &sh->state)) { 279 if (!r5c_is_writeback(conf->log)) 280 list_add_tail(&sh->lru, temp_inactive_list); 281 else { 282 WARN_ON(test_bit(R5_InJournal, &sh->dev[sh->pd_idx].flags)); 283 if (injournal == 0) 284 list_add_tail(&sh->lru, temp_inactive_list); 285 else if (injournal == conf->raid_disks - conf->max_degraded) { 286 /* full stripe */ 287 if (!test_and_set_bit(STRIPE_R5C_FULL_STRIPE, &sh->state)) 288 atomic_inc(&conf->r5c_cached_full_stripes); 289 if (test_and_clear_bit(STRIPE_R5C_PARTIAL_STRIPE, &sh->state)) 290 atomic_dec(&conf->r5c_cached_partial_stripes); 291 list_add_tail(&sh->lru, &conf->r5c_full_stripe_list); 292 r5c_check_cached_full_stripe(conf); 293 } else 294 /* 295 * STRIPE_R5C_PARTIAL_STRIPE is set in 296 * r5c_try_caching_write(). No need to 297 * set it again. 298 */ 299 list_add_tail(&sh->lru, &conf->r5c_partial_stripe_list); 300 } 301 } 302 } 303 } 304 305 static void __release_stripe(struct r5conf *conf, struct stripe_head *sh, 306 struct list_head *temp_inactive_list) 307 { 308 if (atomic_dec_and_test(&sh->count)) 309 do_release_stripe(conf, sh, temp_inactive_list); 310 } 311 312 /* 313 * @hash could be NR_STRIPE_HASH_LOCKS, then we have a list of inactive_list 314 * 315 * Be careful: Only one task can add/delete stripes from temp_inactive_list at 316 * given time. Adding stripes only takes device lock, while deleting stripes 317 * only takes hash lock. 318 */ 319 static void release_inactive_stripe_list(struct r5conf *conf, 320 struct list_head *temp_inactive_list, 321 int hash) 322 { 323 int size; 324 bool do_wakeup = false; 325 unsigned long flags; 326 327 if (hash == NR_STRIPE_HASH_LOCKS) { 328 size = NR_STRIPE_HASH_LOCKS; 329 hash = NR_STRIPE_HASH_LOCKS - 1; 330 } else 331 size = 1; 332 while (size) { 333 struct list_head *list = &temp_inactive_list[size - 1]; 334 335 /* 336 * We don't hold any lock here yet, raid5_get_active_stripe() might 337 * remove stripes from the list 338 */ 339 if (!list_empty_careful(list)) { 340 spin_lock_irqsave(conf->hash_locks + hash, flags); 341 if (list_empty(conf->inactive_list + hash) && 342 !list_empty(list)) 343 atomic_dec(&conf->empty_inactive_list_nr); 344 list_splice_tail_init(list, conf->inactive_list + hash); 345 do_wakeup = true; 346 spin_unlock_irqrestore(conf->hash_locks + hash, flags); 347 } 348 size--; 349 hash--; 350 } 351 352 if (do_wakeup) { 353 wake_up(&conf->wait_for_stripe); 354 if (atomic_read(&conf->active_stripes) == 0) 355 wake_up(&conf->wait_for_quiescent); 356 if (conf->retry_read_aligned) 357 md_wakeup_thread(conf->mddev->thread); 358 } 359 } 360 361 /* should hold conf->device_lock already */ 362 static int release_stripe_list(struct r5conf *conf, 363 struct list_head *temp_inactive_list) 364 { 365 struct stripe_head *sh, *t; 366 int count = 0; 367 struct llist_node *head; 368 369 head = llist_del_all(&conf->released_stripes); 370 head = llist_reverse_order(head); 371 llist_for_each_entry_safe(sh, t, head, release_list) { 372 int hash; 373 374 /* sh could be readded after STRIPE_ON_RELEASE_LIST is cleard */ 375 smp_mb(); 376 clear_bit(STRIPE_ON_RELEASE_LIST, &sh->state); 377 /* 378 * Don't worry the bit is set here, because if the bit is set 379 * again, the count is always > 1. This is true for 380 * STRIPE_ON_UNPLUG_LIST bit too. 381 */ 382 hash = sh->hash_lock_index; 383 __release_stripe(conf, sh, &temp_inactive_list[hash]); 384 count++; 385 } 386 387 return count; 388 } 389 390 void raid5_release_stripe(struct stripe_head *sh) 391 { 392 struct r5conf *conf = sh->raid_conf; 393 unsigned long flags; 394 struct list_head list; 395 int hash; 396 bool wakeup; 397 398 /* Avoid release_list until the last reference. 399 */ 400 if (atomic_add_unless(&sh->count, -1, 1)) 401 return; 402 403 if (unlikely(!conf->mddev->thread) || 404 test_and_set_bit(STRIPE_ON_RELEASE_LIST, &sh->state)) 405 goto slow_path; 406 wakeup = llist_add(&sh->release_list, &conf->released_stripes); 407 if (wakeup) 408 md_wakeup_thread(conf->mddev->thread); 409 return; 410 slow_path: 411 /* we are ok here if STRIPE_ON_RELEASE_LIST is set or not */ 412 if (atomic_dec_and_lock_irqsave(&sh->count, &conf->device_lock, flags)) { 413 INIT_LIST_HEAD(&list); 414 hash = sh->hash_lock_index; 415 do_release_stripe(conf, sh, &list); 416 spin_unlock_irqrestore(&conf->device_lock, flags); 417 release_inactive_stripe_list(conf, &list, hash); 418 } 419 } 420 421 static inline void remove_hash(struct stripe_head *sh) 422 { 423 pr_debug("remove_hash(), stripe %llu\n", 424 (unsigned long long)sh->sector); 425 426 hlist_del_init(&sh->hash); 427 } 428 429 static inline void insert_hash(struct r5conf *conf, struct stripe_head *sh) 430 { 431 struct hlist_head *hp = stripe_hash(conf, sh->sector); 432 433 pr_debug("insert_hash(), stripe %llu\n", 434 (unsigned long long)sh->sector); 435 436 hlist_add_head(&sh->hash, hp); 437 } 438 439 /* find an idle stripe, make sure it is unhashed, and return it. */ 440 static struct stripe_head *get_free_stripe(struct r5conf *conf, int hash) 441 { 442 struct stripe_head *sh = NULL; 443 struct list_head *first; 444 445 if (list_empty(conf->inactive_list + hash)) 446 goto out; 447 first = (conf->inactive_list + hash)->next; 448 sh = list_entry(first, struct stripe_head, lru); 449 list_del_init(first); 450 remove_hash(sh); 451 atomic_inc(&conf->active_stripes); 452 BUG_ON(hash != sh->hash_lock_index); 453 if (list_empty(conf->inactive_list + hash)) 454 atomic_inc(&conf->empty_inactive_list_nr); 455 out: 456 return sh; 457 } 458 459 static void shrink_buffers(struct stripe_head *sh) 460 { 461 struct page *p; 462 int i; 463 int num = sh->raid_conf->pool_size; 464 465 for (i = 0; i < num ; i++) { 466 WARN_ON(sh->dev[i].page != sh->dev[i].orig_page); 467 p = sh->dev[i].page; 468 if (!p) 469 continue; 470 sh->dev[i].page = NULL; 471 put_page(p); 472 } 473 } 474 475 static int grow_buffers(struct stripe_head *sh, gfp_t gfp) 476 { 477 int i; 478 int num = sh->raid_conf->pool_size; 479 480 for (i = 0; i < num; i++) { 481 struct page *page; 482 483 if (!(page = alloc_page(gfp))) { 484 return 1; 485 } 486 sh->dev[i].page = page; 487 sh->dev[i].orig_page = page; 488 } 489 490 return 0; 491 } 492 493 static void stripe_set_idx(sector_t stripe, struct r5conf *conf, int previous, 494 struct stripe_head *sh); 495 496 static void init_stripe(struct stripe_head *sh, sector_t sector, int previous) 497 { 498 struct r5conf *conf = sh->raid_conf; 499 int i, seq; 500 501 BUG_ON(atomic_read(&sh->count) != 0); 502 BUG_ON(test_bit(STRIPE_HANDLE, &sh->state)); 503 BUG_ON(stripe_operations_active(sh)); 504 BUG_ON(sh->batch_head); 505 506 pr_debug("init_stripe called, stripe %llu\n", 507 (unsigned long long)sector); 508 retry: 509 seq = read_seqcount_begin(&conf->gen_lock); 510 sh->generation = conf->generation - previous; 511 sh->disks = previous ? conf->previous_raid_disks : conf->raid_disks; 512 sh->sector = sector; 513 stripe_set_idx(sector, conf, previous, sh); 514 sh->state = 0; 515 516 for (i = sh->disks; i--; ) { 517 struct r5dev *dev = &sh->dev[i]; 518 519 if (dev->toread || dev->read || dev->towrite || dev->written || 520 test_bit(R5_LOCKED, &dev->flags)) { 521 pr_err("sector=%llx i=%d %p %p %p %p %d\n", 522 (unsigned long long)sh->sector, i, dev->toread, 523 dev->read, dev->towrite, dev->written, 524 test_bit(R5_LOCKED, &dev->flags)); 525 WARN_ON(1); 526 } 527 dev->flags = 0; 528 dev->sector = raid5_compute_blocknr(sh, i, previous); 529 } 530 if (read_seqcount_retry(&conf->gen_lock, seq)) 531 goto retry; 532 sh->overwrite_disks = 0; 533 insert_hash(conf, sh); 534 sh->cpu = smp_processor_id(); 535 set_bit(STRIPE_BATCH_READY, &sh->state); 536 } 537 538 static struct stripe_head *__find_stripe(struct r5conf *conf, sector_t sector, 539 short generation) 540 { 541 struct stripe_head *sh; 542 543 pr_debug("__find_stripe, sector %llu\n", (unsigned long long)sector); 544 hlist_for_each_entry(sh, stripe_hash(conf, sector), hash) 545 if (sh->sector == sector && sh->generation == generation) 546 return sh; 547 pr_debug("__stripe %llu not in cache\n", (unsigned long long)sector); 548 return NULL; 549 } 550 551 /* 552 * Need to check if array has failed when deciding whether to: 553 * - start an array 554 * - remove non-faulty devices 555 * - add a spare 556 * - allow a reshape 557 * This determination is simple when no reshape is happening. 558 * However if there is a reshape, we need to carefully check 559 * both the before and after sections. 560 * This is because some failed devices may only affect one 561 * of the two sections, and some non-in_sync devices may 562 * be insync in the section most affected by failed devices. 563 */ 564 int raid5_calc_degraded(struct r5conf *conf) 565 { 566 int degraded, degraded2; 567 int i; 568 569 rcu_read_lock(); 570 degraded = 0; 571 for (i = 0; i < conf->previous_raid_disks; i++) { 572 struct md_rdev *rdev = rcu_dereference(conf->disks[i].rdev); 573 if (rdev && test_bit(Faulty, &rdev->flags)) 574 rdev = rcu_dereference(conf->disks[i].replacement); 575 if (!rdev || test_bit(Faulty, &rdev->flags)) 576 degraded++; 577 else if (test_bit(In_sync, &rdev->flags)) 578 ; 579 else 580 /* not in-sync or faulty. 581 * If the reshape increases the number of devices, 582 * this is being recovered by the reshape, so 583 * this 'previous' section is not in_sync. 584 * If the number of devices is being reduced however, 585 * the device can only be part of the array if 586 * we are reverting a reshape, so this section will 587 * be in-sync. 588 */ 589 if (conf->raid_disks >= conf->previous_raid_disks) 590 degraded++; 591 } 592 rcu_read_unlock(); 593 if (conf->raid_disks == conf->previous_raid_disks) 594 return degraded; 595 rcu_read_lock(); 596 degraded2 = 0; 597 for (i = 0; i < conf->raid_disks; i++) { 598 struct md_rdev *rdev = rcu_dereference(conf->disks[i].rdev); 599 if (rdev && test_bit(Faulty, &rdev->flags)) 600 rdev = rcu_dereference(conf->disks[i].replacement); 601 if (!rdev || test_bit(Faulty, &rdev->flags)) 602 degraded2++; 603 else if (test_bit(In_sync, &rdev->flags)) 604 ; 605 else 606 /* not in-sync or faulty. 607 * If reshape increases the number of devices, this 608 * section has already been recovered, else it 609 * almost certainly hasn't. 610 */ 611 if (conf->raid_disks <= conf->previous_raid_disks) 612 degraded2++; 613 } 614 rcu_read_unlock(); 615 if (degraded2 > degraded) 616 return degraded2; 617 return degraded; 618 } 619 620 static int has_failed(struct r5conf *conf) 621 { 622 int degraded; 623 624 if (conf->mddev->reshape_position == MaxSector) 625 return conf->mddev->degraded > conf->max_degraded; 626 627 degraded = raid5_calc_degraded(conf); 628 if (degraded > conf->max_degraded) 629 return 1; 630 return 0; 631 } 632 633 struct stripe_head * 634 raid5_get_active_stripe(struct r5conf *conf, sector_t sector, 635 int previous, int noblock, int noquiesce) 636 { 637 struct stripe_head *sh; 638 int hash = stripe_hash_locks_hash(sector); 639 int inc_empty_inactive_list_flag; 640 641 pr_debug("get_stripe, sector %llu\n", (unsigned long long)sector); 642 643 spin_lock_irq(conf->hash_locks + hash); 644 645 do { 646 wait_event_lock_irq(conf->wait_for_quiescent, 647 conf->quiesce == 0 || noquiesce, 648 *(conf->hash_locks + hash)); 649 sh = __find_stripe(conf, sector, conf->generation - previous); 650 if (!sh) { 651 if (!test_bit(R5_INACTIVE_BLOCKED, &conf->cache_state)) { 652 sh = get_free_stripe(conf, hash); 653 if (!sh && !test_bit(R5_DID_ALLOC, 654 &conf->cache_state)) 655 set_bit(R5_ALLOC_MORE, 656 &conf->cache_state); 657 } 658 if (noblock && sh == NULL) 659 break; 660 661 r5c_check_stripe_cache_usage(conf); 662 if (!sh) { 663 set_bit(R5_INACTIVE_BLOCKED, 664 &conf->cache_state); 665 r5l_wake_reclaim(conf->log, 0); 666 wait_event_lock_irq( 667 conf->wait_for_stripe, 668 !list_empty(conf->inactive_list + hash) && 669 (atomic_read(&conf->active_stripes) 670 < (conf->max_nr_stripes * 3 / 4) 671 || !test_bit(R5_INACTIVE_BLOCKED, 672 &conf->cache_state)), 673 *(conf->hash_locks + hash)); 674 clear_bit(R5_INACTIVE_BLOCKED, 675 &conf->cache_state); 676 } else { 677 init_stripe(sh, sector, previous); 678 atomic_inc(&sh->count); 679 } 680 } else if (!atomic_inc_not_zero(&sh->count)) { 681 spin_lock(&conf->device_lock); 682 if (!atomic_read(&sh->count)) { 683 if (!test_bit(STRIPE_HANDLE, &sh->state)) 684 atomic_inc(&conf->active_stripes); 685 BUG_ON(list_empty(&sh->lru) && 686 !test_bit(STRIPE_EXPANDING, &sh->state)); 687 inc_empty_inactive_list_flag = 0; 688 if (!list_empty(conf->inactive_list + hash)) 689 inc_empty_inactive_list_flag = 1; 690 list_del_init(&sh->lru); 691 if (list_empty(conf->inactive_list + hash) && inc_empty_inactive_list_flag) 692 atomic_inc(&conf->empty_inactive_list_nr); 693 if (sh->group) { 694 sh->group->stripes_cnt--; 695 sh->group = NULL; 696 } 697 } 698 atomic_inc(&sh->count); 699 spin_unlock(&conf->device_lock); 700 } 701 } while (sh == NULL); 702 703 spin_unlock_irq(conf->hash_locks + hash); 704 return sh; 705 } 706 707 static bool is_full_stripe_write(struct stripe_head *sh) 708 { 709 BUG_ON(sh->overwrite_disks > (sh->disks - sh->raid_conf->max_degraded)); 710 return sh->overwrite_disks == (sh->disks - sh->raid_conf->max_degraded); 711 } 712 713 static void lock_two_stripes(struct stripe_head *sh1, struct stripe_head *sh2) 714 __acquires(&sh1->stripe_lock) 715 __acquires(&sh2->stripe_lock) 716 { 717 if (sh1 > sh2) { 718 spin_lock_irq(&sh2->stripe_lock); 719 spin_lock_nested(&sh1->stripe_lock, 1); 720 } else { 721 spin_lock_irq(&sh1->stripe_lock); 722 spin_lock_nested(&sh2->stripe_lock, 1); 723 } 724 } 725 726 static void unlock_two_stripes(struct stripe_head *sh1, struct stripe_head *sh2) 727 __releases(&sh1->stripe_lock) 728 __releases(&sh2->stripe_lock) 729 { 730 spin_unlock(&sh1->stripe_lock); 731 spin_unlock_irq(&sh2->stripe_lock); 732 } 733 734 /* Only freshly new full stripe normal write stripe can be added to a batch list */ 735 static bool stripe_can_batch(struct stripe_head *sh) 736 { 737 struct r5conf *conf = sh->raid_conf; 738 739 if (raid5_has_log(conf) || raid5_has_ppl(conf)) 740 return false; 741 return test_bit(STRIPE_BATCH_READY, &sh->state) && 742 !test_bit(STRIPE_BITMAP_PENDING, &sh->state) && 743 is_full_stripe_write(sh); 744 } 745 746 /* we only do back search */ 747 static void stripe_add_to_batch_list(struct r5conf *conf, struct stripe_head *sh) 748 { 749 struct stripe_head *head; 750 sector_t head_sector, tmp_sec; 751 int hash; 752 int dd_idx; 753 int inc_empty_inactive_list_flag; 754 755 /* Don't cross chunks, so stripe pd_idx/qd_idx is the same */ 756 tmp_sec = sh->sector; 757 if (!sector_div(tmp_sec, conf->chunk_sectors)) 758 return; 759 head_sector = sh->sector - STRIPE_SECTORS; 760 761 hash = stripe_hash_locks_hash(head_sector); 762 spin_lock_irq(conf->hash_locks + hash); 763 head = __find_stripe(conf, head_sector, conf->generation); 764 if (head && !atomic_inc_not_zero(&head->count)) { 765 spin_lock(&conf->device_lock); 766 if (!atomic_read(&head->count)) { 767 if (!test_bit(STRIPE_HANDLE, &head->state)) 768 atomic_inc(&conf->active_stripes); 769 BUG_ON(list_empty(&head->lru) && 770 !test_bit(STRIPE_EXPANDING, &head->state)); 771 inc_empty_inactive_list_flag = 0; 772 if (!list_empty(conf->inactive_list + hash)) 773 inc_empty_inactive_list_flag = 1; 774 list_del_init(&head->lru); 775 if (list_empty(conf->inactive_list + hash) && inc_empty_inactive_list_flag) 776 atomic_inc(&conf->empty_inactive_list_nr); 777 if (head->group) { 778 head->group->stripes_cnt--; 779 head->group = NULL; 780 } 781 } 782 atomic_inc(&head->count); 783 spin_unlock(&conf->device_lock); 784 } 785 spin_unlock_irq(conf->hash_locks + hash); 786 787 if (!head) 788 return; 789 if (!stripe_can_batch(head)) 790 goto out; 791 792 lock_two_stripes(head, sh); 793 /* clear_batch_ready clear the flag */ 794 if (!stripe_can_batch(head) || !stripe_can_batch(sh)) 795 goto unlock_out; 796 797 if (sh->batch_head) 798 goto unlock_out; 799 800 dd_idx = 0; 801 while (dd_idx == sh->pd_idx || dd_idx == sh->qd_idx) 802 dd_idx++; 803 if (head->dev[dd_idx].towrite->bi_opf != sh->dev[dd_idx].towrite->bi_opf || 804 bio_op(head->dev[dd_idx].towrite) != bio_op(sh->dev[dd_idx].towrite)) 805 goto unlock_out; 806 807 if (head->batch_head) { 808 spin_lock(&head->batch_head->batch_lock); 809 /* This batch list is already running */ 810 if (!stripe_can_batch(head)) { 811 spin_unlock(&head->batch_head->batch_lock); 812 goto unlock_out; 813 } 814 /* 815 * We must assign batch_head of this stripe within the 816 * batch_lock, otherwise clear_batch_ready of batch head 817 * stripe could clear BATCH_READY bit of this stripe and 818 * this stripe->batch_head doesn't get assigned, which 819 * could confuse clear_batch_ready for this stripe 820 */ 821 sh->batch_head = head->batch_head; 822 823 /* 824 * at this point, head's BATCH_READY could be cleared, but we 825 * can still add the stripe to batch list 826 */ 827 list_add(&sh->batch_list, &head->batch_list); 828 spin_unlock(&head->batch_head->batch_lock); 829 } else { 830 head->batch_head = head; 831 sh->batch_head = head->batch_head; 832 spin_lock(&head->batch_lock); 833 list_add_tail(&sh->batch_list, &head->batch_list); 834 spin_unlock(&head->batch_lock); 835 } 836 837 if (test_and_clear_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) 838 if (atomic_dec_return(&conf->preread_active_stripes) 839 < IO_THRESHOLD) 840 md_wakeup_thread(conf->mddev->thread); 841 842 if (test_and_clear_bit(STRIPE_BIT_DELAY, &sh->state)) { 843 int seq = sh->bm_seq; 844 if (test_bit(STRIPE_BIT_DELAY, &sh->batch_head->state) && 845 sh->batch_head->bm_seq > seq) 846 seq = sh->batch_head->bm_seq; 847 set_bit(STRIPE_BIT_DELAY, &sh->batch_head->state); 848 sh->batch_head->bm_seq = seq; 849 } 850 851 atomic_inc(&sh->count); 852 unlock_out: 853 unlock_two_stripes(head, sh); 854 out: 855 raid5_release_stripe(head); 856 } 857 858 /* Determine if 'data_offset' or 'new_data_offset' should be used 859 * in this stripe_head. 860 */ 861 static int use_new_offset(struct r5conf *conf, struct stripe_head *sh) 862 { 863 sector_t progress = conf->reshape_progress; 864 /* Need a memory barrier to make sure we see the value 865 * of conf->generation, or ->data_offset that was set before 866 * reshape_progress was updated. 867 */ 868 smp_rmb(); 869 if (progress == MaxSector) 870 return 0; 871 if (sh->generation == conf->generation - 1) 872 return 0; 873 /* We are in a reshape, and this is a new-generation stripe, 874 * so use new_data_offset. 875 */ 876 return 1; 877 } 878 879 static void dispatch_bio_list(struct bio_list *tmp) 880 { 881 struct bio *bio; 882 883 while ((bio = bio_list_pop(tmp))) 884 generic_make_request(bio); 885 } 886 887 static int cmp_stripe(void *priv, struct list_head *a, struct list_head *b) 888 { 889 const struct r5pending_data *da = list_entry(a, 890 struct r5pending_data, sibling); 891 const struct r5pending_data *db = list_entry(b, 892 struct r5pending_data, sibling); 893 if (da->sector > db->sector) 894 return 1; 895 if (da->sector < db->sector) 896 return -1; 897 return 0; 898 } 899 900 static void dispatch_defer_bios(struct r5conf *conf, int target, 901 struct bio_list *list) 902 { 903 struct r5pending_data *data; 904 struct list_head *first, *next = NULL; 905 int cnt = 0; 906 907 if (conf->pending_data_cnt == 0) 908 return; 909 910 list_sort(NULL, &conf->pending_list, cmp_stripe); 911 912 first = conf->pending_list.next; 913 914 /* temporarily move the head */ 915 if (conf->next_pending_data) 916 list_move_tail(&conf->pending_list, 917 &conf->next_pending_data->sibling); 918 919 while (!list_empty(&conf->pending_list)) { 920 data = list_first_entry(&conf->pending_list, 921 struct r5pending_data, sibling); 922 if (&data->sibling == first) 923 first = data->sibling.next; 924 next = data->sibling.next; 925 926 bio_list_merge(list, &data->bios); 927 list_move(&data->sibling, &conf->free_list); 928 cnt++; 929 if (cnt >= target) 930 break; 931 } 932 conf->pending_data_cnt -= cnt; 933 BUG_ON(conf->pending_data_cnt < 0 || cnt < target); 934 935 if (next != &conf->pending_list) 936 conf->next_pending_data = list_entry(next, 937 struct r5pending_data, sibling); 938 else 939 conf->next_pending_data = NULL; 940 /* list isn't empty */ 941 if (first != &conf->pending_list) 942 list_move_tail(&conf->pending_list, first); 943 } 944 945 static void flush_deferred_bios(struct r5conf *conf) 946 { 947 struct bio_list tmp = BIO_EMPTY_LIST; 948 949 if (conf->pending_data_cnt == 0) 950 return; 951 952 spin_lock(&conf->pending_bios_lock); 953 dispatch_defer_bios(conf, conf->pending_data_cnt, &tmp); 954 BUG_ON(conf->pending_data_cnt != 0); 955 spin_unlock(&conf->pending_bios_lock); 956 957 dispatch_bio_list(&tmp); 958 } 959 960 static void defer_issue_bios(struct r5conf *conf, sector_t sector, 961 struct bio_list *bios) 962 { 963 struct bio_list tmp = BIO_EMPTY_LIST; 964 struct r5pending_data *ent; 965 966 spin_lock(&conf->pending_bios_lock); 967 ent = list_first_entry(&conf->free_list, struct r5pending_data, 968 sibling); 969 list_move_tail(&ent->sibling, &conf->pending_list); 970 ent->sector = sector; 971 bio_list_init(&ent->bios); 972 bio_list_merge(&ent->bios, bios); 973 conf->pending_data_cnt++; 974 if (conf->pending_data_cnt >= PENDING_IO_MAX) 975 dispatch_defer_bios(conf, PENDING_IO_ONE_FLUSH, &tmp); 976 977 spin_unlock(&conf->pending_bios_lock); 978 979 dispatch_bio_list(&tmp); 980 } 981 982 static void 983 raid5_end_read_request(struct bio *bi); 984 static void 985 raid5_end_write_request(struct bio *bi); 986 987 static void ops_run_io(struct stripe_head *sh, struct stripe_head_state *s) 988 { 989 struct r5conf *conf = sh->raid_conf; 990 int i, disks = sh->disks; 991 struct stripe_head *head_sh = sh; 992 struct bio_list pending_bios = BIO_EMPTY_LIST; 993 bool should_defer; 994 995 might_sleep(); 996 997 if (log_stripe(sh, s) == 0) 998 return; 999 1000 should_defer = conf->batch_bio_dispatch && conf->group_cnt; 1001 1002 for (i = disks; i--; ) { 1003 int op, op_flags = 0; 1004 int replace_only = 0; 1005 struct bio *bi, *rbi; 1006 struct md_rdev *rdev, *rrdev = NULL; 1007 1008 sh = head_sh; 1009 if (test_and_clear_bit(R5_Wantwrite, &sh->dev[i].flags)) { 1010 op = REQ_OP_WRITE; 1011 if (test_and_clear_bit(R5_WantFUA, &sh->dev[i].flags)) 1012 op_flags = REQ_FUA; 1013 if (test_bit(R5_Discard, &sh->dev[i].flags)) 1014 op = REQ_OP_DISCARD; 1015 } else if (test_and_clear_bit(R5_Wantread, &sh->dev[i].flags)) 1016 op = REQ_OP_READ; 1017 else if (test_and_clear_bit(R5_WantReplace, 1018 &sh->dev[i].flags)) { 1019 op = REQ_OP_WRITE; 1020 replace_only = 1; 1021 } else 1022 continue; 1023 if (test_and_clear_bit(R5_SyncIO, &sh->dev[i].flags)) 1024 op_flags |= REQ_SYNC; 1025 1026 again: 1027 bi = &sh->dev[i].req; 1028 rbi = &sh->dev[i].rreq; /* For writing to replacement */ 1029 1030 rcu_read_lock(); 1031 rrdev = rcu_dereference(conf->disks[i].replacement); 1032 smp_mb(); /* Ensure that if rrdev is NULL, rdev won't be */ 1033 rdev = rcu_dereference(conf->disks[i].rdev); 1034 if (!rdev) { 1035 rdev = rrdev; 1036 rrdev = NULL; 1037 } 1038 if (op_is_write(op)) { 1039 if (replace_only) 1040 rdev = NULL; 1041 if (rdev == rrdev) 1042 /* We raced and saw duplicates */ 1043 rrdev = NULL; 1044 } else { 1045 if (test_bit(R5_ReadRepl, &head_sh->dev[i].flags) && rrdev) 1046 rdev = rrdev; 1047 rrdev = NULL; 1048 } 1049 1050 if (rdev && test_bit(Faulty, &rdev->flags)) 1051 rdev = NULL; 1052 if (rdev) 1053 atomic_inc(&rdev->nr_pending); 1054 if (rrdev && test_bit(Faulty, &rrdev->flags)) 1055 rrdev = NULL; 1056 if (rrdev) 1057 atomic_inc(&rrdev->nr_pending); 1058 rcu_read_unlock(); 1059 1060 /* We have already checked bad blocks for reads. Now 1061 * need to check for writes. We never accept write errors 1062 * on the replacement, so we don't to check rrdev. 1063 */ 1064 while (op_is_write(op) && rdev && 1065 test_bit(WriteErrorSeen, &rdev->flags)) { 1066 sector_t first_bad; 1067 int bad_sectors; 1068 int bad = is_badblock(rdev, sh->sector, STRIPE_SECTORS, 1069 &first_bad, &bad_sectors); 1070 if (!bad) 1071 break; 1072 1073 if (bad < 0) { 1074 set_bit(BlockedBadBlocks, &rdev->flags); 1075 if (!conf->mddev->external && 1076 conf->mddev->sb_flags) { 1077 /* It is very unlikely, but we might 1078 * still need to write out the 1079 * bad block log - better give it 1080 * a chance*/ 1081 md_check_recovery(conf->mddev); 1082 } 1083 /* 1084 * Because md_wait_for_blocked_rdev 1085 * will dec nr_pending, we must 1086 * increment it first. 1087 */ 1088 atomic_inc(&rdev->nr_pending); 1089 md_wait_for_blocked_rdev(rdev, conf->mddev); 1090 } else { 1091 /* Acknowledged bad block - skip the write */ 1092 rdev_dec_pending(rdev, conf->mddev); 1093 rdev = NULL; 1094 } 1095 } 1096 1097 if (rdev) { 1098 if (s->syncing || s->expanding || s->expanded 1099 || s->replacing) 1100 md_sync_acct(rdev->bdev, STRIPE_SECTORS); 1101 1102 set_bit(STRIPE_IO_STARTED, &sh->state); 1103 1104 bio_set_dev(bi, rdev->bdev); 1105 bio_set_op_attrs(bi, op, op_flags); 1106 bi->bi_end_io = op_is_write(op) 1107 ? raid5_end_write_request 1108 : raid5_end_read_request; 1109 bi->bi_private = sh; 1110 1111 pr_debug("%s: for %llu schedule op %d on disc %d\n", 1112 __func__, (unsigned long long)sh->sector, 1113 bi->bi_opf, i); 1114 atomic_inc(&sh->count); 1115 if (sh != head_sh) 1116 atomic_inc(&head_sh->count); 1117 if (use_new_offset(conf, sh)) 1118 bi->bi_iter.bi_sector = (sh->sector 1119 + rdev->new_data_offset); 1120 else 1121 bi->bi_iter.bi_sector = (sh->sector 1122 + rdev->data_offset); 1123 if (test_bit(R5_ReadNoMerge, &head_sh->dev[i].flags)) 1124 bi->bi_opf |= REQ_NOMERGE; 1125 1126 if (test_bit(R5_SkipCopy, &sh->dev[i].flags)) 1127 WARN_ON(test_bit(R5_UPTODATE, &sh->dev[i].flags)); 1128 1129 if (!op_is_write(op) && 1130 test_bit(R5_InJournal, &sh->dev[i].flags)) 1131 /* 1132 * issuing read for a page in journal, this 1133 * must be preparing for prexor in rmw; read 1134 * the data into orig_page 1135 */ 1136 sh->dev[i].vec.bv_page = sh->dev[i].orig_page; 1137 else 1138 sh->dev[i].vec.bv_page = sh->dev[i].page; 1139 bi->bi_vcnt = 1; 1140 bi->bi_io_vec[0].bv_len = STRIPE_SIZE; 1141 bi->bi_io_vec[0].bv_offset = 0; 1142 bi->bi_iter.bi_size = STRIPE_SIZE; 1143 bi->bi_write_hint = sh->dev[i].write_hint; 1144 if (!rrdev) 1145 sh->dev[i].write_hint = RWF_WRITE_LIFE_NOT_SET; 1146 /* 1147 * If this is discard request, set bi_vcnt 0. We don't 1148 * want to confuse SCSI because SCSI will replace payload 1149 */ 1150 if (op == REQ_OP_DISCARD) 1151 bi->bi_vcnt = 0; 1152 if (rrdev) 1153 set_bit(R5_DOUBLE_LOCKED, &sh->dev[i].flags); 1154 1155 if (conf->mddev->gendisk) 1156 trace_block_bio_remap(bi->bi_disk->queue, 1157 bi, disk_devt(conf->mddev->gendisk), 1158 sh->dev[i].sector); 1159 if (should_defer && op_is_write(op)) 1160 bio_list_add(&pending_bios, bi); 1161 else 1162 generic_make_request(bi); 1163 } 1164 if (rrdev) { 1165 if (s->syncing || s->expanding || s->expanded 1166 || s->replacing) 1167 md_sync_acct(rrdev->bdev, STRIPE_SECTORS); 1168 1169 set_bit(STRIPE_IO_STARTED, &sh->state); 1170 1171 bio_set_dev(rbi, rrdev->bdev); 1172 bio_set_op_attrs(rbi, op, op_flags); 1173 BUG_ON(!op_is_write(op)); 1174 rbi->bi_end_io = raid5_end_write_request; 1175 rbi->bi_private = sh; 1176 1177 pr_debug("%s: for %llu schedule op %d on " 1178 "replacement disc %d\n", 1179 __func__, (unsigned long long)sh->sector, 1180 rbi->bi_opf, i); 1181 atomic_inc(&sh->count); 1182 if (sh != head_sh) 1183 atomic_inc(&head_sh->count); 1184 if (use_new_offset(conf, sh)) 1185 rbi->bi_iter.bi_sector = (sh->sector 1186 + rrdev->new_data_offset); 1187 else 1188 rbi->bi_iter.bi_sector = (sh->sector 1189 + rrdev->data_offset); 1190 if (test_bit(R5_SkipCopy, &sh->dev[i].flags)) 1191 WARN_ON(test_bit(R5_UPTODATE, &sh->dev[i].flags)); 1192 sh->dev[i].rvec.bv_page = sh->dev[i].page; 1193 rbi->bi_vcnt = 1; 1194 rbi->bi_io_vec[0].bv_len = STRIPE_SIZE; 1195 rbi->bi_io_vec[0].bv_offset = 0; 1196 rbi->bi_iter.bi_size = STRIPE_SIZE; 1197 rbi->bi_write_hint = sh->dev[i].write_hint; 1198 sh->dev[i].write_hint = RWF_WRITE_LIFE_NOT_SET; 1199 /* 1200 * If this is discard request, set bi_vcnt 0. We don't 1201 * want to confuse SCSI because SCSI will replace payload 1202 */ 1203 if (op == REQ_OP_DISCARD) 1204 rbi->bi_vcnt = 0; 1205 if (conf->mddev->gendisk) 1206 trace_block_bio_remap(rbi->bi_disk->queue, 1207 rbi, disk_devt(conf->mddev->gendisk), 1208 sh->dev[i].sector); 1209 if (should_defer && op_is_write(op)) 1210 bio_list_add(&pending_bios, rbi); 1211 else 1212 generic_make_request(rbi); 1213 } 1214 if (!rdev && !rrdev) { 1215 if (op_is_write(op)) 1216 set_bit(STRIPE_DEGRADED, &sh->state); 1217 pr_debug("skip op %d on disc %d for sector %llu\n", 1218 bi->bi_opf, i, (unsigned long long)sh->sector); 1219 clear_bit(R5_LOCKED, &sh->dev[i].flags); 1220 set_bit(STRIPE_HANDLE, &sh->state); 1221 } 1222 1223 if (!head_sh->batch_head) 1224 continue; 1225 sh = list_first_entry(&sh->batch_list, struct stripe_head, 1226 batch_list); 1227 if (sh != head_sh) 1228 goto again; 1229 } 1230 1231 if (should_defer && !bio_list_empty(&pending_bios)) 1232 defer_issue_bios(conf, head_sh->sector, &pending_bios); 1233 } 1234 1235 static struct dma_async_tx_descriptor * 1236 async_copy_data(int frombio, struct bio *bio, struct page **page, 1237 sector_t sector, struct dma_async_tx_descriptor *tx, 1238 struct stripe_head *sh, int no_skipcopy) 1239 { 1240 struct bio_vec bvl; 1241 struct bvec_iter iter; 1242 struct page *bio_page; 1243 int page_offset; 1244 struct async_submit_ctl submit; 1245 enum async_tx_flags flags = 0; 1246 1247 if (bio->bi_iter.bi_sector >= sector) 1248 page_offset = (signed)(bio->bi_iter.bi_sector - sector) * 512; 1249 else 1250 page_offset = (signed)(sector - bio->bi_iter.bi_sector) * -512; 1251 1252 if (frombio) 1253 flags |= ASYNC_TX_FENCE; 1254 init_async_submit(&submit, flags, tx, NULL, NULL, NULL); 1255 1256 bio_for_each_segment(bvl, bio, iter) { 1257 int len = bvl.bv_len; 1258 int clen; 1259 int b_offset = 0; 1260 1261 if (page_offset < 0) { 1262 b_offset = -page_offset; 1263 page_offset += b_offset; 1264 len -= b_offset; 1265 } 1266 1267 if (len > 0 && page_offset + len > STRIPE_SIZE) 1268 clen = STRIPE_SIZE - page_offset; 1269 else 1270 clen = len; 1271 1272 if (clen > 0) { 1273 b_offset += bvl.bv_offset; 1274 bio_page = bvl.bv_page; 1275 if (frombio) { 1276 if (sh->raid_conf->skip_copy && 1277 b_offset == 0 && page_offset == 0 && 1278 clen == STRIPE_SIZE && 1279 !no_skipcopy) 1280 *page = bio_page; 1281 else 1282 tx = async_memcpy(*page, bio_page, page_offset, 1283 b_offset, clen, &submit); 1284 } else 1285 tx = async_memcpy(bio_page, *page, b_offset, 1286 page_offset, clen, &submit); 1287 } 1288 /* chain the operations */ 1289 submit.depend_tx = tx; 1290 1291 if (clen < len) /* hit end of page */ 1292 break; 1293 page_offset += len; 1294 } 1295 1296 return tx; 1297 } 1298 1299 static void ops_complete_biofill(void *stripe_head_ref) 1300 { 1301 struct stripe_head *sh = stripe_head_ref; 1302 int i; 1303 1304 pr_debug("%s: stripe %llu\n", __func__, 1305 (unsigned long long)sh->sector); 1306 1307 /* clear completed biofills */ 1308 for (i = sh->disks; i--; ) { 1309 struct r5dev *dev = &sh->dev[i]; 1310 1311 /* acknowledge completion of a biofill operation */ 1312 /* and check if we need to reply to a read request, 1313 * new R5_Wantfill requests are held off until 1314 * !STRIPE_BIOFILL_RUN 1315 */ 1316 if (test_and_clear_bit(R5_Wantfill, &dev->flags)) { 1317 struct bio *rbi, *rbi2; 1318 1319 BUG_ON(!dev->read); 1320 rbi = dev->read; 1321 dev->read = NULL; 1322 while (rbi && rbi->bi_iter.bi_sector < 1323 dev->sector + STRIPE_SECTORS) { 1324 rbi2 = r5_next_bio(rbi, dev->sector); 1325 bio_endio(rbi); 1326 rbi = rbi2; 1327 } 1328 } 1329 } 1330 clear_bit(STRIPE_BIOFILL_RUN, &sh->state); 1331 1332 set_bit(STRIPE_HANDLE, &sh->state); 1333 raid5_release_stripe(sh); 1334 } 1335 1336 static void ops_run_biofill(struct stripe_head *sh) 1337 { 1338 struct dma_async_tx_descriptor *tx = NULL; 1339 struct async_submit_ctl submit; 1340 int i; 1341 1342 BUG_ON(sh->batch_head); 1343 pr_debug("%s: stripe %llu\n", __func__, 1344 (unsigned long long)sh->sector); 1345 1346 for (i = sh->disks; i--; ) { 1347 struct r5dev *dev = &sh->dev[i]; 1348 if (test_bit(R5_Wantfill, &dev->flags)) { 1349 struct bio *rbi; 1350 spin_lock_irq(&sh->stripe_lock); 1351 dev->read = rbi = dev->toread; 1352 dev->toread = NULL; 1353 spin_unlock_irq(&sh->stripe_lock); 1354 while (rbi && rbi->bi_iter.bi_sector < 1355 dev->sector + STRIPE_SECTORS) { 1356 tx = async_copy_data(0, rbi, &dev->page, 1357 dev->sector, tx, sh, 0); 1358 rbi = r5_next_bio(rbi, dev->sector); 1359 } 1360 } 1361 } 1362 1363 atomic_inc(&sh->count); 1364 init_async_submit(&submit, ASYNC_TX_ACK, tx, ops_complete_biofill, sh, NULL); 1365 async_trigger_callback(&submit); 1366 } 1367 1368 static void mark_target_uptodate(struct stripe_head *sh, int target) 1369 { 1370 struct r5dev *tgt; 1371 1372 if (target < 0) 1373 return; 1374 1375 tgt = &sh->dev[target]; 1376 set_bit(R5_UPTODATE, &tgt->flags); 1377 BUG_ON(!test_bit(R5_Wantcompute, &tgt->flags)); 1378 clear_bit(R5_Wantcompute, &tgt->flags); 1379 } 1380 1381 static void ops_complete_compute(void *stripe_head_ref) 1382 { 1383 struct stripe_head *sh = stripe_head_ref; 1384 1385 pr_debug("%s: stripe %llu\n", __func__, 1386 (unsigned long long)sh->sector); 1387 1388 /* mark the computed target(s) as uptodate */ 1389 mark_target_uptodate(sh, sh->ops.target); 1390 mark_target_uptodate(sh, sh->ops.target2); 1391 1392 clear_bit(STRIPE_COMPUTE_RUN, &sh->state); 1393 if (sh->check_state == check_state_compute_run) 1394 sh->check_state = check_state_compute_result; 1395 set_bit(STRIPE_HANDLE, &sh->state); 1396 raid5_release_stripe(sh); 1397 } 1398 1399 /* return a pointer to the address conversion region of the scribble buffer */ 1400 static struct page **to_addr_page(struct raid5_percpu *percpu, int i) 1401 { 1402 return percpu->scribble + i * percpu->scribble_obj_size; 1403 } 1404 1405 /* return a pointer to the address conversion region of the scribble buffer */ 1406 static addr_conv_t *to_addr_conv(struct stripe_head *sh, 1407 struct raid5_percpu *percpu, int i) 1408 { 1409 return (void *) (to_addr_page(percpu, i) + sh->disks + 2); 1410 } 1411 1412 static struct dma_async_tx_descriptor * 1413 ops_run_compute5(struct stripe_head *sh, struct raid5_percpu *percpu) 1414 { 1415 int disks = sh->disks; 1416 struct page **xor_srcs = to_addr_page(percpu, 0); 1417 int target = sh->ops.target; 1418 struct r5dev *tgt = &sh->dev[target]; 1419 struct page *xor_dest = tgt->page; 1420 int count = 0; 1421 struct dma_async_tx_descriptor *tx; 1422 struct async_submit_ctl submit; 1423 int i; 1424 1425 BUG_ON(sh->batch_head); 1426 1427 pr_debug("%s: stripe %llu block: %d\n", 1428 __func__, (unsigned long long)sh->sector, target); 1429 BUG_ON(!test_bit(R5_Wantcompute, &tgt->flags)); 1430 1431 for (i = disks; i--; ) 1432 if (i != target) 1433 xor_srcs[count++] = sh->dev[i].page; 1434 1435 atomic_inc(&sh->count); 1436 1437 init_async_submit(&submit, ASYNC_TX_FENCE|ASYNC_TX_XOR_ZERO_DST, NULL, 1438 ops_complete_compute, sh, to_addr_conv(sh, percpu, 0)); 1439 if (unlikely(count == 1)) 1440 tx = async_memcpy(xor_dest, xor_srcs[0], 0, 0, STRIPE_SIZE, &submit); 1441 else 1442 tx = async_xor(xor_dest, xor_srcs, 0, count, STRIPE_SIZE, &submit); 1443 1444 return tx; 1445 } 1446 1447 /* set_syndrome_sources - populate source buffers for gen_syndrome 1448 * @srcs - (struct page *) array of size sh->disks 1449 * @sh - stripe_head to parse 1450 * 1451 * Populates srcs in proper layout order for the stripe and returns the 1452 * 'count' of sources to be used in a call to async_gen_syndrome. The P 1453 * destination buffer is recorded in srcs[count] and the Q destination 1454 * is recorded in srcs[count+1]]. 1455 */ 1456 static int set_syndrome_sources(struct page **srcs, 1457 struct stripe_head *sh, 1458 int srctype) 1459 { 1460 int disks = sh->disks; 1461 int syndrome_disks = sh->ddf_layout ? disks : (disks - 2); 1462 int d0_idx = raid6_d0(sh); 1463 int count; 1464 int i; 1465 1466 for (i = 0; i < disks; i++) 1467 srcs[i] = NULL; 1468 1469 count = 0; 1470 i = d0_idx; 1471 do { 1472 int slot = raid6_idx_to_slot(i, sh, &count, syndrome_disks); 1473 struct r5dev *dev = &sh->dev[i]; 1474 1475 if (i == sh->qd_idx || i == sh->pd_idx || 1476 (srctype == SYNDROME_SRC_ALL) || 1477 (srctype == SYNDROME_SRC_WANT_DRAIN && 1478 (test_bit(R5_Wantdrain, &dev->flags) || 1479 test_bit(R5_InJournal, &dev->flags))) || 1480 (srctype == SYNDROME_SRC_WRITTEN && 1481 (dev->written || 1482 test_bit(R5_InJournal, &dev->flags)))) { 1483 if (test_bit(R5_InJournal, &dev->flags)) 1484 srcs[slot] = sh->dev[i].orig_page; 1485 else 1486 srcs[slot] = sh->dev[i].page; 1487 } 1488 i = raid6_next_disk(i, disks); 1489 } while (i != d0_idx); 1490 1491 return syndrome_disks; 1492 } 1493 1494 static struct dma_async_tx_descriptor * 1495 ops_run_compute6_1(struct stripe_head *sh, struct raid5_percpu *percpu) 1496 { 1497 int disks = sh->disks; 1498 struct page **blocks = to_addr_page(percpu, 0); 1499 int target; 1500 int qd_idx = sh->qd_idx; 1501 struct dma_async_tx_descriptor *tx; 1502 struct async_submit_ctl submit; 1503 struct r5dev *tgt; 1504 struct page *dest; 1505 int i; 1506 int count; 1507 1508 BUG_ON(sh->batch_head); 1509 if (sh->ops.target < 0) 1510 target = sh->ops.target2; 1511 else if (sh->ops.target2 < 0) 1512 target = sh->ops.target; 1513 else 1514 /* we should only have one valid target */ 1515 BUG(); 1516 BUG_ON(target < 0); 1517 pr_debug("%s: stripe %llu block: %d\n", 1518 __func__, (unsigned long long)sh->sector, target); 1519 1520 tgt = &sh->dev[target]; 1521 BUG_ON(!test_bit(R5_Wantcompute, &tgt->flags)); 1522 dest = tgt->page; 1523 1524 atomic_inc(&sh->count); 1525 1526 if (target == qd_idx) { 1527 count = set_syndrome_sources(blocks, sh, SYNDROME_SRC_ALL); 1528 blocks[count] = NULL; /* regenerating p is not necessary */ 1529 BUG_ON(blocks[count+1] != dest); /* q should already be set */ 1530 init_async_submit(&submit, ASYNC_TX_FENCE, NULL, 1531 ops_complete_compute, sh, 1532 to_addr_conv(sh, percpu, 0)); 1533 tx = async_gen_syndrome(blocks, 0, count+2, STRIPE_SIZE, &submit); 1534 } else { 1535 /* Compute any data- or p-drive using XOR */ 1536 count = 0; 1537 for (i = disks; i-- ; ) { 1538 if (i == target || i == qd_idx) 1539 continue; 1540 blocks[count++] = sh->dev[i].page; 1541 } 1542 1543 init_async_submit(&submit, ASYNC_TX_FENCE|ASYNC_TX_XOR_ZERO_DST, 1544 NULL, ops_complete_compute, sh, 1545 to_addr_conv(sh, percpu, 0)); 1546 tx = async_xor(dest, blocks, 0, count, STRIPE_SIZE, &submit); 1547 } 1548 1549 return tx; 1550 } 1551 1552 static struct dma_async_tx_descriptor * 1553 ops_run_compute6_2(struct stripe_head *sh, struct raid5_percpu *percpu) 1554 { 1555 int i, count, disks = sh->disks; 1556 int syndrome_disks = sh->ddf_layout ? disks : disks-2; 1557 int d0_idx = raid6_d0(sh); 1558 int faila = -1, failb = -1; 1559 int target = sh->ops.target; 1560 int target2 = sh->ops.target2; 1561 struct r5dev *tgt = &sh->dev[target]; 1562 struct r5dev *tgt2 = &sh->dev[target2]; 1563 struct dma_async_tx_descriptor *tx; 1564 struct page **blocks = to_addr_page(percpu, 0); 1565 struct async_submit_ctl submit; 1566 1567 BUG_ON(sh->batch_head); 1568 pr_debug("%s: stripe %llu block1: %d block2: %d\n", 1569 __func__, (unsigned long long)sh->sector, target, target2); 1570 BUG_ON(target < 0 || target2 < 0); 1571 BUG_ON(!test_bit(R5_Wantcompute, &tgt->flags)); 1572 BUG_ON(!test_bit(R5_Wantcompute, &tgt2->flags)); 1573 1574 /* we need to open-code set_syndrome_sources to handle the 1575 * slot number conversion for 'faila' and 'failb' 1576 */ 1577 for (i = 0; i < disks ; i++) 1578 blocks[i] = NULL; 1579 count = 0; 1580 i = d0_idx; 1581 do { 1582 int slot = raid6_idx_to_slot(i, sh, &count, syndrome_disks); 1583 1584 blocks[slot] = sh->dev[i].page; 1585 1586 if (i == target) 1587 faila = slot; 1588 if (i == target2) 1589 failb = slot; 1590 i = raid6_next_disk(i, disks); 1591 } while (i != d0_idx); 1592 1593 BUG_ON(faila == failb); 1594 if (failb < faila) 1595 swap(faila, failb); 1596 pr_debug("%s: stripe: %llu faila: %d failb: %d\n", 1597 __func__, (unsigned long long)sh->sector, faila, failb); 1598 1599 atomic_inc(&sh->count); 1600 1601 if (failb == syndrome_disks+1) { 1602 /* Q disk is one of the missing disks */ 1603 if (faila == syndrome_disks) { 1604 /* Missing P+Q, just recompute */ 1605 init_async_submit(&submit, ASYNC_TX_FENCE, NULL, 1606 ops_complete_compute, sh, 1607 to_addr_conv(sh, percpu, 0)); 1608 return async_gen_syndrome(blocks, 0, syndrome_disks+2, 1609 STRIPE_SIZE, &submit); 1610 } else { 1611 struct page *dest; 1612 int data_target; 1613 int qd_idx = sh->qd_idx; 1614 1615 /* Missing D+Q: recompute D from P, then recompute Q */ 1616 if (target == qd_idx) 1617 data_target = target2; 1618 else 1619 data_target = target; 1620 1621 count = 0; 1622 for (i = disks; i-- ; ) { 1623 if (i == data_target || i == qd_idx) 1624 continue; 1625 blocks[count++] = sh->dev[i].page; 1626 } 1627 dest = sh->dev[data_target].page; 1628 init_async_submit(&submit, 1629 ASYNC_TX_FENCE|ASYNC_TX_XOR_ZERO_DST, 1630 NULL, NULL, NULL, 1631 to_addr_conv(sh, percpu, 0)); 1632 tx = async_xor(dest, blocks, 0, count, STRIPE_SIZE, 1633 &submit); 1634 1635 count = set_syndrome_sources(blocks, sh, SYNDROME_SRC_ALL); 1636 init_async_submit(&submit, ASYNC_TX_FENCE, tx, 1637 ops_complete_compute, sh, 1638 to_addr_conv(sh, percpu, 0)); 1639 return async_gen_syndrome(blocks, 0, count+2, 1640 STRIPE_SIZE, &submit); 1641 } 1642 } else { 1643 init_async_submit(&submit, ASYNC_TX_FENCE, NULL, 1644 ops_complete_compute, sh, 1645 to_addr_conv(sh, percpu, 0)); 1646 if (failb == syndrome_disks) { 1647 /* We're missing D+P. */ 1648 return async_raid6_datap_recov(syndrome_disks+2, 1649 STRIPE_SIZE, faila, 1650 blocks, &submit); 1651 } else { 1652 /* We're missing D+D. */ 1653 return async_raid6_2data_recov(syndrome_disks+2, 1654 STRIPE_SIZE, faila, failb, 1655 blocks, &submit); 1656 } 1657 } 1658 } 1659 1660 static void ops_complete_prexor(void *stripe_head_ref) 1661 { 1662 struct stripe_head *sh = stripe_head_ref; 1663 1664 pr_debug("%s: stripe %llu\n", __func__, 1665 (unsigned long long)sh->sector); 1666 1667 if (r5c_is_writeback(sh->raid_conf->log)) 1668 /* 1669 * raid5-cache write back uses orig_page during prexor. 1670 * After prexor, it is time to free orig_page 1671 */ 1672 r5c_release_extra_page(sh); 1673 } 1674 1675 static struct dma_async_tx_descriptor * 1676 ops_run_prexor5(struct stripe_head *sh, struct raid5_percpu *percpu, 1677 struct dma_async_tx_descriptor *tx) 1678 { 1679 int disks = sh->disks; 1680 struct page **xor_srcs = to_addr_page(percpu, 0); 1681 int count = 0, pd_idx = sh->pd_idx, i; 1682 struct async_submit_ctl submit; 1683 1684 /* existing parity data subtracted */ 1685 struct page *xor_dest = xor_srcs[count++] = sh->dev[pd_idx].page; 1686 1687 BUG_ON(sh->batch_head); 1688 pr_debug("%s: stripe %llu\n", __func__, 1689 (unsigned long long)sh->sector); 1690 1691 for (i = disks; i--; ) { 1692 struct r5dev *dev = &sh->dev[i]; 1693 /* Only process blocks that are known to be uptodate */ 1694 if (test_bit(R5_InJournal, &dev->flags)) 1695 xor_srcs[count++] = dev->orig_page; 1696 else if (test_bit(R5_Wantdrain, &dev->flags)) 1697 xor_srcs[count++] = dev->page; 1698 } 1699 1700 init_async_submit(&submit, ASYNC_TX_FENCE|ASYNC_TX_XOR_DROP_DST, tx, 1701 ops_complete_prexor, sh, to_addr_conv(sh, percpu, 0)); 1702 tx = async_xor(xor_dest, xor_srcs, 0, count, STRIPE_SIZE, &submit); 1703 1704 return tx; 1705 } 1706 1707 static struct dma_async_tx_descriptor * 1708 ops_run_prexor6(struct stripe_head *sh, struct raid5_percpu *percpu, 1709 struct dma_async_tx_descriptor *tx) 1710 { 1711 struct page **blocks = to_addr_page(percpu, 0); 1712 int count; 1713 struct async_submit_ctl submit; 1714 1715 pr_debug("%s: stripe %llu\n", __func__, 1716 (unsigned long long)sh->sector); 1717 1718 count = set_syndrome_sources(blocks, sh, SYNDROME_SRC_WANT_DRAIN); 1719 1720 init_async_submit(&submit, ASYNC_TX_FENCE|ASYNC_TX_PQ_XOR_DST, tx, 1721 ops_complete_prexor, sh, to_addr_conv(sh, percpu, 0)); 1722 tx = async_gen_syndrome(blocks, 0, count+2, STRIPE_SIZE, &submit); 1723 1724 return tx; 1725 } 1726 1727 static struct dma_async_tx_descriptor * 1728 ops_run_biodrain(struct stripe_head *sh, struct dma_async_tx_descriptor *tx) 1729 { 1730 struct r5conf *conf = sh->raid_conf; 1731 int disks = sh->disks; 1732 int i; 1733 struct stripe_head *head_sh = sh; 1734 1735 pr_debug("%s: stripe %llu\n", __func__, 1736 (unsigned long long)sh->sector); 1737 1738 for (i = disks; i--; ) { 1739 struct r5dev *dev; 1740 struct bio *chosen; 1741 1742 sh = head_sh; 1743 if (test_and_clear_bit(R5_Wantdrain, &head_sh->dev[i].flags)) { 1744 struct bio *wbi; 1745 1746 again: 1747 dev = &sh->dev[i]; 1748 /* 1749 * clear R5_InJournal, so when rewriting a page in 1750 * journal, it is not skipped by r5l_log_stripe() 1751 */ 1752 clear_bit(R5_InJournal, &dev->flags); 1753 spin_lock_irq(&sh->stripe_lock); 1754 chosen = dev->towrite; 1755 dev->towrite = NULL; 1756 sh->overwrite_disks = 0; 1757 BUG_ON(dev->written); 1758 wbi = dev->written = chosen; 1759 spin_unlock_irq(&sh->stripe_lock); 1760 WARN_ON(dev->page != dev->orig_page); 1761 1762 while (wbi && wbi->bi_iter.bi_sector < 1763 dev->sector + STRIPE_SECTORS) { 1764 if (wbi->bi_opf & REQ_FUA) 1765 set_bit(R5_WantFUA, &dev->flags); 1766 if (wbi->bi_opf & REQ_SYNC) 1767 set_bit(R5_SyncIO, &dev->flags); 1768 if (bio_op(wbi) == REQ_OP_DISCARD) 1769 set_bit(R5_Discard, &dev->flags); 1770 else { 1771 tx = async_copy_data(1, wbi, &dev->page, 1772 dev->sector, tx, sh, 1773 r5c_is_writeback(conf->log)); 1774 if (dev->page != dev->orig_page && 1775 !r5c_is_writeback(conf->log)) { 1776 set_bit(R5_SkipCopy, &dev->flags); 1777 clear_bit(R5_UPTODATE, &dev->flags); 1778 clear_bit(R5_OVERWRITE, &dev->flags); 1779 } 1780 } 1781 wbi = r5_next_bio(wbi, dev->sector); 1782 } 1783 1784 if (head_sh->batch_head) { 1785 sh = list_first_entry(&sh->batch_list, 1786 struct stripe_head, 1787 batch_list); 1788 if (sh == head_sh) 1789 continue; 1790 goto again; 1791 } 1792 } 1793 } 1794 1795 return tx; 1796 } 1797 1798 static void ops_complete_reconstruct(void *stripe_head_ref) 1799 { 1800 struct stripe_head *sh = stripe_head_ref; 1801 int disks = sh->disks; 1802 int pd_idx = sh->pd_idx; 1803 int qd_idx = sh->qd_idx; 1804 int i; 1805 bool fua = false, sync = false, discard = false; 1806 1807 pr_debug("%s: stripe %llu\n", __func__, 1808 (unsigned long long)sh->sector); 1809 1810 for (i = disks; i--; ) { 1811 fua |= test_bit(R5_WantFUA, &sh->dev[i].flags); 1812 sync |= test_bit(R5_SyncIO, &sh->dev[i].flags); 1813 discard |= test_bit(R5_Discard, &sh->dev[i].flags); 1814 } 1815 1816 for (i = disks; i--; ) { 1817 struct r5dev *dev = &sh->dev[i]; 1818 1819 if (dev->written || i == pd_idx || i == qd_idx) { 1820 if (!discard && !test_bit(R5_SkipCopy, &dev->flags)) { 1821 set_bit(R5_UPTODATE, &dev->flags); 1822 if (test_bit(STRIPE_EXPAND_READY, &sh->state)) 1823 set_bit(R5_Expanded, &dev->flags); 1824 } 1825 if (fua) 1826 set_bit(R5_WantFUA, &dev->flags); 1827 if (sync) 1828 set_bit(R5_SyncIO, &dev->flags); 1829 } 1830 } 1831 1832 if (sh->reconstruct_state == reconstruct_state_drain_run) 1833 sh->reconstruct_state = reconstruct_state_drain_result; 1834 else if (sh->reconstruct_state == reconstruct_state_prexor_drain_run) 1835 sh->reconstruct_state = reconstruct_state_prexor_drain_result; 1836 else { 1837 BUG_ON(sh->reconstruct_state != reconstruct_state_run); 1838 sh->reconstruct_state = reconstruct_state_result; 1839 } 1840 1841 set_bit(STRIPE_HANDLE, &sh->state); 1842 raid5_release_stripe(sh); 1843 } 1844 1845 static void 1846 ops_run_reconstruct5(struct stripe_head *sh, struct raid5_percpu *percpu, 1847 struct dma_async_tx_descriptor *tx) 1848 { 1849 int disks = sh->disks; 1850 struct page **xor_srcs; 1851 struct async_submit_ctl submit; 1852 int count, pd_idx = sh->pd_idx, i; 1853 struct page *xor_dest; 1854 int prexor = 0; 1855 unsigned long flags; 1856 int j = 0; 1857 struct stripe_head *head_sh = sh; 1858 int last_stripe; 1859 1860 pr_debug("%s: stripe %llu\n", __func__, 1861 (unsigned long long)sh->sector); 1862 1863 for (i = 0; i < sh->disks; i++) { 1864 if (pd_idx == i) 1865 continue; 1866 if (!test_bit(R5_Discard, &sh->dev[i].flags)) 1867 break; 1868 } 1869 if (i >= sh->disks) { 1870 atomic_inc(&sh->count); 1871 set_bit(R5_Discard, &sh->dev[pd_idx].flags); 1872 ops_complete_reconstruct(sh); 1873 return; 1874 } 1875 again: 1876 count = 0; 1877 xor_srcs = to_addr_page(percpu, j); 1878 /* check if prexor is active which means only process blocks 1879 * that are part of a read-modify-write (written) 1880 */ 1881 if (head_sh->reconstruct_state == reconstruct_state_prexor_drain_run) { 1882 prexor = 1; 1883 xor_dest = xor_srcs[count++] = sh->dev[pd_idx].page; 1884 for (i = disks; i--; ) { 1885 struct r5dev *dev = &sh->dev[i]; 1886 if (head_sh->dev[i].written || 1887 test_bit(R5_InJournal, &head_sh->dev[i].flags)) 1888 xor_srcs[count++] = dev->page; 1889 } 1890 } else { 1891 xor_dest = sh->dev[pd_idx].page; 1892 for (i = disks; i--; ) { 1893 struct r5dev *dev = &sh->dev[i]; 1894 if (i != pd_idx) 1895 xor_srcs[count++] = dev->page; 1896 } 1897 } 1898 1899 /* 1/ if we prexor'd then the dest is reused as a source 1900 * 2/ if we did not prexor then we are redoing the parity 1901 * set ASYNC_TX_XOR_DROP_DST and ASYNC_TX_XOR_ZERO_DST 1902 * for the synchronous xor case 1903 */ 1904 last_stripe = !head_sh->batch_head || 1905 list_first_entry(&sh->batch_list, 1906 struct stripe_head, batch_list) == head_sh; 1907 if (last_stripe) { 1908 flags = ASYNC_TX_ACK | 1909 (prexor ? ASYNC_TX_XOR_DROP_DST : ASYNC_TX_XOR_ZERO_DST); 1910 1911 atomic_inc(&head_sh->count); 1912 init_async_submit(&submit, flags, tx, ops_complete_reconstruct, head_sh, 1913 to_addr_conv(sh, percpu, j)); 1914 } else { 1915 flags = prexor ? ASYNC_TX_XOR_DROP_DST : ASYNC_TX_XOR_ZERO_DST; 1916 init_async_submit(&submit, flags, tx, NULL, NULL, 1917 to_addr_conv(sh, percpu, j)); 1918 } 1919 1920 if (unlikely(count == 1)) 1921 tx = async_memcpy(xor_dest, xor_srcs[0], 0, 0, STRIPE_SIZE, &submit); 1922 else 1923 tx = async_xor(xor_dest, xor_srcs, 0, count, STRIPE_SIZE, &submit); 1924 if (!last_stripe) { 1925 j++; 1926 sh = list_first_entry(&sh->batch_list, struct stripe_head, 1927 batch_list); 1928 goto again; 1929 } 1930 } 1931 1932 static void 1933 ops_run_reconstruct6(struct stripe_head *sh, struct raid5_percpu *percpu, 1934 struct dma_async_tx_descriptor *tx) 1935 { 1936 struct async_submit_ctl submit; 1937 struct page **blocks; 1938 int count, i, j = 0; 1939 struct stripe_head *head_sh = sh; 1940 int last_stripe; 1941 int synflags; 1942 unsigned long txflags; 1943 1944 pr_debug("%s: stripe %llu\n", __func__, (unsigned long long)sh->sector); 1945 1946 for (i = 0; i < sh->disks; i++) { 1947 if (sh->pd_idx == i || sh->qd_idx == i) 1948 continue; 1949 if (!test_bit(R5_Discard, &sh->dev[i].flags)) 1950 break; 1951 } 1952 if (i >= sh->disks) { 1953 atomic_inc(&sh->count); 1954 set_bit(R5_Discard, &sh->dev[sh->pd_idx].flags); 1955 set_bit(R5_Discard, &sh->dev[sh->qd_idx].flags); 1956 ops_complete_reconstruct(sh); 1957 return; 1958 } 1959 1960 again: 1961 blocks = to_addr_page(percpu, j); 1962 1963 if (sh->reconstruct_state == reconstruct_state_prexor_drain_run) { 1964 synflags = SYNDROME_SRC_WRITTEN; 1965 txflags = ASYNC_TX_ACK | ASYNC_TX_PQ_XOR_DST; 1966 } else { 1967 synflags = SYNDROME_SRC_ALL; 1968 txflags = ASYNC_TX_ACK; 1969 } 1970 1971 count = set_syndrome_sources(blocks, sh, synflags); 1972 last_stripe = !head_sh->batch_head || 1973 list_first_entry(&sh->batch_list, 1974 struct stripe_head, batch_list) == head_sh; 1975 1976 if (last_stripe) { 1977 atomic_inc(&head_sh->count); 1978 init_async_submit(&submit, txflags, tx, ops_complete_reconstruct, 1979 head_sh, to_addr_conv(sh, percpu, j)); 1980 } else 1981 init_async_submit(&submit, 0, tx, NULL, NULL, 1982 to_addr_conv(sh, percpu, j)); 1983 tx = async_gen_syndrome(blocks, 0, count+2, STRIPE_SIZE, &submit); 1984 if (!last_stripe) { 1985 j++; 1986 sh = list_first_entry(&sh->batch_list, struct stripe_head, 1987 batch_list); 1988 goto again; 1989 } 1990 } 1991 1992 static void ops_complete_check(void *stripe_head_ref) 1993 { 1994 struct stripe_head *sh = stripe_head_ref; 1995 1996 pr_debug("%s: stripe %llu\n", __func__, 1997 (unsigned long long)sh->sector); 1998 1999 sh->check_state = check_state_check_result; 2000 set_bit(STRIPE_HANDLE, &sh->state); 2001 raid5_release_stripe(sh); 2002 } 2003 2004 static void ops_run_check_p(struct stripe_head *sh, struct raid5_percpu *percpu) 2005 { 2006 int disks = sh->disks; 2007 int pd_idx = sh->pd_idx; 2008 int qd_idx = sh->qd_idx; 2009 struct page *xor_dest; 2010 struct page **xor_srcs = to_addr_page(percpu, 0); 2011 struct dma_async_tx_descriptor *tx; 2012 struct async_submit_ctl submit; 2013 int count; 2014 int i; 2015 2016 pr_debug("%s: stripe %llu\n", __func__, 2017 (unsigned long long)sh->sector); 2018 2019 BUG_ON(sh->batch_head); 2020 count = 0; 2021 xor_dest = sh->dev[pd_idx].page; 2022 xor_srcs[count++] = xor_dest; 2023 for (i = disks; i--; ) { 2024 if (i == pd_idx || i == qd_idx) 2025 continue; 2026 xor_srcs[count++] = sh->dev[i].page; 2027 } 2028 2029 init_async_submit(&submit, 0, NULL, NULL, NULL, 2030 to_addr_conv(sh, percpu, 0)); 2031 tx = async_xor_val(xor_dest, xor_srcs, 0, count, STRIPE_SIZE, 2032 &sh->ops.zero_sum_result, &submit); 2033 2034 atomic_inc(&sh->count); 2035 init_async_submit(&submit, ASYNC_TX_ACK, tx, ops_complete_check, sh, NULL); 2036 tx = async_trigger_callback(&submit); 2037 } 2038 2039 static void ops_run_check_pq(struct stripe_head *sh, struct raid5_percpu *percpu, int checkp) 2040 { 2041 struct page **srcs = to_addr_page(percpu, 0); 2042 struct async_submit_ctl submit; 2043 int count; 2044 2045 pr_debug("%s: stripe %llu checkp: %d\n", __func__, 2046 (unsigned long long)sh->sector, checkp); 2047 2048 BUG_ON(sh->batch_head); 2049 count = set_syndrome_sources(srcs, sh, SYNDROME_SRC_ALL); 2050 if (!checkp) 2051 srcs[count] = NULL; 2052 2053 atomic_inc(&sh->count); 2054 init_async_submit(&submit, ASYNC_TX_ACK, NULL, ops_complete_check, 2055 sh, to_addr_conv(sh, percpu, 0)); 2056 async_syndrome_val(srcs, 0, count+2, STRIPE_SIZE, 2057 &sh->ops.zero_sum_result, percpu->spare_page, &submit); 2058 } 2059 2060 static void raid_run_ops(struct stripe_head *sh, unsigned long ops_request) 2061 { 2062 int overlap_clear = 0, i, disks = sh->disks; 2063 struct dma_async_tx_descriptor *tx = NULL; 2064 struct r5conf *conf = sh->raid_conf; 2065 int level = conf->level; 2066 struct raid5_percpu *percpu; 2067 unsigned long cpu; 2068 2069 cpu = get_cpu(); 2070 percpu = per_cpu_ptr(conf->percpu, cpu); 2071 if (test_bit(STRIPE_OP_BIOFILL, &ops_request)) { 2072 ops_run_biofill(sh); 2073 overlap_clear++; 2074 } 2075 2076 if (test_bit(STRIPE_OP_COMPUTE_BLK, &ops_request)) { 2077 if (level < 6) 2078 tx = ops_run_compute5(sh, percpu); 2079 else { 2080 if (sh->ops.target2 < 0 || sh->ops.target < 0) 2081 tx = ops_run_compute6_1(sh, percpu); 2082 else 2083 tx = ops_run_compute6_2(sh, percpu); 2084 } 2085 /* terminate the chain if reconstruct is not set to be run */ 2086 if (tx && !test_bit(STRIPE_OP_RECONSTRUCT, &ops_request)) 2087 async_tx_ack(tx); 2088 } 2089 2090 if (test_bit(STRIPE_OP_PREXOR, &ops_request)) { 2091 if (level < 6) 2092 tx = ops_run_prexor5(sh, percpu, tx); 2093 else 2094 tx = ops_run_prexor6(sh, percpu, tx); 2095 } 2096 2097 if (test_bit(STRIPE_OP_PARTIAL_PARITY, &ops_request)) 2098 tx = ops_run_partial_parity(sh, percpu, tx); 2099 2100 if (test_bit(STRIPE_OP_BIODRAIN, &ops_request)) { 2101 tx = ops_run_biodrain(sh, tx); 2102 overlap_clear++; 2103 } 2104 2105 if (test_bit(STRIPE_OP_RECONSTRUCT, &ops_request)) { 2106 if (level < 6) 2107 ops_run_reconstruct5(sh, percpu, tx); 2108 else 2109 ops_run_reconstruct6(sh, percpu, tx); 2110 } 2111 2112 if (test_bit(STRIPE_OP_CHECK, &ops_request)) { 2113 if (sh->check_state == check_state_run) 2114 ops_run_check_p(sh, percpu); 2115 else if (sh->check_state == check_state_run_q) 2116 ops_run_check_pq(sh, percpu, 0); 2117 else if (sh->check_state == check_state_run_pq) 2118 ops_run_check_pq(sh, percpu, 1); 2119 else 2120 BUG(); 2121 } 2122 2123 if (overlap_clear && !sh->batch_head) 2124 for (i = disks; i--; ) { 2125 struct r5dev *dev = &sh->dev[i]; 2126 if (test_and_clear_bit(R5_Overlap, &dev->flags)) 2127 wake_up(&sh->raid_conf->wait_for_overlap); 2128 } 2129 put_cpu(); 2130 } 2131 2132 static void free_stripe(struct kmem_cache *sc, struct stripe_head *sh) 2133 { 2134 if (sh->ppl_page) 2135 __free_page(sh->ppl_page); 2136 kmem_cache_free(sc, sh); 2137 } 2138 2139 static struct stripe_head *alloc_stripe(struct kmem_cache *sc, gfp_t gfp, 2140 int disks, struct r5conf *conf) 2141 { 2142 struct stripe_head *sh; 2143 int i; 2144 2145 sh = kmem_cache_zalloc(sc, gfp); 2146 if (sh) { 2147 spin_lock_init(&sh->stripe_lock); 2148 spin_lock_init(&sh->batch_lock); 2149 INIT_LIST_HEAD(&sh->batch_list); 2150 INIT_LIST_HEAD(&sh->lru); 2151 INIT_LIST_HEAD(&sh->r5c); 2152 INIT_LIST_HEAD(&sh->log_list); 2153 atomic_set(&sh->count, 1); 2154 sh->raid_conf = conf; 2155 sh->log_start = MaxSector; 2156 for (i = 0; i < disks; i++) { 2157 struct r5dev *dev = &sh->dev[i]; 2158 2159 bio_init(&dev->req, &dev->vec, 1); 2160 bio_init(&dev->rreq, &dev->rvec, 1); 2161 } 2162 2163 if (raid5_has_ppl(conf)) { 2164 sh->ppl_page = alloc_page(gfp); 2165 if (!sh->ppl_page) { 2166 free_stripe(sc, sh); 2167 sh = NULL; 2168 } 2169 } 2170 } 2171 return sh; 2172 } 2173 static int grow_one_stripe(struct r5conf *conf, gfp_t gfp) 2174 { 2175 struct stripe_head *sh; 2176 2177 sh = alloc_stripe(conf->slab_cache, gfp, conf->pool_size, conf); 2178 if (!sh) 2179 return 0; 2180 2181 if (grow_buffers(sh, gfp)) { 2182 shrink_buffers(sh); 2183 free_stripe(conf->slab_cache, sh); 2184 return 0; 2185 } 2186 sh->hash_lock_index = 2187 conf->max_nr_stripes % NR_STRIPE_HASH_LOCKS; 2188 /* we just created an active stripe so... */ 2189 atomic_inc(&conf->active_stripes); 2190 2191 raid5_release_stripe(sh); 2192 conf->max_nr_stripes++; 2193 return 1; 2194 } 2195 2196 static int grow_stripes(struct r5conf *conf, int num) 2197 { 2198 struct kmem_cache *sc; 2199 size_t namelen = sizeof(conf->cache_name[0]); 2200 int devs = max(conf->raid_disks, conf->previous_raid_disks); 2201 2202 if (conf->mddev->gendisk) 2203 snprintf(conf->cache_name[0], namelen, 2204 "raid%d-%s", conf->level, mdname(conf->mddev)); 2205 else 2206 snprintf(conf->cache_name[0], namelen, 2207 "raid%d-%p", conf->level, conf->mddev); 2208 snprintf(conf->cache_name[1], namelen, "%.27s-alt", conf->cache_name[0]); 2209 2210 conf->active_name = 0; 2211 sc = kmem_cache_create(conf->cache_name[conf->active_name], 2212 sizeof(struct stripe_head)+(devs-1)*sizeof(struct r5dev), 2213 0, 0, NULL); 2214 if (!sc) 2215 return 1; 2216 conf->slab_cache = sc; 2217 conf->pool_size = devs; 2218 while (num--) 2219 if (!grow_one_stripe(conf, GFP_KERNEL)) 2220 return 1; 2221 2222 return 0; 2223 } 2224 2225 /** 2226 * scribble_len - return the required size of the scribble region 2227 * @num - total number of disks in the array 2228 * 2229 * The size must be enough to contain: 2230 * 1/ a struct page pointer for each device in the array +2 2231 * 2/ room to convert each entry in (1) to its corresponding dma 2232 * (dma_map_page()) or page (page_address()) address. 2233 * 2234 * Note: the +2 is for the destination buffers of the ddf/raid6 case where we 2235 * calculate over all devices (not just the data blocks), using zeros in place 2236 * of the P and Q blocks. 2237 */ 2238 static int scribble_alloc(struct raid5_percpu *percpu, 2239 int num, int cnt, gfp_t flags) 2240 { 2241 size_t obj_size = 2242 sizeof(struct page *) * (num+2) + 2243 sizeof(addr_conv_t) * (num+2); 2244 void *scribble; 2245 2246 scribble = kvmalloc_array(cnt, obj_size, flags); 2247 if (!scribble) 2248 return -ENOMEM; 2249 2250 kvfree(percpu->scribble); 2251 2252 percpu->scribble = scribble; 2253 percpu->scribble_obj_size = obj_size; 2254 return 0; 2255 } 2256 2257 static int resize_chunks(struct r5conf *conf, int new_disks, int new_sectors) 2258 { 2259 unsigned long cpu; 2260 int err = 0; 2261 2262 /* 2263 * Never shrink. And mddev_suspend() could deadlock if this is called 2264 * from raid5d. In that case, scribble_disks and scribble_sectors 2265 * should equal to new_disks and new_sectors 2266 */ 2267 if (conf->scribble_disks >= new_disks && 2268 conf->scribble_sectors >= new_sectors) 2269 return 0; 2270 mddev_suspend(conf->mddev); 2271 get_online_cpus(); 2272 2273 for_each_present_cpu(cpu) { 2274 struct raid5_percpu *percpu; 2275 2276 percpu = per_cpu_ptr(conf->percpu, cpu); 2277 err = scribble_alloc(percpu, new_disks, 2278 new_sectors / STRIPE_SECTORS, 2279 GFP_NOIO); 2280 if (err) 2281 break; 2282 } 2283 2284 put_online_cpus(); 2285 mddev_resume(conf->mddev); 2286 if (!err) { 2287 conf->scribble_disks = new_disks; 2288 conf->scribble_sectors = new_sectors; 2289 } 2290 return err; 2291 } 2292 2293 static int resize_stripes(struct r5conf *conf, int newsize) 2294 { 2295 /* Make all the stripes able to hold 'newsize' devices. 2296 * New slots in each stripe get 'page' set to a new page. 2297 * 2298 * This happens in stages: 2299 * 1/ create a new kmem_cache and allocate the required number of 2300 * stripe_heads. 2301 * 2/ gather all the old stripe_heads and transfer the pages across 2302 * to the new stripe_heads. This will have the side effect of 2303 * freezing the array as once all stripe_heads have been collected, 2304 * no IO will be possible. Old stripe heads are freed once their 2305 * pages have been transferred over, and the old kmem_cache is 2306 * freed when all stripes are done. 2307 * 3/ reallocate conf->disks to be suitable bigger. If this fails, 2308 * we simple return a failure status - no need to clean anything up. 2309 * 4/ allocate new pages for the new slots in the new stripe_heads. 2310 * If this fails, we don't bother trying the shrink the 2311 * stripe_heads down again, we just leave them as they are. 2312 * As each stripe_head is processed the new one is released into 2313 * active service. 2314 * 2315 * Once step2 is started, we cannot afford to wait for a write, 2316 * so we use GFP_NOIO allocations. 2317 */ 2318 struct stripe_head *osh, *nsh; 2319 LIST_HEAD(newstripes); 2320 struct disk_info *ndisks; 2321 int err = 0; 2322 struct kmem_cache *sc; 2323 int i; 2324 int hash, cnt; 2325 2326 md_allow_write(conf->mddev); 2327 2328 /* Step 1 */ 2329 sc = kmem_cache_create(conf->cache_name[1-conf->active_name], 2330 sizeof(struct stripe_head)+(newsize-1)*sizeof(struct r5dev), 2331 0, 0, NULL); 2332 if (!sc) 2333 return -ENOMEM; 2334 2335 /* Need to ensure auto-resizing doesn't interfere */ 2336 mutex_lock(&conf->cache_size_mutex); 2337 2338 for (i = conf->max_nr_stripes; i; i--) { 2339 nsh = alloc_stripe(sc, GFP_KERNEL, newsize, conf); 2340 if (!nsh) 2341 break; 2342 2343 list_add(&nsh->lru, &newstripes); 2344 } 2345 if (i) { 2346 /* didn't get enough, give up */ 2347 while (!list_empty(&newstripes)) { 2348 nsh = list_entry(newstripes.next, struct stripe_head, lru); 2349 list_del(&nsh->lru); 2350 free_stripe(sc, nsh); 2351 } 2352 kmem_cache_destroy(sc); 2353 mutex_unlock(&conf->cache_size_mutex); 2354 return -ENOMEM; 2355 } 2356 /* Step 2 - Must use GFP_NOIO now. 2357 * OK, we have enough stripes, start collecting inactive 2358 * stripes and copying them over 2359 */ 2360 hash = 0; 2361 cnt = 0; 2362 list_for_each_entry(nsh, &newstripes, lru) { 2363 lock_device_hash_lock(conf, hash); 2364 wait_event_cmd(conf->wait_for_stripe, 2365 !list_empty(conf->inactive_list + hash), 2366 unlock_device_hash_lock(conf, hash), 2367 lock_device_hash_lock(conf, hash)); 2368 osh = get_free_stripe(conf, hash); 2369 unlock_device_hash_lock(conf, hash); 2370 2371 for(i=0; i<conf->pool_size; i++) { 2372 nsh->dev[i].page = osh->dev[i].page; 2373 nsh->dev[i].orig_page = osh->dev[i].page; 2374 } 2375 nsh->hash_lock_index = hash; 2376 free_stripe(conf->slab_cache, osh); 2377 cnt++; 2378 if (cnt >= conf->max_nr_stripes / NR_STRIPE_HASH_LOCKS + 2379 !!((conf->max_nr_stripes % NR_STRIPE_HASH_LOCKS) > hash)) { 2380 hash++; 2381 cnt = 0; 2382 } 2383 } 2384 kmem_cache_destroy(conf->slab_cache); 2385 2386 /* Step 3. 2387 * At this point, we are holding all the stripes so the array 2388 * is completely stalled, so now is a good time to resize 2389 * conf->disks and the scribble region 2390 */ 2391 ndisks = kcalloc(newsize, sizeof(struct disk_info), GFP_NOIO); 2392 if (ndisks) { 2393 for (i = 0; i < conf->pool_size; i++) 2394 ndisks[i] = conf->disks[i]; 2395 2396 for (i = conf->pool_size; i < newsize; i++) { 2397 ndisks[i].extra_page = alloc_page(GFP_NOIO); 2398 if (!ndisks[i].extra_page) 2399 err = -ENOMEM; 2400 } 2401 2402 if (err) { 2403 for (i = conf->pool_size; i < newsize; i++) 2404 if (ndisks[i].extra_page) 2405 put_page(ndisks[i].extra_page); 2406 kfree(ndisks); 2407 } else { 2408 kfree(conf->disks); 2409 conf->disks = ndisks; 2410 } 2411 } else 2412 err = -ENOMEM; 2413 2414 mutex_unlock(&conf->cache_size_mutex); 2415 2416 conf->slab_cache = sc; 2417 conf->active_name = 1-conf->active_name; 2418 2419 /* Step 4, return new stripes to service */ 2420 while(!list_empty(&newstripes)) { 2421 nsh = list_entry(newstripes.next, struct stripe_head, lru); 2422 list_del_init(&nsh->lru); 2423 2424 for (i=conf->raid_disks; i < newsize; i++) 2425 if (nsh->dev[i].page == NULL) { 2426 struct page *p = alloc_page(GFP_NOIO); 2427 nsh->dev[i].page = p; 2428 nsh->dev[i].orig_page = p; 2429 if (!p) 2430 err = -ENOMEM; 2431 } 2432 raid5_release_stripe(nsh); 2433 } 2434 /* critical section pass, GFP_NOIO no longer needed */ 2435 2436 if (!err) 2437 conf->pool_size = newsize; 2438 return err; 2439 } 2440 2441 static int drop_one_stripe(struct r5conf *conf) 2442 { 2443 struct stripe_head *sh; 2444 int hash = (conf->max_nr_stripes - 1) & STRIPE_HASH_LOCKS_MASK; 2445 2446 spin_lock_irq(conf->hash_locks + hash); 2447 sh = get_free_stripe(conf, hash); 2448 spin_unlock_irq(conf->hash_locks + hash); 2449 if (!sh) 2450 return 0; 2451 BUG_ON(atomic_read(&sh->count)); 2452 shrink_buffers(sh); 2453 free_stripe(conf->slab_cache, sh); 2454 atomic_dec(&conf->active_stripes); 2455 conf->max_nr_stripes--; 2456 return 1; 2457 } 2458 2459 static void shrink_stripes(struct r5conf *conf) 2460 { 2461 while (conf->max_nr_stripes && 2462 drop_one_stripe(conf)) 2463 ; 2464 2465 kmem_cache_destroy(conf->slab_cache); 2466 conf->slab_cache = NULL; 2467 } 2468 2469 static void raid5_end_read_request(struct bio * bi) 2470 { 2471 struct stripe_head *sh = bi->bi_private; 2472 struct r5conf *conf = sh->raid_conf; 2473 int disks = sh->disks, i; 2474 char b[BDEVNAME_SIZE]; 2475 struct md_rdev *rdev = NULL; 2476 sector_t s; 2477 2478 for (i=0 ; i<disks; i++) 2479 if (bi == &sh->dev[i].req) 2480 break; 2481 2482 pr_debug("end_read_request %llu/%d, count: %d, error %d.\n", 2483 (unsigned long long)sh->sector, i, atomic_read(&sh->count), 2484 bi->bi_status); 2485 if (i == disks) { 2486 bio_reset(bi); 2487 BUG(); 2488 return; 2489 } 2490 if (test_bit(R5_ReadRepl, &sh->dev[i].flags)) 2491 /* If replacement finished while this request was outstanding, 2492 * 'replacement' might be NULL already. 2493 * In that case it moved down to 'rdev'. 2494 * rdev is not removed until all requests are finished. 2495 */ 2496 rdev = conf->disks[i].replacement; 2497 if (!rdev) 2498 rdev = conf->disks[i].rdev; 2499 2500 if (use_new_offset(conf, sh)) 2501 s = sh->sector + rdev->new_data_offset; 2502 else 2503 s = sh->sector + rdev->data_offset; 2504 if (!bi->bi_status) { 2505 set_bit(R5_UPTODATE, &sh->dev[i].flags); 2506 if (test_bit(R5_ReadError, &sh->dev[i].flags)) { 2507 /* Note that this cannot happen on a 2508 * replacement device. We just fail those on 2509 * any error 2510 */ 2511 pr_info_ratelimited( 2512 "md/raid:%s: read error corrected (%lu sectors at %llu on %s)\n", 2513 mdname(conf->mddev), STRIPE_SECTORS, 2514 (unsigned long long)s, 2515 bdevname(rdev->bdev, b)); 2516 atomic_add(STRIPE_SECTORS, &rdev->corrected_errors); 2517 clear_bit(R5_ReadError, &sh->dev[i].flags); 2518 clear_bit(R5_ReWrite, &sh->dev[i].flags); 2519 } else if (test_bit(R5_ReadNoMerge, &sh->dev[i].flags)) 2520 clear_bit(R5_ReadNoMerge, &sh->dev[i].flags); 2521 2522 if (test_bit(R5_InJournal, &sh->dev[i].flags)) 2523 /* 2524 * end read for a page in journal, this 2525 * must be preparing for prexor in rmw 2526 */ 2527 set_bit(R5_OrigPageUPTDODATE, &sh->dev[i].flags); 2528 2529 if (atomic_read(&rdev->read_errors)) 2530 atomic_set(&rdev->read_errors, 0); 2531 } else { 2532 const char *bdn = bdevname(rdev->bdev, b); 2533 int retry = 0; 2534 int set_bad = 0; 2535 2536 clear_bit(R5_UPTODATE, &sh->dev[i].flags); 2537 atomic_inc(&rdev->read_errors); 2538 if (test_bit(R5_ReadRepl, &sh->dev[i].flags)) 2539 pr_warn_ratelimited( 2540 "md/raid:%s: read error on replacement device (sector %llu on %s).\n", 2541 mdname(conf->mddev), 2542 (unsigned long long)s, 2543 bdn); 2544 else if (conf->mddev->degraded >= conf->max_degraded) { 2545 set_bad = 1; 2546 pr_warn_ratelimited( 2547 "md/raid:%s: read error not correctable (sector %llu on %s).\n", 2548 mdname(conf->mddev), 2549 (unsigned long long)s, 2550 bdn); 2551 } else if (test_bit(R5_ReWrite, &sh->dev[i].flags)) { 2552 /* Oh, no!!! */ 2553 set_bad = 1; 2554 pr_warn_ratelimited( 2555 "md/raid:%s: read error NOT corrected!! (sector %llu on %s).\n", 2556 mdname(conf->mddev), 2557 (unsigned long long)s, 2558 bdn); 2559 } else if (atomic_read(&rdev->read_errors) 2560 > conf->max_nr_stripes) 2561 pr_warn("md/raid:%s: Too many read errors, failing device %s.\n", 2562 mdname(conf->mddev), bdn); 2563 else 2564 retry = 1; 2565 if (set_bad && test_bit(In_sync, &rdev->flags) 2566 && !test_bit(R5_ReadNoMerge, &sh->dev[i].flags)) 2567 retry = 1; 2568 if (retry) 2569 if (test_bit(R5_ReadNoMerge, &sh->dev[i].flags)) { 2570 set_bit(R5_ReadError, &sh->dev[i].flags); 2571 clear_bit(R5_ReadNoMerge, &sh->dev[i].flags); 2572 } else 2573 set_bit(R5_ReadNoMerge, &sh->dev[i].flags); 2574 else { 2575 clear_bit(R5_ReadError, &sh->dev[i].flags); 2576 clear_bit(R5_ReWrite, &sh->dev[i].flags); 2577 if (!(set_bad 2578 && test_bit(In_sync, &rdev->flags) 2579 && rdev_set_badblocks( 2580 rdev, sh->sector, STRIPE_SECTORS, 0))) 2581 md_error(conf->mddev, rdev); 2582 } 2583 } 2584 rdev_dec_pending(rdev, conf->mddev); 2585 bio_reset(bi); 2586 clear_bit(R5_LOCKED, &sh->dev[i].flags); 2587 set_bit(STRIPE_HANDLE, &sh->state); 2588 raid5_release_stripe(sh); 2589 } 2590 2591 static void raid5_end_write_request(struct bio *bi) 2592 { 2593 struct stripe_head *sh = bi->bi_private; 2594 struct r5conf *conf = sh->raid_conf; 2595 int disks = sh->disks, i; 2596 struct md_rdev *uninitialized_var(rdev); 2597 sector_t first_bad; 2598 int bad_sectors; 2599 int replacement = 0; 2600 2601 for (i = 0 ; i < disks; i++) { 2602 if (bi == &sh->dev[i].req) { 2603 rdev = conf->disks[i].rdev; 2604 break; 2605 } 2606 if (bi == &sh->dev[i].rreq) { 2607 rdev = conf->disks[i].replacement; 2608 if (rdev) 2609 replacement = 1; 2610 else 2611 /* rdev was removed and 'replacement' 2612 * replaced it. rdev is not removed 2613 * until all requests are finished. 2614 */ 2615 rdev = conf->disks[i].rdev; 2616 break; 2617 } 2618 } 2619 pr_debug("end_write_request %llu/%d, count %d, error: %d.\n", 2620 (unsigned long long)sh->sector, i, atomic_read(&sh->count), 2621 bi->bi_status); 2622 if (i == disks) { 2623 bio_reset(bi); 2624 BUG(); 2625 return; 2626 } 2627 2628 if (replacement) { 2629 if (bi->bi_status) 2630 md_error(conf->mddev, rdev); 2631 else if (is_badblock(rdev, sh->sector, 2632 STRIPE_SECTORS, 2633 &first_bad, &bad_sectors)) 2634 set_bit(R5_MadeGoodRepl, &sh->dev[i].flags); 2635 } else { 2636 if (bi->bi_status) { 2637 set_bit(STRIPE_DEGRADED, &sh->state); 2638 set_bit(WriteErrorSeen, &rdev->flags); 2639 set_bit(R5_WriteError, &sh->dev[i].flags); 2640 if (!test_and_set_bit(WantReplacement, &rdev->flags)) 2641 set_bit(MD_RECOVERY_NEEDED, 2642 &rdev->mddev->recovery); 2643 } else if (is_badblock(rdev, sh->sector, 2644 STRIPE_SECTORS, 2645 &first_bad, &bad_sectors)) { 2646 set_bit(R5_MadeGood, &sh->dev[i].flags); 2647 if (test_bit(R5_ReadError, &sh->dev[i].flags)) 2648 /* That was a successful write so make 2649 * sure it looks like we already did 2650 * a re-write. 2651 */ 2652 set_bit(R5_ReWrite, &sh->dev[i].flags); 2653 } 2654 } 2655 rdev_dec_pending(rdev, conf->mddev); 2656 2657 if (sh->batch_head && bi->bi_status && !replacement) 2658 set_bit(STRIPE_BATCH_ERR, &sh->batch_head->state); 2659 2660 bio_reset(bi); 2661 if (!test_and_clear_bit(R5_DOUBLE_LOCKED, &sh->dev[i].flags)) 2662 clear_bit(R5_LOCKED, &sh->dev[i].flags); 2663 set_bit(STRIPE_HANDLE, &sh->state); 2664 raid5_release_stripe(sh); 2665 2666 if (sh->batch_head && sh != sh->batch_head) 2667 raid5_release_stripe(sh->batch_head); 2668 } 2669 2670 static void raid5_error(struct mddev *mddev, struct md_rdev *rdev) 2671 { 2672 char b[BDEVNAME_SIZE]; 2673 struct r5conf *conf = mddev->private; 2674 unsigned long flags; 2675 pr_debug("raid456: error called\n"); 2676 2677 spin_lock_irqsave(&conf->device_lock, flags); 2678 2679 if (test_bit(In_sync, &rdev->flags) && 2680 mddev->degraded == conf->max_degraded) { 2681 /* 2682 * Don't allow to achieve failed state 2683 * Don't try to recover this device 2684 */ 2685 conf->recovery_disabled = mddev->recovery_disabled; 2686 spin_unlock_irqrestore(&conf->device_lock, flags); 2687 return; 2688 } 2689 2690 set_bit(Faulty, &rdev->flags); 2691 clear_bit(In_sync, &rdev->flags); 2692 mddev->degraded = raid5_calc_degraded(conf); 2693 spin_unlock_irqrestore(&conf->device_lock, flags); 2694 set_bit(MD_RECOVERY_INTR, &mddev->recovery); 2695 2696 set_bit(Blocked, &rdev->flags); 2697 set_mask_bits(&mddev->sb_flags, 0, 2698 BIT(MD_SB_CHANGE_DEVS) | BIT(MD_SB_CHANGE_PENDING)); 2699 pr_crit("md/raid:%s: Disk failure on %s, disabling device.\n" 2700 "md/raid:%s: Operation continuing on %d devices.\n", 2701 mdname(mddev), 2702 bdevname(rdev->bdev, b), 2703 mdname(mddev), 2704 conf->raid_disks - mddev->degraded); 2705 r5c_update_on_rdev_error(mddev, rdev); 2706 } 2707 2708 /* 2709 * Input: a 'big' sector number, 2710 * Output: index of the data and parity disk, and the sector # in them. 2711 */ 2712 sector_t raid5_compute_sector(struct r5conf *conf, sector_t r_sector, 2713 int previous, int *dd_idx, 2714 struct stripe_head *sh) 2715 { 2716 sector_t stripe, stripe2; 2717 sector_t chunk_number; 2718 unsigned int chunk_offset; 2719 int pd_idx, qd_idx; 2720 int ddf_layout = 0; 2721 sector_t new_sector; 2722 int algorithm = previous ? conf->prev_algo 2723 : conf->algorithm; 2724 int sectors_per_chunk = previous ? conf->prev_chunk_sectors 2725 : conf->chunk_sectors; 2726 int raid_disks = previous ? conf->previous_raid_disks 2727 : conf->raid_disks; 2728 int data_disks = raid_disks - conf->max_degraded; 2729 2730 /* First compute the information on this sector */ 2731 2732 /* 2733 * Compute the chunk number and the sector offset inside the chunk 2734 */ 2735 chunk_offset = sector_div(r_sector, sectors_per_chunk); 2736 chunk_number = r_sector; 2737 2738 /* 2739 * Compute the stripe number 2740 */ 2741 stripe = chunk_number; 2742 *dd_idx = sector_div(stripe, data_disks); 2743 stripe2 = stripe; 2744 /* 2745 * Select the parity disk based on the user selected algorithm. 2746 */ 2747 pd_idx = qd_idx = -1; 2748 switch(conf->level) { 2749 case 4: 2750 pd_idx = data_disks; 2751 break; 2752 case 5: 2753 switch (algorithm) { 2754 case ALGORITHM_LEFT_ASYMMETRIC: 2755 pd_idx = data_disks - sector_div(stripe2, raid_disks); 2756 if (*dd_idx >= pd_idx) 2757 (*dd_idx)++; 2758 break; 2759 case ALGORITHM_RIGHT_ASYMMETRIC: 2760 pd_idx = sector_div(stripe2, raid_disks); 2761 if (*dd_idx >= pd_idx) 2762 (*dd_idx)++; 2763 break; 2764 case ALGORITHM_LEFT_SYMMETRIC: 2765 pd_idx = data_disks - sector_div(stripe2, raid_disks); 2766 *dd_idx = (pd_idx + 1 + *dd_idx) % raid_disks; 2767 break; 2768 case ALGORITHM_RIGHT_SYMMETRIC: 2769 pd_idx = sector_div(stripe2, raid_disks); 2770 *dd_idx = (pd_idx + 1 + *dd_idx) % raid_disks; 2771 break; 2772 case ALGORITHM_PARITY_0: 2773 pd_idx = 0; 2774 (*dd_idx)++; 2775 break; 2776 case ALGORITHM_PARITY_N: 2777 pd_idx = data_disks; 2778 break; 2779 default: 2780 BUG(); 2781 } 2782 break; 2783 case 6: 2784 2785 switch (algorithm) { 2786 case ALGORITHM_LEFT_ASYMMETRIC: 2787 pd_idx = raid_disks - 1 - sector_div(stripe2, raid_disks); 2788 qd_idx = pd_idx + 1; 2789 if (pd_idx == raid_disks-1) { 2790 (*dd_idx)++; /* Q D D D P */ 2791 qd_idx = 0; 2792 } else if (*dd_idx >= pd_idx) 2793 (*dd_idx) += 2; /* D D P Q D */ 2794 break; 2795 case ALGORITHM_RIGHT_ASYMMETRIC: 2796 pd_idx = sector_div(stripe2, raid_disks); 2797 qd_idx = pd_idx + 1; 2798 if (pd_idx == raid_disks-1) { 2799 (*dd_idx)++; /* Q D D D P */ 2800 qd_idx = 0; 2801 } else if (*dd_idx >= pd_idx) 2802 (*dd_idx) += 2; /* D D P Q D */ 2803 break; 2804 case ALGORITHM_LEFT_SYMMETRIC: 2805 pd_idx = raid_disks - 1 - sector_div(stripe2, raid_disks); 2806 qd_idx = (pd_idx + 1) % raid_disks; 2807 *dd_idx = (pd_idx + 2 + *dd_idx) % raid_disks; 2808 break; 2809 case ALGORITHM_RIGHT_SYMMETRIC: 2810 pd_idx = sector_div(stripe2, raid_disks); 2811 qd_idx = (pd_idx + 1) % raid_disks; 2812 *dd_idx = (pd_idx + 2 + *dd_idx) % raid_disks; 2813 break; 2814 2815 case ALGORITHM_PARITY_0: 2816 pd_idx = 0; 2817 qd_idx = 1; 2818 (*dd_idx) += 2; 2819 break; 2820 case ALGORITHM_PARITY_N: 2821 pd_idx = data_disks; 2822 qd_idx = data_disks + 1; 2823 break; 2824 2825 case ALGORITHM_ROTATING_ZERO_RESTART: 2826 /* Exactly the same as RIGHT_ASYMMETRIC, but or 2827 * of blocks for computing Q is different. 2828 */ 2829 pd_idx = sector_div(stripe2, raid_disks); 2830 qd_idx = pd_idx + 1; 2831 if (pd_idx == raid_disks-1) { 2832 (*dd_idx)++; /* Q D D D P */ 2833 qd_idx = 0; 2834 } else if (*dd_idx >= pd_idx) 2835 (*dd_idx) += 2; /* D D P Q D */ 2836 ddf_layout = 1; 2837 break; 2838 2839 case ALGORITHM_ROTATING_N_RESTART: 2840 /* Same a left_asymmetric, by first stripe is 2841 * D D D P Q rather than 2842 * Q D D D P 2843 */ 2844 stripe2 += 1; 2845 pd_idx = raid_disks - 1 - sector_div(stripe2, raid_disks); 2846 qd_idx = pd_idx + 1; 2847 if (pd_idx == raid_disks-1) { 2848 (*dd_idx)++; /* Q D D D P */ 2849 qd_idx = 0; 2850 } else if (*dd_idx >= pd_idx) 2851 (*dd_idx) += 2; /* D D P Q D */ 2852 ddf_layout = 1; 2853 break; 2854 2855 case ALGORITHM_ROTATING_N_CONTINUE: 2856 /* Same as left_symmetric but Q is before P */ 2857 pd_idx = raid_disks - 1 - sector_div(stripe2, raid_disks); 2858 qd_idx = (pd_idx + raid_disks - 1) % raid_disks; 2859 *dd_idx = (pd_idx + 1 + *dd_idx) % raid_disks; 2860 ddf_layout = 1; 2861 break; 2862 2863 case ALGORITHM_LEFT_ASYMMETRIC_6: 2864 /* RAID5 left_asymmetric, with Q on last device */ 2865 pd_idx = data_disks - sector_div(stripe2, raid_disks-1); 2866 if (*dd_idx >= pd_idx) 2867 (*dd_idx)++; 2868 qd_idx = raid_disks - 1; 2869 break; 2870 2871 case ALGORITHM_RIGHT_ASYMMETRIC_6: 2872 pd_idx = sector_div(stripe2, raid_disks-1); 2873 if (*dd_idx >= pd_idx) 2874 (*dd_idx)++; 2875 qd_idx = raid_disks - 1; 2876 break; 2877 2878 case ALGORITHM_LEFT_SYMMETRIC_6: 2879 pd_idx = data_disks - sector_div(stripe2, raid_disks-1); 2880 *dd_idx = (pd_idx + 1 + *dd_idx) % (raid_disks-1); 2881 qd_idx = raid_disks - 1; 2882 break; 2883 2884 case ALGORITHM_RIGHT_SYMMETRIC_6: 2885 pd_idx = sector_div(stripe2, raid_disks-1); 2886 *dd_idx = (pd_idx + 1 + *dd_idx) % (raid_disks-1); 2887 qd_idx = raid_disks - 1; 2888 break; 2889 2890 case ALGORITHM_PARITY_0_6: 2891 pd_idx = 0; 2892 (*dd_idx)++; 2893 qd_idx = raid_disks - 1; 2894 break; 2895 2896 default: 2897 BUG(); 2898 } 2899 break; 2900 } 2901 2902 if (sh) { 2903 sh->pd_idx = pd_idx; 2904 sh->qd_idx = qd_idx; 2905 sh->ddf_layout = ddf_layout; 2906 } 2907 /* 2908 * Finally, compute the new sector number 2909 */ 2910 new_sector = (sector_t)stripe * sectors_per_chunk + chunk_offset; 2911 return new_sector; 2912 } 2913 2914 sector_t raid5_compute_blocknr(struct stripe_head *sh, int i, int previous) 2915 { 2916 struct r5conf *conf = sh->raid_conf; 2917 int raid_disks = sh->disks; 2918 int data_disks = raid_disks - conf->max_degraded; 2919 sector_t new_sector = sh->sector, check; 2920 int sectors_per_chunk = previous ? conf->prev_chunk_sectors 2921 : conf->chunk_sectors; 2922 int algorithm = previous ? conf->prev_algo 2923 : conf->algorithm; 2924 sector_t stripe; 2925 int chunk_offset; 2926 sector_t chunk_number; 2927 int dummy1, dd_idx = i; 2928 sector_t r_sector; 2929 struct stripe_head sh2; 2930 2931 chunk_offset = sector_div(new_sector, sectors_per_chunk); 2932 stripe = new_sector; 2933 2934 if (i == sh->pd_idx) 2935 return 0; 2936 switch(conf->level) { 2937 case 4: break; 2938 case 5: 2939 switch (algorithm) { 2940 case ALGORITHM_LEFT_ASYMMETRIC: 2941 case ALGORITHM_RIGHT_ASYMMETRIC: 2942 if (i > sh->pd_idx) 2943 i--; 2944 break; 2945 case ALGORITHM_LEFT_SYMMETRIC: 2946 case ALGORITHM_RIGHT_SYMMETRIC: 2947 if (i < sh->pd_idx) 2948 i += raid_disks; 2949 i -= (sh->pd_idx + 1); 2950 break; 2951 case ALGORITHM_PARITY_0: 2952 i -= 1; 2953 break; 2954 case ALGORITHM_PARITY_N: 2955 break; 2956 default: 2957 BUG(); 2958 } 2959 break; 2960 case 6: 2961 if (i == sh->qd_idx) 2962 return 0; /* It is the Q disk */ 2963 switch (algorithm) { 2964 case ALGORITHM_LEFT_ASYMMETRIC: 2965 case ALGORITHM_RIGHT_ASYMMETRIC: 2966 case ALGORITHM_ROTATING_ZERO_RESTART: 2967 case ALGORITHM_ROTATING_N_RESTART: 2968 if (sh->pd_idx == raid_disks-1) 2969 i--; /* Q D D D P */ 2970 else if (i > sh->pd_idx) 2971 i -= 2; /* D D P Q D */ 2972 break; 2973 case ALGORITHM_LEFT_SYMMETRIC: 2974 case ALGORITHM_RIGHT_SYMMETRIC: 2975 if (sh->pd_idx == raid_disks-1) 2976 i--; /* Q D D D P */ 2977 else { 2978 /* D D P Q D */ 2979 if (i < sh->pd_idx) 2980 i += raid_disks; 2981 i -= (sh->pd_idx + 2); 2982 } 2983 break; 2984 case ALGORITHM_PARITY_0: 2985 i -= 2; 2986 break; 2987 case ALGORITHM_PARITY_N: 2988 break; 2989 case ALGORITHM_ROTATING_N_CONTINUE: 2990 /* Like left_symmetric, but P is before Q */ 2991 if (sh->pd_idx == 0) 2992 i--; /* P D D D Q */ 2993 else { 2994 /* D D Q P D */ 2995 if (i < sh->pd_idx) 2996 i += raid_disks; 2997 i -= (sh->pd_idx + 1); 2998 } 2999 break; 3000 case ALGORITHM_LEFT_ASYMMETRIC_6: 3001 case ALGORITHM_RIGHT_ASYMMETRIC_6: 3002 if (i > sh->pd_idx) 3003 i--; 3004 break; 3005 case ALGORITHM_LEFT_SYMMETRIC_6: 3006 case ALGORITHM_RIGHT_SYMMETRIC_6: 3007 if (i < sh->pd_idx) 3008 i += data_disks + 1; 3009 i -= (sh->pd_idx + 1); 3010 break; 3011 case ALGORITHM_PARITY_0_6: 3012 i -= 1; 3013 break; 3014 default: 3015 BUG(); 3016 } 3017 break; 3018 } 3019 3020 chunk_number = stripe * data_disks + i; 3021 r_sector = chunk_number * sectors_per_chunk + chunk_offset; 3022 3023 check = raid5_compute_sector(conf, r_sector, 3024 previous, &dummy1, &sh2); 3025 if (check != sh->sector || dummy1 != dd_idx || sh2.pd_idx != sh->pd_idx 3026 || sh2.qd_idx != sh->qd_idx) { 3027 pr_warn("md/raid:%s: compute_blocknr: map not correct\n", 3028 mdname(conf->mddev)); 3029 return 0; 3030 } 3031 return r_sector; 3032 } 3033 3034 /* 3035 * There are cases where we want handle_stripe_dirtying() and 3036 * schedule_reconstruction() to delay towrite to some dev of a stripe. 3037 * 3038 * This function checks whether we want to delay the towrite. Specifically, 3039 * we delay the towrite when: 3040 * 3041 * 1. degraded stripe has a non-overwrite to the missing dev, AND this 3042 * stripe has data in journal (for other devices). 3043 * 3044 * In this case, when reading data for the non-overwrite dev, it is 3045 * necessary to handle complex rmw of write back cache (prexor with 3046 * orig_page, and xor with page). To keep read path simple, we would 3047 * like to flush data in journal to RAID disks first, so complex rmw 3048 * is handled in the write patch (handle_stripe_dirtying). 3049 * 3050 * 2. when journal space is critical (R5C_LOG_CRITICAL=1) 3051 * 3052 * It is important to be able to flush all stripes in raid5-cache. 3053 * Therefore, we need reserve some space on the journal device for 3054 * these flushes. If flush operation includes pending writes to the 3055 * stripe, we need to reserve (conf->raid_disk + 1) pages per stripe 3056 * for the flush out. If we exclude these pending writes from flush 3057 * operation, we only need (conf->max_degraded + 1) pages per stripe. 3058 * Therefore, excluding pending writes in these cases enables more 3059 * efficient use of the journal device. 3060 * 3061 * Note: To make sure the stripe makes progress, we only delay 3062 * towrite for stripes with data already in journal (injournal > 0). 3063 * When LOG_CRITICAL, stripes with injournal == 0 will be sent to 3064 * no_space_stripes list. 3065 * 3066 * 3. during journal failure 3067 * In journal failure, we try to flush all cached data to raid disks 3068 * based on data in stripe cache. The array is read-only to upper 3069 * layers, so we would skip all pending writes. 3070 * 3071 */ 3072 static inline bool delay_towrite(struct r5conf *conf, 3073 struct r5dev *dev, 3074 struct stripe_head_state *s) 3075 { 3076 /* case 1 above */ 3077 if (!test_bit(R5_OVERWRITE, &dev->flags) && 3078 !test_bit(R5_Insync, &dev->flags) && s->injournal) 3079 return true; 3080 /* case 2 above */ 3081 if (test_bit(R5C_LOG_CRITICAL, &conf->cache_state) && 3082 s->injournal > 0) 3083 return true; 3084 /* case 3 above */ 3085 if (s->log_failed && s->injournal) 3086 return true; 3087 return false; 3088 } 3089 3090 static void 3091 schedule_reconstruction(struct stripe_head *sh, struct stripe_head_state *s, 3092 int rcw, int expand) 3093 { 3094 int i, pd_idx = sh->pd_idx, qd_idx = sh->qd_idx, disks = sh->disks; 3095 struct r5conf *conf = sh->raid_conf; 3096 int level = conf->level; 3097 3098 if (rcw) { 3099 /* 3100 * In some cases, handle_stripe_dirtying initially decided to 3101 * run rmw and allocates extra page for prexor. However, rcw is 3102 * cheaper later on. We need to free the extra page now, 3103 * because we won't be able to do that in ops_complete_prexor(). 3104 */ 3105 r5c_release_extra_page(sh); 3106 3107 for (i = disks; i--; ) { 3108 struct r5dev *dev = &sh->dev[i]; 3109 3110 if (dev->towrite && !delay_towrite(conf, dev, s)) { 3111 set_bit(R5_LOCKED, &dev->flags); 3112 set_bit(R5_Wantdrain, &dev->flags); 3113 if (!expand) 3114 clear_bit(R5_UPTODATE, &dev->flags); 3115 s->locked++; 3116 } else if (test_bit(R5_InJournal, &dev->flags)) { 3117 set_bit(R5_LOCKED, &dev->flags); 3118 s->locked++; 3119 } 3120 } 3121 /* if we are not expanding this is a proper write request, and 3122 * there will be bios with new data to be drained into the 3123 * stripe cache 3124 */ 3125 if (!expand) { 3126 if (!s->locked) 3127 /* False alarm, nothing to do */ 3128 return; 3129 sh->reconstruct_state = reconstruct_state_drain_run; 3130 set_bit(STRIPE_OP_BIODRAIN, &s->ops_request); 3131 } else 3132 sh->reconstruct_state = reconstruct_state_run; 3133 3134 set_bit(STRIPE_OP_RECONSTRUCT, &s->ops_request); 3135 3136 if (s->locked + conf->max_degraded == disks) 3137 if (!test_and_set_bit(STRIPE_FULL_WRITE, &sh->state)) 3138 atomic_inc(&conf->pending_full_writes); 3139 } else { 3140 BUG_ON(!(test_bit(R5_UPTODATE, &sh->dev[pd_idx].flags) || 3141 test_bit(R5_Wantcompute, &sh->dev[pd_idx].flags))); 3142 BUG_ON(level == 6 && 3143 (!(test_bit(R5_UPTODATE, &sh->dev[qd_idx].flags) || 3144 test_bit(R5_Wantcompute, &sh->dev[qd_idx].flags)))); 3145 3146 for (i = disks; i--; ) { 3147 struct r5dev *dev = &sh->dev[i]; 3148 if (i == pd_idx || i == qd_idx) 3149 continue; 3150 3151 if (dev->towrite && 3152 (test_bit(R5_UPTODATE, &dev->flags) || 3153 test_bit(R5_Wantcompute, &dev->flags))) { 3154 set_bit(R5_Wantdrain, &dev->flags); 3155 set_bit(R5_LOCKED, &dev->flags); 3156 clear_bit(R5_UPTODATE, &dev->flags); 3157 s->locked++; 3158 } else if (test_bit(R5_InJournal, &dev->flags)) { 3159 set_bit(R5_LOCKED, &dev->flags); 3160 s->locked++; 3161 } 3162 } 3163 if (!s->locked) 3164 /* False alarm - nothing to do */ 3165 return; 3166 sh->reconstruct_state = reconstruct_state_prexor_drain_run; 3167 set_bit(STRIPE_OP_PREXOR, &s->ops_request); 3168 set_bit(STRIPE_OP_BIODRAIN, &s->ops_request); 3169 set_bit(STRIPE_OP_RECONSTRUCT, &s->ops_request); 3170 } 3171 3172 /* keep the parity disk(s) locked while asynchronous operations 3173 * are in flight 3174 */ 3175 set_bit(R5_LOCKED, &sh->dev[pd_idx].flags); 3176 clear_bit(R5_UPTODATE, &sh->dev[pd_idx].flags); 3177 s->locked++; 3178 3179 if (level == 6) { 3180 int qd_idx = sh->qd_idx; 3181 struct r5dev *dev = &sh->dev[qd_idx]; 3182 3183 set_bit(R5_LOCKED, &dev->flags); 3184 clear_bit(R5_UPTODATE, &dev->flags); 3185 s->locked++; 3186 } 3187 3188 if (raid5_has_ppl(sh->raid_conf) && sh->ppl_page && 3189 test_bit(STRIPE_OP_BIODRAIN, &s->ops_request) && 3190 !test_bit(STRIPE_FULL_WRITE, &sh->state) && 3191 test_bit(R5_Insync, &sh->dev[pd_idx].flags)) 3192 set_bit(STRIPE_OP_PARTIAL_PARITY, &s->ops_request); 3193 3194 pr_debug("%s: stripe %llu locked: %d ops_request: %lx\n", 3195 __func__, (unsigned long long)sh->sector, 3196 s->locked, s->ops_request); 3197 } 3198 3199 /* 3200 * Each stripe/dev can have one or more bion attached. 3201 * toread/towrite point to the first in a chain. 3202 * The bi_next chain must be in order. 3203 */ 3204 static int add_stripe_bio(struct stripe_head *sh, struct bio *bi, int dd_idx, 3205 int forwrite, int previous) 3206 { 3207 struct bio **bip; 3208 struct r5conf *conf = sh->raid_conf; 3209 int firstwrite=0; 3210 3211 pr_debug("adding bi b#%llu to stripe s#%llu\n", 3212 (unsigned long long)bi->bi_iter.bi_sector, 3213 (unsigned long long)sh->sector); 3214 3215 spin_lock_irq(&sh->stripe_lock); 3216 sh->dev[dd_idx].write_hint = bi->bi_write_hint; 3217 /* Don't allow new IO added to stripes in batch list */ 3218 if (sh->batch_head) 3219 goto overlap; 3220 if (forwrite) { 3221 bip = &sh->dev[dd_idx].towrite; 3222 if (*bip == NULL) 3223 firstwrite = 1; 3224 } else 3225 bip = &sh->dev[dd_idx].toread; 3226 while (*bip && (*bip)->bi_iter.bi_sector < bi->bi_iter.bi_sector) { 3227 if (bio_end_sector(*bip) > bi->bi_iter.bi_sector) 3228 goto overlap; 3229 bip = & (*bip)->bi_next; 3230 } 3231 if (*bip && (*bip)->bi_iter.bi_sector < bio_end_sector(bi)) 3232 goto overlap; 3233 3234 if (forwrite && raid5_has_ppl(conf)) { 3235 /* 3236 * With PPL only writes to consecutive data chunks within a 3237 * stripe are allowed because for a single stripe_head we can 3238 * only have one PPL entry at a time, which describes one data 3239 * range. Not really an overlap, but wait_for_overlap can be 3240 * used to handle this. 3241 */ 3242 sector_t sector; 3243 sector_t first = 0; 3244 sector_t last = 0; 3245 int count = 0; 3246 int i; 3247 3248 for (i = 0; i < sh->disks; i++) { 3249 if (i != sh->pd_idx && 3250 (i == dd_idx || sh->dev[i].towrite)) { 3251 sector = sh->dev[i].sector; 3252 if (count == 0 || sector < first) 3253 first = sector; 3254 if (sector > last) 3255 last = sector; 3256 count++; 3257 } 3258 } 3259 3260 if (first + conf->chunk_sectors * (count - 1) != last) 3261 goto overlap; 3262 } 3263 3264 if (!forwrite || previous) 3265 clear_bit(STRIPE_BATCH_READY, &sh->state); 3266 3267 BUG_ON(*bip && bi->bi_next && (*bip) != bi->bi_next); 3268 if (*bip) 3269 bi->bi_next = *bip; 3270 *bip = bi; 3271 bio_inc_remaining(bi); 3272 md_write_inc(conf->mddev, bi); 3273 3274 if (forwrite) { 3275 /* check if page is covered */ 3276 sector_t sector = sh->dev[dd_idx].sector; 3277 for (bi=sh->dev[dd_idx].towrite; 3278 sector < sh->dev[dd_idx].sector + STRIPE_SECTORS && 3279 bi && bi->bi_iter.bi_sector <= sector; 3280 bi = r5_next_bio(bi, sh->dev[dd_idx].sector)) { 3281 if (bio_end_sector(bi) >= sector) 3282 sector = bio_end_sector(bi); 3283 } 3284 if (sector >= sh->dev[dd_idx].sector + STRIPE_SECTORS) 3285 if (!test_and_set_bit(R5_OVERWRITE, &sh->dev[dd_idx].flags)) 3286 sh->overwrite_disks++; 3287 } 3288 3289 pr_debug("added bi b#%llu to stripe s#%llu, disk %d.\n", 3290 (unsigned long long)(*bip)->bi_iter.bi_sector, 3291 (unsigned long long)sh->sector, dd_idx); 3292 3293 if (conf->mddev->bitmap && firstwrite) { 3294 /* Cannot hold spinlock over bitmap_startwrite, 3295 * but must ensure this isn't added to a batch until 3296 * we have added to the bitmap and set bm_seq. 3297 * So set STRIPE_BITMAP_PENDING to prevent 3298 * batching. 3299 * If multiple add_stripe_bio() calls race here they 3300 * much all set STRIPE_BITMAP_PENDING. So only the first one 3301 * to complete "bitmap_startwrite" gets to set 3302 * STRIPE_BIT_DELAY. This is important as once a stripe 3303 * is added to a batch, STRIPE_BIT_DELAY cannot be changed 3304 * any more. 3305 */ 3306 set_bit(STRIPE_BITMAP_PENDING, &sh->state); 3307 spin_unlock_irq(&sh->stripe_lock); 3308 md_bitmap_startwrite(conf->mddev->bitmap, sh->sector, 3309 STRIPE_SECTORS, 0); 3310 spin_lock_irq(&sh->stripe_lock); 3311 clear_bit(STRIPE_BITMAP_PENDING, &sh->state); 3312 if (!sh->batch_head) { 3313 sh->bm_seq = conf->seq_flush+1; 3314 set_bit(STRIPE_BIT_DELAY, &sh->state); 3315 } 3316 } 3317 spin_unlock_irq(&sh->stripe_lock); 3318 3319 if (stripe_can_batch(sh)) 3320 stripe_add_to_batch_list(conf, sh); 3321 return 1; 3322 3323 overlap: 3324 set_bit(R5_Overlap, &sh->dev[dd_idx].flags); 3325 spin_unlock_irq(&sh->stripe_lock); 3326 return 0; 3327 } 3328 3329 static void end_reshape(struct r5conf *conf); 3330 3331 static void stripe_set_idx(sector_t stripe, struct r5conf *conf, int previous, 3332 struct stripe_head *sh) 3333 { 3334 int sectors_per_chunk = 3335 previous ? conf->prev_chunk_sectors : conf->chunk_sectors; 3336 int dd_idx; 3337 int chunk_offset = sector_div(stripe, sectors_per_chunk); 3338 int disks = previous ? conf->previous_raid_disks : conf->raid_disks; 3339 3340 raid5_compute_sector(conf, 3341 stripe * (disks - conf->max_degraded) 3342 *sectors_per_chunk + chunk_offset, 3343 previous, 3344 &dd_idx, sh); 3345 } 3346 3347 static void 3348 handle_failed_stripe(struct r5conf *conf, struct stripe_head *sh, 3349 struct stripe_head_state *s, int disks) 3350 { 3351 int i; 3352 BUG_ON(sh->batch_head); 3353 for (i = disks; i--; ) { 3354 struct bio *bi; 3355 int bitmap_end = 0; 3356 3357 if (test_bit(R5_ReadError, &sh->dev[i].flags)) { 3358 struct md_rdev *rdev; 3359 rcu_read_lock(); 3360 rdev = rcu_dereference(conf->disks[i].rdev); 3361 if (rdev && test_bit(In_sync, &rdev->flags) && 3362 !test_bit(Faulty, &rdev->flags)) 3363 atomic_inc(&rdev->nr_pending); 3364 else 3365 rdev = NULL; 3366 rcu_read_unlock(); 3367 if (rdev) { 3368 if (!rdev_set_badblocks( 3369 rdev, 3370 sh->sector, 3371 STRIPE_SECTORS, 0)) 3372 md_error(conf->mddev, rdev); 3373 rdev_dec_pending(rdev, conf->mddev); 3374 } 3375 } 3376 spin_lock_irq(&sh->stripe_lock); 3377 /* fail all writes first */ 3378 bi = sh->dev[i].towrite; 3379 sh->dev[i].towrite = NULL; 3380 sh->overwrite_disks = 0; 3381 spin_unlock_irq(&sh->stripe_lock); 3382 if (bi) 3383 bitmap_end = 1; 3384 3385 log_stripe_write_finished(sh); 3386 3387 if (test_and_clear_bit(R5_Overlap, &sh->dev[i].flags)) 3388 wake_up(&conf->wait_for_overlap); 3389 3390 while (bi && bi->bi_iter.bi_sector < 3391 sh->dev[i].sector + STRIPE_SECTORS) { 3392 struct bio *nextbi = r5_next_bio(bi, sh->dev[i].sector); 3393 3394 md_write_end(conf->mddev); 3395 bio_io_error(bi); 3396 bi = nextbi; 3397 } 3398 if (bitmap_end) 3399 md_bitmap_endwrite(conf->mddev->bitmap, sh->sector, 3400 STRIPE_SECTORS, 0, 0); 3401 bitmap_end = 0; 3402 /* and fail all 'written' */ 3403 bi = sh->dev[i].written; 3404 sh->dev[i].written = NULL; 3405 if (test_and_clear_bit(R5_SkipCopy, &sh->dev[i].flags)) { 3406 WARN_ON(test_bit(R5_UPTODATE, &sh->dev[i].flags)); 3407 sh->dev[i].page = sh->dev[i].orig_page; 3408 } 3409 3410 if (bi) bitmap_end = 1; 3411 while (bi && bi->bi_iter.bi_sector < 3412 sh->dev[i].sector + STRIPE_SECTORS) { 3413 struct bio *bi2 = r5_next_bio(bi, sh->dev[i].sector); 3414 3415 md_write_end(conf->mddev); 3416 bio_io_error(bi); 3417 bi = bi2; 3418 } 3419 3420 /* fail any reads if this device is non-operational and 3421 * the data has not reached the cache yet. 3422 */ 3423 if (!test_bit(R5_Wantfill, &sh->dev[i].flags) && 3424 s->failed > conf->max_degraded && 3425 (!test_bit(R5_Insync, &sh->dev[i].flags) || 3426 test_bit(R5_ReadError, &sh->dev[i].flags))) { 3427 spin_lock_irq(&sh->stripe_lock); 3428 bi = sh->dev[i].toread; 3429 sh->dev[i].toread = NULL; 3430 spin_unlock_irq(&sh->stripe_lock); 3431 if (test_and_clear_bit(R5_Overlap, &sh->dev[i].flags)) 3432 wake_up(&conf->wait_for_overlap); 3433 if (bi) 3434 s->to_read--; 3435 while (bi && bi->bi_iter.bi_sector < 3436 sh->dev[i].sector + STRIPE_SECTORS) { 3437 struct bio *nextbi = 3438 r5_next_bio(bi, sh->dev[i].sector); 3439 3440 bio_io_error(bi); 3441 bi = nextbi; 3442 } 3443 } 3444 if (bitmap_end) 3445 md_bitmap_endwrite(conf->mddev->bitmap, sh->sector, 3446 STRIPE_SECTORS, 0, 0); 3447 /* If we were in the middle of a write the parity block might 3448 * still be locked - so just clear all R5_LOCKED flags 3449 */ 3450 clear_bit(R5_LOCKED, &sh->dev[i].flags); 3451 } 3452 s->to_write = 0; 3453 s->written = 0; 3454 3455 if (test_and_clear_bit(STRIPE_FULL_WRITE, &sh->state)) 3456 if (atomic_dec_and_test(&conf->pending_full_writes)) 3457 md_wakeup_thread(conf->mddev->thread); 3458 } 3459 3460 static void 3461 handle_failed_sync(struct r5conf *conf, struct stripe_head *sh, 3462 struct stripe_head_state *s) 3463 { 3464 int abort = 0; 3465 int i; 3466 3467 BUG_ON(sh->batch_head); 3468 clear_bit(STRIPE_SYNCING, &sh->state); 3469 if (test_and_clear_bit(R5_Overlap, &sh->dev[sh->pd_idx].flags)) 3470 wake_up(&conf->wait_for_overlap); 3471 s->syncing = 0; 3472 s->replacing = 0; 3473 /* There is nothing more to do for sync/check/repair. 3474 * Don't even need to abort as that is handled elsewhere 3475 * if needed, and not always wanted e.g. if there is a known 3476 * bad block here. 3477 * For recover/replace we need to record a bad block on all 3478 * non-sync devices, or abort the recovery 3479 */ 3480 if (test_bit(MD_RECOVERY_RECOVER, &conf->mddev->recovery)) { 3481 /* During recovery devices cannot be removed, so 3482 * locking and refcounting of rdevs is not needed 3483 */ 3484 rcu_read_lock(); 3485 for (i = 0; i < conf->raid_disks; i++) { 3486 struct md_rdev *rdev = rcu_dereference(conf->disks[i].rdev); 3487 if (rdev 3488 && !test_bit(Faulty, &rdev->flags) 3489 && !test_bit(In_sync, &rdev->flags) 3490 && !rdev_set_badblocks(rdev, sh->sector, 3491 STRIPE_SECTORS, 0)) 3492 abort = 1; 3493 rdev = rcu_dereference(conf->disks[i].replacement); 3494 if (rdev 3495 && !test_bit(Faulty, &rdev->flags) 3496 && !test_bit(In_sync, &rdev->flags) 3497 && !rdev_set_badblocks(rdev, sh->sector, 3498 STRIPE_SECTORS, 0)) 3499 abort = 1; 3500 } 3501 rcu_read_unlock(); 3502 if (abort) 3503 conf->recovery_disabled = 3504 conf->mddev->recovery_disabled; 3505 } 3506 md_done_sync(conf->mddev, STRIPE_SECTORS, !abort); 3507 } 3508 3509 static int want_replace(struct stripe_head *sh, int disk_idx) 3510 { 3511 struct md_rdev *rdev; 3512 int rv = 0; 3513 3514 rcu_read_lock(); 3515 rdev = rcu_dereference(sh->raid_conf->disks[disk_idx].replacement); 3516 if (rdev 3517 && !test_bit(Faulty, &rdev->flags) 3518 && !test_bit(In_sync, &rdev->flags) 3519 && (rdev->recovery_offset <= sh->sector 3520 || rdev->mddev->recovery_cp <= sh->sector)) 3521 rv = 1; 3522 rcu_read_unlock(); 3523 return rv; 3524 } 3525 3526 static int need_this_block(struct stripe_head *sh, struct stripe_head_state *s, 3527 int disk_idx, int disks) 3528 { 3529 struct r5dev *dev = &sh->dev[disk_idx]; 3530 struct r5dev *fdev[2] = { &sh->dev[s->failed_num[0]], 3531 &sh->dev[s->failed_num[1]] }; 3532 int i; 3533 3534 3535 if (test_bit(R5_LOCKED, &dev->flags) || 3536 test_bit(R5_UPTODATE, &dev->flags)) 3537 /* No point reading this as we already have it or have 3538 * decided to get it. 3539 */ 3540 return 0; 3541 3542 if (dev->toread || 3543 (dev->towrite && !test_bit(R5_OVERWRITE, &dev->flags))) 3544 /* We need this block to directly satisfy a request */ 3545 return 1; 3546 3547 if (s->syncing || s->expanding || 3548 (s->replacing && want_replace(sh, disk_idx))) 3549 /* When syncing, or expanding we read everything. 3550 * When replacing, we need the replaced block. 3551 */ 3552 return 1; 3553 3554 if ((s->failed >= 1 && fdev[0]->toread) || 3555 (s->failed >= 2 && fdev[1]->toread)) 3556 /* If we want to read from a failed device, then 3557 * we need to actually read every other device. 3558 */ 3559 return 1; 3560 3561 /* Sometimes neither read-modify-write nor reconstruct-write 3562 * cycles can work. In those cases we read every block we 3563 * can. Then the parity-update is certain to have enough to 3564 * work with. 3565 * This can only be a problem when we need to write something, 3566 * and some device has failed. If either of those tests 3567 * fail we need look no further. 3568 */ 3569 if (!s->failed || !s->to_write) 3570 return 0; 3571 3572 if (test_bit(R5_Insync, &dev->flags) && 3573 !test_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) 3574 /* Pre-reads at not permitted until after short delay 3575 * to gather multiple requests. However if this 3576 * device is no Insync, the block could only be computed 3577 * and there is no need to delay that. 3578 */ 3579 return 0; 3580 3581 for (i = 0; i < s->failed && i < 2; i++) { 3582 if (fdev[i]->towrite && 3583 !test_bit(R5_UPTODATE, &fdev[i]->flags) && 3584 !test_bit(R5_OVERWRITE, &fdev[i]->flags)) 3585 /* If we have a partial write to a failed 3586 * device, then we will need to reconstruct 3587 * the content of that device, so all other 3588 * devices must be read. 3589 */ 3590 return 1; 3591 } 3592 3593 /* If we are forced to do a reconstruct-write, either because 3594 * the current RAID6 implementation only supports that, or 3595 * because parity cannot be trusted and we are currently 3596 * recovering it, there is extra need to be careful. 3597 * If one of the devices that we would need to read, because 3598 * it is not being overwritten (and maybe not written at all) 3599 * is missing/faulty, then we need to read everything we can. 3600 */ 3601 if (sh->raid_conf->level != 6 && 3602 sh->sector < sh->raid_conf->mddev->recovery_cp) 3603 /* reconstruct-write isn't being forced */ 3604 return 0; 3605 for (i = 0; i < s->failed && i < 2; i++) { 3606 if (s->failed_num[i] != sh->pd_idx && 3607 s->failed_num[i] != sh->qd_idx && 3608 !test_bit(R5_UPTODATE, &fdev[i]->flags) && 3609 !test_bit(R5_OVERWRITE, &fdev[i]->flags)) 3610 return 1; 3611 } 3612 3613 return 0; 3614 } 3615 3616 /* fetch_block - checks the given member device to see if its data needs 3617 * to be read or computed to satisfy a request. 3618 * 3619 * Returns 1 when no more member devices need to be checked, otherwise returns 3620 * 0 to tell the loop in handle_stripe_fill to continue 3621 */ 3622 static int fetch_block(struct stripe_head *sh, struct stripe_head_state *s, 3623 int disk_idx, int disks) 3624 { 3625 struct r5dev *dev = &sh->dev[disk_idx]; 3626 3627 /* is the data in this block needed, and can we get it? */ 3628 if (need_this_block(sh, s, disk_idx, disks)) { 3629 /* we would like to get this block, possibly by computing it, 3630 * otherwise read it if the backing disk is insync 3631 */ 3632 BUG_ON(test_bit(R5_Wantcompute, &dev->flags)); 3633 BUG_ON(test_bit(R5_Wantread, &dev->flags)); 3634 BUG_ON(sh->batch_head); 3635 3636 /* 3637 * In the raid6 case if the only non-uptodate disk is P 3638 * then we already trusted P to compute the other failed 3639 * drives. It is safe to compute rather than re-read P. 3640 * In other cases we only compute blocks from failed 3641 * devices, otherwise check/repair might fail to detect 3642 * a real inconsistency. 3643 */ 3644 3645 if ((s->uptodate == disks - 1) && 3646 ((sh->qd_idx >= 0 && sh->pd_idx == disk_idx) || 3647 (s->failed && (disk_idx == s->failed_num[0] || 3648 disk_idx == s->failed_num[1])))) { 3649 /* have disk failed, and we're requested to fetch it; 3650 * do compute it 3651 */ 3652 pr_debug("Computing stripe %llu block %d\n", 3653 (unsigned long long)sh->sector, disk_idx); 3654 set_bit(STRIPE_COMPUTE_RUN, &sh->state); 3655 set_bit(STRIPE_OP_COMPUTE_BLK, &s->ops_request); 3656 set_bit(R5_Wantcompute, &dev->flags); 3657 sh->ops.target = disk_idx; 3658 sh->ops.target2 = -1; /* no 2nd target */ 3659 s->req_compute = 1; 3660 /* Careful: from this point on 'uptodate' is in the eye 3661 * of raid_run_ops which services 'compute' operations 3662 * before writes. R5_Wantcompute flags a block that will 3663 * be R5_UPTODATE by the time it is needed for a 3664 * subsequent operation. 3665 */ 3666 s->uptodate++; 3667 return 1; 3668 } else if (s->uptodate == disks-2 && s->failed >= 2) { 3669 /* Computing 2-failure is *very* expensive; only 3670 * do it if failed >= 2 3671 */ 3672 int other; 3673 for (other = disks; other--; ) { 3674 if (other == disk_idx) 3675 continue; 3676 if (!test_bit(R5_UPTODATE, 3677 &sh->dev[other].flags)) 3678 break; 3679 } 3680 BUG_ON(other < 0); 3681 pr_debug("Computing stripe %llu blocks %d,%d\n", 3682 (unsigned long long)sh->sector, 3683 disk_idx, other); 3684 set_bit(STRIPE_COMPUTE_RUN, &sh->state); 3685 set_bit(STRIPE_OP_COMPUTE_BLK, &s->ops_request); 3686 set_bit(R5_Wantcompute, &sh->dev[disk_idx].flags); 3687 set_bit(R5_Wantcompute, &sh->dev[other].flags); 3688 sh->ops.target = disk_idx; 3689 sh->ops.target2 = other; 3690 s->uptodate += 2; 3691 s->req_compute = 1; 3692 return 1; 3693 } else if (test_bit(R5_Insync, &dev->flags)) { 3694 set_bit(R5_LOCKED, &dev->flags); 3695 set_bit(R5_Wantread, &dev->flags); 3696 s->locked++; 3697 pr_debug("Reading block %d (sync=%d)\n", 3698 disk_idx, s->syncing); 3699 } 3700 } 3701 3702 return 0; 3703 } 3704 3705 /** 3706 * handle_stripe_fill - read or compute data to satisfy pending requests. 3707 */ 3708 static void handle_stripe_fill(struct stripe_head *sh, 3709 struct stripe_head_state *s, 3710 int disks) 3711 { 3712 int i; 3713 3714 /* look for blocks to read/compute, skip this if a compute 3715 * is already in flight, or if the stripe contents are in the 3716 * midst of changing due to a write 3717 */ 3718 if (!test_bit(STRIPE_COMPUTE_RUN, &sh->state) && !sh->check_state && 3719 !sh->reconstruct_state) { 3720 3721 /* 3722 * For degraded stripe with data in journal, do not handle 3723 * read requests yet, instead, flush the stripe to raid 3724 * disks first, this avoids handling complex rmw of write 3725 * back cache (prexor with orig_page, and then xor with 3726 * page) in the read path 3727 */ 3728 if (s->injournal && s->failed) { 3729 if (test_bit(STRIPE_R5C_CACHING, &sh->state)) 3730 r5c_make_stripe_write_out(sh); 3731 goto out; 3732 } 3733 3734 for (i = disks; i--; ) 3735 if (fetch_block(sh, s, i, disks)) 3736 break; 3737 } 3738 out: 3739 set_bit(STRIPE_HANDLE, &sh->state); 3740 } 3741 3742 static void break_stripe_batch_list(struct stripe_head *head_sh, 3743 unsigned long handle_flags); 3744 /* handle_stripe_clean_event 3745 * any written block on an uptodate or failed drive can be returned. 3746 * Note that if we 'wrote' to a failed drive, it will be UPTODATE, but 3747 * never LOCKED, so we don't need to test 'failed' directly. 3748 */ 3749 static void handle_stripe_clean_event(struct r5conf *conf, 3750 struct stripe_head *sh, int disks) 3751 { 3752 int i; 3753 struct r5dev *dev; 3754 int discard_pending = 0; 3755 struct stripe_head *head_sh = sh; 3756 bool do_endio = false; 3757 3758 for (i = disks; i--; ) 3759 if (sh->dev[i].written) { 3760 dev = &sh->dev[i]; 3761 if (!test_bit(R5_LOCKED, &dev->flags) && 3762 (test_bit(R5_UPTODATE, &dev->flags) || 3763 test_bit(R5_Discard, &dev->flags) || 3764 test_bit(R5_SkipCopy, &dev->flags))) { 3765 /* We can return any write requests */ 3766 struct bio *wbi, *wbi2; 3767 pr_debug("Return write for disc %d\n", i); 3768 if (test_and_clear_bit(R5_Discard, &dev->flags)) 3769 clear_bit(R5_UPTODATE, &dev->flags); 3770 if (test_and_clear_bit(R5_SkipCopy, &dev->flags)) { 3771 WARN_ON(test_bit(R5_UPTODATE, &dev->flags)); 3772 } 3773 do_endio = true; 3774 3775 returnbi: 3776 dev->page = dev->orig_page; 3777 wbi = dev->written; 3778 dev->written = NULL; 3779 while (wbi && wbi->bi_iter.bi_sector < 3780 dev->sector + STRIPE_SECTORS) { 3781 wbi2 = r5_next_bio(wbi, dev->sector); 3782 md_write_end(conf->mddev); 3783 bio_endio(wbi); 3784 wbi = wbi2; 3785 } 3786 md_bitmap_endwrite(conf->mddev->bitmap, sh->sector, 3787 STRIPE_SECTORS, 3788 !test_bit(STRIPE_DEGRADED, &sh->state), 3789 0); 3790 if (head_sh->batch_head) { 3791 sh = list_first_entry(&sh->batch_list, 3792 struct stripe_head, 3793 batch_list); 3794 if (sh != head_sh) { 3795 dev = &sh->dev[i]; 3796 goto returnbi; 3797 } 3798 } 3799 sh = head_sh; 3800 dev = &sh->dev[i]; 3801 } else if (test_bit(R5_Discard, &dev->flags)) 3802 discard_pending = 1; 3803 } 3804 3805 log_stripe_write_finished(sh); 3806 3807 if (!discard_pending && 3808 test_bit(R5_Discard, &sh->dev[sh->pd_idx].flags)) { 3809 int hash; 3810 clear_bit(R5_Discard, &sh->dev[sh->pd_idx].flags); 3811 clear_bit(R5_UPTODATE, &sh->dev[sh->pd_idx].flags); 3812 if (sh->qd_idx >= 0) { 3813 clear_bit(R5_Discard, &sh->dev[sh->qd_idx].flags); 3814 clear_bit(R5_UPTODATE, &sh->dev[sh->qd_idx].flags); 3815 } 3816 /* now that discard is done we can proceed with any sync */ 3817 clear_bit(STRIPE_DISCARD, &sh->state); 3818 /* 3819 * SCSI discard will change some bio fields and the stripe has 3820 * no updated data, so remove it from hash list and the stripe 3821 * will be reinitialized 3822 */ 3823 unhash: 3824 hash = sh->hash_lock_index; 3825 spin_lock_irq(conf->hash_locks + hash); 3826 remove_hash(sh); 3827 spin_unlock_irq(conf->hash_locks + hash); 3828 if (head_sh->batch_head) { 3829 sh = list_first_entry(&sh->batch_list, 3830 struct stripe_head, batch_list); 3831 if (sh != head_sh) 3832 goto unhash; 3833 } 3834 sh = head_sh; 3835 3836 if (test_bit(STRIPE_SYNC_REQUESTED, &sh->state)) 3837 set_bit(STRIPE_HANDLE, &sh->state); 3838 3839 } 3840 3841 if (test_and_clear_bit(STRIPE_FULL_WRITE, &sh->state)) 3842 if (atomic_dec_and_test(&conf->pending_full_writes)) 3843 md_wakeup_thread(conf->mddev->thread); 3844 3845 if (head_sh->batch_head && do_endio) 3846 break_stripe_batch_list(head_sh, STRIPE_EXPAND_SYNC_FLAGS); 3847 } 3848 3849 /* 3850 * For RMW in write back cache, we need extra page in prexor to store the 3851 * old data. This page is stored in dev->orig_page. 3852 * 3853 * This function checks whether we have data for prexor. The exact logic 3854 * is: 3855 * R5_UPTODATE && (!R5_InJournal || R5_OrigPageUPTDODATE) 3856 */ 3857 static inline bool uptodate_for_rmw(struct r5dev *dev) 3858 { 3859 return (test_bit(R5_UPTODATE, &dev->flags)) && 3860 (!test_bit(R5_InJournal, &dev->flags) || 3861 test_bit(R5_OrigPageUPTDODATE, &dev->flags)); 3862 } 3863 3864 static int handle_stripe_dirtying(struct r5conf *conf, 3865 struct stripe_head *sh, 3866 struct stripe_head_state *s, 3867 int disks) 3868 { 3869 int rmw = 0, rcw = 0, i; 3870 sector_t recovery_cp = conf->mddev->recovery_cp; 3871 3872 /* Check whether resync is now happening or should start. 3873 * If yes, then the array is dirty (after unclean shutdown or 3874 * initial creation), so parity in some stripes might be inconsistent. 3875 * In this case, we need to always do reconstruct-write, to ensure 3876 * that in case of drive failure or read-error correction, we 3877 * generate correct data from the parity. 3878 */ 3879 if (conf->rmw_level == PARITY_DISABLE_RMW || 3880 (recovery_cp < MaxSector && sh->sector >= recovery_cp && 3881 s->failed == 0)) { 3882 /* Calculate the real rcw later - for now make it 3883 * look like rcw is cheaper 3884 */ 3885 rcw = 1; rmw = 2; 3886 pr_debug("force RCW rmw_level=%u, recovery_cp=%llu sh->sector=%llu\n", 3887 conf->rmw_level, (unsigned long long)recovery_cp, 3888 (unsigned long long)sh->sector); 3889 } else for (i = disks; i--; ) { 3890 /* would I have to read this buffer for read_modify_write */ 3891 struct r5dev *dev = &sh->dev[i]; 3892 if (((dev->towrite && !delay_towrite(conf, dev, s)) || 3893 i == sh->pd_idx || i == sh->qd_idx || 3894 test_bit(R5_InJournal, &dev->flags)) && 3895 !test_bit(R5_LOCKED, &dev->flags) && 3896 !(uptodate_for_rmw(dev) || 3897 test_bit(R5_Wantcompute, &dev->flags))) { 3898 if (test_bit(R5_Insync, &dev->flags)) 3899 rmw++; 3900 else 3901 rmw += 2*disks; /* cannot read it */ 3902 } 3903 /* Would I have to read this buffer for reconstruct_write */ 3904 if (!test_bit(R5_OVERWRITE, &dev->flags) && 3905 i != sh->pd_idx && i != sh->qd_idx && 3906 !test_bit(R5_LOCKED, &dev->flags) && 3907 !(test_bit(R5_UPTODATE, &dev->flags) || 3908 test_bit(R5_Wantcompute, &dev->flags))) { 3909 if (test_bit(R5_Insync, &dev->flags)) 3910 rcw++; 3911 else 3912 rcw += 2*disks; 3913 } 3914 } 3915 3916 pr_debug("for sector %llu state 0x%lx, rmw=%d rcw=%d\n", 3917 (unsigned long long)sh->sector, sh->state, rmw, rcw); 3918 set_bit(STRIPE_HANDLE, &sh->state); 3919 if ((rmw < rcw || (rmw == rcw && conf->rmw_level == PARITY_PREFER_RMW)) && rmw > 0) { 3920 /* prefer read-modify-write, but need to get some data */ 3921 if (conf->mddev->queue) 3922 blk_add_trace_msg(conf->mddev->queue, 3923 "raid5 rmw %llu %d", 3924 (unsigned long long)sh->sector, rmw); 3925 for (i = disks; i--; ) { 3926 struct r5dev *dev = &sh->dev[i]; 3927 if (test_bit(R5_InJournal, &dev->flags) && 3928 dev->page == dev->orig_page && 3929 !test_bit(R5_LOCKED, &sh->dev[sh->pd_idx].flags)) { 3930 /* alloc page for prexor */ 3931 struct page *p = alloc_page(GFP_NOIO); 3932 3933 if (p) { 3934 dev->orig_page = p; 3935 continue; 3936 } 3937 3938 /* 3939 * alloc_page() failed, try use 3940 * disk_info->extra_page 3941 */ 3942 if (!test_and_set_bit(R5C_EXTRA_PAGE_IN_USE, 3943 &conf->cache_state)) { 3944 r5c_use_extra_page(sh); 3945 break; 3946 } 3947 3948 /* extra_page in use, add to delayed_list */ 3949 set_bit(STRIPE_DELAYED, &sh->state); 3950 s->waiting_extra_page = 1; 3951 return -EAGAIN; 3952 } 3953 } 3954 3955 for (i = disks; i--; ) { 3956 struct r5dev *dev = &sh->dev[i]; 3957 if (((dev->towrite && !delay_towrite(conf, dev, s)) || 3958 i == sh->pd_idx || i == sh->qd_idx || 3959 test_bit(R5_InJournal, &dev->flags)) && 3960 !test_bit(R5_LOCKED, &dev->flags) && 3961 !(uptodate_for_rmw(dev) || 3962 test_bit(R5_Wantcompute, &dev->flags)) && 3963 test_bit(R5_Insync, &dev->flags)) { 3964 if (test_bit(STRIPE_PREREAD_ACTIVE, 3965 &sh->state)) { 3966 pr_debug("Read_old block %d for r-m-w\n", 3967 i); 3968 set_bit(R5_LOCKED, &dev->flags); 3969 set_bit(R5_Wantread, &dev->flags); 3970 s->locked++; 3971 } else { 3972 set_bit(STRIPE_DELAYED, &sh->state); 3973 set_bit(STRIPE_HANDLE, &sh->state); 3974 } 3975 } 3976 } 3977 } 3978 if ((rcw < rmw || (rcw == rmw && conf->rmw_level != PARITY_PREFER_RMW)) && rcw > 0) { 3979 /* want reconstruct write, but need to get some data */ 3980 int qread =0; 3981 rcw = 0; 3982 for (i = disks; i--; ) { 3983 struct r5dev *dev = &sh->dev[i]; 3984 if (!test_bit(R5_OVERWRITE, &dev->flags) && 3985 i != sh->pd_idx && i != sh->qd_idx && 3986 !test_bit(R5_LOCKED, &dev->flags) && 3987 !(test_bit(R5_UPTODATE, &dev->flags) || 3988 test_bit(R5_Wantcompute, &dev->flags))) { 3989 rcw++; 3990 if (test_bit(R5_Insync, &dev->flags) && 3991 test_bit(STRIPE_PREREAD_ACTIVE, 3992 &sh->state)) { 3993 pr_debug("Read_old block " 3994 "%d for Reconstruct\n", i); 3995 set_bit(R5_LOCKED, &dev->flags); 3996 set_bit(R5_Wantread, &dev->flags); 3997 s->locked++; 3998 qread++; 3999 } else { 4000 set_bit(STRIPE_DELAYED, &sh->state); 4001 set_bit(STRIPE_HANDLE, &sh->state); 4002 } 4003 } 4004 } 4005 if (rcw && conf->mddev->queue) 4006 blk_add_trace_msg(conf->mddev->queue, "raid5 rcw %llu %d %d %d", 4007 (unsigned long long)sh->sector, 4008 rcw, qread, test_bit(STRIPE_DELAYED, &sh->state)); 4009 } 4010 4011 if (rcw > disks && rmw > disks && 4012 !test_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) 4013 set_bit(STRIPE_DELAYED, &sh->state); 4014 4015 /* now if nothing is locked, and if we have enough data, 4016 * we can start a write request 4017 */ 4018 /* since handle_stripe can be called at any time we need to handle the 4019 * case where a compute block operation has been submitted and then a 4020 * subsequent call wants to start a write request. raid_run_ops only 4021 * handles the case where compute block and reconstruct are requested 4022 * simultaneously. If this is not the case then new writes need to be 4023 * held off until the compute completes. 4024 */ 4025 if ((s->req_compute || !test_bit(STRIPE_COMPUTE_RUN, &sh->state)) && 4026 (s->locked == 0 && (rcw == 0 || rmw == 0) && 4027 !test_bit(STRIPE_BIT_DELAY, &sh->state))) 4028 schedule_reconstruction(sh, s, rcw == 0, 0); 4029 return 0; 4030 } 4031 4032 static void handle_parity_checks5(struct r5conf *conf, struct stripe_head *sh, 4033 struct stripe_head_state *s, int disks) 4034 { 4035 struct r5dev *dev = NULL; 4036 4037 BUG_ON(sh->batch_head); 4038 set_bit(STRIPE_HANDLE, &sh->state); 4039 4040 switch (sh->check_state) { 4041 case check_state_idle: 4042 /* start a new check operation if there are no failures */ 4043 if (s->failed == 0) { 4044 BUG_ON(s->uptodate != disks); 4045 sh->check_state = check_state_run; 4046 set_bit(STRIPE_OP_CHECK, &s->ops_request); 4047 clear_bit(R5_UPTODATE, &sh->dev[sh->pd_idx].flags); 4048 s->uptodate--; 4049 break; 4050 } 4051 dev = &sh->dev[s->failed_num[0]]; 4052 /* fall through */ 4053 case check_state_compute_result: 4054 sh->check_state = check_state_idle; 4055 if (!dev) 4056 dev = &sh->dev[sh->pd_idx]; 4057 4058 /* check that a write has not made the stripe insync */ 4059 if (test_bit(STRIPE_INSYNC, &sh->state)) 4060 break; 4061 4062 /* either failed parity check, or recovery is happening */ 4063 BUG_ON(!test_bit(R5_UPTODATE, &dev->flags)); 4064 BUG_ON(s->uptodate != disks); 4065 4066 set_bit(R5_LOCKED, &dev->flags); 4067 s->locked++; 4068 set_bit(R5_Wantwrite, &dev->flags); 4069 4070 clear_bit(STRIPE_DEGRADED, &sh->state); 4071 set_bit(STRIPE_INSYNC, &sh->state); 4072 break; 4073 case check_state_run: 4074 break; /* we will be called again upon completion */ 4075 case check_state_check_result: 4076 sh->check_state = check_state_idle; 4077 4078 /* if a failure occurred during the check operation, leave 4079 * STRIPE_INSYNC not set and let the stripe be handled again 4080 */ 4081 if (s->failed) 4082 break; 4083 4084 /* handle a successful check operation, if parity is correct 4085 * we are done. Otherwise update the mismatch count and repair 4086 * parity if !MD_RECOVERY_CHECK 4087 */ 4088 if ((sh->ops.zero_sum_result & SUM_CHECK_P_RESULT) == 0) 4089 /* parity is correct (on disc, 4090 * not in buffer any more) 4091 */ 4092 set_bit(STRIPE_INSYNC, &sh->state); 4093 else { 4094 atomic64_add(STRIPE_SECTORS, &conf->mddev->resync_mismatches); 4095 if (test_bit(MD_RECOVERY_CHECK, &conf->mddev->recovery)) { 4096 /* don't try to repair!! */ 4097 set_bit(STRIPE_INSYNC, &sh->state); 4098 pr_warn_ratelimited("%s: mismatch sector in range " 4099 "%llu-%llu\n", mdname(conf->mddev), 4100 (unsigned long long) sh->sector, 4101 (unsigned long long) sh->sector + 4102 STRIPE_SECTORS); 4103 } else { 4104 sh->check_state = check_state_compute_run; 4105 set_bit(STRIPE_COMPUTE_RUN, &sh->state); 4106 set_bit(STRIPE_OP_COMPUTE_BLK, &s->ops_request); 4107 set_bit(R5_Wantcompute, 4108 &sh->dev[sh->pd_idx].flags); 4109 sh->ops.target = sh->pd_idx; 4110 sh->ops.target2 = -1; 4111 s->uptodate++; 4112 } 4113 } 4114 break; 4115 case check_state_compute_run: 4116 break; 4117 default: 4118 pr_err("%s: unknown check_state: %d sector: %llu\n", 4119 __func__, sh->check_state, 4120 (unsigned long long) sh->sector); 4121 BUG(); 4122 } 4123 } 4124 4125 static void handle_parity_checks6(struct r5conf *conf, struct stripe_head *sh, 4126 struct stripe_head_state *s, 4127 int disks) 4128 { 4129 int pd_idx = sh->pd_idx; 4130 int qd_idx = sh->qd_idx; 4131 struct r5dev *dev; 4132 4133 BUG_ON(sh->batch_head); 4134 set_bit(STRIPE_HANDLE, &sh->state); 4135 4136 BUG_ON(s->failed > 2); 4137 4138 /* Want to check and possibly repair P and Q. 4139 * However there could be one 'failed' device, in which 4140 * case we can only check one of them, possibly using the 4141 * other to generate missing data 4142 */ 4143 4144 switch (sh->check_state) { 4145 case check_state_idle: 4146 /* start a new check operation if there are < 2 failures */ 4147 if (s->failed == s->q_failed) { 4148 /* The only possible failed device holds Q, so it 4149 * makes sense to check P (If anything else were failed, 4150 * we would have used P to recreate it). 4151 */ 4152 sh->check_state = check_state_run; 4153 } 4154 if (!s->q_failed && s->failed < 2) { 4155 /* Q is not failed, and we didn't use it to generate 4156 * anything, so it makes sense to check it 4157 */ 4158 if (sh->check_state == check_state_run) 4159 sh->check_state = check_state_run_pq; 4160 else 4161 sh->check_state = check_state_run_q; 4162 } 4163 4164 /* discard potentially stale zero_sum_result */ 4165 sh->ops.zero_sum_result = 0; 4166 4167 if (sh->check_state == check_state_run) { 4168 /* async_xor_zero_sum destroys the contents of P */ 4169 clear_bit(R5_UPTODATE, &sh->dev[pd_idx].flags); 4170 s->uptodate--; 4171 } 4172 if (sh->check_state >= check_state_run && 4173 sh->check_state <= check_state_run_pq) { 4174 /* async_syndrome_zero_sum preserves P and Q, so 4175 * no need to mark them !uptodate here 4176 */ 4177 set_bit(STRIPE_OP_CHECK, &s->ops_request); 4178 break; 4179 } 4180 4181 /* we have 2-disk failure */ 4182 BUG_ON(s->failed != 2); 4183 /* fall through */ 4184 case check_state_compute_result: 4185 sh->check_state = check_state_idle; 4186 4187 /* check that a write has not made the stripe insync */ 4188 if (test_bit(STRIPE_INSYNC, &sh->state)) 4189 break; 4190 4191 /* now write out any block on a failed drive, 4192 * or P or Q if they were recomputed 4193 */ 4194 dev = NULL; 4195 if (s->failed == 2) { 4196 dev = &sh->dev[s->failed_num[1]]; 4197 s->locked++; 4198 set_bit(R5_LOCKED, &dev->flags); 4199 set_bit(R5_Wantwrite, &dev->flags); 4200 } 4201 if (s->failed >= 1) { 4202 dev = &sh->dev[s->failed_num[0]]; 4203 s->locked++; 4204 set_bit(R5_LOCKED, &dev->flags); 4205 set_bit(R5_Wantwrite, &dev->flags); 4206 } 4207 if (sh->ops.zero_sum_result & SUM_CHECK_P_RESULT) { 4208 dev = &sh->dev[pd_idx]; 4209 s->locked++; 4210 set_bit(R5_LOCKED, &dev->flags); 4211 set_bit(R5_Wantwrite, &dev->flags); 4212 } 4213 if (sh->ops.zero_sum_result & SUM_CHECK_Q_RESULT) { 4214 dev = &sh->dev[qd_idx]; 4215 s->locked++; 4216 set_bit(R5_LOCKED, &dev->flags); 4217 set_bit(R5_Wantwrite, &dev->flags); 4218 } 4219 if (WARN_ONCE(dev && !test_bit(R5_UPTODATE, &dev->flags), 4220 "%s: disk%td not up to date\n", 4221 mdname(conf->mddev), 4222 dev - (struct r5dev *) &sh->dev)) { 4223 clear_bit(R5_LOCKED, &dev->flags); 4224 clear_bit(R5_Wantwrite, &dev->flags); 4225 s->locked--; 4226 } 4227 clear_bit(STRIPE_DEGRADED, &sh->state); 4228 4229 set_bit(STRIPE_INSYNC, &sh->state); 4230 break; 4231 case check_state_run: 4232 case check_state_run_q: 4233 case check_state_run_pq: 4234 break; /* we will be called again upon completion */ 4235 case check_state_check_result: 4236 sh->check_state = check_state_idle; 4237 4238 /* handle a successful check operation, if parity is correct 4239 * we are done. Otherwise update the mismatch count and repair 4240 * parity if !MD_RECOVERY_CHECK 4241 */ 4242 if (sh->ops.zero_sum_result == 0) { 4243 /* both parities are correct */ 4244 if (!s->failed) 4245 set_bit(STRIPE_INSYNC, &sh->state); 4246 else { 4247 /* in contrast to the raid5 case we can validate 4248 * parity, but still have a failure to write 4249 * back 4250 */ 4251 sh->check_state = check_state_compute_result; 4252 /* Returning at this point means that we may go 4253 * off and bring p and/or q uptodate again so 4254 * we make sure to check zero_sum_result again 4255 * to verify if p or q need writeback 4256 */ 4257 } 4258 } else { 4259 atomic64_add(STRIPE_SECTORS, &conf->mddev->resync_mismatches); 4260 if (test_bit(MD_RECOVERY_CHECK, &conf->mddev->recovery)) { 4261 /* don't try to repair!! */ 4262 set_bit(STRIPE_INSYNC, &sh->state); 4263 pr_warn_ratelimited("%s: mismatch sector in range " 4264 "%llu-%llu\n", mdname(conf->mddev), 4265 (unsigned long long) sh->sector, 4266 (unsigned long long) sh->sector + 4267 STRIPE_SECTORS); 4268 } else { 4269 int *target = &sh->ops.target; 4270 4271 sh->ops.target = -1; 4272 sh->ops.target2 = -1; 4273 sh->check_state = check_state_compute_run; 4274 set_bit(STRIPE_COMPUTE_RUN, &sh->state); 4275 set_bit(STRIPE_OP_COMPUTE_BLK, &s->ops_request); 4276 if (sh->ops.zero_sum_result & SUM_CHECK_P_RESULT) { 4277 set_bit(R5_Wantcompute, 4278 &sh->dev[pd_idx].flags); 4279 *target = pd_idx; 4280 target = &sh->ops.target2; 4281 s->uptodate++; 4282 } 4283 if (sh->ops.zero_sum_result & SUM_CHECK_Q_RESULT) { 4284 set_bit(R5_Wantcompute, 4285 &sh->dev[qd_idx].flags); 4286 *target = qd_idx; 4287 s->uptodate++; 4288 } 4289 } 4290 } 4291 break; 4292 case check_state_compute_run: 4293 break; 4294 default: 4295 pr_warn("%s: unknown check_state: %d sector: %llu\n", 4296 __func__, sh->check_state, 4297 (unsigned long long) sh->sector); 4298 BUG(); 4299 } 4300 } 4301 4302 static void handle_stripe_expansion(struct r5conf *conf, struct stripe_head *sh) 4303 { 4304 int i; 4305 4306 /* We have read all the blocks in this stripe and now we need to 4307 * copy some of them into a target stripe for expand. 4308 */ 4309 struct dma_async_tx_descriptor *tx = NULL; 4310 BUG_ON(sh->batch_head); 4311 clear_bit(STRIPE_EXPAND_SOURCE, &sh->state); 4312 for (i = 0; i < sh->disks; i++) 4313 if (i != sh->pd_idx && i != sh->qd_idx) { 4314 int dd_idx, j; 4315 struct stripe_head *sh2; 4316 struct async_submit_ctl submit; 4317 4318 sector_t bn = raid5_compute_blocknr(sh, i, 1); 4319 sector_t s = raid5_compute_sector(conf, bn, 0, 4320 &dd_idx, NULL); 4321 sh2 = raid5_get_active_stripe(conf, s, 0, 1, 1); 4322 if (sh2 == NULL) 4323 /* so far only the early blocks of this stripe 4324 * have been requested. When later blocks 4325 * get requested, we will try again 4326 */ 4327 continue; 4328 if (!test_bit(STRIPE_EXPANDING, &sh2->state) || 4329 test_bit(R5_Expanded, &sh2->dev[dd_idx].flags)) { 4330 /* must have already done this block */ 4331 raid5_release_stripe(sh2); 4332 continue; 4333 } 4334 4335 /* place all the copies on one channel */ 4336 init_async_submit(&submit, 0, tx, NULL, NULL, NULL); 4337 tx = async_memcpy(sh2->dev[dd_idx].page, 4338 sh->dev[i].page, 0, 0, STRIPE_SIZE, 4339 &submit); 4340 4341 set_bit(R5_Expanded, &sh2->dev[dd_idx].flags); 4342 set_bit(R5_UPTODATE, &sh2->dev[dd_idx].flags); 4343 for (j = 0; j < conf->raid_disks; j++) 4344 if (j != sh2->pd_idx && 4345 j != sh2->qd_idx && 4346 !test_bit(R5_Expanded, &sh2->dev[j].flags)) 4347 break; 4348 if (j == conf->raid_disks) { 4349 set_bit(STRIPE_EXPAND_READY, &sh2->state); 4350 set_bit(STRIPE_HANDLE, &sh2->state); 4351 } 4352 raid5_release_stripe(sh2); 4353 4354 } 4355 /* done submitting copies, wait for them to complete */ 4356 async_tx_quiesce(&tx); 4357 } 4358 4359 /* 4360 * handle_stripe - do things to a stripe. 4361 * 4362 * We lock the stripe by setting STRIPE_ACTIVE and then examine the 4363 * state of various bits to see what needs to be done. 4364 * Possible results: 4365 * return some read requests which now have data 4366 * return some write requests which are safely on storage 4367 * schedule a read on some buffers 4368 * schedule a write of some buffers 4369 * return confirmation of parity correctness 4370 * 4371 */ 4372 4373 static void analyse_stripe(struct stripe_head *sh, struct stripe_head_state *s) 4374 { 4375 struct r5conf *conf = sh->raid_conf; 4376 int disks = sh->disks; 4377 struct r5dev *dev; 4378 int i; 4379 int do_recovery = 0; 4380 4381 memset(s, 0, sizeof(*s)); 4382 4383 s->expanding = test_bit(STRIPE_EXPAND_SOURCE, &sh->state) && !sh->batch_head; 4384 s->expanded = test_bit(STRIPE_EXPAND_READY, &sh->state) && !sh->batch_head; 4385 s->failed_num[0] = -1; 4386 s->failed_num[1] = -1; 4387 s->log_failed = r5l_log_disk_error(conf); 4388 4389 /* Now to look around and see what can be done */ 4390 rcu_read_lock(); 4391 for (i=disks; i--; ) { 4392 struct md_rdev *rdev; 4393 sector_t first_bad; 4394 int bad_sectors; 4395 int is_bad = 0; 4396 4397 dev = &sh->dev[i]; 4398 4399 pr_debug("check %d: state 0x%lx read %p write %p written %p\n", 4400 i, dev->flags, 4401 dev->toread, dev->towrite, dev->written); 4402 /* maybe we can reply to a read 4403 * 4404 * new wantfill requests are only permitted while 4405 * ops_complete_biofill is guaranteed to be inactive 4406 */ 4407 if (test_bit(R5_UPTODATE, &dev->flags) && dev->toread && 4408 !test_bit(STRIPE_BIOFILL_RUN, &sh->state)) 4409 set_bit(R5_Wantfill, &dev->flags); 4410 4411 /* now count some things */ 4412 if (test_bit(R5_LOCKED, &dev->flags)) 4413 s->locked++; 4414 if (test_bit(R5_UPTODATE, &dev->flags)) 4415 s->uptodate++; 4416 if (test_bit(R5_Wantcompute, &dev->flags)) { 4417 s->compute++; 4418 BUG_ON(s->compute > 2); 4419 } 4420 4421 if (test_bit(R5_Wantfill, &dev->flags)) 4422 s->to_fill++; 4423 else if (dev->toread) 4424 s->to_read++; 4425 if (dev->towrite) { 4426 s->to_write++; 4427 if (!test_bit(R5_OVERWRITE, &dev->flags)) 4428 s->non_overwrite++; 4429 } 4430 if (dev->written) 4431 s->written++; 4432 /* Prefer to use the replacement for reads, but only 4433 * if it is recovered enough and has no bad blocks. 4434 */ 4435 rdev = rcu_dereference(conf->disks[i].replacement); 4436 if (rdev && !test_bit(Faulty, &rdev->flags) && 4437 rdev->recovery_offset >= sh->sector + STRIPE_SECTORS && 4438 !is_badblock(rdev, sh->sector, STRIPE_SECTORS, 4439 &first_bad, &bad_sectors)) 4440 set_bit(R5_ReadRepl, &dev->flags); 4441 else { 4442 if (rdev && !test_bit(Faulty, &rdev->flags)) 4443 set_bit(R5_NeedReplace, &dev->flags); 4444 else 4445 clear_bit(R5_NeedReplace, &dev->flags); 4446 rdev = rcu_dereference(conf->disks[i].rdev); 4447 clear_bit(R5_ReadRepl, &dev->flags); 4448 } 4449 if (rdev && test_bit(Faulty, &rdev->flags)) 4450 rdev = NULL; 4451 if (rdev) { 4452 is_bad = is_badblock(rdev, sh->sector, STRIPE_SECTORS, 4453 &first_bad, &bad_sectors); 4454 if (s->blocked_rdev == NULL 4455 && (test_bit(Blocked, &rdev->flags) 4456 || is_bad < 0)) { 4457 if (is_bad < 0) 4458 set_bit(BlockedBadBlocks, 4459 &rdev->flags); 4460 s->blocked_rdev = rdev; 4461 atomic_inc(&rdev->nr_pending); 4462 } 4463 } 4464 clear_bit(R5_Insync, &dev->flags); 4465 if (!rdev) 4466 /* Not in-sync */; 4467 else if (is_bad) { 4468 /* also not in-sync */ 4469 if (!test_bit(WriteErrorSeen, &rdev->flags) && 4470 test_bit(R5_UPTODATE, &dev->flags)) { 4471 /* treat as in-sync, but with a read error 4472 * which we can now try to correct 4473 */ 4474 set_bit(R5_Insync, &dev->flags); 4475 set_bit(R5_ReadError, &dev->flags); 4476 } 4477 } else if (test_bit(In_sync, &rdev->flags)) 4478 set_bit(R5_Insync, &dev->flags); 4479 else if (sh->sector + STRIPE_SECTORS <= rdev->recovery_offset) 4480 /* in sync if before recovery_offset */ 4481 set_bit(R5_Insync, &dev->flags); 4482 else if (test_bit(R5_UPTODATE, &dev->flags) && 4483 test_bit(R5_Expanded, &dev->flags)) 4484 /* If we've reshaped into here, we assume it is Insync. 4485 * We will shortly update recovery_offset to make 4486 * it official. 4487 */ 4488 set_bit(R5_Insync, &dev->flags); 4489 4490 if (test_bit(R5_WriteError, &dev->flags)) { 4491 /* This flag does not apply to '.replacement' 4492 * only to .rdev, so make sure to check that*/ 4493 struct md_rdev *rdev2 = rcu_dereference( 4494 conf->disks[i].rdev); 4495 if (rdev2 == rdev) 4496 clear_bit(R5_Insync, &dev->flags); 4497 if (rdev2 && !test_bit(Faulty, &rdev2->flags)) { 4498 s->handle_bad_blocks = 1; 4499 atomic_inc(&rdev2->nr_pending); 4500 } else 4501 clear_bit(R5_WriteError, &dev->flags); 4502 } 4503 if (test_bit(R5_MadeGood, &dev->flags)) { 4504 /* This flag does not apply to '.replacement' 4505 * only to .rdev, so make sure to check that*/ 4506 struct md_rdev *rdev2 = rcu_dereference( 4507 conf->disks[i].rdev); 4508 if (rdev2 && !test_bit(Faulty, &rdev2->flags)) { 4509 s->handle_bad_blocks = 1; 4510 atomic_inc(&rdev2->nr_pending); 4511 } else 4512 clear_bit(R5_MadeGood, &dev->flags); 4513 } 4514 if (test_bit(R5_MadeGoodRepl, &dev->flags)) { 4515 struct md_rdev *rdev2 = rcu_dereference( 4516 conf->disks[i].replacement); 4517 if (rdev2 && !test_bit(Faulty, &rdev2->flags)) { 4518 s->handle_bad_blocks = 1; 4519 atomic_inc(&rdev2->nr_pending); 4520 } else 4521 clear_bit(R5_MadeGoodRepl, &dev->flags); 4522 } 4523 if (!test_bit(R5_Insync, &dev->flags)) { 4524 /* The ReadError flag will just be confusing now */ 4525 clear_bit(R5_ReadError, &dev->flags); 4526 clear_bit(R5_ReWrite, &dev->flags); 4527 } 4528 if (test_bit(R5_ReadError, &dev->flags)) 4529 clear_bit(R5_Insync, &dev->flags); 4530 if (!test_bit(R5_Insync, &dev->flags)) { 4531 if (s->failed < 2) 4532 s->failed_num[s->failed] = i; 4533 s->failed++; 4534 if (rdev && !test_bit(Faulty, &rdev->flags)) 4535 do_recovery = 1; 4536 else if (!rdev) { 4537 rdev = rcu_dereference( 4538 conf->disks[i].replacement); 4539 if (rdev && !test_bit(Faulty, &rdev->flags)) 4540 do_recovery = 1; 4541 } 4542 } 4543 4544 if (test_bit(R5_InJournal, &dev->flags)) 4545 s->injournal++; 4546 if (test_bit(R5_InJournal, &dev->flags) && dev->written) 4547 s->just_cached++; 4548 } 4549 if (test_bit(STRIPE_SYNCING, &sh->state)) { 4550 /* If there is a failed device being replaced, 4551 * we must be recovering. 4552 * else if we are after recovery_cp, we must be syncing 4553 * else if MD_RECOVERY_REQUESTED is set, we also are syncing. 4554 * else we can only be replacing 4555 * sync and recovery both need to read all devices, and so 4556 * use the same flag. 4557 */ 4558 if (do_recovery || 4559 sh->sector >= conf->mddev->recovery_cp || 4560 test_bit(MD_RECOVERY_REQUESTED, &(conf->mddev->recovery))) 4561 s->syncing = 1; 4562 else 4563 s->replacing = 1; 4564 } 4565 rcu_read_unlock(); 4566 } 4567 4568 static int clear_batch_ready(struct stripe_head *sh) 4569 { 4570 /* Return '1' if this is a member of batch, or 4571 * '0' if it is a lone stripe or a head which can now be 4572 * handled. 4573 */ 4574 struct stripe_head *tmp; 4575 if (!test_and_clear_bit(STRIPE_BATCH_READY, &sh->state)) 4576 return (sh->batch_head && sh->batch_head != sh); 4577 spin_lock(&sh->stripe_lock); 4578 if (!sh->batch_head) { 4579 spin_unlock(&sh->stripe_lock); 4580 return 0; 4581 } 4582 4583 /* 4584 * this stripe could be added to a batch list before we check 4585 * BATCH_READY, skips it 4586 */ 4587 if (sh->batch_head != sh) { 4588 spin_unlock(&sh->stripe_lock); 4589 return 1; 4590 } 4591 spin_lock(&sh->batch_lock); 4592 list_for_each_entry(tmp, &sh->batch_list, batch_list) 4593 clear_bit(STRIPE_BATCH_READY, &tmp->state); 4594 spin_unlock(&sh->batch_lock); 4595 spin_unlock(&sh->stripe_lock); 4596 4597 /* 4598 * BATCH_READY is cleared, no new stripes can be added. 4599 * batch_list can be accessed without lock 4600 */ 4601 return 0; 4602 } 4603 4604 static void break_stripe_batch_list(struct stripe_head *head_sh, 4605 unsigned long handle_flags) 4606 { 4607 struct stripe_head *sh, *next; 4608 int i; 4609 int do_wakeup = 0; 4610 4611 list_for_each_entry_safe(sh, next, &head_sh->batch_list, batch_list) { 4612 4613 list_del_init(&sh->batch_list); 4614 4615 WARN_ONCE(sh->state & ((1 << STRIPE_ACTIVE) | 4616 (1 << STRIPE_SYNCING) | 4617 (1 << STRIPE_REPLACED) | 4618 (1 << STRIPE_DELAYED) | 4619 (1 << STRIPE_BIT_DELAY) | 4620 (1 << STRIPE_FULL_WRITE) | 4621 (1 << STRIPE_BIOFILL_RUN) | 4622 (1 << STRIPE_COMPUTE_RUN) | 4623 (1 << STRIPE_OPS_REQ_PENDING) | 4624 (1 << STRIPE_DISCARD) | 4625 (1 << STRIPE_BATCH_READY) | 4626 (1 << STRIPE_BATCH_ERR) | 4627 (1 << STRIPE_BITMAP_PENDING)), 4628 "stripe state: %lx\n", sh->state); 4629 WARN_ONCE(head_sh->state & ((1 << STRIPE_DISCARD) | 4630 (1 << STRIPE_REPLACED)), 4631 "head stripe state: %lx\n", head_sh->state); 4632 4633 set_mask_bits(&sh->state, ~(STRIPE_EXPAND_SYNC_FLAGS | 4634 (1 << STRIPE_PREREAD_ACTIVE) | 4635 (1 << STRIPE_DEGRADED) | 4636 (1 << STRIPE_ON_UNPLUG_LIST)), 4637 head_sh->state & (1 << STRIPE_INSYNC)); 4638 4639 sh->check_state = head_sh->check_state; 4640 sh->reconstruct_state = head_sh->reconstruct_state; 4641 spin_lock_irq(&sh->stripe_lock); 4642 sh->batch_head = NULL; 4643 spin_unlock_irq(&sh->stripe_lock); 4644 for (i = 0; i < sh->disks; i++) { 4645 if (test_and_clear_bit(R5_Overlap, &sh->dev[i].flags)) 4646 do_wakeup = 1; 4647 sh->dev[i].flags = head_sh->dev[i].flags & 4648 (~((1 << R5_WriteError) | (1 << R5_Overlap))); 4649 } 4650 if (handle_flags == 0 || 4651 sh->state & handle_flags) 4652 set_bit(STRIPE_HANDLE, &sh->state); 4653 raid5_release_stripe(sh); 4654 } 4655 spin_lock_irq(&head_sh->stripe_lock); 4656 head_sh->batch_head = NULL; 4657 spin_unlock_irq(&head_sh->stripe_lock); 4658 for (i = 0; i < head_sh->disks; i++) 4659 if (test_and_clear_bit(R5_Overlap, &head_sh->dev[i].flags)) 4660 do_wakeup = 1; 4661 if (head_sh->state & handle_flags) 4662 set_bit(STRIPE_HANDLE, &head_sh->state); 4663 4664 if (do_wakeup) 4665 wake_up(&head_sh->raid_conf->wait_for_overlap); 4666 } 4667 4668 static void handle_stripe(struct stripe_head *sh) 4669 { 4670 struct stripe_head_state s; 4671 struct r5conf *conf = sh->raid_conf; 4672 int i; 4673 int prexor; 4674 int disks = sh->disks; 4675 struct r5dev *pdev, *qdev; 4676 4677 clear_bit(STRIPE_HANDLE, &sh->state); 4678 if (test_and_set_bit_lock(STRIPE_ACTIVE, &sh->state)) { 4679 /* already being handled, ensure it gets handled 4680 * again when current action finishes */ 4681 set_bit(STRIPE_HANDLE, &sh->state); 4682 return; 4683 } 4684 4685 if (clear_batch_ready(sh) ) { 4686 clear_bit_unlock(STRIPE_ACTIVE, &sh->state); 4687 return; 4688 } 4689 4690 if (test_and_clear_bit(STRIPE_BATCH_ERR, &sh->state)) 4691 break_stripe_batch_list(sh, 0); 4692 4693 if (test_bit(STRIPE_SYNC_REQUESTED, &sh->state) && !sh->batch_head) { 4694 spin_lock(&sh->stripe_lock); 4695 /* 4696 * Cannot process 'sync' concurrently with 'discard'. 4697 * Flush data in r5cache before 'sync'. 4698 */ 4699 if (!test_bit(STRIPE_R5C_PARTIAL_STRIPE, &sh->state) && 4700 !test_bit(STRIPE_R5C_FULL_STRIPE, &sh->state) && 4701 !test_bit(STRIPE_DISCARD, &sh->state) && 4702 test_and_clear_bit(STRIPE_SYNC_REQUESTED, &sh->state)) { 4703 set_bit(STRIPE_SYNCING, &sh->state); 4704 clear_bit(STRIPE_INSYNC, &sh->state); 4705 clear_bit(STRIPE_REPLACED, &sh->state); 4706 } 4707 spin_unlock(&sh->stripe_lock); 4708 } 4709 clear_bit(STRIPE_DELAYED, &sh->state); 4710 4711 pr_debug("handling stripe %llu, state=%#lx cnt=%d, " 4712 "pd_idx=%d, qd_idx=%d\n, check:%d, reconstruct:%d\n", 4713 (unsigned long long)sh->sector, sh->state, 4714 atomic_read(&sh->count), sh->pd_idx, sh->qd_idx, 4715 sh->check_state, sh->reconstruct_state); 4716 4717 analyse_stripe(sh, &s); 4718 4719 if (test_bit(STRIPE_LOG_TRAPPED, &sh->state)) 4720 goto finish; 4721 4722 if (s.handle_bad_blocks || 4723 test_bit(MD_SB_CHANGE_PENDING, &conf->mddev->sb_flags)) { 4724 set_bit(STRIPE_HANDLE, &sh->state); 4725 goto finish; 4726 } 4727 4728 if (unlikely(s.blocked_rdev)) { 4729 if (s.syncing || s.expanding || s.expanded || 4730 s.replacing || s.to_write || s.written) { 4731 set_bit(STRIPE_HANDLE, &sh->state); 4732 goto finish; 4733 } 4734 /* There is nothing for the blocked_rdev to block */ 4735 rdev_dec_pending(s.blocked_rdev, conf->mddev); 4736 s.blocked_rdev = NULL; 4737 } 4738 4739 if (s.to_fill && !test_bit(STRIPE_BIOFILL_RUN, &sh->state)) { 4740 set_bit(STRIPE_OP_BIOFILL, &s.ops_request); 4741 set_bit(STRIPE_BIOFILL_RUN, &sh->state); 4742 } 4743 4744 pr_debug("locked=%d uptodate=%d to_read=%d" 4745 " to_write=%d failed=%d failed_num=%d,%d\n", 4746 s.locked, s.uptodate, s.to_read, s.to_write, s.failed, 4747 s.failed_num[0], s.failed_num[1]); 4748 /* 4749 * check if the array has lost more than max_degraded devices and, 4750 * if so, some requests might need to be failed. 4751 * 4752 * When journal device failed (log_failed), we will only process 4753 * the stripe if there is data need write to raid disks 4754 */ 4755 if (s.failed > conf->max_degraded || 4756 (s.log_failed && s.injournal == 0)) { 4757 sh->check_state = 0; 4758 sh->reconstruct_state = 0; 4759 break_stripe_batch_list(sh, 0); 4760 if (s.to_read+s.to_write+s.written) 4761 handle_failed_stripe(conf, sh, &s, disks); 4762 if (s.syncing + s.replacing) 4763 handle_failed_sync(conf, sh, &s); 4764 } 4765 4766 /* Now we check to see if any write operations have recently 4767 * completed 4768 */ 4769 prexor = 0; 4770 if (sh->reconstruct_state == reconstruct_state_prexor_drain_result) 4771 prexor = 1; 4772 if (sh->reconstruct_state == reconstruct_state_drain_result || 4773 sh->reconstruct_state == reconstruct_state_prexor_drain_result) { 4774 sh->reconstruct_state = reconstruct_state_idle; 4775 4776 /* All the 'written' buffers and the parity block are ready to 4777 * be written back to disk 4778 */ 4779 BUG_ON(!test_bit(R5_UPTODATE, &sh->dev[sh->pd_idx].flags) && 4780 !test_bit(R5_Discard, &sh->dev[sh->pd_idx].flags)); 4781 BUG_ON(sh->qd_idx >= 0 && 4782 !test_bit(R5_UPTODATE, &sh->dev[sh->qd_idx].flags) && 4783 !test_bit(R5_Discard, &sh->dev[sh->qd_idx].flags)); 4784 for (i = disks; i--; ) { 4785 struct r5dev *dev = &sh->dev[i]; 4786 if (test_bit(R5_LOCKED, &dev->flags) && 4787 (i == sh->pd_idx || i == sh->qd_idx || 4788 dev->written || test_bit(R5_InJournal, 4789 &dev->flags))) { 4790 pr_debug("Writing block %d\n", i); 4791 set_bit(R5_Wantwrite, &dev->flags); 4792 if (prexor) 4793 continue; 4794 if (s.failed > 1) 4795 continue; 4796 if (!test_bit(R5_Insync, &dev->flags) || 4797 ((i == sh->pd_idx || i == sh->qd_idx) && 4798 s.failed == 0)) 4799 set_bit(STRIPE_INSYNC, &sh->state); 4800 } 4801 } 4802 if (test_and_clear_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) 4803 s.dec_preread_active = 1; 4804 } 4805 4806 /* 4807 * might be able to return some write requests if the parity blocks 4808 * are safe, or on a failed drive 4809 */ 4810 pdev = &sh->dev[sh->pd_idx]; 4811 s.p_failed = (s.failed >= 1 && s.failed_num[0] == sh->pd_idx) 4812 || (s.failed >= 2 && s.failed_num[1] == sh->pd_idx); 4813 qdev = &sh->dev[sh->qd_idx]; 4814 s.q_failed = (s.failed >= 1 && s.failed_num[0] == sh->qd_idx) 4815 || (s.failed >= 2 && s.failed_num[1] == sh->qd_idx) 4816 || conf->level < 6; 4817 4818 if (s.written && 4819 (s.p_failed || ((test_bit(R5_Insync, &pdev->flags) 4820 && !test_bit(R5_LOCKED, &pdev->flags) 4821 && (test_bit(R5_UPTODATE, &pdev->flags) || 4822 test_bit(R5_Discard, &pdev->flags))))) && 4823 (s.q_failed || ((test_bit(R5_Insync, &qdev->flags) 4824 && !test_bit(R5_LOCKED, &qdev->flags) 4825 && (test_bit(R5_UPTODATE, &qdev->flags) || 4826 test_bit(R5_Discard, &qdev->flags)))))) 4827 handle_stripe_clean_event(conf, sh, disks); 4828 4829 if (s.just_cached) 4830 r5c_handle_cached_data_endio(conf, sh, disks); 4831 log_stripe_write_finished(sh); 4832 4833 /* Now we might consider reading some blocks, either to check/generate 4834 * parity, or to satisfy requests 4835 * or to load a block that is being partially written. 4836 */ 4837 if (s.to_read || s.non_overwrite 4838 || (conf->level == 6 && s.to_write && s.failed) 4839 || (s.syncing && (s.uptodate + s.compute < disks)) 4840 || s.replacing 4841 || s.expanding) 4842 handle_stripe_fill(sh, &s, disks); 4843 4844 /* 4845 * When the stripe finishes full journal write cycle (write to journal 4846 * and raid disk), this is the clean up procedure so it is ready for 4847 * next operation. 4848 */ 4849 r5c_finish_stripe_write_out(conf, sh, &s); 4850 4851 /* 4852 * Now to consider new write requests, cache write back and what else, 4853 * if anything should be read. We do not handle new writes when: 4854 * 1/ A 'write' operation (copy+xor) is already in flight. 4855 * 2/ A 'check' operation is in flight, as it may clobber the parity 4856 * block. 4857 * 3/ A r5c cache log write is in flight. 4858 */ 4859 4860 if (!sh->reconstruct_state && !sh->check_state && !sh->log_io) { 4861 if (!r5c_is_writeback(conf->log)) { 4862 if (s.to_write) 4863 handle_stripe_dirtying(conf, sh, &s, disks); 4864 } else { /* write back cache */ 4865 int ret = 0; 4866 4867 /* First, try handle writes in caching phase */ 4868 if (s.to_write) 4869 ret = r5c_try_caching_write(conf, sh, &s, 4870 disks); 4871 /* 4872 * If caching phase failed: ret == -EAGAIN 4873 * OR 4874 * stripe under reclaim: !caching && injournal 4875 * 4876 * fall back to handle_stripe_dirtying() 4877 */ 4878 if (ret == -EAGAIN || 4879 /* stripe under reclaim: !caching && injournal */ 4880 (!test_bit(STRIPE_R5C_CACHING, &sh->state) && 4881 s.injournal > 0)) { 4882 ret = handle_stripe_dirtying(conf, sh, &s, 4883 disks); 4884 if (ret == -EAGAIN) 4885 goto finish; 4886 } 4887 } 4888 } 4889 4890 /* maybe we need to check and possibly fix the parity for this stripe 4891 * Any reads will already have been scheduled, so we just see if enough 4892 * data is available. The parity check is held off while parity 4893 * dependent operations are in flight. 4894 */ 4895 if (sh->check_state || 4896 (s.syncing && s.locked == 0 && 4897 !test_bit(STRIPE_COMPUTE_RUN, &sh->state) && 4898 !test_bit(STRIPE_INSYNC, &sh->state))) { 4899 if (conf->level == 6) 4900 handle_parity_checks6(conf, sh, &s, disks); 4901 else 4902 handle_parity_checks5(conf, sh, &s, disks); 4903 } 4904 4905 if ((s.replacing || s.syncing) && s.locked == 0 4906 && !test_bit(STRIPE_COMPUTE_RUN, &sh->state) 4907 && !test_bit(STRIPE_REPLACED, &sh->state)) { 4908 /* Write out to replacement devices where possible */ 4909 for (i = 0; i < conf->raid_disks; i++) 4910 if (test_bit(R5_NeedReplace, &sh->dev[i].flags)) { 4911 WARN_ON(!test_bit(R5_UPTODATE, &sh->dev[i].flags)); 4912 set_bit(R5_WantReplace, &sh->dev[i].flags); 4913 set_bit(R5_LOCKED, &sh->dev[i].flags); 4914 s.locked++; 4915 } 4916 if (s.replacing) 4917 set_bit(STRIPE_INSYNC, &sh->state); 4918 set_bit(STRIPE_REPLACED, &sh->state); 4919 } 4920 if ((s.syncing || s.replacing) && s.locked == 0 && 4921 !test_bit(STRIPE_COMPUTE_RUN, &sh->state) && 4922 test_bit(STRIPE_INSYNC, &sh->state)) { 4923 md_done_sync(conf->mddev, STRIPE_SECTORS, 1); 4924 clear_bit(STRIPE_SYNCING, &sh->state); 4925 if (test_and_clear_bit(R5_Overlap, &sh->dev[sh->pd_idx].flags)) 4926 wake_up(&conf->wait_for_overlap); 4927 } 4928 4929 /* If the failed drives are just a ReadError, then we might need 4930 * to progress the repair/check process 4931 */ 4932 if (s.failed <= conf->max_degraded && !conf->mddev->ro) 4933 for (i = 0; i < s.failed; i++) { 4934 struct r5dev *dev = &sh->dev[s.failed_num[i]]; 4935 if (test_bit(R5_ReadError, &dev->flags) 4936 && !test_bit(R5_LOCKED, &dev->flags) 4937 && test_bit(R5_UPTODATE, &dev->flags) 4938 ) { 4939 if (!test_bit(R5_ReWrite, &dev->flags)) { 4940 set_bit(R5_Wantwrite, &dev->flags); 4941 set_bit(R5_ReWrite, &dev->flags); 4942 set_bit(R5_LOCKED, &dev->flags); 4943 s.locked++; 4944 } else { 4945 /* let's read it back */ 4946 set_bit(R5_Wantread, &dev->flags); 4947 set_bit(R5_LOCKED, &dev->flags); 4948 s.locked++; 4949 } 4950 } 4951 } 4952 4953 /* Finish reconstruct operations initiated by the expansion process */ 4954 if (sh->reconstruct_state == reconstruct_state_result) { 4955 struct stripe_head *sh_src 4956 = raid5_get_active_stripe(conf, sh->sector, 1, 1, 1); 4957 if (sh_src && test_bit(STRIPE_EXPAND_SOURCE, &sh_src->state)) { 4958 /* sh cannot be written until sh_src has been read. 4959 * so arrange for sh to be delayed a little 4960 */ 4961 set_bit(STRIPE_DELAYED, &sh->state); 4962 set_bit(STRIPE_HANDLE, &sh->state); 4963 if (!test_and_set_bit(STRIPE_PREREAD_ACTIVE, 4964 &sh_src->state)) 4965 atomic_inc(&conf->preread_active_stripes); 4966 raid5_release_stripe(sh_src); 4967 goto finish; 4968 } 4969 if (sh_src) 4970 raid5_release_stripe(sh_src); 4971 4972 sh->reconstruct_state = reconstruct_state_idle; 4973 clear_bit(STRIPE_EXPANDING, &sh->state); 4974 for (i = conf->raid_disks; i--; ) { 4975 set_bit(R5_Wantwrite, &sh->dev[i].flags); 4976 set_bit(R5_LOCKED, &sh->dev[i].flags); 4977 s.locked++; 4978 } 4979 } 4980 4981 if (s.expanded && test_bit(STRIPE_EXPANDING, &sh->state) && 4982 !sh->reconstruct_state) { 4983 /* Need to write out all blocks after computing parity */ 4984 sh->disks = conf->raid_disks; 4985 stripe_set_idx(sh->sector, conf, 0, sh); 4986 schedule_reconstruction(sh, &s, 1, 1); 4987 } else if (s.expanded && !sh->reconstruct_state && s.locked == 0) { 4988 clear_bit(STRIPE_EXPAND_READY, &sh->state); 4989 atomic_dec(&conf->reshape_stripes); 4990 wake_up(&conf->wait_for_overlap); 4991 md_done_sync(conf->mddev, STRIPE_SECTORS, 1); 4992 } 4993 4994 if (s.expanding && s.locked == 0 && 4995 !test_bit(STRIPE_COMPUTE_RUN, &sh->state)) 4996 handle_stripe_expansion(conf, sh); 4997 4998 finish: 4999 /* wait for this device to become unblocked */ 5000 if (unlikely(s.blocked_rdev)) { 5001 if (conf->mddev->external) 5002 md_wait_for_blocked_rdev(s.blocked_rdev, 5003 conf->mddev); 5004 else 5005 /* Internal metadata will immediately 5006 * be written by raid5d, so we don't 5007 * need to wait here. 5008 */ 5009 rdev_dec_pending(s.blocked_rdev, 5010 conf->mddev); 5011 } 5012 5013 if (s.handle_bad_blocks) 5014 for (i = disks; i--; ) { 5015 struct md_rdev *rdev; 5016 struct r5dev *dev = &sh->dev[i]; 5017 if (test_and_clear_bit(R5_WriteError, &dev->flags)) { 5018 /* We own a safe reference to the rdev */ 5019 rdev = conf->disks[i].rdev; 5020 if (!rdev_set_badblocks(rdev, sh->sector, 5021 STRIPE_SECTORS, 0)) 5022 md_error(conf->mddev, rdev); 5023 rdev_dec_pending(rdev, conf->mddev); 5024 } 5025 if (test_and_clear_bit(R5_MadeGood, &dev->flags)) { 5026 rdev = conf->disks[i].rdev; 5027 rdev_clear_badblocks(rdev, sh->sector, 5028 STRIPE_SECTORS, 0); 5029 rdev_dec_pending(rdev, conf->mddev); 5030 } 5031 if (test_and_clear_bit(R5_MadeGoodRepl, &dev->flags)) { 5032 rdev = conf->disks[i].replacement; 5033 if (!rdev) 5034 /* rdev have been moved down */ 5035 rdev = conf->disks[i].rdev; 5036 rdev_clear_badblocks(rdev, sh->sector, 5037 STRIPE_SECTORS, 0); 5038 rdev_dec_pending(rdev, conf->mddev); 5039 } 5040 } 5041 5042 if (s.ops_request) 5043 raid_run_ops(sh, s.ops_request); 5044 5045 ops_run_io(sh, &s); 5046 5047 if (s.dec_preread_active) { 5048 /* We delay this until after ops_run_io so that if make_request 5049 * is waiting on a flush, it won't continue until the writes 5050 * have actually been submitted. 5051 */ 5052 atomic_dec(&conf->preread_active_stripes); 5053 if (atomic_read(&conf->preread_active_stripes) < 5054 IO_THRESHOLD) 5055 md_wakeup_thread(conf->mddev->thread); 5056 } 5057 5058 clear_bit_unlock(STRIPE_ACTIVE, &sh->state); 5059 } 5060 5061 static void raid5_activate_delayed(struct r5conf *conf) 5062 { 5063 if (atomic_read(&conf->preread_active_stripes) < IO_THRESHOLD) { 5064 while (!list_empty(&conf->delayed_list)) { 5065 struct list_head *l = conf->delayed_list.next; 5066 struct stripe_head *sh; 5067 sh = list_entry(l, struct stripe_head, lru); 5068 list_del_init(l); 5069 clear_bit(STRIPE_DELAYED, &sh->state); 5070 if (!test_and_set_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) 5071 atomic_inc(&conf->preread_active_stripes); 5072 list_add_tail(&sh->lru, &conf->hold_list); 5073 raid5_wakeup_stripe_thread(sh); 5074 } 5075 } 5076 } 5077 5078 static void activate_bit_delay(struct r5conf *conf, 5079 struct list_head *temp_inactive_list) 5080 { 5081 /* device_lock is held */ 5082 struct list_head head; 5083 list_add(&head, &conf->bitmap_list); 5084 list_del_init(&conf->bitmap_list); 5085 while (!list_empty(&head)) { 5086 struct stripe_head *sh = list_entry(head.next, struct stripe_head, lru); 5087 int hash; 5088 list_del_init(&sh->lru); 5089 atomic_inc(&sh->count); 5090 hash = sh->hash_lock_index; 5091 __release_stripe(conf, sh, &temp_inactive_list[hash]); 5092 } 5093 } 5094 5095 static int raid5_congested(struct mddev *mddev, int bits) 5096 { 5097 struct r5conf *conf = mddev->private; 5098 5099 /* No difference between reads and writes. Just check 5100 * how busy the stripe_cache is 5101 */ 5102 5103 if (test_bit(R5_INACTIVE_BLOCKED, &conf->cache_state)) 5104 return 1; 5105 5106 /* Also checks whether there is pressure on r5cache log space */ 5107 if (test_bit(R5C_LOG_TIGHT, &conf->cache_state)) 5108 return 1; 5109 if (conf->quiesce) 5110 return 1; 5111 if (atomic_read(&conf->empty_inactive_list_nr)) 5112 return 1; 5113 5114 return 0; 5115 } 5116 5117 static int in_chunk_boundary(struct mddev *mddev, struct bio *bio) 5118 { 5119 struct r5conf *conf = mddev->private; 5120 sector_t sector = bio->bi_iter.bi_sector; 5121 unsigned int chunk_sectors; 5122 unsigned int bio_sectors = bio_sectors(bio); 5123 5124 WARN_ON_ONCE(bio->bi_partno); 5125 5126 chunk_sectors = min(conf->chunk_sectors, conf->prev_chunk_sectors); 5127 return chunk_sectors >= 5128 ((sector & (chunk_sectors - 1)) + bio_sectors); 5129 } 5130 5131 /* 5132 * add bio to the retry LIFO ( in O(1) ... we are in interrupt ) 5133 * later sampled by raid5d. 5134 */ 5135 static void add_bio_to_retry(struct bio *bi,struct r5conf *conf) 5136 { 5137 unsigned long flags; 5138 5139 spin_lock_irqsave(&conf->device_lock, flags); 5140 5141 bi->bi_next = conf->retry_read_aligned_list; 5142 conf->retry_read_aligned_list = bi; 5143 5144 spin_unlock_irqrestore(&conf->device_lock, flags); 5145 md_wakeup_thread(conf->mddev->thread); 5146 } 5147 5148 static struct bio *remove_bio_from_retry(struct r5conf *conf, 5149 unsigned int *offset) 5150 { 5151 struct bio *bi; 5152 5153 bi = conf->retry_read_aligned; 5154 if (bi) { 5155 *offset = conf->retry_read_offset; 5156 conf->retry_read_aligned = NULL; 5157 return bi; 5158 } 5159 bi = conf->retry_read_aligned_list; 5160 if(bi) { 5161 conf->retry_read_aligned_list = bi->bi_next; 5162 bi->bi_next = NULL; 5163 *offset = 0; 5164 } 5165 5166 return bi; 5167 } 5168 5169 /* 5170 * The "raid5_align_endio" should check if the read succeeded and if it 5171 * did, call bio_endio on the original bio (having bio_put the new bio 5172 * first). 5173 * If the read failed.. 5174 */ 5175 static void raid5_align_endio(struct bio *bi) 5176 { 5177 struct bio* raid_bi = bi->bi_private; 5178 struct mddev *mddev; 5179 struct r5conf *conf; 5180 struct md_rdev *rdev; 5181 blk_status_t error = bi->bi_status; 5182 5183 bio_put(bi); 5184 5185 rdev = (void*)raid_bi->bi_next; 5186 raid_bi->bi_next = NULL; 5187 mddev = rdev->mddev; 5188 conf = mddev->private; 5189 5190 rdev_dec_pending(rdev, conf->mddev); 5191 5192 if (!error) { 5193 bio_endio(raid_bi); 5194 if (atomic_dec_and_test(&conf->active_aligned_reads)) 5195 wake_up(&conf->wait_for_quiescent); 5196 return; 5197 } 5198 5199 pr_debug("raid5_align_endio : io error...handing IO for a retry\n"); 5200 5201 add_bio_to_retry(raid_bi, conf); 5202 } 5203 5204 static int raid5_read_one_chunk(struct mddev *mddev, struct bio *raid_bio) 5205 { 5206 struct r5conf *conf = mddev->private; 5207 int dd_idx; 5208 struct bio* align_bi; 5209 struct md_rdev *rdev; 5210 sector_t end_sector; 5211 5212 if (!in_chunk_boundary(mddev, raid_bio)) { 5213 pr_debug("%s: non aligned\n", __func__); 5214 return 0; 5215 } 5216 /* 5217 * use bio_clone_fast to make a copy of the bio 5218 */ 5219 align_bi = bio_clone_fast(raid_bio, GFP_NOIO, &mddev->bio_set); 5220 if (!align_bi) 5221 return 0; 5222 /* 5223 * set bi_end_io to a new function, and set bi_private to the 5224 * original bio. 5225 */ 5226 align_bi->bi_end_io = raid5_align_endio; 5227 align_bi->bi_private = raid_bio; 5228 /* 5229 * compute position 5230 */ 5231 align_bi->bi_iter.bi_sector = 5232 raid5_compute_sector(conf, raid_bio->bi_iter.bi_sector, 5233 0, &dd_idx, NULL); 5234 5235 end_sector = bio_end_sector(align_bi); 5236 rcu_read_lock(); 5237 rdev = rcu_dereference(conf->disks[dd_idx].replacement); 5238 if (!rdev || test_bit(Faulty, &rdev->flags) || 5239 rdev->recovery_offset < end_sector) { 5240 rdev = rcu_dereference(conf->disks[dd_idx].rdev); 5241 if (rdev && 5242 (test_bit(Faulty, &rdev->flags) || 5243 !(test_bit(In_sync, &rdev->flags) || 5244 rdev->recovery_offset >= end_sector))) 5245 rdev = NULL; 5246 } 5247 5248 if (r5c_big_stripe_cached(conf, align_bi->bi_iter.bi_sector)) { 5249 rcu_read_unlock(); 5250 bio_put(align_bi); 5251 return 0; 5252 } 5253 5254 if (rdev) { 5255 sector_t first_bad; 5256 int bad_sectors; 5257 5258 atomic_inc(&rdev->nr_pending); 5259 rcu_read_unlock(); 5260 raid_bio->bi_next = (void*)rdev; 5261 bio_set_dev(align_bi, rdev->bdev); 5262 bio_clear_flag(align_bi, BIO_SEG_VALID); 5263 5264 if (is_badblock(rdev, align_bi->bi_iter.bi_sector, 5265 bio_sectors(align_bi), 5266 &first_bad, &bad_sectors)) { 5267 bio_put(align_bi); 5268 rdev_dec_pending(rdev, mddev); 5269 return 0; 5270 } 5271 5272 /* No reshape active, so we can trust rdev->data_offset */ 5273 align_bi->bi_iter.bi_sector += rdev->data_offset; 5274 5275 spin_lock_irq(&conf->device_lock); 5276 wait_event_lock_irq(conf->wait_for_quiescent, 5277 conf->quiesce == 0, 5278 conf->device_lock); 5279 atomic_inc(&conf->active_aligned_reads); 5280 spin_unlock_irq(&conf->device_lock); 5281 5282 if (mddev->gendisk) 5283 trace_block_bio_remap(align_bi->bi_disk->queue, 5284 align_bi, disk_devt(mddev->gendisk), 5285 raid_bio->bi_iter.bi_sector); 5286 generic_make_request(align_bi); 5287 return 1; 5288 } else { 5289 rcu_read_unlock(); 5290 bio_put(align_bi); 5291 return 0; 5292 } 5293 } 5294 5295 static struct bio *chunk_aligned_read(struct mddev *mddev, struct bio *raid_bio) 5296 { 5297 struct bio *split; 5298 sector_t sector = raid_bio->bi_iter.bi_sector; 5299 unsigned chunk_sects = mddev->chunk_sectors; 5300 unsigned sectors = chunk_sects - (sector & (chunk_sects-1)); 5301 5302 if (sectors < bio_sectors(raid_bio)) { 5303 struct r5conf *conf = mddev->private; 5304 split = bio_split(raid_bio, sectors, GFP_NOIO, &conf->bio_split); 5305 bio_chain(split, raid_bio); 5306 generic_make_request(raid_bio); 5307 raid_bio = split; 5308 } 5309 5310 if (!raid5_read_one_chunk(mddev, raid_bio)) 5311 return raid_bio; 5312 5313 return NULL; 5314 } 5315 5316 /* __get_priority_stripe - get the next stripe to process 5317 * 5318 * Full stripe writes are allowed to pass preread active stripes up until 5319 * the bypass_threshold is exceeded. In general the bypass_count 5320 * increments when the handle_list is handled before the hold_list; however, it 5321 * will not be incremented when STRIPE_IO_STARTED is sampled set signifying a 5322 * stripe with in flight i/o. The bypass_count will be reset when the 5323 * head of the hold_list has changed, i.e. the head was promoted to the 5324 * handle_list. 5325 */ 5326 static struct stripe_head *__get_priority_stripe(struct r5conf *conf, int group) 5327 { 5328 struct stripe_head *sh, *tmp; 5329 struct list_head *handle_list = NULL; 5330 struct r5worker_group *wg; 5331 bool second_try = !r5c_is_writeback(conf->log) && 5332 !r5l_log_disk_error(conf); 5333 bool try_loprio = test_bit(R5C_LOG_TIGHT, &conf->cache_state) || 5334 r5l_log_disk_error(conf); 5335 5336 again: 5337 wg = NULL; 5338 sh = NULL; 5339 if (conf->worker_cnt_per_group == 0) { 5340 handle_list = try_loprio ? &conf->loprio_list : 5341 &conf->handle_list; 5342 } else if (group != ANY_GROUP) { 5343 handle_list = try_loprio ? &conf->worker_groups[group].loprio_list : 5344 &conf->worker_groups[group].handle_list; 5345 wg = &conf->worker_groups[group]; 5346 } else { 5347 int i; 5348 for (i = 0; i < conf->group_cnt; i++) { 5349 handle_list = try_loprio ? &conf->worker_groups[i].loprio_list : 5350 &conf->worker_groups[i].handle_list; 5351 wg = &conf->worker_groups[i]; 5352 if (!list_empty(handle_list)) 5353 break; 5354 } 5355 } 5356 5357 pr_debug("%s: handle: %s hold: %s full_writes: %d bypass_count: %d\n", 5358 __func__, 5359 list_empty(handle_list) ? "empty" : "busy", 5360 list_empty(&conf->hold_list) ? "empty" : "busy", 5361 atomic_read(&conf->pending_full_writes), conf->bypass_count); 5362 5363 if (!list_empty(handle_list)) { 5364 sh = list_entry(handle_list->next, typeof(*sh), lru); 5365 5366 if (list_empty(&conf->hold_list)) 5367 conf->bypass_count = 0; 5368 else if (!test_bit(STRIPE_IO_STARTED, &sh->state)) { 5369 if (conf->hold_list.next == conf->last_hold) 5370 conf->bypass_count++; 5371 else { 5372 conf->last_hold = conf->hold_list.next; 5373 conf->bypass_count -= conf->bypass_threshold; 5374 if (conf->bypass_count < 0) 5375 conf->bypass_count = 0; 5376 } 5377 } 5378 } else if (!list_empty(&conf->hold_list) && 5379 ((conf->bypass_threshold && 5380 conf->bypass_count > conf->bypass_threshold) || 5381 atomic_read(&conf->pending_full_writes) == 0)) { 5382 5383 list_for_each_entry(tmp, &conf->hold_list, lru) { 5384 if (conf->worker_cnt_per_group == 0 || 5385 group == ANY_GROUP || 5386 !cpu_online(tmp->cpu) || 5387 cpu_to_group(tmp->cpu) == group) { 5388 sh = tmp; 5389 break; 5390 } 5391 } 5392 5393 if (sh) { 5394 conf->bypass_count -= conf->bypass_threshold; 5395 if (conf->bypass_count < 0) 5396 conf->bypass_count = 0; 5397 } 5398 wg = NULL; 5399 } 5400 5401 if (!sh) { 5402 if (second_try) 5403 return NULL; 5404 second_try = true; 5405 try_loprio = !try_loprio; 5406 goto again; 5407 } 5408 5409 if (wg) { 5410 wg->stripes_cnt--; 5411 sh->group = NULL; 5412 } 5413 list_del_init(&sh->lru); 5414 BUG_ON(atomic_inc_return(&sh->count) != 1); 5415 return sh; 5416 } 5417 5418 struct raid5_plug_cb { 5419 struct blk_plug_cb cb; 5420 struct list_head list; 5421 struct list_head temp_inactive_list[NR_STRIPE_HASH_LOCKS]; 5422 }; 5423 5424 static void raid5_unplug(struct blk_plug_cb *blk_cb, bool from_schedule) 5425 { 5426 struct raid5_plug_cb *cb = container_of( 5427 blk_cb, struct raid5_plug_cb, cb); 5428 struct stripe_head *sh; 5429 struct mddev *mddev = cb->cb.data; 5430 struct r5conf *conf = mddev->private; 5431 int cnt = 0; 5432 int hash; 5433 5434 if (cb->list.next && !list_empty(&cb->list)) { 5435 spin_lock_irq(&conf->device_lock); 5436 while (!list_empty(&cb->list)) { 5437 sh = list_first_entry(&cb->list, struct stripe_head, lru); 5438 list_del_init(&sh->lru); 5439 /* 5440 * avoid race release_stripe_plug() sees 5441 * STRIPE_ON_UNPLUG_LIST clear but the stripe 5442 * is still in our list 5443 */ 5444 smp_mb__before_atomic(); 5445 clear_bit(STRIPE_ON_UNPLUG_LIST, &sh->state); 5446 /* 5447 * STRIPE_ON_RELEASE_LIST could be set here. In that 5448 * case, the count is always > 1 here 5449 */ 5450 hash = sh->hash_lock_index; 5451 __release_stripe(conf, sh, &cb->temp_inactive_list[hash]); 5452 cnt++; 5453 } 5454 spin_unlock_irq(&conf->device_lock); 5455 } 5456 release_inactive_stripe_list(conf, cb->temp_inactive_list, 5457 NR_STRIPE_HASH_LOCKS); 5458 if (mddev->queue) 5459 trace_block_unplug(mddev->queue, cnt, !from_schedule); 5460 kfree(cb); 5461 } 5462 5463 static void release_stripe_plug(struct mddev *mddev, 5464 struct stripe_head *sh) 5465 { 5466 struct blk_plug_cb *blk_cb = blk_check_plugged( 5467 raid5_unplug, mddev, 5468 sizeof(struct raid5_plug_cb)); 5469 struct raid5_plug_cb *cb; 5470 5471 if (!blk_cb) { 5472 raid5_release_stripe(sh); 5473 return; 5474 } 5475 5476 cb = container_of(blk_cb, struct raid5_plug_cb, cb); 5477 5478 if (cb->list.next == NULL) { 5479 int i; 5480 INIT_LIST_HEAD(&cb->list); 5481 for (i = 0; i < NR_STRIPE_HASH_LOCKS; i++) 5482 INIT_LIST_HEAD(cb->temp_inactive_list + i); 5483 } 5484 5485 if (!test_and_set_bit(STRIPE_ON_UNPLUG_LIST, &sh->state)) 5486 list_add_tail(&sh->lru, &cb->list); 5487 else 5488 raid5_release_stripe(sh); 5489 } 5490 5491 static void make_discard_request(struct mddev *mddev, struct bio *bi) 5492 { 5493 struct r5conf *conf = mddev->private; 5494 sector_t logical_sector, last_sector; 5495 struct stripe_head *sh; 5496 int stripe_sectors; 5497 5498 if (mddev->reshape_position != MaxSector) 5499 /* Skip discard while reshape is happening */ 5500 return; 5501 5502 logical_sector = bi->bi_iter.bi_sector & ~((sector_t)STRIPE_SECTORS-1); 5503 last_sector = bi->bi_iter.bi_sector + (bi->bi_iter.bi_size>>9); 5504 5505 bi->bi_next = NULL; 5506 5507 stripe_sectors = conf->chunk_sectors * 5508 (conf->raid_disks - conf->max_degraded); 5509 logical_sector = DIV_ROUND_UP_SECTOR_T(logical_sector, 5510 stripe_sectors); 5511 sector_div(last_sector, stripe_sectors); 5512 5513 logical_sector *= conf->chunk_sectors; 5514 last_sector *= conf->chunk_sectors; 5515 5516 for (; logical_sector < last_sector; 5517 logical_sector += STRIPE_SECTORS) { 5518 DEFINE_WAIT(w); 5519 int d; 5520 again: 5521 sh = raid5_get_active_stripe(conf, logical_sector, 0, 0, 0); 5522 prepare_to_wait(&conf->wait_for_overlap, &w, 5523 TASK_UNINTERRUPTIBLE); 5524 set_bit(R5_Overlap, &sh->dev[sh->pd_idx].flags); 5525 if (test_bit(STRIPE_SYNCING, &sh->state)) { 5526 raid5_release_stripe(sh); 5527 schedule(); 5528 goto again; 5529 } 5530 clear_bit(R5_Overlap, &sh->dev[sh->pd_idx].flags); 5531 spin_lock_irq(&sh->stripe_lock); 5532 for (d = 0; d < conf->raid_disks; d++) { 5533 if (d == sh->pd_idx || d == sh->qd_idx) 5534 continue; 5535 if (sh->dev[d].towrite || sh->dev[d].toread) { 5536 set_bit(R5_Overlap, &sh->dev[d].flags); 5537 spin_unlock_irq(&sh->stripe_lock); 5538 raid5_release_stripe(sh); 5539 schedule(); 5540 goto again; 5541 } 5542 } 5543 set_bit(STRIPE_DISCARD, &sh->state); 5544 finish_wait(&conf->wait_for_overlap, &w); 5545 sh->overwrite_disks = 0; 5546 for (d = 0; d < conf->raid_disks; d++) { 5547 if (d == sh->pd_idx || d == sh->qd_idx) 5548 continue; 5549 sh->dev[d].towrite = bi; 5550 set_bit(R5_OVERWRITE, &sh->dev[d].flags); 5551 bio_inc_remaining(bi); 5552 md_write_inc(mddev, bi); 5553 sh->overwrite_disks++; 5554 } 5555 spin_unlock_irq(&sh->stripe_lock); 5556 if (conf->mddev->bitmap) { 5557 for (d = 0; 5558 d < conf->raid_disks - conf->max_degraded; 5559 d++) 5560 md_bitmap_startwrite(mddev->bitmap, 5561 sh->sector, 5562 STRIPE_SECTORS, 5563 0); 5564 sh->bm_seq = conf->seq_flush + 1; 5565 set_bit(STRIPE_BIT_DELAY, &sh->state); 5566 } 5567 5568 set_bit(STRIPE_HANDLE, &sh->state); 5569 clear_bit(STRIPE_DELAYED, &sh->state); 5570 if (!test_and_set_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) 5571 atomic_inc(&conf->preread_active_stripes); 5572 release_stripe_plug(mddev, sh); 5573 } 5574 5575 bio_endio(bi); 5576 } 5577 5578 static bool raid5_make_request(struct mddev *mddev, struct bio * bi) 5579 { 5580 struct r5conf *conf = mddev->private; 5581 int dd_idx; 5582 sector_t new_sector; 5583 sector_t logical_sector, last_sector; 5584 struct stripe_head *sh; 5585 const int rw = bio_data_dir(bi); 5586 DEFINE_WAIT(w); 5587 bool do_prepare; 5588 bool do_flush = false; 5589 5590 if (unlikely(bi->bi_opf & REQ_PREFLUSH)) { 5591 int ret = log_handle_flush_request(conf, bi); 5592 5593 if (ret == 0) 5594 return true; 5595 if (ret == -ENODEV) { 5596 md_flush_request(mddev, bi); 5597 return true; 5598 } 5599 /* ret == -EAGAIN, fallback */ 5600 /* 5601 * if r5l_handle_flush_request() didn't clear REQ_PREFLUSH, 5602 * we need to flush journal device 5603 */ 5604 do_flush = bi->bi_opf & REQ_PREFLUSH; 5605 } 5606 5607 if (!md_write_start(mddev, bi)) 5608 return false; 5609 /* 5610 * If array is degraded, better not do chunk aligned read because 5611 * later we might have to read it again in order to reconstruct 5612 * data on failed drives. 5613 */ 5614 if (rw == READ && mddev->degraded == 0 && 5615 mddev->reshape_position == MaxSector) { 5616 bi = chunk_aligned_read(mddev, bi); 5617 if (!bi) 5618 return true; 5619 } 5620 5621 if (unlikely(bio_op(bi) == REQ_OP_DISCARD)) { 5622 make_discard_request(mddev, bi); 5623 md_write_end(mddev); 5624 return true; 5625 } 5626 5627 logical_sector = bi->bi_iter.bi_sector & ~((sector_t)STRIPE_SECTORS-1); 5628 last_sector = bio_end_sector(bi); 5629 bi->bi_next = NULL; 5630 5631 prepare_to_wait(&conf->wait_for_overlap, &w, TASK_UNINTERRUPTIBLE); 5632 for (;logical_sector < last_sector; logical_sector += STRIPE_SECTORS) { 5633 int previous; 5634 int seq; 5635 5636 do_prepare = false; 5637 retry: 5638 seq = read_seqcount_begin(&conf->gen_lock); 5639 previous = 0; 5640 if (do_prepare) 5641 prepare_to_wait(&conf->wait_for_overlap, &w, 5642 TASK_UNINTERRUPTIBLE); 5643 if (unlikely(conf->reshape_progress != MaxSector)) { 5644 /* spinlock is needed as reshape_progress may be 5645 * 64bit on a 32bit platform, and so it might be 5646 * possible to see a half-updated value 5647 * Of course reshape_progress could change after 5648 * the lock is dropped, so once we get a reference 5649 * to the stripe that we think it is, we will have 5650 * to check again. 5651 */ 5652 spin_lock_irq(&conf->device_lock); 5653 if (mddev->reshape_backwards 5654 ? logical_sector < conf->reshape_progress 5655 : logical_sector >= conf->reshape_progress) { 5656 previous = 1; 5657 } else { 5658 if (mddev->reshape_backwards 5659 ? logical_sector < conf->reshape_safe 5660 : logical_sector >= conf->reshape_safe) { 5661 spin_unlock_irq(&conf->device_lock); 5662 schedule(); 5663 do_prepare = true; 5664 goto retry; 5665 } 5666 } 5667 spin_unlock_irq(&conf->device_lock); 5668 } 5669 5670 new_sector = raid5_compute_sector(conf, logical_sector, 5671 previous, 5672 &dd_idx, NULL); 5673 pr_debug("raid456: raid5_make_request, sector %llu logical %llu\n", 5674 (unsigned long long)new_sector, 5675 (unsigned long long)logical_sector); 5676 5677 sh = raid5_get_active_stripe(conf, new_sector, previous, 5678 (bi->bi_opf & REQ_RAHEAD), 0); 5679 if (sh) { 5680 if (unlikely(previous)) { 5681 /* expansion might have moved on while waiting for a 5682 * stripe, so we must do the range check again. 5683 * Expansion could still move past after this 5684 * test, but as we are holding a reference to 5685 * 'sh', we know that if that happens, 5686 * STRIPE_EXPANDING will get set and the expansion 5687 * won't proceed until we finish with the stripe. 5688 */ 5689 int must_retry = 0; 5690 spin_lock_irq(&conf->device_lock); 5691 if (mddev->reshape_backwards 5692 ? logical_sector >= conf->reshape_progress 5693 : logical_sector < conf->reshape_progress) 5694 /* mismatch, need to try again */ 5695 must_retry = 1; 5696 spin_unlock_irq(&conf->device_lock); 5697 if (must_retry) { 5698 raid5_release_stripe(sh); 5699 schedule(); 5700 do_prepare = true; 5701 goto retry; 5702 } 5703 } 5704 if (read_seqcount_retry(&conf->gen_lock, seq)) { 5705 /* Might have got the wrong stripe_head 5706 * by accident 5707 */ 5708 raid5_release_stripe(sh); 5709 goto retry; 5710 } 5711 5712 if (test_bit(STRIPE_EXPANDING, &sh->state) || 5713 !add_stripe_bio(sh, bi, dd_idx, rw, previous)) { 5714 /* Stripe is busy expanding or 5715 * add failed due to overlap. Flush everything 5716 * and wait a while 5717 */ 5718 md_wakeup_thread(mddev->thread); 5719 raid5_release_stripe(sh); 5720 schedule(); 5721 do_prepare = true; 5722 goto retry; 5723 } 5724 if (do_flush) { 5725 set_bit(STRIPE_R5C_PREFLUSH, &sh->state); 5726 /* we only need flush for one stripe */ 5727 do_flush = false; 5728 } 5729 5730 set_bit(STRIPE_HANDLE, &sh->state); 5731 clear_bit(STRIPE_DELAYED, &sh->state); 5732 if ((!sh->batch_head || sh == sh->batch_head) && 5733 (bi->bi_opf & REQ_SYNC) && 5734 !test_and_set_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) 5735 atomic_inc(&conf->preread_active_stripes); 5736 release_stripe_plug(mddev, sh); 5737 } else { 5738 /* cannot get stripe for read-ahead, just give-up */ 5739 bi->bi_status = BLK_STS_IOERR; 5740 break; 5741 } 5742 } 5743 finish_wait(&conf->wait_for_overlap, &w); 5744 5745 if (rw == WRITE) 5746 md_write_end(mddev); 5747 bio_endio(bi); 5748 return true; 5749 } 5750 5751 static sector_t raid5_size(struct mddev *mddev, sector_t sectors, int raid_disks); 5752 5753 static sector_t reshape_request(struct mddev *mddev, sector_t sector_nr, int *skipped) 5754 { 5755 /* reshaping is quite different to recovery/resync so it is 5756 * handled quite separately ... here. 5757 * 5758 * On each call to sync_request, we gather one chunk worth of 5759 * destination stripes and flag them as expanding. 5760 * Then we find all the source stripes and request reads. 5761 * As the reads complete, handle_stripe will copy the data 5762 * into the destination stripe and release that stripe. 5763 */ 5764 struct r5conf *conf = mddev->private; 5765 struct stripe_head *sh; 5766 struct md_rdev *rdev; 5767 sector_t first_sector, last_sector; 5768 int raid_disks = conf->previous_raid_disks; 5769 int data_disks = raid_disks - conf->max_degraded; 5770 int new_data_disks = conf->raid_disks - conf->max_degraded; 5771 int i; 5772 int dd_idx; 5773 sector_t writepos, readpos, safepos; 5774 sector_t stripe_addr; 5775 int reshape_sectors; 5776 struct list_head stripes; 5777 sector_t retn; 5778 5779 if (sector_nr == 0) { 5780 /* If restarting in the middle, skip the initial sectors */ 5781 if (mddev->reshape_backwards && 5782 conf->reshape_progress < raid5_size(mddev, 0, 0)) { 5783 sector_nr = raid5_size(mddev, 0, 0) 5784 - conf->reshape_progress; 5785 } else if (mddev->reshape_backwards && 5786 conf->reshape_progress == MaxSector) { 5787 /* shouldn't happen, but just in case, finish up.*/ 5788 sector_nr = MaxSector; 5789 } else if (!mddev->reshape_backwards && 5790 conf->reshape_progress > 0) 5791 sector_nr = conf->reshape_progress; 5792 sector_div(sector_nr, new_data_disks); 5793 if (sector_nr) { 5794 mddev->curr_resync_completed = sector_nr; 5795 sysfs_notify(&mddev->kobj, NULL, "sync_completed"); 5796 *skipped = 1; 5797 retn = sector_nr; 5798 goto finish; 5799 } 5800 } 5801 5802 /* We need to process a full chunk at a time. 5803 * If old and new chunk sizes differ, we need to process the 5804 * largest of these 5805 */ 5806 5807 reshape_sectors = max(conf->chunk_sectors, conf->prev_chunk_sectors); 5808 5809 /* We update the metadata at least every 10 seconds, or when 5810 * the data about to be copied would over-write the source of 5811 * the data at the front of the range. i.e. one new_stripe 5812 * along from reshape_progress new_maps to after where 5813 * reshape_safe old_maps to 5814 */ 5815 writepos = conf->reshape_progress; 5816 sector_div(writepos, new_data_disks); 5817 readpos = conf->reshape_progress; 5818 sector_div(readpos, data_disks); 5819 safepos = conf->reshape_safe; 5820 sector_div(safepos, data_disks); 5821 if (mddev->reshape_backwards) { 5822 BUG_ON(writepos < reshape_sectors); 5823 writepos -= reshape_sectors; 5824 readpos += reshape_sectors; 5825 safepos += reshape_sectors; 5826 } else { 5827 writepos += reshape_sectors; 5828 /* readpos and safepos are worst-case calculations. 5829 * A negative number is overly pessimistic, and causes 5830 * obvious problems for unsigned storage. So clip to 0. 5831 */ 5832 readpos -= min_t(sector_t, reshape_sectors, readpos); 5833 safepos -= min_t(sector_t, reshape_sectors, safepos); 5834 } 5835 5836 /* Having calculated the 'writepos' possibly use it 5837 * to set 'stripe_addr' which is where we will write to. 5838 */ 5839 if (mddev->reshape_backwards) { 5840 BUG_ON(conf->reshape_progress == 0); 5841 stripe_addr = writepos; 5842 BUG_ON((mddev->dev_sectors & 5843 ~((sector_t)reshape_sectors - 1)) 5844 - reshape_sectors - stripe_addr 5845 != sector_nr); 5846 } else { 5847 BUG_ON(writepos != sector_nr + reshape_sectors); 5848 stripe_addr = sector_nr; 5849 } 5850 5851 /* 'writepos' is the most advanced device address we might write. 5852 * 'readpos' is the least advanced device address we might read. 5853 * 'safepos' is the least address recorded in the metadata as having 5854 * been reshaped. 5855 * If there is a min_offset_diff, these are adjusted either by 5856 * increasing the safepos/readpos if diff is negative, or 5857 * increasing writepos if diff is positive. 5858 * If 'readpos' is then behind 'writepos', there is no way that we can 5859 * ensure safety in the face of a crash - that must be done by userspace 5860 * making a backup of the data. So in that case there is no particular 5861 * rush to update metadata. 5862 * Otherwise if 'safepos' is behind 'writepos', then we really need to 5863 * update the metadata to advance 'safepos' to match 'readpos' so that 5864 * we can be safe in the event of a crash. 5865 * So we insist on updating metadata if safepos is behind writepos and 5866 * readpos is beyond writepos. 5867 * In any case, update the metadata every 10 seconds. 5868 * Maybe that number should be configurable, but I'm not sure it is 5869 * worth it.... maybe it could be a multiple of safemode_delay??? 5870 */ 5871 if (conf->min_offset_diff < 0) { 5872 safepos += -conf->min_offset_diff; 5873 readpos += -conf->min_offset_diff; 5874 } else 5875 writepos += conf->min_offset_diff; 5876 5877 if ((mddev->reshape_backwards 5878 ? (safepos > writepos && readpos < writepos) 5879 : (safepos < writepos && readpos > writepos)) || 5880 time_after(jiffies, conf->reshape_checkpoint + 10*HZ)) { 5881 /* Cannot proceed until we've updated the superblock... */ 5882 wait_event(conf->wait_for_overlap, 5883 atomic_read(&conf->reshape_stripes)==0 5884 || test_bit(MD_RECOVERY_INTR, &mddev->recovery)); 5885 if (atomic_read(&conf->reshape_stripes) != 0) 5886 return 0; 5887 mddev->reshape_position = conf->reshape_progress; 5888 mddev->curr_resync_completed = sector_nr; 5889 if (!mddev->reshape_backwards) 5890 /* Can update recovery_offset */ 5891 rdev_for_each(rdev, mddev) 5892 if (rdev->raid_disk >= 0 && 5893 !test_bit(Journal, &rdev->flags) && 5894 !test_bit(In_sync, &rdev->flags) && 5895 rdev->recovery_offset < sector_nr) 5896 rdev->recovery_offset = sector_nr; 5897 5898 conf->reshape_checkpoint = jiffies; 5899 set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags); 5900 md_wakeup_thread(mddev->thread); 5901 wait_event(mddev->sb_wait, mddev->sb_flags == 0 || 5902 test_bit(MD_RECOVERY_INTR, &mddev->recovery)); 5903 if (test_bit(MD_RECOVERY_INTR, &mddev->recovery)) 5904 return 0; 5905 spin_lock_irq(&conf->device_lock); 5906 conf->reshape_safe = mddev->reshape_position; 5907 spin_unlock_irq(&conf->device_lock); 5908 wake_up(&conf->wait_for_overlap); 5909 sysfs_notify(&mddev->kobj, NULL, "sync_completed"); 5910 } 5911 5912 INIT_LIST_HEAD(&stripes); 5913 for (i = 0; i < reshape_sectors; i += STRIPE_SECTORS) { 5914 int j; 5915 int skipped_disk = 0; 5916 sh = raid5_get_active_stripe(conf, stripe_addr+i, 0, 0, 1); 5917 set_bit(STRIPE_EXPANDING, &sh->state); 5918 atomic_inc(&conf->reshape_stripes); 5919 /* If any of this stripe is beyond the end of the old 5920 * array, then we need to zero those blocks 5921 */ 5922 for (j=sh->disks; j--;) { 5923 sector_t s; 5924 if (j == sh->pd_idx) 5925 continue; 5926 if (conf->level == 6 && 5927 j == sh->qd_idx) 5928 continue; 5929 s = raid5_compute_blocknr(sh, j, 0); 5930 if (s < raid5_size(mddev, 0, 0)) { 5931 skipped_disk = 1; 5932 continue; 5933 } 5934 memset(page_address(sh->dev[j].page), 0, STRIPE_SIZE); 5935 set_bit(R5_Expanded, &sh->dev[j].flags); 5936 set_bit(R5_UPTODATE, &sh->dev[j].flags); 5937 } 5938 if (!skipped_disk) { 5939 set_bit(STRIPE_EXPAND_READY, &sh->state); 5940 set_bit(STRIPE_HANDLE, &sh->state); 5941 } 5942 list_add(&sh->lru, &stripes); 5943 } 5944 spin_lock_irq(&conf->device_lock); 5945 if (mddev->reshape_backwards) 5946 conf->reshape_progress -= reshape_sectors * new_data_disks; 5947 else 5948 conf->reshape_progress += reshape_sectors * new_data_disks; 5949 spin_unlock_irq(&conf->device_lock); 5950 /* Ok, those stripe are ready. We can start scheduling 5951 * reads on the source stripes. 5952 * The source stripes are determined by mapping the first and last 5953 * block on the destination stripes. 5954 */ 5955 first_sector = 5956 raid5_compute_sector(conf, stripe_addr*(new_data_disks), 5957 1, &dd_idx, NULL); 5958 last_sector = 5959 raid5_compute_sector(conf, ((stripe_addr+reshape_sectors) 5960 * new_data_disks - 1), 5961 1, &dd_idx, NULL); 5962 if (last_sector >= mddev->dev_sectors) 5963 last_sector = mddev->dev_sectors - 1; 5964 while (first_sector <= last_sector) { 5965 sh = raid5_get_active_stripe(conf, first_sector, 1, 0, 1); 5966 set_bit(STRIPE_EXPAND_SOURCE, &sh->state); 5967 set_bit(STRIPE_HANDLE, &sh->state); 5968 raid5_release_stripe(sh); 5969 first_sector += STRIPE_SECTORS; 5970 } 5971 /* Now that the sources are clearly marked, we can release 5972 * the destination stripes 5973 */ 5974 while (!list_empty(&stripes)) { 5975 sh = list_entry(stripes.next, struct stripe_head, lru); 5976 list_del_init(&sh->lru); 5977 raid5_release_stripe(sh); 5978 } 5979 /* If this takes us to the resync_max point where we have to pause, 5980 * then we need to write out the superblock. 5981 */ 5982 sector_nr += reshape_sectors; 5983 retn = reshape_sectors; 5984 finish: 5985 if (mddev->curr_resync_completed > mddev->resync_max || 5986 (sector_nr - mddev->curr_resync_completed) * 2 5987 >= mddev->resync_max - mddev->curr_resync_completed) { 5988 /* Cannot proceed until we've updated the superblock... */ 5989 wait_event(conf->wait_for_overlap, 5990 atomic_read(&conf->reshape_stripes) == 0 5991 || test_bit(MD_RECOVERY_INTR, &mddev->recovery)); 5992 if (atomic_read(&conf->reshape_stripes) != 0) 5993 goto ret; 5994 mddev->reshape_position = conf->reshape_progress; 5995 mddev->curr_resync_completed = sector_nr; 5996 if (!mddev->reshape_backwards) 5997 /* Can update recovery_offset */ 5998 rdev_for_each(rdev, mddev) 5999 if (rdev->raid_disk >= 0 && 6000 !test_bit(Journal, &rdev->flags) && 6001 !test_bit(In_sync, &rdev->flags) && 6002 rdev->recovery_offset < sector_nr) 6003 rdev->recovery_offset = sector_nr; 6004 conf->reshape_checkpoint = jiffies; 6005 set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags); 6006 md_wakeup_thread(mddev->thread); 6007 wait_event(mddev->sb_wait, 6008 !test_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags) 6009 || test_bit(MD_RECOVERY_INTR, &mddev->recovery)); 6010 if (test_bit(MD_RECOVERY_INTR, &mddev->recovery)) 6011 goto ret; 6012 spin_lock_irq(&conf->device_lock); 6013 conf->reshape_safe = mddev->reshape_position; 6014 spin_unlock_irq(&conf->device_lock); 6015 wake_up(&conf->wait_for_overlap); 6016 sysfs_notify(&mddev->kobj, NULL, "sync_completed"); 6017 } 6018 ret: 6019 return retn; 6020 } 6021 6022 static inline sector_t raid5_sync_request(struct mddev *mddev, sector_t sector_nr, 6023 int *skipped) 6024 { 6025 struct r5conf *conf = mddev->private; 6026 struct stripe_head *sh; 6027 sector_t max_sector = mddev->dev_sectors; 6028 sector_t sync_blocks; 6029 int still_degraded = 0; 6030 int i; 6031 6032 if (sector_nr >= max_sector) { 6033 /* just being told to finish up .. nothing much to do */ 6034 6035 if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)) { 6036 end_reshape(conf); 6037 return 0; 6038 } 6039 6040 if (mddev->curr_resync < max_sector) /* aborted */ 6041 md_bitmap_end_sync(mddev->bitmap, mddev->curr_resync, 6042 &sync_blocks, 1); 6043 else /* completed sync */ 6044 conf->fullsync = 0; 6045 md_bitmap_close_sync(mddev->bitmap); 6046 6047 return 0; 6048 } 6049 6050 /* Allow raid5_quiesce to complete */ 6051 wait_event(conf->wait_for_overlap, conf->quiesce != 2); 6052 6053 if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)) 6054 return reshape_request(mddev, sector_nr, skipped); 6055 6056 /* No need to check resync_max as we never do more than one 6057 * stripe, and as resync_max will always be on a chunk boundary, 6058 * if the check in md_do_sync didn't fire, there is no chance 6059 * of overstepping resync_max here 6060 */ 6061 6062 /* if there is too many failed drives and we are trying 6063 * to resync, then assert that we are finished, because there is 6064 * nothing we can do. 6065 */ 6066 if (mddev->degraded >= conf->max_degraded && 6067 test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) { 6068 sector_t rv = mddev->dev_sectors - sector_nr; 6069 *skipped = 1; 6070 return rv; 6071 } 6072 if (!test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery) && 6073 !conf->fullsync && 6074 !md_bitmap_start_sync(mddev->bitmap, sector_nr, &sync_blocks, 1) && 6075 sync_blocks >= STRIPE_SECTORS) { 6076 /* we can skip this block, and probably more */ 6077 sync_blocks /= STRIPE_SECTORS; 6078 *skipped = 1; 6079 return sync_blocks * STRIPE_SECTORS; /* keep things rounded to whole stripes */ 6080 } 6081 6082 md_bitmap_cond_end_sync(mddev->bitmap, sector_nr, false); 6083 6084 sh = raid5_get_active_stripe(conf, sector_nr, 0, 1, 0); 6085 if (sh == NULL) { 6086 sh = raid5_get_active_stripe(conf, sector_nr, 0, 0, 0); 6087 /* make sure we don't swamp the stripe cache if someone else 6088 * is trying to get access 6089 */ 6090 schedule_timeout_uninterruptible(1); 6091 } 6092 /* Need to check if array will still be degraded after recovery/resync 6093 * Note in case of > 1 drive failures it's possible we're rebuilding 6094 * one drive while leaving another faulty drive in array. 6095 */ 6096 rcu_read_lock(); 6097 for (i = 0; i < conf->raid_disks; i++) { 6098 struct md_rdev *rdev = READ_ONCE(conf->disks[i].rdev); 6099 6100 if (rdev == NULL || test_bit(Faulty, &rdev->flags)) 6101 still_degraded = 1; 6102 } 6103 rcu_read_unlock(); 6104 6105 md_bitmap_start_sync(mddev->bitmap, sector_nr, &sync_blocks, still_degraded); 6106 6107 set_bit(STRIPE_SYNC_REQUESTED, &sh->state); 6108 set_bit(STRIPE_HANDLE, &sh->state); 6109 6110 raid5_release_stripe(sh); 6111 6112 return STRIPE_SECTORS; 6113 } 6114 6115 static int retry_aligned_read(struct r5conf *conf, struct bio *raid_bio, 6116 unsigned int offset) 6117 { 6118 /* We may not be able to submit a whole bio at once as there 6119 * may not be enough stripe_heads available. 6120 * We cannot pre-allocate enough stripe_heads as we may need 6121 * more than exist in the cache (if we allow ever large chunks). 6122 * So we do one stripe head at a time and record in 6123 * ->bi_hw_segments how many have been done. 6124 * 6125 * We *know* that this entire raid_bio is in one chunk, so 6126 * it will be only one 'dd_idx' and only need one call to raid5_compute_sector. 6127 */ 6128 struct stripe_head *sh; 6129 int dd_idx; 6130 sector_t sector, logical_sector, last_sector; 6131 int scnt = 0; 6132 int handled = 0; 6133 6134 logical_sector = raid_bio->bi_iter.bi_sector & 6135 ~((sector_t)STRIPE_SECTORS-1); 6136 sector = raid5_compute_sector(conf, logical_sector, 6137 0, &dd_idx, NULL); 6138 last_sector = bio_end_sector(raid_bio); 6139 6140 for (; logical_sector < last_sector; 6141 logical_sector += STRIPE_SECTORS, 6142 sector += STRIPE_SECTORS, 6143 scnt++) { 6144 6145 if (scnt < offset) 6146 /* already done this stripe */ 6147 continue; 6148 6149 sh = raid5_get_active_stripe(conf, sector, 0, 1, 1); 6150 6151 if (!sh) { 6152 /* failed to get a stripe - must wait */ 6153 conf->retry_read_aligned = raid_bio; 6154 conf->retry_read_offset = scnt; 6155 return handled; 6156 } 6157 6158 if (!add_stripe_bio(sh, raid_bio, dd_idx, 0, 0)) { 6159 raid5_release_stripe(sh); 6160 conf->retry_read_aligned = raid_bio; 6161 conf->retry_read_offset = scnt; 6162 return handled; 6163 } 6164 6165 set_bit(R5_ReadNoMerge, &sh->dev[dd_idx].flags); 6166 handle_stripe(sh); 6167 raid5_release_stripe(sh); 6168 handled++; 6169 } 6170 6171 bio_endio(raid_bio); 6172 6173 if (atomic_dec_and_test(&conf->active_aligned_reads)) 6174 wake_up(&conf->wait_for_quiescent); 6175 return handled; 6176 } 6177 6178 static int handle_active_stripes(struct r5conf *conf, int group, 6179 struct r5worker *worker, 6180 struct list_head *temp_inactive_list) 6181 __releases(&conf->device_lock) 6182 __acquires(&conf->device_lock) 6183 { 6184 struct stripe_head *batch[MAX_STRIPE_BATCH], *sh; 6185 int i, batch_size = 0, hash; 6186 bool release_inactive = false; 6187 6188 while (batch_size < MAX_STRIPE_BATCH && 6189 (sh = __get_priority_stripe(conf, group)) != NULL) 6190 batch[batch_size++] = sh; 6191 6192 if (batch_size == 0) { 6193 for (i = 0; i < NR_STRIPE_HASH_LOCKS; i++) 6194 if (!list_empty(temp_inactive_list + i)) 6195 break; 6196 if (i == NR_STRIPE_HASH_LOCKS) { 6197 spin_unlock_irq(&conf->device_lock); 6198 log_flush_stripe_to_raid(conf); 6199 spin_lock_irq(&conf->device_lock); 6200 return batch_size; 6201 } 6202 release_inactive = true; 6203 } 6204 spin_unlock_irq(&conf->device_lock); 6205 6206 release_inactive_stripe_list(conf, temp_inactive_list, 6207 NR_STRIPE_HASH_LOCKS); 6208 6209 r5l_flush_stripe_to_raid(conf->log); 6210 if (release_inactive) { 6211 spin_lock_irq(&conf->device_lock); 6212 return 0; 6213 } 6214 6215 for (i = 0; i < batch_size; i++) 6216 handle_stripe(batch[i]); 6217 log_write_stripe_run(conf); 6218 6219 cond_resched(); 6220 6221 spin_lock_irq(&conf->device_lock); 6222 for (i = 0; i < batch_size; i++) { 6223 hash = batch[i]->hash_lock_index; 6224 __release_stripe(conf, batch[i], &temp_inactive_list[hash]); 6225 } 6226 return batch_size; 6227 } 6228 6229 static void raid5_do_work(struct work_struct *work) 6230 { 6231 struct r5worker *worker = container_of(work, struct r5worker, work); 6232 struct r5worker_group *group = worker->group; 6233 struct r5conf *conf = group->conf; 6234 struct mddev *mddev = conf->mddev; 6235 int group_id = group - conf->worker_groups; 6236 int handled; 6237 struct blk_plug plug; 6238 6239 pr_debug("+++ raid5worker active\n"); 6240 6241 blk_start_plug(&plug); 6242 handled = 0; 6243 spin_lock_irq(&conf->device_lock); 6244 while (1) { 6245 int batch_size, released; 6246 6247 released = release_stripe_list(conf, worker->temp_inactive_list); 6248 6249 batch_size = handle_active_stripes(conf, group_id, worker, 6250 worker->temp_inactive_list); 6251 worker->working = false; 6252 if (!batch_size && !released) 6253 break; 6254 handled += batch_size; 6255 wait_event_lock_irq(mddev->sb_wait, 6256 !test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags), 6257 conf->device_lock); 6258 } 6259 pr_debug("%d stripes handled\n", handled); 6260 6261 spin_unlock_irq(&conf->device_lock); 6262 6263 flush_deferred_bios(conf); 6264 6265 r5l_flush_stripe_to_raid(conf->log); 6266 6267 async_tx_issue_pending_all(); 6268 blk_finish_plug(&plug); 6269 6270 pr_debug("--- raid5worker inactive\n"); 6271 } 6272 6273 /* 6274 * This is our raid5 kernel thread. 6275 * 6276 * We scan the hash table for stripes which can be handled now. 6277 * During the scan, completed stripes are saved for us by the interrupt 6278 * handler, so that they will not have to wait for our next wakeup. 6279 */ 6280 static void raid5d(struct md_thread *thread) 6281 { 6282 struct mddev *mddev = thread->mddev; 6283 struct r5conf *conf = mddev->private; 6284 int handled; 6285 struct blk_plug plug; 6286 6287 pr_debug("+++ raid5d active\n"); 6288 6289 md_check_recovery(mddev); 6290 6291 blk_start_plug(&plug); 6292 handled = 0; 6293 spin_lock_irq(&conf->device_lock); 6294 while (1) { 6295 struct bio *bio; 6296 int batch_size, released; 6297 unsigned int offset; 6298 6299 released = release_stripe_list(conf, conf->temp_inactive_list); 6300 if (released) 6301 clear_bit(R5_DID_ALLOC, &conf->cache_state); 6302 6303 if ( 6304 !list_empty(&conf->bitmap_list)) { 6305 /* Now is a good time to flush some bitmap updates */ 6306 conf->seq_flush++; 6307 spin_unlock_irq(&conf->device_lock); 6308 md_bitmap_unplug(mddev->bitmap); 6309 spin_lock_irq(&conf->device_lock); 6310 conf->seq_write = conf->seq_flush; 6311 activate_bit_delay(conf, conf->temp_inactive_list); 6312 } 6313 raid5_activate_delayed(conf); 6314 6315 while ((bio = remove_bio_from_retry(conf, &offset))) { 6316 int ok; 6317 spin_unlock_irq(&conf->device_lock); 6318 ok = retry_aligned_read(conf, bio, offset); 6319 spin_lock_irq(&conf->device_lock); 6320 if (!ok) 6321 break; 6322 handled++; 6323 } 6324 6325 batch_size = handle_active_stripes(conf, ANY_GROUP, NULL, 6326 conf->temp_inactive_list); 6327 if (!batch_size && !released) 6328 break; 6329 handled += batch_size; 6330 6331 if (mddev->sb_flags & ~(1 << MD_SB_CHANGE_PENDING)) { 6332 spin_unlock_irq(&conf->device_lock); 6333 md_check_recovery(mddev); 6334 spin_lock_irq(&conf->device_lock); 6335 } 6336 } 6337 pr_debug("%d stripes handled\n", handled); 6338 6339 spin_unlock_irq(&conf->device_lock); 6340 if (test_and_clear_bit(R5_ALLOC_MORE, &conf->cache_state) && 6341 mutex_trylock(&conf->cache_size_mutex)) { 6342 grow_one_stripe(conf, __GFP_NOWARN); 6343 /* Set flag even if allocation failed. This helps 6344 * slow down allocation requests when mem is short 6345 */ 6346 set_bit(R5_DID_ALLOC, &conf->cache_state); 6347 mutex_unlock(&conf->cache_size_mutex); 6348 } 6349 6350 flush_deferred_bios(conf); 6351 6352 r5l_flush_stripe_to_raid(conf->log); 6353 6354 async_tx_issue_pending_all(); 6355 blk_finish_plug(&plug); 6356 6357 pr_debug("--- raid5d inactive\n"); 6358 } 6359 6360 static ssize_t 6361 raid5_show_stripe_cache_size(struct mddev *mddev, char *page) 6362 { 6363 struct r5conf *conf; 6364 int ret = 0; 6365 spin_lock(&mddev->lock); 6366 conf = mddev->private; 6367 if (conf) 6368 ret = sprintf(page, "%d\n", conf->min_nr_stripes); 6369 spin_unlock(&mddev->lock); 6370 return ret; 6371 } 6372 6373 int 6374 raid5_set_cache_size(struct mddev *mddev, int size) 6375 { 6376 int result = 0; 6377 struct r5conf *conf = mddev->private; 6378 6379 if (size <= 16 || size > 32768) 6380 return -EINVAL; 6381 6382 conf->min_nr_stripes = size; 6383 mutex_lock(&conf->cache_size_mutex); 6384 while (size < conf->max_nr_stripes && 6385 drop_one_stripe(conf)) 6386 ; 6387 mutex_unlock(&conf->cache_size_mutex); 6388 6389 md_allow_write(mddev); 6390 6391 mutex_lock(&conf->cache_size_mutex); 6392 while (size > conf->max_nr_stripes) 6393 if (!grow_one_stripe(conf, GFP_KERNEL)) { 6394 conf->min_nr_stripes = conf->max_nr_stripes; 6395 result = -ENOMEM; 6396 break; 6397 } 6398 mutex_unlock(&conf->cache_size_mutex); 6399 6400 return result; 6401 } 6402 EXPORT_SYMBOL(raid5_set_cache_size); 6403 6404 static ssize_t 6405 raid5_store_stripe_cache_size(struct mddev *mddev, const char *page, size_t len) 6406 { 6407 struct r5conf *conf; 6408 unsigned long new; 6409 int err; 6410 6411 if (len >= PAGE_SIZE) 6412 return -EINVAL; 6413 if (kstrtoul(page, 10, &new)) 6414 return -EINVAL; 6415 err = mddev_lock(mddev); 6416 if (err) 6417 return err; 6418 conf = mddev->private; 6419 if (!conf) 6420 err = -ENODEV; 6421 else 6422 err = raid5_set_cache_size(mddev, new); 6423 mddev_unlock(mddev); 6424 6425 return err ?: len; 6426 } 6427 6428 static struct md_sysfs_entry 6429 raid5_stripecache_size = __ATTR(stripe_cache_size, S_IRUGO | S_IWUSR, 6430 raid5_show_stripe_cache_size, 6431 raid5_store_stripe_cache_size); 6432 6433 static ssize_t 6434 raid5_show_rmw_level(struct mddev *mddev, char *page) 6435 { 6436 struct r5conf *conf = mddev->private; 6437 if (conf) 6438 return sprintf(page, "%d\n", conf->rmw_level); 6439 else 6440 return 0; 6441 } 6442 6443 static ssize_t 6444 raid5_store_rmw_level(struct mddev *mddev, const char *page, size_t len) 6445 { 6446 struct r5conf *conf = mddev->private; 6447 unsigned long new; 6448 6449 if (!conf) 6450 return -ENODEV; 6451 6452 if (len >= PAGE_SIZE) 6453 return -EINVAL; 6454 6455 if (kstrtoul(page, 10, &new)) 6456 return -EINVAL; 6457 6458 if (new != PARITY_DISABLE_RMW && !raid6_call.xor_syndrome) 6459 return -EINVAL; 6460 6461 if (new != PARITY_DISABLE_RMW && 6462 new != PARITY_ENABLE_RMW && 6463 new != PARITY_PREFER_RMW) 6464 return -EINVAL; 6465 6466 conf->rmw_level = new; 6467 return len; 6468 } 6469 6470 static struct md_sysfs_entry 6471 raid5_rmw_level = __ATTR(rmw_level, S_IRUGO | S_IWUSR, 6472 raid5_show_rmw_level, 6473 raid5_store_rmw_level); 6474 6475 6476 static ssize_t 6477 raid5_show_preread_threshold(struct mddev *mddev, char *page) 6478 { 6479 struct r5conf *conf; 6480 int ret = 0; 6481 spin_lock(&mddev->lock); 6482 conf = mddev->private; 6483 if (conf) 6484 ret = sprintf(page, "%d\n", conf->bypass_threshold); 6485 spin_unlock(&mddev->lock); 6486 return ret; 6487 } 6488 6489 static ssize_t 6490 raid5_store_preread_threshold(struct mddev *mddev, const char *page, size_t len) 6491 { 6492 struct r5conf *conf; 6493 unsigned long new; 6494 int err; 6495 6496 if (len >= PAGE_SIZE) 6497 return -EINVAL; 6498 if (kstrtoul(page, 10, &new)) 6499 return -EINVAL; 6500 6501 err = mddev_lock(mddev); 6502 if (err) 6503 return err; 6504 conf = mddev->private; 6505 if (!conf) 6506 err = -ENODEV; 6507 else if (new > conf->min_nr_stripes) 6508 err = -EINVAL; 6509 else 6510 conf->bypass_threshold = new; 6511 mddev_unlock(mddev); 6512 return err ?: len; 6513 } 6514 6515 static struct md_sysfs_entry 6516 raid5_preread_bypass_threshold = __ATTR(preread_bypass_threshold, 6517 S_IRUGO | S_IWUSR, 6518 raid5_show_preread_threshold, 6519 raid5_store_preread_threshold); 6520 6521 static ssize_t 6522 raid5_show_skip_copy(struct mddev *mddev, char *page) 6523 { 6524 struct r5conf *conf; 6525 int ret = 0; 6526 spin_lock(&mddev->lock); 6527 conf = mddev->private; 6528 if (conf) 6529 ret = sprintf(page, "%d\n", conf->skip_copy); 6530 spin_unlock(&mddev->lock); 6531 return ret; 6532 } 6533 6534 static ssize_t 6535 raid5_store_skip_copy(struct mddev *mddev, const char *page, size_t len) 6536 { 6537 struct r5conf *conf; 6538 unsigned long new; 6539 int err; 6540 6541 if (len >= PAGE_SIZE) 6542 return -EINVAL; 6543 if (kstrtoul(page, 10, &new)) 6544 return -EINVAL; 6545 new = !!new; 6546 6547 err = mddev_lock(mddev); 6548 if (err) 6549 return err; 6550 conf = mddev->private; 6551 if (!conf) 6552 err = -ENODEV; 6553 else if (new != conf->skip_copy) { 6554 mddev_suspend(mddev); 6555 conf->skip_copy = new; 6556 if (new) 6557 mddev->queue->backing_dev_info->capabilities |= 6558 BDI_CAP_STABLE_WRITES; 6559 else 6560 mddev->queue->backing_dev_info->capabilities &= 6561 ~BDI_CAP_STABLE_WRITES; 6562 mddev_resume(mddev); 6563 } 6564 mddev_unlock(mddev); 6565 return err ?: len; 6566 } 6567 6568 static struct md_sysfs_entry 6569 raid5_skip_copy = __ATTR(skip_copy, S_IRUGO | S_IWUSR, 6570 raid5_show_skip_copy, 6571 raid5_store_skip_copy); 6572 6573 static ssize_t 6574 stripe_cache_active_show(struct mddev *mddev, char *page) 6575 { 6576 struct r5conf *conf = mddev->private; 6577 if (conf) 6578 return sprintf(page, "%d\n", atomic_read(&conf->active_stripes)); 6579 else 6580 return 0; 6581 } 6582 6583 static struct md_sysfs_entry 6584 raid5_stripecache_active = __ATTR_RO(stripe_cache_active); 6585 6586 static ssize_t 6587 raid5_show_group_thread_cnt(struct mddev *mddev, char *page) 6588 { 6589 struct r5conf *conf; 6590 int ret = 0; 6591 spin_lock(&mddev->lock); 6592 conf = mddev->private; 6593 if (conf) 6594 ret = sprintf(page, "%d\n", conf->worker_cnt_per_group); 6595 spin_unlock(&mddev->lock); 6596 return ret; 6597 } 6598 6599 static int alloc_thread_groups(struct r5conf *conf, int cnt, 6600 int *group_cnt, 6601 int *worker_cnt_per_group, 6602 struct r5worker_group **worker_groups); 6603 static ssize_t 6604 raid5_store_group_thread_cnt(struct mddev *mddev, const char *page, size_t len) 6605 { 6606 struct r5conf *conf; 6607 unsigned int new; 6608 int err; 6609 struct r5worker_group *new_groups, *old_groups; 6610 int group_cnt, worker_cnt_per_group; 6611 6612 if (len >= PAGE_SIZE) 6613 return -EINVAL; 6614 if (kstrtouint(page, 10, &new)) 6615 return -EINVAL; 6616 /* 8192 should be big enough */ 6617 if (new > 8192) 6618 return -EINVAL; 6619 6620 err = mddev_lock(mddev); 6621 if (err) 6622 return err; 6623 conf = mddev->private; 6624 if (!conf) 6625 err = -ENODEV; 6626 else if (new != conf->worker_cnt_per_group) { 6627 mddev_suspend(mddev); 6628 6629 old_groups = conf->worker_groups; 6630 if (old_groups) 6631 flush_workqueue(raid5_wq); 6632 6633 err = alloc_thread_groups(conf, new, 6634 &group_cnt, &worker_cnt_per_group, 6635 &new_groups); 6636 if (!err) { 6637 spin_lock_irq(&conf->device_lock); 6638 conf->group_cnt = group_cnt; 6639 conf->worker_cnt_per_group = worker_cnt_per_group; 6640 conf->worker_groups = new_groups; 6641 spin_unlock_irq(&conf->device_lock); 6642 6643 if (old_groups) 6644 kfree(old_groups[0].workers); 6645 kfree(old_groups); 6646 } 6647 mddev_resume(mddev); 6648 } 6649 mddev_unlock(mddev); 6650 6651 return err ?: len; 6652 } 6653 6654 static struct md_sysfs_entry 6655 raid5_group_thread_cnt = __ATTR(group_thread_cnt, S_IRUGO | S_IWUSR, 6656 raid5_show_group_thread_cnt, 6657 raid5_store_group_thread_cnt); 6658 6659 static struct attribute *raid5_attrs[] = { 6660 &raid5_stripecache_size.attr, 6661 &raid5_stripecache_active.attr, 6662 &raid5_preread_bypass_threshold.attr, 6663 &raid5_group_thread_cnt.attr, 6664 &raid5_skip_copy.attr, 6665 &raid5_rmw_level.attr, 6666 &r5c_journal_mode.attr, 6667 &ppl_write_hint.attr, 6668 NULL, 6669 }; 6670 static struct attribute_group raid5_attrs_group = { 6671 .name = NULL, 6672 .attrs = raid5_attrs, 6673 }; 6674 6675 static int alloc_thread_groups(struct r5conf *conf, int cnt, 6676 int *group_cnt, 6677 int *worker_cnt_per_group, 6678 struct r5worker_group **worker_groups) 6679 { 6680 int i, j, k; 6681 ssize_t size; 6682 struct r5worker *workers; 6683 6684 *worker_cnt_per_group = cnt; 6685 if (cnt == 0) { 6686 *group_cnt = 0; 6687 *worker_groups = NULL; 6688 return 0; 6689 } 6690 *group_cnt = num_possible_nodes(); 6691 size = sizeof(struct r5worker) * cnt; 6692 workers = kcalloc(size, *group_cnt, GFP_NOIO); 6693 *worker_groups = kcalloc(*group_cnt, sizeof(struct r5worker_group), 6694 GFP_NOIO); 6695 if (!*worker_groups || !workers) { 6696 kfree(workers); 6697 kfree(*worker_groups); 6698 return -ENOMEM; 6699 } 6700 6701 for (i = 0; i < *group_cnt; i++) { 6702 struct r5worker_group *group; 6703 6704 group = &(*worker_groups)[i]; 6705 INIT_LIST_HEAD(&group->handle_list); 6706 INIT_LIST_HEAD(&group->loprio_list); 6707 group->conf = conf; 6708 group->workers = workers + i * cnt; 6709 6710 for (j = 0; j < cnt; j++) { 6711 struct r5worker *worker = group->workers + j; 6712 worker->group = group; 6713 INIT_WORK(&worker->work, raid5_do_work); 6714 6715 for (k = 0; k < NR_STRIPE_HASH_LOCKS; k++) 6716 INIT_LIST_HEAD(worker->temp_inactive_list + k); 6717 } 6718 } 6719 6720 return 0; 6721 } 6722 6723 static void free_thread_groups(struct r5conf *conf) 6724 { 6725 if (conf->worker_groups) 6726 kfree(conf->worker_groups[0].workers); 6727 kfree(conf->worker_groups); 6728 conf->worker_groups = NULL; 6729 } 6730 6731 static sector_t 6732 raid5_size(struct mddev *mddev, sector_t sectors, int raid_disks) 6733 { 6734 struct r5conf *conf = mddev->private; 6735 6736 if (!sectors) 6737 sectors = mddev->dev_sectors; 6738 if (!raid_disks) 6739 /* size is defined by the smallest of previous and new size */ 6740 raid_disks = min(conf->raid_disks, conf->previous_raid_disks); 6741 6742 sectors &= ~((sector_t)conf->chunk_sectors - 1); 6743 sectors &= ~((sector_t)conf->prev_chunk_sectors - 1); 6744 return sectors * (raid_disks - conf->max_degraded); 6745 } 6746 6747 static void free_scratch_buffer(struct r5conf *conf, struct raid5_percpu *percpu) 6748 { 6749 safe_put_page(percpu->spare_page); 6750 percpu->spare_page = NULL; 6751 kvfree(percpu->scribble); 6752 percpu->scribble = NULL; 6753 } 6754 6755 static int alloc_scratch_buffer(struct r5conf *conf, struct raid5_percpu *percpu) 6756 { 6757 if (conf->level == 6 && !percpu->spare_page) { 6758 percpu->spare_page = alloc_page(GFP_KERNEL); 6759 if (!percpu->spare_page) 6760 return -ENOMEM; 6761 } 6762 6763 if (scribble_alloc(percpu, 6764 max(conf->raid_disks, 6765 conf->previous_raid_disks), 6766 max(conf->chunk_sectors, 6767 conf->prev_chunk_sectors) 6768 / STRIPE_SECTORS, 6769 GFP_KERNEL)) { 6770 free_scratch_buffer(conf, percpu); 6771 return -ENOMEM; 6772 } 6773 6774 return 0; 6775 } 6776 6777 static int raid456_cpu_dead(unsigned int cpu, struct hlist_node *node) 6778 { 6779 struct r5conf *conf = hlist_entry_safe(node, struct r5conf, node); 6780 6781 free_scratch_buffer(conf, per_cpu_ptr(conf->percpu, cpu)); 6782 return 0; 6783 } 6784 6785 static void raid5_free_percpu(struct r5conf *conf) 6786 { 6787 if (!conf->percpu) 6788 return; 6789 6790 cpuhp_state_remove_instance(CPUHP_MD_RAID5_PREPARE, &conf->node); 6791 free_percpu(conf->percpu); 6792 } 6793 6794 static void free_conf(struct r5conf *conf) 6795 { 6796 int i; 6797 6798 log_exit(conf); 6799 6800 unregister_shrinker(&conf->shrinker); 6801 free_thread_groups(conf); 6802 shrink_stripes(conf); 6803 raid5_free_percpu(conf); 6804 for (i = 0; i < conf->pool_size; i++) 6805 if (conf->disks[i].extra_page) 6806 put_page(conf->disks[i].extra_page); 6807 kfree(conf->disks); 6808 bioset_exit(&conf->bio_split); 6809 kfree(conf->stripe_hashtbl); 6810 kfree(conf->pending_data); 6811 kfree(conf); 6812 } 6813 6814 static int raid456_cpu_up_prepare(unsigned int cpu, struct hlist_node *node) 6815 { 6816 struct r5conf *conf = hlist_entry_safe(node, struct r5conf, node); 6817 struct raid5_percpu *percpu = per_cpu_ptr(conf->percpu, cpu); 6818 6819 if (alloc_scratch_buffer(conf, percpu)) { 6820 pr_warn("%s: failed memory allocation for cpu%u\n", 6821 __func__, cpu); 6822 return -ENOMEM; 6823 } 6824 return 0; 6825 } 6826 6827 static int raid5_alloc_percpu(struct r5conf *conf) 6828 { 6829 int err = 0; 6830 6831 conf->percpu = alloc_percpu(struct raid5_percpu); 6832 if (!conf->percpu) 6833 return -ENOMEM; 6834 6835 err = cpuhp_state_add_instance(CPUHP_MD_RAID5_PREPARE, &conf->node); 6836 if (!err) { 6837 conf->scribble_disks = max(conf->raid_disks, 6838 conf->previous_raid_disks); 6839 conf->scribble_sectors = max(conf->chunk_sectors, 6840 conf->prev_chunk_sectors); 6841 } 6842 return err; 6843 } 6844 6845 static unsigned long raid5_cache_scan(struct shrinker *shrink, 6846 struct shrink_control *sc) 6847 { 6848 struct r5conf *conf = container_of(shrink, struct r5conf, shrinker); 6849 unsigned long ret = SHRINK_STOP; 6850 6851 if (mutex_trylock(&conf->cache_size_mutex)) { 6852 ret= 0; 6853 while (ret < sc->nr_to_scan && 6854 conf->max_nr_stripes > conf->min_nr_stripes) { 6855 if (drop_one_stripe(conf) == 0) { 6856 ret = SHRINK_STOP; 6857 break; 6858 } 6859 ret++; 6860 } 6861 mutex_unlock(&conf->cache_size_mutex); 6862 } 6863 return ret; 6864 } 6865 6866 static unsigned long raid5_cache_count(struct shrinker *shrink, 6867 struct shrink_control *sc) 6868 { 6869 struct r5conf *conf = container_of(shrink, struct r5conf, shrinker); 6870 6871 if (conf->max_nr_stripes < conf->min_nr_stripes) 6872 /* unlikely, but not impossible */ 6873 return 0; 6874 return conf->max_nr_stripes - conf->min_nr_stripes; 6875 } 6876 6877 static struct r5conf *setup_conf(struct mddev *mddev) 6878 { 6879 struct r5conf *conf; 6880 int raid_disk, memory, max_disks; 6881 struct md_rdev *rdev; 6882 struct disk_info *disk; 6883 char pers_name[6]; 6884 int i; 6885 int group_cnt, worker_cnt_per_group; 6886 struct r5worker_group *new_group; 6887 int ret; 6888 6889 if (mddev->new_level != 5 6890 && mddev->new_level != 4 6891 && mddev->new_level != 6) { 6892 pr_warn("md/raid:%s: raid level not set to 4/5/6 (%d)\n", 6893 mdname(mddev), mddev->new_level); 6894 return ERR_PTR(-EIO); 6895 } 6896 if ((mddev->new_level == 5 6897 && !algorithm_valid_raid5(mddev->new_layout)) || 6898 (mddev->new_level == 6 6899 && !algorithm_valid_raid6(mddev->new_layout))) { 6900 pr_warn("md/raid:%s: layout %d not supported\n", 6901 mdname(mddev), mddev->new_layout); 6902 return ERR_PTR(-EIO); 6903 } 6904 if (mddev->new_level == 6 && mddev->raid_disks < 4) { 6905 pr_warn("md/raid:%s: not enough configured devices (%d, minimum 4)\n", 6906 mdname(mddev), mddev->raid_disks); 6907 return ERR_PTR(-EINVAL); 6908 } 6909 6910 if (!mddev->new_chunk_sectors || 6911 (mddev->new_chunk_sectors << 9) % PAGE_SIZE || 6912 !is_power_of_2(mddev->new_chunk_sectors)) { 6913 pr_warn("md/raid:%s: invalid chunk size %d\n", 6914 mdname(mddev), mddev->new_chunk_sectors << 9); 6915 return ERR_PTR(-EINVAL); 6916 } 6917 6918 conf = kzalloc(sizeof(struct r5conf), GFP_KERNEL); 6919 if (conf == NULL) 6920 goto abort; 6921 INIT_LIST_HEAD(&conf->free_list); 6922 INIT_LIST_HEAD(&conf->pending_list); 6923 conf->pending_data = kcalloc(PENDING_IO_MAX, 6924 sizeof(struct r5pending_data), 6925 GFP_KERNEL); 6926 if (!conf->pending_data) 6927 goto abort; 6928 for (i = 0; i < PENDING_IO_MAX; i++) 6929 list_add(&conf->pending_data[i].sibling, &conf->free_list); 6930 /* Don't enable multi-threading by default*/ 6931 if (!alloc_thread_groups(conf, 0, &group_cnt, &worker_cnt_per_group, 6932 &new_group)) { 6933 conf->group_cnt = group_cnt; 6934 conf->worker_cnt_per_group = worker_cnt_per_group; 6935 conf->worker_groups = new_group; 6936 } else 6937 goto abort; 6938 spin_lock_init(&conf->device_lock); 6939 seqcount_init(&conf->gen_lock); 6940 mutex_init(&conf->cache_size_mutex); 6941 init_waitqueue_head(&conf->wait_for_quiescent); 6942 init_waitqueue_head(&conf->wait_for_stripe); 6943 init_waitqueue_head(&conf->wait_for_overlap); 6944 INIT_LIST_HEAD(&conf->handle_list); 6945 INIT_LIST_HEAD(&conf->loprio_list); 6946 INIT_LIST_HEAD(&conf->hold_list); 6947 INIT_LIST_HEAD(&conf->delayed_list); 6948 INIT_LIST_HEAD(&conf->bitmap_list); 6949 init_llist_head(&conf->released_stripes); 6950 atomic_set(&conf->active_stripes, 0); 6951 atomic_set(&conf->preread_active_stripes, 0); 6952 atomic_set(&conf->active_aligned_reads, 0); 6953 spin_lock_init(&conf->pending_bios_lock); 6954 conf->batch_bio_dispatch = true; 6955 rdev_for_each(rdev, mddev) { 6956 if (test_bit(Journal, &rdev->flags)) 6957 continue; 6958 if (blk_queue_nonrot(bdev_get_queue(rdev->bdev))) { 6959 conf->batch_bio_dispatch = false; 6960 break; 6961 } 6962 } 6963 6964 conf->bypass_threshold = BYPASS_THRESHOLD; 6965 conf->recovery_disabled = mddev->recovery_disabled - 1; 6966 6967 conf->raid_disks = mddev->raid_disks; 6968 if (mddev->reshape_position == MaxSector) 6969 conf->previous_raid_disks = mddev->raid_disks; 6970 else 6971 conf->previous_raid_disks = mddev->raid_disks - mddev->delta_disks; 6972 max_disks = max(conf->raid_disks, conf->previous_raid_disks); 6973 6974 conf->disks = kcalloc(max_disks, sizeof(struct disk_info), 6975 GFP_KERNEL); 6976 6977 if (!conf->disks) 6978 goto abort; 6979 6980 for (i = 0; i < max_disks; i++) { 6981 conf->disks[i].extra_page = alloc_page(GFP_KERNEL); 6982 if (!conf->disks[i].extra_page) 6983 goto abort; 6984 } 6985 6986 ret = bioset_init(&conf->bio_split, BIO_POOL_SIZE, 0, 0); 6987 if (ret) 6988 goto abort; 6989 conf->mddev = mddev; 6990 6991 if ((conf->stripe_hashtbl = kzalloc(PAGE_SIZE, GFP_KERNEL)) == NULL) 6992 goto abort; 6993 6994 /* We init hash_locks[0] separately to that it can be used 6995 * as the reference lock in the spin_lock_nest_lock() call 6996 * in lock_all_device_hash_locks_irq in order to convince 6997 * lockdep that we know what we are doing. 6998 */ 6999 spin_lock_init(conf->hash_locks); 7000 for (i = 1; i < NR_STRIPE_HASH_LOCKS; i++) 7001 spin_lock_init(conf->hash_locks + i); 7002 7003 for (i = 0; i < NR_STRIPE_HASH_LOCKS; i++) 7004 INIT_LIST_HEAD(conf->inactive_list + i); 7005 7006 for (i = 0; i < NR_STRIPE_HASH_LOCKS; i++) 7007 INIT_LIST_HEAD(conf->temp_inactive_list + i); 7008 7009 atomic_set(&conf->r5c_cached_full_stripes, 0); 7010 INIT_LIST_HEAD(&conf->r5c_full_stripe_list); 7011 atomic_set(&conf->r5c_cached_partial_stripes, 0); 7012 INIT_LIST_HEAD(&conf->r5c_partial_stripe_list); 7013 atomic_set(&conf->r5c_flushing_full_stripes, 0); 7014 atomic_set(&conf->r5c_flushing_partial_stripes, 0); 7015 7016 conf->level = mddev->new_level; 7017 conf->chunk_sectors = mddev->new_chunk_sectors; 7018 if (raid5_alloc_percpu(conf) != 0) 7019 goto abort; 7020 7021 pr_debug("raid456: run(%s) called.\n", mdname(mddev)); 7022 7023 rdev_for_each(rdev, mddev) { 7024 raid_disk = rdev->raid_disk; 7025 if (raid_disk >= max_disks 7026 || raid_disk < 0 || test_bit(Journal, &rdev->flags)) 7027 continue; 7028 disk = conf->disks + raid_disk; 7029 7030 if (test_bit(Replacement, &rdev->flags)) { 7031 if (disk->replacement) 7032 goto abort; 7033 disk->replacement = rdev; 7034 } else { 7035 if (disk->rdev) 7036 goto abort; 7037 disk->rdev = rdev; 7038 } 7039 7040 if (test_bit(In_sync, &rdev->flags)) { 7041 char b[BDEVNAME_SIZE]; 7042 pr_info("md/raid:%s: device %s operational as raid disk %d\n", 7043 mdname(mddev), bdevname(rdev->bdev, b), raid_disk); 7044 } else if (rdev->saved_raid_disk != raid_disk) 7045 /* Cannot rely on bitmap to complete recovery */ 7046 conf->fullsync = 1; 7047 } 7048 7049 conf->level = mddev->new_level; 7050 if (conf->level == 6) { 7051 conf->max_degraded = 2; 7052 if (raid6_call.xor_syndrome) 7053 conf->rmw_level = PARITY_ENABLE_RMW; 7054 else 7055 conf->rmw_level = PARITY_DISABLE_RMW; 7056 } else { 7057 conf->max_degraded = 1; 7058 conf->rmw_level = PARITY_ENABLE_RMW; 7059 } 7060 conf->algorithm = mddev->new_layout; 7061 conf->reshape_progress = mddev->reshape_position; 7062 if (conf->reshape_progress != MaxSector) { 7063 conf->prev_chunk_sectors = mddev->chunk_sectors; 7064 conf->prev_algo = mddev->layout; 7065 } else { 7066 conf->prev_chunk_sectors = conf->chunk_sectors; 7067 conf->prev_algo = conf->algorithm; 7068 } 7069 7070 conf->min_nr_stripes = NR_STRIPES; 7071 if (mddev->reshape_position != MaxSector) { 7072 int stripes = max_t(int, 7073 ((mddev->chunk_sectors << 9) / STRIPE_SIZE) * 4, 7074 ((mddev->new_chunk_sectors << 9) / STRIPE_SIZE) * 4); 7075 conf->min_nr_stripes = max(NR_STRIPES, stripes); 7076 if (conf->min_nr_stripes != NR_STRIPES) 7077 pr_info("md/raid:%s: force stripe size %d for reshape\n", 7078 mdname(mddev), conf->min_nr_stripes); 7079 } 7080 memory = conf->min_nr_stripes * (sizeof(struct stripe_head) + 7081 max_disks * ((sizeof(struct bio) + PAGE_SIZE))) / 1024; 7082 atomic_set(&conf->empty_inactive_list_nr, NR_STRIPE_HASH_LOCKS); 7083 if (grow_stripes(conf, conf->min_nr_stripes)) { 7084 pr_warn("md/raid:%s: couldn't allocate %dkB for buffers\n", 7085 mdname(mddev), memory); 7086 goto abort; 7087 } else 7088 pr_debug("md/raid:%s: allocated %dkB\n", mdname(mddev), memory); 7089 /* 7090 * Losing a stripe head costs more than the time to refill it, 7091 * it reduces the queue depth and so can hurt throughput. 7092 * So set it rather large, scaled by number of devices. 7093 */ 7094 conf->shrinker.seeks = DEFAULT_SEEKS * conf->raid_disks * 4; 7095 conf->shrinker.scan_objects = raid5_cache_scan; 7096 conf->shrinker.count_objects = raid5_cache_count; 7097 conf->shrinker.batch = 128; 7098 conf->shrinker.flags = 0; 7099 if (register_shrinker(&conf->shrinker)) { 7100 pr_warn("md/raid:%s: couldn't register shrinker.\n", 7101 mdname(mddev)); 7102 goto abort; 7103 } 7104 7105 sprintf(pers_name, "raid%d", mddev->new_level); 7106 conf->thread = md_register_thread(raid5d, mddev, pers_name); 7107 if (!conf->thread) { 7108 pr_warn("md/raid:%s: couldn't allocate thread.\n", 7109 mdname(mddev)); 7110 goto abort; 7111 } 7112 7113 return conf; 7114 7115 abort: 7116 if (conf) { 7117 free_conf(conf); 7118 return ERR_PTR(-EIO); 7119 } else 7120 return ERR_PTR(-ENOMEM); 7121 } 7122 7123 static int only_parity(int raid_disk, int algo, int raid_disks, int max_degraded) 7124 { 7125 switch (algo) { 7126 case ALGORITHM_PARITY_0: 7127 if (raid_disk < max_degraded) 7128 return 1; 7129 break; 7130 case ALGORITHM_PARITY_N: 7131 if (raid_disk >= raid_disks - max_degraded) 7132 return 1; 7133 break; 7134 case ALGORITHM_PARITY_0_6: 7135 if (raid_disk == 0 || 7136 raid_disk == raid_disks - 1) 7137 return 1; 7138 break; 7139 case ALGORITHM_LEFT_ASYMMETRIC_6: 7140 case ALGORITHM_RIGHT_ASYMMETRIC_6: 7141 case ALGORITHM_LEFT_SYMMETRIC_6: 7142 case ALGORITHM_RIGHT_SYMMETRIC_6: 7143 if (raid_disk == raid_disks - 1) 7144 return 1; 7145 } 7146 return 0; 7147 } 7148 7149 static int raid5_run(struct mddev *mddev) 7150 { 7151 struct r5conf *conf; 7152 int working_disks = 0; 7153 int dirty_parity_disks = 0; 7154 struct md_rdev *rdev; 7155 struct md_rdev *journal_dev = NULL; 7156 sector_t reshape_offset = 0; 7157 int i; 7158 long long min_offset_diff = 0; 7159 int first = 1; 7160 7161 if (mddev_init_writes_pending(mddev) < 0) 7162 return -ENOMEM; 7163 7164 if (mddev->recovery_cp != MaxSector) 7165 pr_notice("md/raid:%s: not clean -- starting background reconstruction\n", 7166 mdname(mddev)); 7167 7168 rdev_for_each(rdev, mddev) { 7169 long long diff; 7170 7171 if (test_bit(Journal, &rdev->flags)) { 7172 journal_dev = rdev; 7173 continue; 7174 } 7175 if (rdev->raid_disk < 0) 7176 continue; 7177 diff = (rdev->new_data_offset - rdev->data_offset); 7178 if (first) { 7179 min_offset_diff = diff; 7180 first = 0; 7181 } else if (mddev->reshape_backwards && 7182 diff < min_offset_diff) 7183 min_offset_diff = diff; 7184 else if (!mddev->reshape_backwards && 7185 diff > min_offset_diff) 7186 min_offset_diff = diff; 7187 } 7188 7189 if ((test_bit(MD_HAS_JOURNAL, &mddev->flags) || journal_dev) && 7190 (mddev->bitmap_info.offset || mddev->bitmap_info.file)) { 7191 pr_notice("md/raid:%s: array cannot have both journal and bitmap\n", 7192 mdname(mddev)); 7193 return -EINVAL; 7194 } 7195 7196 if (mddev->reshape_position != MaxSector) { 7197 /* Check that we can continue the reshape. 7198 * Difficulties arise if the stripe we would write to 7199 * next is at or after the stripe we would read from next. 7200 * For a reshape that changes the number of devices, this 7201 * is only possible for a very short time, and mdadm makes 7202 * sure that time appears to have past before assembling 7203 * the array. So we fail if that time hasn't passed. 7204 * For a reshape that keeps the number of devices the same 7205 * mdadm must be monitoring the reshape can keeping the 7206 * critical areas read-only and backed up. It will start 7207 * the array in read-only mode, so we check for that. 7208 */ 7209 sector_t here_new, here_old; 7210 int old_disks; 7211 int max_degraded = (mddev->level == 6 ? 2 : 1); 7212 int chunk_sectors; 7213 int new_data_disks; 7214 7215 if (journal_dev) { 7216 pr_warn("md/raid:%s: don't support reshape with journal - aborting.\n", 7217 mdname(mddev)); 7218 return -EINVAL; 7219 } 7220 7221 if (mddev->new_level != mddev->level) { 7222 pr_warn("md/raid:%s: unsupported reshape required - aborting.\n", 7223 mdname(mddev)); 7224 return -EINVAL; 7225 } 7226 old_disks = mddev->raid_disks - mddev->delta_disks; 7227 /* reshape_position must be on a new-stripe boundary, and one 7228 * further up in new geometry must map after here in old 7229 * geometry. 7230 * If the chunk sizes are different, then as we perform reshape 7231 * in units of the largest of the two, reshape_position needs 7232 * be a multiple of the largest chunk size times new data disks. 7233 */ 7234 here_new = mddev->reshape_position; 7235 chunk_sectors = max(mddev->chunk_sectors, mddev->new_chunk_sectors); 7236 new_data_disks = mddev->raid_disks - max_degraded; 7237 if (sector_div(here_new, chunk_sectors * new_data_disks)) { 7238 pr_warn("md/raid:%s: reshape_position not on a stripe boundary\n", 7239 mdname(mddev)); 7240 return -EINVAL; 7241 } 7242 reshape_offset = here_new * chunk_sectors; 7243 /* here_new is the stripe we will write to */ 7244 here_old = mddev->reshape_position; 7245 sector_div(here_old, chunk_sectors * (old_disks-max_degraded)); 7246 /* here_old is the first stripe that we might need to read 7247 * from */ 7248 if (mddev->delta_disks == 0) { 7249 /* We cannot be sure it is safe to start an in-place 7250 * reshape. It is only safe if user-space is monitoring 7251 * and taking constant backups. 7252 * mdadm always starts a situation like this in 7253 * readonly mode so it can take control before 7254 * allowing any writes. So just check for that. 7255 */ 7256 if (abs(min_offset_diff) >= mddev->chunk_sectors && 7257 abs(min_offset_diff) >= mddev->new_chunk_sectors) 7258 /* not really in-place - so OK */; 7259 else if (mddev->ro == 0) { 7260 pr_warn("md/raid:%s: in-place reshape must be started in read-only mode - aborting\n", 7261 mdname(mddev)); 7262 return -EINVAL; 7263 } 7264 } else if (mddev->reshape_backwards 7265 ? (here_new * chunk_sectors + min_offset_diff <= 7266 here_old * chunk_sectors) 7267 : (here_new * chunk_sectors >= 7268 here_old * chunk_sectors + (-min_offset_diff))) { 7269 /* Reading from the same stripe as writing to - bad */ 7270 pr_warn("md/raid:%s: reshape_position too early for auto-recovery - aborting.\n", 7271 mdname(mddev)); 7272 return -EINVAL; 7273 } 7274 pr_debug("md/raid:%s: reshape will continue\n", mdname(mddev)); 7275 /* OK, we should be able to continue; */ 7276 } else { 7277 BUG_ON(mddev->level != mddev->new_level); 7278 BUG_ON(mddev->layout != mddev->new_layout); 7279 BUG_ON(mddev->chunk_sectors != mddev->new_chunk_sectors); 7280 BUG_ON(mddev->delta_disks != 0); 7281 } 7282 7283 if (test_bit(MD_HAS_JOURNAL, &mddev->flags) && 7284 test_bit(MD_HAS_PPL, &mddev->flags)) { 7285 pr_warn("md/raid:%s: using journal device and PPL not allowed - disabling PPL\n", 7286 mdname(mddev)); 7287 clear_bit(MD_HAS_PPL, &mddev->flags); 7288 clear_bit(MD_HAS_MULTIPLE_PPLS, &mddev->flags); 7289 } 7290 7291 if (mddev->private == NULL) 7292 conf = setup_conf(mddev); 7293 else 7294 conf = mddev->private; 7295 7296 if (IS_ERR(conf)) 7297 return PTR_ERR(conf); 7298 7299 if (test_bit(MD_HAS_JOURNAL, &mddev->flags)) { 7300 if (!journal_dev) { 7301 pr_warn("md/raid:%s: journal disk is missing, force array readonly\n", 7302 mdname(mddev)); 7303 mddev->ro = 1; 7304 set_disk_ro(mddev->gendisk, 1); 7305 } else if (mddev->recovery_cp == MaxSector) 7306 set_bit(MD_JOURNAL_CLEAN, &mddev->flags); 7307 } 7308 7309 conf->min_offset_diff = min_offset_diff; 7310 mddev->thread = conf->thread; 7311 conf->thread = NULL; 7312 mddev->private = conf; 7313 7314 for (i = 0; i < conf->raid_disks && conf->previous_raid_disks; 7315 i++) { 7316 rdev = conf->disks[i].rdev; 7317 if (!rdev && conf->disks[i].replacement) { 7318 /* The replacement is all we have yet */ 7319 rdev = conf->disks[i].replacement; 7320 conf->disks[i].replacement = NULL; 7321 clear_bit(Replacement, &rdev->flags); 7322 conf->disks[i].rdev = rdev; 7323 } 7324 if (!rdev) 7325 continue; 7326 if (conf->disks[i].replacement && 7327 conf->reshape_progress != MaxSector) { 7328 /* replacements and reshape simply do not mix. */ 7329 pr_warn("md: cannot handle concurrent replacement and reshape.\n"); 7330 goto abort; 7331 } 7332 if (test_bit(In_sync, &rdev->flags)) { 7333 working_disks++; 7334 continue; 7335 } 7336 /* This disc is not fully in-sync. However if it 7337 * just stored parity (beyond the recovery_offset), 7338 * when we don't need to be concerned about the 7339 * array being dirty. 7340 * When reshape goes 'backwards', we never have 7341 * partially completed devices, so we only need 7342 * to worry about reshape going forwards. 7343 */ 7344 /* Hack because v0.91 doesn't store recovery_offset properly. */ 7345 if (mddev->major_version == 0 && 7346 mddev->minor_version > 90) 7347 rdev->recovery_offset = reshape_offset; 7348 7349 if (rdev->recovery_offset < reshape_offset) { 7350 /* We need to check old and new layout */ 7351 if (!only_parity(rdev->raid_disk, 7352 conf->algorithm, 7353 conf->raid_disks, 7354 conf->max_degraded)) 7355 continue; 7356 } 7357 if (!only_parity(rdev->raid_disk, 7358 conf->prev_algo, 7359 conf->previous_raid_disks, 7360 conf->max_degraded)) 7361 continue; 7362 dirty_parity_disks++; 7363 } 7364 7365 /* 7366 * 0 for a fully functional array, 1 or 2 for a degraded array. 7367 */ 7368 mddev->degraded = raid5_calc_degraded(conf); 7369 7370 if (has_failed(conf)) { 7371 pr_crit("md/raid:%s: not enough operational devices (%d/%d failed)\n", 7372 mdname(mddev), mddev->degraded, conf->raid_disks); 7373 goto abort; 7374 } 7375 7376 /* device size must be a multiple of chunk size */ 7377 mddev->dev_sectors &= ~(mddev->chunk_sectors - 1); 7378 mddev->resync_max_sectors = mddev->dev_sectors; 7379 7380 if (mddev->degraded > dirty_parity_disks && 7381 mddev->recovery_cp != MaxSector) { 7382 if (test_bit(MD_HAS_PPL, &mddev->flags)) 7383 pr_crit("md/raid:%s: starting dirty degraded array with PPL.\n", 7384 mdname(mddev)); 7385 else if (mddev->ok_start_degraded) 7386 pr_crit("md/raid:%s: starting dirty degraded array - data corruption possible.\n", 7387 mdname(mddev)); 7388 else { 7389 pr_crit("md/raid:%s: cannot start dirty degraded array.\n", 7390 mdname(mddev)); 7391 goto abort; 7392 } 7393 } 7394 7395 pr_info("md/raid:%s: raid level %d active with %d out of %d devices, algorithm %d\n", 7396 mdname(mddev), conf->level, 7397 mddev->raid_disks-mddev->degraded, mddev->raid_disks, 7398 mddev->new_layout); 7399 7400 print_raid5_conf(conf); 7401 7402 if (conf->reshape_progress != MaxSector) { 7403 conf->reshape_safe = conf->reshape_progress; 7404 atomic_set(&conf->reshape_stripes, 0); 7405 clear_bit(MD_RECOVERY_SYNC, &mddev->recovery); 7406 clear_bit(MD_RECOVERY_CHECK, &mddev->recovery); 7407 set_bit(MD_RECOVERY_RESHAPE, &mddev->recovery); 7408 set_bit(MD_RECOVERY_RUNNING, &mddev->recovery); 7409 mddev->sync_thread = md_register_thread(md_do_sync, mddev, 7410 "reshape"); 7411 if (!mddev->sync_thread) 7412 goto abort; 7413 } 7414 7415 /* Ok, everything is just fine now */ 7416 if (mddev->to_remove == &raid5_attrs_group) 7417 mddev->to_remove = NULL; 7418 else if (mddev->kobj.sd && 7419 sysfs_create_group(&mddev->kobj, &raid5_attrs_group)) 7420 pr_warn("raid5: failed to create sysfs attributes for %s\n", 7421 mdname(mddev)); 7422 md_set_array_sectors(mddev, raid5_size(mddev, 0, 0)); 7423 7424 if (mddev->queue) { 7425 int chunk_size; 7426 /* read-ahead size must cover two whole stripes, which 7427 * is 2 * (datadisks) * chunksize where 'n' is the 7428 * number of raid devices 7429 */ 7430 int data_disks = conf->previous_raid_disks - conf->max_degraded; 7431 int stripe = data_disks * 7432 ((mddev->chunk_sectors << 9) / PAGE_SIZE); 7433 if (mddev->queue->backing_dev_info->ra_pages < 2 * stripe) 7434 mddev->queue->backing_dev_info->ra_pages = 2 * stripe; 7435 7436 chunk_size = mddev->chunk_sectors << 9; 7437 blk_queue_io_min(mddev->queue, chunk_size); 7438 blk_queue_io_opt(mddev->queue, chunk_size * 7439 (conf->raid_disks - conf->max_degraded)); 7440 mddev->queue->limits.raid_partial_stripes_expensive = 1; 7441 /* 7442 * We can only discard a whole stripe. It doesn't make sense to 7443 * discard data disk but write parity disk 7444 */ 7445 stripe = stripe * PAGE_SIZE; 7446 /* Round up to power of 2, as discard handling 7447 * currently assumes that */ 7448 while ((stripe-1) & stripe) 7449 stripe = (stripe | (stripe-1)) + 1; 7450 mddev->queue->limits.discard_alignment = stripe; 7451 mddev->queue->limits.discard_granularity = stripe; 7452 7453 blk_queue_max_write_same_sectors(mddev->queue, 0); 7454 blk_queue_max_write_zeroes_sectors(mddev->queue, 0); 7455 7456 rdev_for_each(rdev, mddev) { 7457 disk_stack_limits(mddev->gendisk, rdev->bdev, 7458 rdev->data_offset << 9); 7459 disk_stack_limits(mddev->gendisk, rdev->bdev, 7460 rdev->new_data_offset << 9); 7461 } 7462 7463 /* 7464 * zeroing is required, otherwise data 7465 * could be lost. Consider a scenario: discard a stripe 7466 * (the stripe could be inconsistent if 7467 * discard_zeroes_data is 0); write one disk of the 7468 * stripe (the stripe could be inconsistent again 7469 * depending on which disks are used to calculate 7470 * parity); the disk is broken; The stripe data of this 7471 * disk is lost. 7472 * 7473 * We only allow DISCARD if the sysadmin has confirmed that 7474 * only safe devices are in use by setting a module parameter. 7475 * A better idea might be to turn DISCARD into WRITE_ZEROES 7476 * requests, as that is required to be safe. 7477 */ 7478 if (devices_handle_discard_safely && 7479 mddev->queue->limits.max_discard_sectors >= (stripe >> 9) && 7480 mddev->queue->limits.discard_granularity >= stripe) 7481 blk_queue_flag_set(QUEUE_FLAG_DISCARD, 7482 mddev->queue); 7483 else 7484 blk_queue_flag_clear(QUEUE_FLAG_DISCARD, 7485 mddev->queue); 7486 7487 blk_queue_max_hw_sectors(mddev->queue, UINT_MAX); 7488 } 7489 7490 if (log_init(conf, journal_dev, raid5_has_ppl(conf))) 7491 goto abort; 7492 7493 return 0; 7494 abort: 7495 md_unregister_thread(&mddev->thread); 7496 print_raid5_conf(conf); 7497 free_conf(conf); 7498 mddev->private = NULL; 7499 pr_warn("md/raid:%s: failed to run raid set.\n", mdname(mddev)); 7500 return -EIO; 7501 } 7502 7503 static void raid5_free(struct mddev *mddev, void *priv) 7504 { 7505 struct r5conf *conf = priv; 7506 7507 free_conf(conf); 7508 mddev->to_remove = &raid5_attrs_group; 7509 } 7510 7511 static void raid5_status(struct seq_file *seq, struct mddev *mddev) 7512 { 7513 struct r5conf *conf = mddev->private; 7514 int i; 7515 7516 seq_printf(seq, " level %d, %dk chunk, algorithm %d", mddev->level, 7517 conf->chunk_sectors / 2, mddev->layout); 7518 seq_printf (seq, " [%d/%d] [", conf->raid_disks, conf->raid_disks - mddev->degraded); 7519 rcu_read_lock(); 7520 for (i = 0; i < conf->raid_disks; i++) { 7521 struct md_rdev *rdev = rcu_dereference(conf->disks[i].rdev); 7522 seq_printf (seq, "%s", rdev && test_bit(In_sync, &rdev->flags) ? "U" : "_"); 7523 } 7524 rcu_read_unlock(); 7525 seq_printf (seq, "]"); 7526 } 7527 7528 static void print_raid5_conf (struct r5conf *conf) 7529 { 7530 int i; 7531 struct disk_info *tmp; 7532 7533 pr_debug("RAID conf printout:\n"); 7534 if (!conf) { 7535 pr_debug("(conf==NULL)\n"); 7536 return; 7537 } 7538 pr_debug(" --- level:%d rd:%d wd:%d\n", conf->level, 7539 conf->raid_disks, 7540 conf->raid_disks - conf->mddev->degraded); 7541 7542 for (i = 0; i < conf->raid_disks; i++) { 7543 char b[BDEVNAME_SIZE]; 7544 tmp = conf->disks + i; 7545 if (tmp->rdev) 7546 pr_debug(" disk %d, o:%d, dev:%s\n", 7547 i, !test_bit(Faulty, &tmp->rdev->flags), 7548 bdevname(tmp->rdev->bdev, b)); 7549 } 7550 } 7551 7552 static int raid5_spare_active(struct mddev *mddev) 7553 { 7554 int i; 7555 struct r5conf *conf = mddev->private; 7556 struct disk_info *tmp; 7557 int count = 0; 7558 unsigned long flags; 7559 7560 for (i = 0; i < conf->raid_disks; i++) { 7561 tmp = conf->disks + i; 7562 if (tmp->replacement 7563 && tmp->replacement->recovery_offset == MaxSector 7564 && !test_bit(Faulty, &tmp->replacement->flags) 7565 && !test_and_set_bit(In_sync, &tmp->replacement->flags)) { 7566 /* Replacement has just become active. */ 7567 if (!tmp->rdev 7568 || !test_and_clear_bit(In_sync, &tmp->rdev->flags)) 7569 count++; 7570 if (tmp->rdev) { 7571 /* Replaced device not technically faulty, 7572 * but we need to be sure it gets removed 7573 * and never re-added. 7574 */ 7575 set_bit(Faulty, &tmp->rdev->flags); 7576 sysfs_notify_dirent_safe( 7577 tmp->rdev->sysfs_state); 7578 } 7579 sysfs_notify_dirent_safe(tmp->replacement->sysfs_state); 7580 } else if (tmp->rdev 7581 && tmp->rdev->recovery_offset == MaxSector 7582 && !test_bit(Faulty, &tmp->rdev->flags) 7583 && !test_and_set_bit(In_sync, &tmp->rdev->flags)) { 7584 count++; 7585 sysfs_notify_dirent_safe(tmp->rdev->sysfs_state); 7586 } 7587 } 7588 spin_lock_irqsave(&conf->device_lock, flags); 7589 mddev->degraded = raid5_calc_degraded(conf); 7590 spin_unlock_irqrestore(&conf->device_lock, flags); 7591 print_raid5_conf(conf); 7592 return count; 7593 } 7594 7595 static int raid5_remove_disk(struct mddev *mddev, struct md_rdev *rdev) 7596 { 7597 struct r5conf *conf = mddev->private; 7598 int err = 0; 7599 int number = rdev->raid_disk; 7600 struct md_rdev **rdevp; 7601 struct disk_info *p = conf->disks + number; 7602 7603 print_raid5_conf(conf); 7604 if (test_bit(Journal, &rdev->flags) && conf->log) { 7605 /* 7606 * we can't wait pending write here, as this is called in 7607 * raid5d, wait will deadlock. 7608 * neilb: there is no locking about new writes here, 7609 * so this cannot be safe. 7610 */ 7611 if (atomic_read(&conf->active_stripes) || 7612 atomic_read(&conf->r5c_cached_full_stripes) || 7613 atomic_read(&conf->r5c_cached_partial_stripes)) { 7614 return -EBUSY; 7615 } 7616 log_exit(conf); 7617 return 0; 7618 } 7619 if (rdev == p->rdev) 7620 rdevp = &p->rdev; 7621 else if (rdev == p->replacement) 7622 rdevp = &p->replacement; 7623 else 7624 return 0; 7625 7626 if (number >= conf->raid_disks && 7627 conf->reshape_progress == MaxSector) 7628 clear_bit(In_sync, &rdev->flags); 7629 7630 if (test_bit(In_sync, &rdev->flags) || 7631 atomic_read(&rdev->nr_pending)) { 7632 err = -EBUSY; 7633 goto abort; 7634 } 7635 /* Only remove non-faulty devices if recovery 7636 * isn't possible. 7637 */ 7638 if (!test_bit(Faulty, &rdev->flags) && 7639 mddev->recovery_disabled != conf->recovery_disabled && 7640 !has_failed(conf) && 7641 (!p->replacement || p->replacement == rdev) && 7642 number < conf->raid_disks) { 7643 err = -EBUSY; 7644 goto abort; 7645 } 7646 *rdevp = NULL; 7647 if (!test_bit(RemoveSynchronized, &rdev->flags)) { 7648 synchronize_rcu(); 7649 if (atomic_read(&rdev->nr_pending)) { 7650 /* lost the race, try later */ 7651 err = -EBUSY; 7652 *rdevp = rdev; 7653 } 7654 } 7655 if (!err) { 7656 err = log_modify(conf, rdev, false); 7657 if (err) 7658 goto abort; 7659 } 7660 if (p->replacement) { 7661 /* We must have just cleared 'rdev' */ 7662 p->rdev = p->replacement; 7663 clear_bit(Replacement, &p->replacement->flags); 7664 smp_mb(); /* Make sure other CPUs may see both as identical 7665 * but will never see neither - if they are careful 7666 */ 7667 p->replacement = NULL; 7668 7669 if (!err) 7670 err = log_modify(conf, p->rdev, true); 7671 } 7672 7673 clear_bit(WantReplacement, &rdev->flags); 7674 abort: 7675 7676 print_raid5_conf(conf); 7677 return err; 7678 } 7679 7680 static int raid5_add_disk(struct mddev *mddev, struct md_rdev *rdev) 7681 { 7682 struct r5conf *conf = mddev->private; 7683 int err = -EEXIST; 7684 int disk; 7685 struct disk_info *p; 7686 int first = 0; 7687 int last = conf->raid_disks - 1; 7688 7689 if (test_bit(Journal, &rdev->flags)) { 7690 if (conf->log) 7691 return -EBUSY; 7692 7693 rdev->raid_disk = 0; 7694 /* 7695 * The array is in readonly mode if journal is missing, so no 7696 * write requests running. We should be safe 7697 */ 7698 log_init(conf, rdev, false); 7699 return 0; 7700 } 7701 if (mddev->recovery_disabled == conf->recovery_disabled) 7702 return -EBUSY; 7703 7704 if (rdev->saved_raid_disk < 0 && has_failed(conf)) 7705 /* no point adding a device */ 7706 return -EINVAL; 7707 7708 if (rdev->raid_disk >= 0) 7709 first = last = rdev->raid_disk; 7710 7711 /* 7712 * find the disk ... but prefer rdev->saved_raid_disk 7713 * if possible. 7714 */ 7715 if (rdev->saved_raid_disk >= 0 && 7716 rdev->saved_raid_disk >= first && 7717 conf->disks[rdev->saved_raid_disk].rdev == NULL) 7718 first = rdev->saved_raid_disk; 7719 7720 for (disk = first; disk <= last; disk++) { 7721 p = conf->disks + disk; 7722 if (p->rdev == NULL) { 7723 clear_bit(In_sync, &rdev->flags); 7724 rdev->raid_disk = disk; 7725 if (rdev->saved_raid_disk != disk) 7726 conf->fullsync = 1; 7727 rcu_assign_pointer(p->rdev, rdev); 7728 7729 err = log_modify(conf, rdev, true); 7730 7731 goto out; 7732 } 7733 } 7734 for (disk = first; disk <= last; disk++) { 7735 p = conf->disks + disk; 7736 if (test_bit(WantReplacement, &p->rdev->flags) && 7737 p->replacement == NULL) { 7738 clear_bit(In_sync, &rdev->flags); 7739 set_bit(Replacement, &rdev->flags); 7740 rdev->raid_disk = disk; 7741 err = 0; 7742 conf->fullsync = 1; 7743 rcu_assign_pointer(p->replacement, rdev); 7744 break; 7745 } 7746 } 7747 out: 7748 print_raid5_conf(conf); 7749 return err; 7750 } 7751 7752 static int raid5_resize(struct mddev *mddev, sector_t sectors) 7753 { 7754 /* no resync is happening, and there is enough space 7755 * on all devices, so we can resize. 7756 * We need to make sure resync covers any new space. 7757 * If the array is shrinking we should possibly wait until 7758 * any io in the removed space completes, but it hardly seems 7759 * worth it. 7760 */ 7761 sector_t newsize; 7762 struct r5conf *conf = mddev->private; 7763 7764 if (raid5_has_log(conf) || raid5_has_ppl(conf)) 7765 return -EINVAL; 7766 sectors &= ~((sector_t)conf->chunk_sectors - 1); 7767 newsize = raid5_size(mddev, sectors, mddev->raid_disks); 7768 if (mddev->external_size && 7769 mddev->array_sectors > newsize) 7770 return -EINVAL; 7771 if (mddev->bitmap) { 7772 int ret = md_bitmap_resize(mddev->bitmap, sectors, 0, 0); 7773 if (ret) 7774 return ret; 7775 } 7776 md_set_array_sectors(mddev, newsize); 7777 if (sectors > mddev->dev_sectors && 7778 mddev->recovery_cp > mddev->dev_sectors) { 7779 mddev->recovery_cp = mddev->dev_sectors; 7780 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 7781 } 7782 mddev->dev_sectors = sectors; 7783 mddev->resync_max_sectors = sectors; 7784 return 0; 7785 } 7786 7787 static int check_stripe_cache(struct mddev *mddev) 7788 { 7789 /* Can only proceed if there are plenty of stripe_heads. 7790 * We need a minimum of one full stripe,, and for sensible progress 7791 * it is best to have about 4 times that. 7792 * If we require 4 times, then the default 256 4K stripe_heads will 7793 * allow for chunk sizes up to 256K, which is probably OK. 7794 * If the chunk size is greater, user-space should request more 7795 * stripe_heads first. 7796 */ 7797 struct r5conf *conf = mddev->private; 7798 if (((mddev->chunk_sectors << 9) / STRIPE_SIZE) * 4 7799 > conf->min_nr_stripes || 7800 ((mddev->new_chunk_sectors << 9) / STRIPE_SIZE) * 4 7801 > conf->min_nr_stripes) { 7802 pr_warn("md/raid:%s: reshape: not enough stripes. Needed %lu\n", 7803 mdname(mddev), 7804 ((max(mddev->chunk_sectors, mddev->new_chunk_sectors) << 9) 7805 / STRIPE_SIZE)*4); 7806 return 0; 7807 } 7808 return 1; 7809 } 7810 7811 static int check_reshape(struct mddev *mddev) 7812 { 7813 struct r5conf *conf = mddev->private; 7814 7815 if (raid5_has_log(conf) || raid5_has_ppl(conf)) 7816 return -EINVAL; 7817 if (mddev->delta_disks == 0 && 7818 mddev->new_layout == mddev->layout && 7819 mddev->new_chunk_sectors == mddev->chunk_sectors) 7820 return 0; /* nothing to do */ 7821 if (has_failed(conf)) 7822 return -EINVAL; 7823 if (mddev->delta_disks < 0 && mddev->reshape_position == MaxSector) { 7824 /* We might be able to shrink, but the devices must 7825 * be made bigger first. 7826 * For raid6, 4 is the minimum size. 7827 * Otherwise 2 is the minimum 7828 */ 7829 int min = 2; 7830 if (mddev->level == 6) 7831 min = 4; 7832 if (mddev->raid_disks + mddev->delta_disks < min) 7833 return -EINVAL; 7834 } 7835 7836 if (!check_stripe_cache(mddev)) 7837 return -ENOSPC; 7838 7839 if (mddev->new_chunk_sectors > mddev->chunk_sectors || 7840 mddev->delta_disks > 0) 7841 if (resize_chunks(conf, 7842 conf->previous_raid_disks 7843 + max(0, mddev->delta_disks), 7844 max(mddev->new_chunk_sectors, 7845 mddev->chunk_sectors) 7846 ) < 0) 7847 return -ENOMEM; 7848 7849 if (conf->previous_raid_disks + mddev->delta_disks <= conf->pool_size) 7850 return 0; /* never bother to shrink */ 7851 return resize_stripes(conf, (conf->previous_raid_disks 7852 + mddev->delta_disks)); 7853 } 7854 7855 static int raid5_start_reshape(struct mddev *mddev) 7856 { 7857 struct r5conf *conf = mddev->private; 7858 struct md_rdev *rdev; 7859 int spares = 0; 7860 unsigned long flags; 7861 7862 if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) 7863 return -EBUSY; 7864 7865 if (!check_stripe_cache(mddev)) 7866 return -ENOSPC; 7867 7868 if (has_failed(conf)) 7869 return -EINVAL; 7870 7871 rdev_for_each(rdev, mddev) { 7872 if (!test_bit(In_sync, &rdev->flags) 7873 && !test_bit(Faulty, &rdev->flags)) 7874 spares++; 7875 } 7876 7877 if (spares - mddev->degraded < mddev->delta_disks - conf->max_degraded) 7878 /* Not enough devices even to make a degraded array 7879 * of that size 7880 */ 7881 return -EINVAL; 7882 7883 /* Refuse to reduce size of the array. Any reductions in 7884 * array size must be through explicit setting of array_size 7885 * attribute. 7886 */ 7887 if (raid5_size(mddev, 0, conf->raid_disks + mddev->delta_disks) 7888 < mddev->array_sectors) { 7889 pr_warn("md/raid:%s: array size must be reduced before number of disks\n", 7890 mdname(mddev)); 7891 return -EINVAL; 7892 } 7893 7894 atomic_set(&conf->reshape_stripes, 0); 7895 spin_lock_irq(&conf->device_lock); 7896 write_seqcount_begin(&conf->gen_lock); 7897 conf->previous_raid_disks = conf->raid_disks; 7898 conf->raid_disks += mddev->delta_disks; 7899 conf->prev_chunk_sectors = conf->chunk_sectors; 7900 conf->chunk_sectors = mddev->new_chunk_sectors; 7901 conf->prev_algo = conf->algorithm; 7902 conf->algorithm = mddev->new_layout; 7903 conf->generation++; 7904 /* Code that selects data_offset needs to see the generation update 7905 * if reshape_progress has been set - so a memory barrier needed. 7906 */ 7907 smp_mb(); 7908 if (mddev->reshape_backwards) 7909 conf->reshape_progress = raid5_size(mddev, 0, 0); 7910 else 7911 conf->reshape_progress = 0; 7912 conf->reshape_safe = conf->reshape_progress; 7913 write_seqcount_end(&conf->gen_lock); 7914 spin_unlock_irq(&conf->device_lock); 7915 7916 /* Now make sure any requests that proceeded on the assumption 7917 * the reshape wasn't running - like Discard or Read - have 7918 * completed. 7919 */ 7920 mddev_suspend(mddev); 7921 mddev_resume(mddev); 7922 7923 /* Add some new drives, as many as will fit. 7924 * We know there are enough to make the newly sized array work. 7925 * Don't add devices if we are reducing the number of 7926 * devices in the array. This is because it is not possible 7927 * to correctly record the "partially reconstructed" state of 7928 * such devices during the reshape and confusion could result. 7929 */ 7930 if (mddev->delta_disks >= 0) { 7931 rdev_for_each(rdev, mddev) 7932 if (rdev->raid_disk < 0 && 7933 !test_bit(Faulty, &rdev->flags)) { 7934 if (raid5_add_disk(mddev, rdev) == 0) { 7935 if (rdev->raid_disk 7936 >= conf->previous_raid_disks) 7937 set_bit(In_sync, &rdev->flags); 7938 else 7939 rdev->recovery_offset = 0; 7940 7941 if (sysfs_link_rdev(mddev, rdev)) 7942 /* Failure here is OK */; 7943 } 7944 } else if (rdev->raid_disk >= conf->previous_raid_disks 7945 && !test_bit(Faulty, &rdev->flags)) { 7946 /* This is a spare that was manually added */ 7947 set_bit(In_sync, &rdev->flags); 7948 } 7949 7950 /* When a reshape changes the number of devices, 7951 * ->degraded is measured against the larger of the 7952 * pre and post number of devices. 7953 */ 7954 spin_lock_irqsave(&conf->device_lock, flags); 7955 mddev->degraded = raid5_calc_degraded(conf); 7956 spin_unlock_irqrestore(&conf->device_lock, flags); 7957 } 7958 mddev->raid_disks = conf->raid_disks; 7959 mddev->reshape_position = conf->reshape_progress; 7960 set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags); 7961 7962 clear_bit(MD_RECOVERY_SYNC, &mddev->recovery); 7963 clear_bit(MD_RECOVERY_CHECK, &mddev->recovery); 7964 clear_bit(MD_RECOVERY_DONE, &mddev->recovery); 7965 set_bit(MD_RECOVERY_RESHAPE, &mddev->recovery); 7966 set_bit(MD_RECOVERY_RUNNING, &mddev->recovery); 7967 mddev->sync_thread = md_register_thread(md_do_sync, mddev, 7968 "reshape"); 7969 if (!mddev->sync_thread) { 7970 mddev->recovery = 0; 7971 spin_lock_irq(&conf->device_lock); 7972 write_seqcount_begin(&conf->gen_lock); 7973 mddev->raid_disks = conf->raid_disks = conf->previous_raid_disks; 7974 mddev->new_chunk_sectors = 7975 conf->chunk_sectors = conf->prev_chunk_sectors; 7976 mddev->new_layout = conf->algorithm = conf->prev_algo; 7977 rdev_for_each(rdev, mddev) 7978 rdev->new_data_offset = rdev->data_offset; 7979 smp_wmb(); 7980 conf->generation --; 7981 conf->reshape_progress = MaxSector; 7982 mddev->reshape_position = MaxSector; 7983 write_seqcount_end(&conf->gen_lock); 7984 spin_unlock_irq(&conf->device_lock); 7985 return -EAGAIN; 7986 } 7987 conf->reshape_checkpoint = jiffies; 7988 md_wakeup_thread(mddev->sync_thread); 7989 md_new_event(mddev); 7990 return 0; 7991 } 7992 7993 /* This is called from the reshape thread and should make any 7994 * changes needed in 'conf' 7995 */ 7996 static void end_reshape(struct r5conf *conf) 7997 { 7998 7999 if (!test_bit(MD_RECOVERY_INTR, &conf->mddev->recovery)) { 8000 struct md_rdev *rdev; 8001 8002 spin_lock_irq(&conf->device_lock); 8003 conf->previous_raid_disks = conf->raid_disks; 8004 md_finish_reshape(conf->mddev); 8005 smp_wmb(); 8006 conf->reshape_progress = MaxSector; 8007 conf->mddev->reshape_position = MaxSector; 8008 rdev_for_each(rdev, conf->mddev) 8009 if (rdev->raid_disk >= 0 && 8010 !test_bit(Journal, &rdev->flags) && 8011 !test_bit(In_sync, &rdev->flags)) 8012 rdev->recovery_offset = MaxSector; 8013 spin_unlock_irq(&conf->device_lock); 8014 wake_up(&conf->wait_for_overlap); 8015 8016 /* read-ahead size must cover two whole stripes, which is 8017 * 2 * (datadisks) * chunksize where 'n' is the number of raid devices 8018 */ 8019 if (conf->mddev->queue) { 8020 int data_disks = conf->raid_disks - conf->max_degraded; 8021 int stripe = data_disks * ((conf->chunk_sectors << 9) 8022 / PAGE_SIZE); 8023 if (conf->mddev->queue->backing_dev_info->ra_pages < 2 * stripe) 8024 conf->mddev->queue->backing_dev_info->ra_pages = 2 * stripe; 8025 } 8026 } 8027 } 8028 8029 /* This is called from the raid5d thread with mddev_lock held. 8030 * It makes config changes to the device. 8031 */ 8032 static void raid5_finish_reshape(struct mddev *mddev) 8033 { 8034 struct r5conf *conf = mddev->private; 8035 8036 if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery)) { 8037 8038 if (mddev->delta_disks <= 0) { 8039 int d; 8040 spin_lock_irq(&conf->device_lock); 8041 mddev->degraded = raid5_calc_degraded(conf); 8042 spin_unlock_irq(&conf->device_lock); 8043 for (d = conf->raid_disks ; 8044 d < conf->raid_disks - mddev->delta_disks; 8045 d++) { 8046 struct md_rdev *rdev = conf->disks[d].rdev; 8047 if (rdev) 8048 clear_bit(In_sync, &rdev->flags); 8049 rdev = conf->disks[d].replacement; 8050 if (rdev) 8051 clear_bit(In_sync, &rdev->flags); 8052 } 8053 } 8054 mddev->layout = conf->algorithm; 8055 mddev->chunk_sectors = conf->chunk_sectors; 8056 mddev->reshape_position = MaxSector; 8057 mddev->delta_disks = 0; 8058 mddev->reshape_backwards = 0; 8059 } 8060 } 8061 8062 static void raid5_quiesce(struct mddev *mddev, int quiesce) 8063 { 8064 struct r5conf *conf = mddev->private; 8065 8066 if (quiesce) { 8067 /* stop all writes */ 8068 lock_all_device_hash_locks_irq(conf); 8069 /* '2' tells resync/reshape to pause so that all 8070 * active stripes can drain 8071 */ 8072 r5c_flush_cache(conf, INT_MAX); 8073 conf->quiesce = 2; 8074 wait_event_cmd(conf->wait_for_quiescent, 8075 atomic_read(&conf->active_stripes) == 0 && 8076 atomic_read(&conf->active_aligned_reads) == 0, 8077 unlock_all_device_hash_locks_irq(conf), 8078 lock_all_device_hash_locks_irq(conf)); 8079 conf->quiesce = 1; 8080 unlock_all_device_hash_locks_irq(conf); 8081 /* allow reshape to continue */ 8082 wake_up(&conf->wait_for_overlap); 8083 } else { 8084 /* re-enable writes */ 8085 lock_all_device_hash_locks_irq(conf); 8086 conf->quiesce = 0; 8087 wake_up(&conf->wait_for_quiescent); 8088 wake_up(&conf->wait_for_overlap); 8089 unlock_all_device_hash_locks_irq(conf); 8090 } 8091 log_quiesce(conf, quiesce); 8092 } 8093 8094 static void *raid45_takeover_raid0(struct mddev *mddev, int level) 8095 { 8096 struct r0conf *raid0_conf = mddev->private; 8097 sector_t sectors; 8098 8099 /* for raid0 takeover only one zone is supported */ 8100 if (raid0_conf->nr_strip_zones > 1) { 8101 pr_warn("md/raid:%s: cannot takeover raid0 with more than one zone.\n", 8102 mdname(mddev)); 8103 return ERR_PTR(-EINVAL); 8104 } 8105 8106 sectors = raid0_conf->strip_zone[0].zone_end; 8107 sector_div(sectors, raid0_conf->strip_zone[0].nb_dev); 8108 mddev->dev_sectors = sectors; 8109 mddev->new_level = level; 8110 mddev->new_layout = ALGORITHM_PARITY_N; 8111 mddev->new_chunk_sectors = mddev->chunk_sectors; 8112 mddev->raid_disks += 1; 8113 mddev->delta_disks = 1; 8114 /* make sure it will be not marked as dirty */ 8115 mddev->recovery_cp = MaxSector; 8116 8117 return setup_conf(mddev); 8118 } 8119 8120 static void *raid5_takeover_raid1(struct mddev *mddev) 8121 { 8122 int chunksect; 8123 void *ret; 8124 8125 if (mddev->raid_disks != 2 || 8126 mddev->degraded > 1) 8127 return ERR_PTR(-EINVAL); 8128 8129 /* Should check if there are write-behind devices? */ 8130 8131 chunksect = 64*2; /* 64K by default */ 8132 8133 /* The array must be an exact multiple of chunksize */ 8134 while (chunksect && (mddev->array_sectors & (chunksect-1))) 8135 chunksect >>= 1; 8136 8137 if ((chunksect<<9) < STRIPE_SIZE) 8138 /* array size does not allow a suitable chunk size */ 8139 return ERR_PTR(-EINVAL); 8140 8141 mddev->new_level = 5; 8142 mddev->new_layout = ALGORITHM_LEFT_SYMMETRIC; 8143 mddev->new_chunk_sectors = chunksect; 8144 8145 ret = setup_conf(mddev); 8146 if (!IS_ERR(ret)) 8147 mddev_clear_unsupported_flags(mddev, 8148 UNSUPPORTED_MDDEV_FLAGS); 8149 return ret; 8150 } 8151 8152 static void *raid5_takeover_raid6(struct mddev *mddev) 8153 { 8154 int new_layout; 8155 8156 switch (mddev->layout) { 8157 case ALGORITHM_LEFT_ASYMMETRIC_6: 8158 new_layout = ALGORITHM_LEFT_ASYMMETRIC; 8159 break; 8160 case ALGORITHM_RIGHT_ASYMMETRIC_6: 8161 new_layout = ALGORITHM_RIGHT_ASYMMETRIC; 8162 break; 8163 case ALGORITHM_LEFT_SYMMETRIC_6: 8164 new_layout = ALGORITHM_LEFT_SYMMETRIC; 8165 break; 8166 case ALGORITHM_RIGHT_SYMMETRIC_6: 8167 new_layout = ALGORITHM_RIGHT_SYMMETRIC; 8168 break; 8169 case ALGORITHM_PARITY_0_6: 8170 new_layout = ALGORITHM_PARITY_0; 8171 break; 8172 case ALGORITHM_PARITY_N: 8173 new_layout = ALGORITHM_PARITY_N; 8174 break; 8175 default: 8176 return ERR_PTR(-EINVAL); 8177 } 8178 mddev->new_level = 5; 8179 mddev->new_layout = new_layout; 8180 mddev->delta_disks = -1; 8181 mddev->raid_disks -= 1; 8182 return setup_conf(mddev); 8183 } 8184 8185 static int raid5_check_reshape(struct mddev *mddev) 8186 { 8187 /* For a 2-drive array, the layout and chunk size can be changed 8188 * immediately as not restriping is needed. 8189 * For larger arrays we record the new value - after validation 8190 * to be used by a reshape pass. 8191 */ 8192 struct r5conf *conf = mddev->private; 8193 int new_chunk = mddev->new_chunk_sectors; 8194 8195 if (mddev->new_layout >= 0 && !algorithm_valid_raid5(mddev->new_layout)) 8196 return -EINVAL; 8197 if (new_chunk > 0) { 8198 if (!is_power_of_2(new_chunk)) 8199 return -EINVAL; 8200 if (new_chunk < (PAGE_SIZE>>9)) 8201 return -EINVAL; 8202 if (mddev->array_sectors & (new_chunk-1)) 8203 /* not factor of array size */ 8204 return -EINVAL; 8205 } 8206 8207 /* They look valid */ 8208 8209 if (mddev->raid_disks == 2) { 8210 /* can make the change immediately */ 8211 if (mddev->new_layout >= 0) { 8212 conf->algorithm = mddev->new_layout; 8213 mddev->layout = mddev->new_layout; 8214 } 8215 if (new_chunk > 0) { 8216 conf->chunk_sectors = new_chunk ; 8217 mddev->chunk_sectors = new_chunk; 8218 } 8219 set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags); 8220 md_wakeup_thread(mddev->thread); 8221 } 8222 return check_reshape(mddev); 8223 } 8224 8225 static int raid6_check_reshape(struct mddev *mddev) 8226 { 8227 int new_chunk = mddev->new_chunk_sectors; 8228 8229 if (mddev->new_layout >= 0 && !algorithm_valid_raid6(mddev->new_layout)) 8230 return -EINVAL; 8231 if (new_chunk > 0) { 8232 if (!is_power_of_2(new_chunk)) 8233 return -EINVAL; 8234 if (new_chunk < (PAGE_SIZE >> 9)) 8235 return -EINVAL; 8236 if (mddev->array_sectors & (new_chunk-1)) 8237 /* not factor of array size */ 8238 return -EINVAL; 8239 } 8240 8241 /* They look valid */ 8242 return check_reshape(mddev); 8243 } 8244 8245 static void *raid5_takeover(struct mddev *mddev) 8246 { 8247 /* raid5 can take over: 8248 * raid0 - if there is only one strip zone - make it a raid4 layout 8249 * raid1 - if there are two drives. We need to know the chunk size 8250 * raid4 - trivial - just use a raid4 layout. 8251 * raid6 - Providing it is a *_6 layout 8252 */ 8253 if (mddev->level == 0) 8254 return raid45_takeover_raid0(mddev, 5); 8255 if (mddev->level == 1) 8256 return raid5_takeover_raid1(mddev); 8257 if (mddev->level == 4) { 8258 mddev->new_layout = ALGORITHM_PARITY_N; 8259 mddev->new_level = 5; 8260 return setup_conf(mddev); 8261 } 8262 if (mddev->level == 6) 8263 return raid5_takeover_raid6(mddev); 8264 8265 return ERR_PTR(-EINVAL); 8266 } 8267 8268 static void *raid4_takeover(struct mddev *mddev) 8269 { 8270 /* raid4 can take over: 8271 * raid0 - if there is only one strip zone 8272 * raid5 - if layout is right 8273 */ 8274 if (mddev->level == 0) 8275 return raid45_takeover_raid0(mddev, 4); 8276 if (mddev->level == 5 && 8277 mddev->layout == ALGORITHM_PARITY_N) { 8278 mddev->new_layout = 0; 8279 mddev->new_level = 4; 8280 return setup_conf(mddev); 8281 } 8282 return ERR_PTR(-EINVAL); 8283 } 8284 8285 static struct md_personality raid5_personality; 8286 8287 static void *raid6_takeover(struct mddev *mddev) 8288 { 8289 /* Currently can only take over a raid5. We map the 8290 * personality to an equivalent raid6 personality 8291 * with the Q block at the end. 8292 */ 8293 int new_layout; 8294 8295 if (mddev->pers != &raid5_personality) 8296 return ERR_PTR(-EINVAL); 8297 if (mddev->degraded > 1) 8298 return ERR_PTR(-EINVAL); 8299 if (mddev->raid_disks > 253) 8300 return ERR_PTR(-EINVAL); 8301 if (mddev->raid_disks < 3) 8302 return ERR_PTR(-EINVAL); 8303 8304 switch (mddev->layout) { 8305 case ALGORITHM_LEFT_ASYMMETRIC: 8306 new_layout = ALGORITHM_LEFT_ASYMMETRIC_6; 8307 break; 8308 case ALGORITHM_RIGHT_ASYMMETRIC: 8309 new_layout = ALGORITHM_RIGHT_ASYMMETRIC_6; 8310 break; 8311 case ALGORITHM_LEFT_SYMMETRIC: 8312 new_layout = ALGORITHM_LEFT_SYMMETRIC_6; 8313 break; 8314 case ALGORITHM_RIGHT_SYMMETRIC: 8315 new_layout = ALGORITHM_RIGHT_SYMMETRIC_6; 8316 break; 8317 case ALGORITHM_PARITY_0: 8318 new_layout = ALGORITHM_PARITY_0_6; 8319 break; 8320 case ALGORITHM_PARITY_N: 8321 new_layout = ALGORITHM_PARITY_N; 8322 break; 8323 default: 8324 return ERR_PTR(-EINVAL); 8325 } 8326 mddev->new_level = 6; 8327 mddev->new_layout = new_layout; 8328 mddev->delta_disks = 1; 8329 mddev->raid_disks += 1; 8330 return setup_conf(mddev); 8331 } 8332 8333 static int raid5_change_consistency_policy(struct mddev *mddev, const char *buf) 8334 { 8335 struct r5conf *conf; 8336 int err; 8337 8338 err = mddev_lock(mddev); 8339 if (err) 8340 return err; 8341 conf = mddev->private; 8342 if (!conf) { 8343 mddev_unlock(mddev); 8344 return -ENODEV; 8345 } 8346 8347 if (strncmp(buf, "ppl", 3) == 0) { 8348 /* ppl only works with RAID 5 */ 8349 if (!raid5_has_ppl(conf) && conf->level == 5) { 8350 err = log_init(conf, NULL, true); 8351 if (!err) { 8352 err = resize_stripes(conf, conf->pool_size); 8353 if (err) 8354 log_exit(conf); 8355 } 8356 } else 8357 err = -EINVAL; 8358 } else if (strncmp(buf, "resync", 6) == 0) { 8359 if (raid5_has_ppl(conf)) { 8360 mddev_suspend(mddev); 8361 log_exit(conf); 8362 mddev_resume(mddev); 8363 err = resize_stripes(conf, conf->pool_size); 8364 } else if (test_bit(MD_HAS_JOURNAL, &conf->mddev->flags) && 8365 r5l_log_disk_error(conf)) { 8366 bool journal_dev_exists = false; 8367 struct md_rdev *rdev; 8368 8369 rdev_for_each(rdev, mddev) 8370 if (test_bit(Journal, &rdev->flags)) { 8371 journal_dev_exists = true; 8372 break; 8373 } 8374 8375 if (!journal_dev_exists) { 8376 mddev_suspend(mddev); 8377 clear_bit(MD_HAS_JOURNAL, &mddev->flags); 8378 mddev_resume(mddev); 8379 } else /* need remove journal device first */ 8380 err = -EBUSY; 8381 } else 8382 err = -EINVAL; 8383 } else { 8384 err = -EINVAL; 8385 } 8386 8387 if (!err) 8388 md_update_sb(mddev, 1); 8389 8390 mddev_unlock(mddev); 8391 8392 return err; 8393 } 8394 8395 static int raid5_start(struct mddev *mddev) 8396 { 8397 struct r5conf *conf = mddev->private; 8398 8399 return r5l_start(conf->log); 8400 } 8401 8402 static struct md_personality raid6_personality = 8403 { 8404 .name = "raid6", 8405 .level = 6, 8406 .owner = THIS_MODULE, 8407 .make_request = raid5_make_request, 8408 .run = raid5_run, 8409 .start = raid5_start, 8410 .free = raid5_free, 8411 .status = raid5_status, 8412 .error_handler = raid5_error, 8413 .hot_add_disk = raid5_add_disk, 8414 .hot_remove_disk= raid5_remove_disk, 8415 .spare_active = raid5_spare_active, 8416 .sync_request = raid5_sync_request, 8417 .resize = raid5_resize, 8418 .size = raid5_size, 8419 .check_reshape = raid6_check_reshape, 8420 .start_reshape = raid5_start_reshape, 8421 .finish_reshape = raid5_finish_reshape, 8422 .quiesce = raid5_quiesce, 8423 .takeover = raid6_takeover, 8424 .congested = raid5_congested, 8425 .change_consistency_policy = raid5_change_consistency_policy, 8426 }; 8427 static struct md_personality raid5_personality = 8428 { 8429 .name = "raid5", 8430 .level = 5, 8431 .owner = THIS_MODULE, 8432 .make_request = raid5_make_request, 8433 .run = raid5_run, 8434 .start = raid5_start, 8435 .free = raid5_free, 8436 .status = raid5_status, 8437 .error_handler = raid5_error, 8438 .hot_add_disk = raid5_add_disk, 8439 .hot_remove_disk= raid5_remove_disk, 8440 .spare_active = raid5_spare_active, 8441 .sync_request = raid5_sync_request, 8442 .resize = raid5_resize, 8443 .size = raid5_size, 8444 .check_reshape = raid5_check_reshape, 8445 .start_reshape = raid5_start_reshape, 8446 .finish_reshape = raid5_finish_reshape, 8447 .quiesce = raid5_quiesce, 8448 .takeover = raid5_takeover, 8449 .congested = raid5_congested, 8450 .change_consistency_policy = raid5_change_consistency_policy, 8451 }; 8452 8453 static struct md_personality raid4_personality = 8454 { 8455 .name = "raid4", 8456 .level = 4, 8457 .owner = THIS_MODULE, 8458 .make_request = raid5_make_request, 8459 .run = raid5_run, 8460 .start = raid5_start, 8461 .free = raid5_free, 8462 .status = raid5_status, 8463 .error_handler = raid5_error, 8464 .hot_add_disk = raid5_add_disk, 8465 .hot_remove_disk= raid5_remove_disk, 8466 .spare_active = raid5_spare_active, 8467 .sync_request = raid5_sync_request, 8468 .resize = raid5_resize, 8469 .size = raid5_size, 8470 .check_reshape = raid5_check_reshape, 8471 .start_reshape = raid5_start_reshape, 8472 .finish_reshape = raid5_finish_reshape, 8473 .quiesce = raid5_quiesce, 8474 .takeover = raid4_takeover, 8475 .congested = raid5_congested, 8476 .change_consistency_policy = raid5_change_consistency_policy, 8477 }; 8478 8479 static int __init raid5_init(void) 8480 { 8481 int ret; 8482 8483 raid5_wq = alloc_workqueue("raid5wq", 8484 WQ_UNBOUND|WQ_MEM_RECLAIM|WQ_CPU_INTENSIVE|WQ_SYSFS, 0); 8485 if (!raid5_wq) 8486 return -ENOMEM; 8487 8488 ret = cpuhp_setup_state_multi(CPUHP_MD_RAID5_PREPARE, 8489 "md/raid5:prepare", 8490 raid456_cpu_up_prepare, 8491 raid456_cpu_dead); 8492 if (ret) { 8493 destroy_workqueue(raid5_wq); 8494 return ret; 8495 } 8496 register_md_personality(&raid6_personality); 8497 register_md_personality(&raid5_personality); 8498 register_md_personality(&raid4_personality); 8499 return 0; 8500 } 8501 8502 static void raid5_exit(void) 8503 { 8504 unregister_md_personality(&raid6_personality); 8505 unregister_md_personality(&raid5_personality); 8506 unregister_md_personality(&raid4_personality); 8507 cpuhp_remove_multi_state(CPUHP_MD_RAID5_PREPARE); 8508 destroy_workqueue(raid5_wq); 8509 } 8510 8511 module_init(raid5_init); 8512 module_exit(raid5_exit); 8513 MODULE_LICENSE("GPL"); 8514 MODULE_DESCRIPTION("RAID4/5/6 (striping with parity) personality for MD"); 8515 MODULE_ALIAS("md-personality-4"); /* RAID5 */ 8516 MODULE_ALIAS("md-raid5"); 8517 MODULE_ALIAS("md-raid4"); 8518 MODULE_ALIAS("md-level-5"); 8519 MODULE_ALIAS("md-level-4"); 8520 MODULE_ALIAS("md-personality-8"); /* RAID6 */ 8521 MODULE_ALIAS("md-raid6"); 8522 MODULE_ALIAS("md-level-6"); 8523 8524 /* This used to be two separate modules, they were: */ 8525 MODULE_ALIAS("raid5"); 8526 MODULE_ALIAS("raid6"); 8527