1 /* 2 * raid5.c : Multiple Devices driver for Linux 3 * Copyright (C) 1996, 1997 Ingo Molnar, Miguel de Icaza, Gadi Oxman 4 * Copyright (C) 1999, 2000 Ingo Molnar 5 * Copyright (C) 2002, 2003 H. Peter Anvin 6 * 7 * RAID-4/5/6 management functions. 8 * Thanks to Penguin Computing for making the RAID-6 development possible 9 * by donating a test server! 10 * 11 * This program is free software; you can redistribute it and/or modify 12 * it under the terms of the GNU General Public License as published by 13 * the Free Software Foundation; either version 2, or (at your option) 14 * any later version. 15 * 16 * You should have received a copy of the GNU General Public License 17 * (for example /usr/src/linux/COPYING); if not, write to the Free 18 * Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. 19 */ 20 21 /* 22 * BITMAP UNPLUGGING: 23 * 24 * The sequencing for updating the bitmap reliably is a little 25 * subtle (and I got it wrong the first time) so it deserves some 26 * explanation. 27 * 28 * We group bitmap updates into batches. Each batch has a number. 29 * We may write out several batches at once, but that isn't very important. 30 * conf->seq_write is the number of the last batch successfully written. 31 * conf->seq_flush is the number of the last batch that was closed to 32 * new additions. 33 * When we discover that we will need to write to any block in a stripe 34 * (in add_stripe_bio) we update the in-memory bitmap and record in sh->bm_seq 35 * the number of the batch it will be in. This is seq_flush+1. 36 * When we are ready to do a write, if that batch hasn't been written yet, 37 * we plug the array and queue the stripe for later. 38 * When an unplug happens, we increment bm_flush, thus closing the current 39 * batch. 40 * When we notice that bm_flush > bm_write, we write out all pending updates 41 * to the bitmap, and advance bm_write to where bm_flush was. 42 * This may occasionally write a bit out twice, but is sure never to 43 * miss any bits. 44 */ 45 46 #include <linux/blkdev.h> 47 #include <linux/kthread.h> 48 #include <linux/raid/pq.h> 49 #include <linux/async_tx.h> 50 #include <linux/module.h> 51 #include <linux/async.h> 52 #include <linux/seq_file.h> 53 #include <linux/cpu.h> 54 #include <linux/slab.h> 55 #include <linux/ratelimit.h> 56 #include <linux/nodemask.h> 57 #include <linux/flex_array.h> 58 #include <trace/events/block.h> 59 60 #include "md.h" 61 #include "raid5.h" 62 #include "raid0.h" 63 #include "bitmap.h" 64 65 #define UNSUPPORTED_MDDEV_FLAGS (1L << MD_FAILFAST_SUPPORTED) 66 67 #define cpu_to_group(cpu) cpu_to_node(cpu) 68 #define ANY_GROUP NUMA_NO_NODE 69 70 static bool devices_handle_discard_safely = false; 71 module_param(devices_handle_discard_safely, bool, 0644); 72 MODULE_PARM_DESC(devices_handle_discard_safely, 73 "Set to Y if all devices in each array reliably return zeroes on reads from discarded regions"); 74 static struct workqueue_struct *raid5_wq; 75 76 static inline struct hlist_head *stripe_hash(struct r5conf *conf, sector_t sect) 77 { 78 int hash = (sect >> STRIPE_SHIFT) & HASH_MASK; 79 return &conf->stripe_hashtbl[hash]; 80 } 81 82 static inline int stripe_hash_locks_hash(sector_t sect) 83 { 84 return (sect >> STRIPE_SHIFT) & STRIPE_HASH_LOCKS_MASK; 85 } 86 87 static inline void lock_device_hash_lock(struct r5conf *conf, int hash) 88 { 89 spin_lock_irq(conf->hash_locks + hash); 90 spin_lock(&conf->device_lock); 91 } 92 93 static inline void unlock_device_hash_lock(struct r5conf *conf, int hash) 94 { 95 spin_unlock(&conf->device_lock); 96 spin_unlock_irq(conf->hash_locks + hash); 97 } 98 99 static inline void lock_all_device_hash_locks_irq(struct r5conf *conf) 100 { 101 int i; 102 local_irq_disable(); 103 spin_lock(conf->hash_locks); 104 for (i = 1; i < NR_STRIPE_HASH_LOCKS; i++) 105 spin_lock_nest_lock(conf->hash_locks + i, conf->hash_locks); 106 spin_lock(&conf->device_lock); 107 } 108 109 static inline void unlock_all_device_hash_locks_irq(struct r5conf *conf) 110 { 111 int i; 112 spin_unlock(&conf->device_lock); 113 for (i = NR_STRIPE_HASH_LOCKS; i; i--) 114 spin_unlock(conf->hash_locks + i - 1); 115 local_irq_enable(); 116 } 117 118 /* Find first data disk in a raid6 stripe */ 119 static inline int raid6_d0(struct stripe_head *sh) 120 { 121 if (sh->ddf_layout) 122 /* ddf always start from first device */ 123 return 0; 124 /* md starts just after Q block */ 125 if (sh->qd_idx == sh->disks - 1) 126 return 0; 127 else 128 return sh->qd_idx + 1; 129 } 130 static inline int raid6_next_disk(int disk, int raid_disks) 131 { 132 disk++; 133 return (disk < raid_disks) ? disk : 0; 134 } 135 136 /* When walking through the disks in a raid5, starting at raid6_d0, 137 * We need to map each disk to a 'slot', where the data disks are slot 138 * 0 .. raid_disks-3, the parity disk is raid_disks-2 and the Q disk 139 * is raid_disks-1. This help does that mapping. 140 */ 141 static int raid6_idx_to_slot(int idx, struct stripe_head *sh, 142 int *count, int syndrome_disks) 143 { 144 int slot = *count; 145 146 if (sh->ddf_layout) 147 (*count)++; 148 if (idx == sh->pd_idx) 149 return syndrome_disks; 150 if (idx == sh->qd_idx) 151 return syndrome_disks + 1; 152 if (!sh->ddf_layout) 153 (*count)++; 154 return slot; 155 } 156 157 static void return_io(struct bio_list *return_bi) 158 { 159 struct bio *bi; 160 while ((bi = bio_list_pop(return_bi)) != NULL) { 161 bi->bi_iter.bi_size = 0; 162 trace_block_bio_complete(bdev_get_queue(bi->bi_bdev), 163 bi, 0); 164 bio_endio(bi); 165 } 166 } 167 168 static void print_raid5_conf (struct r5conf *conf); 169 170 static int stripe_operations_active(struct stripe_head *sh) 171 { 172 return sh->check_state || sh->reconstruct_state || 173 test_bit(STRIPE_BIOFILL_RUN, &sh->state) || 174 test_bit(STRIPE_COMPUTE_RUN, &sh->state); 175 } 176 177 static void raid5_wakeup_stripe_thread(struct stripe_head *sh) 178 { 179 struct r5conf *conf = sh->raid_conf; 180 struct r5worker_group *group; 181 int thread_cnt; 182 int i, cpu = sh->cpu; 183 184 if (!cpu_online(cpu)) { 185 cpu = cpumask_any(cpu_online_mask); 186 sh->cpu = cpu; 187 } 188 189 if (list_empty(&sh->lru)) { 190 struct r5worker_group *group; 191 group = conf->worker_groups + cpu_to_group(cpu); 192 list_add_tail(&sh->lru, &group->handle_list); 193 group->stripes_cnt++; 194 sh->group = group; 195 } 196 197 if (conf->worker_cnt_per_group == 0) { 198 md_wakeup_thread(conf->mddev->thread); 199 return; 200 } 201 202 group = conf->worker_groups + cpu_to_group(sh->cpu); 203 204 group->workers[0].working = true; 205 /* at least one worker should run to avoid race */ 206 queue_work_on(sh->cpu, raid5_wq, &group->workers[0].work); 207 208 thread_cnt = group->stripes_cnt / MAX_STRIPE_BATCH - 1; 209 /* wakeup more workers */ 210 for (i = 1; i < conf->worker_cnt_per_group && thread_cnt > 0; i++) { 211 if (group->workers[i].working == false) { 212 group->workers[i].working = true; 213 queue_work_on(sh->cpu, raid5_wq, 214 &group->workers[i].work); 215 thread_cnt--; 216 } 217 } 218 } 219 220 static void do_release_stripe(struct r5conf *conf, struct stripe_head *sh, 221 struct list_head *temp_inactive_list) 222 { 223 int i; 224 int injournal = 0; /* number of date pages with R5_InJournal */ 225 226 BUG_ON(!list_empty(&sh->lru)); 227 BUG_ON(atomic_read(&conf->active_stripes)==0); 228 229 if (r5c_is_writeback(conf->log)) 230 for (i = sh->disks; i--; ) 231 if (test_bit(R5_InJournal, &sh->dev[i].flags)) 232 injournal++; 233 /* 234 * When quiesce in r5c write back, set STRIPE_HANDLE for stripes with 235 * data in journal, so they are not released to cached lists 236 */ 237 if (conf->quiesce && r5c_is_writeback(conf->log) && 238 !test_bit(STRIPE_HANDLE, &sh->state) && injournal != 0) { 239 if (test_bit(STRIPE_R5C_CACHING, &sh->state)) 240 r5c_make_stripe_write_out(sh); 241 set_bit(STRIPE_HANDLE, &sh->state); 242 } 243 244 if (test_bit(STRIPE_HANDLE, &sh->state)) { 245 if (test_bit(STRIPE_DELAYED, &sh->state) && 246 !test_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) 247 list_add_tail(&sh->lru, &conf->delayed_list); 248 else if (test_bit(STRIPE_BIT_DELAY, &sh->state) && 249 sh->bm_seq - conf->seq_write > 0) 250 list_add_tail(&sh->lru, &conf->bitmap_list); 251 else { 252 clear_bit(STRIPE_DELAYED, &sh->state); 253 clear_bit(STRIPE_BIT_DELAY, &sh->state); 254 if (conf->worker_cnt_per_group == 0) { 255 list_add_tail(&sh->lru, &conf->handle_list); 256 } else { 257 raid5_wakeup_stripe_thread(sh); 258 return; 259 } 260 } 261 md_wakeup_thread(conf->mddev->thread); 262 } else { 263 BUG_ON(stripe_operations_active(sh)); 264 if (test_and_clear_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) 265 if (atomic_dec_return(&conf->preread_active_stripes) 266 < IO_THRESHOLD) 267 md_wakeup_thread(conf->mddev->thread); 268 atomic_dec(&conf->active_stripes); 269 if (!test_bit(STRIPE_EXPANDING, &sh->state)) { 270 if (!r5c_is_writeback(conf->log)) 271 list_add_tail(&sh->lru, temp_inactive_list); 272 else { 273 WARN_ON(test_bit(R5_InJournal, &sh->dev[sh->pd_idx].flags)); 274 if (injournal == 0) 275 list_add_tail(&sh->lru, temp_inactive_list); 276 else if (injournal == conf->raid_disks - conf->max_degraded) { 277 /* full stripe */ 278 if (!test_and_set_bit(STRIPE_R5C_FULL_STRIPE, &sh->state)) 279 atomic_inc(&conf->r5c_cached_full_stripes); 280 if (test_and_clear_bit(STRIPE_R5C_PARTIAL_STRIPE, &sh->state)) 281 atomic_dec(&conf->r5c_cached_partial_stripes); 282 list_add_tail(&sh->lru, &conf->r5c_full_stripe_list); 283 r5c_check_cached_full_stripe(conf); 284 } else { 285 /* partial stripe */ 286 if (!test_and_set_bit(STRIPE_R5C_PARTIAL_STRIPE, 287 &sh->state)) 288 atomic_inc(&conf->r5c_cached_partial_stripes); 289 list_add_tail(&sh->lru, &conf->r5c_partial_stripe_list); 290 } 291 } 292 } 293 } 294 } 295 296 static void __release_stripe(struct r5conf *conf, struct stripe_head *sh, 297 struct list_head *temp_inactive_list) 298 { 299 if (atomic_dec_and_test(&sh->count)) 300 do_release_stripe(conf, sh, temp_inactive_list); 301 } 302 303 /* 304 * @hash could be NR_STRIPE_HASH_LOCKS, then we have a list of inactive_list 305 * 306 * Be careful: Only one task can add/delete stripes from temp_inactive_list at 307 * given time. Adding stripes only takes device lock, while deleting stripes 308 * only takes hash lock. 309 */ 310 static void release_inactive_stripe_list(struct r5conf *conf, 311 struct list_head *temp_inactive_list, 312 int hash) 313 { 314 int size; 315 bool do_wakeup = false; 316 unsigned long flags; 317 318 if (hash == NR_STRIPE_HASH_LOCKS) { 319 size = NR_STRIPE_HASH_LOCKS; 320 hash = NR_STRIPE_HASH_LOCKS - 1; 321 } else 322 size = 1; 323 while (size) { 324 struct list_head *list = &temp_inactive_list[size - 1]; 325 326 /* 327 * We don't hold any lock here yet, raid5_get_active_stripe() might 328 * remove stripes from the list 329 */ 330 if (!list_empty_careful(list)) { 331 spin_lock_irqsave(conf->hash_locks + hash, flags); 332 if (list_empty(conf->inactive_list + hash) && 333 !list_empty(list)) 334 atomic_dec(&conf->empty_inactive_list_nr); 335 list_splice_tail_init(list, conf->inactive_list + hash); 336 do_wakeup = true; 337 spin_unlock_irqrestore(conf->hash_locks + hash, flags); 338 } 339 size--; 340 hash--; 341 } 342 343 if (do_wakeup) { 344 wake_up(&conf->wait_for_stripe); 345 if (atomic_read(&conf->active_stripes) == 0) 346 wake_up(&conf->wait_for_quiescent); 347 if (conf->retry_read_aligned) 348 md_wakeup_thread(conf->mddev->thread); 349 } 350 } 351 352 /* should hold conf->device_lock already */ 353 static int release_stripe_list(struct r5conf *conf, 354 struct list_head *temp_inactive_list) 355 { 356 struct stripe_head *sh; 357 int count = 0; 358 struct llist_node *head; 359 360 head = llist_del_all(&conf->released_stripes); 361 head = llist_reverse_order(head); 362 while (head) { 363 int hash; 364 365 sh = llist_entry(head, struct stripe_head, release_list); 366 head = llist_next(head); 367 /* sh could be readded after STRIPE_ON_RELEASE_LIST is cleard */ 368 smp_mb(); 369 clear_bit(STRIPE_ON_RELEASE_LIST, &sh->state); 370 /* 371 * Don't worry the bit is set here, because if the bit is set 372 * again, the count is always > 1. This is true for 373 * STRIPE_ON_UNPLUG_LIST bit too. 374 */ 375 hash = sh->hash_lock_index; 376 __release_stripe(conf, sh, &temp_inactive_list[hash]); 377 count++; 378 } 379 380 return count; 381 } 382 383 void raid5_release_stripe(struct stripe_head *sh) 384 { 385 struct r5conf *conf = sh->raid_conf; 386 unsigned long flags; 387 struct list_head list; 388 int hash; 389 bool wakeup; 390 391 /* Avoid release_list until the last reference. 392 */ 393 if (atomic_add_unless(&sh->count, -1, 1)) 394 return; 395 396 if (unlikely(!conf->mddev->thread) || 397 test_and_set_bit(STRIPE_ON_RELEASE_LIST, &sh->state)) 398 goto slow_path; 399 wakeup = llist_add(&sh->release_list, &conf->released_stripes); 400 if (wakeup) 401 md_wakeup_thread(conf->mddev->thread); 402 return; 403 slow_path: 404 local_irq_save(flags); 405 /* we are ok here if STRIPE_ON_RELEASE_LIST is set or not */ 406 if (atomic_dec_and_lock(&sh->count, &conf->device_lock)) { 407 INIT_LIST_HEAD(&list); 408 hash = sh->hash_lock_index; 409 do_release_stripe(conf, sh, &list); 410 spin_unlock(&conf->device_lock); 411 release_inactive_stripe_list(conf, &list, hash); 412 } 413 local_irq_restore(flags); 414 } 415 416 static inline void remove_hash(struct stripe_head *sh) 417 { 418 pr_debug("remove_hash(), stripe %llu\n", 419 (unsigned long long)sh->sector); 420 421 hlist_del_init(&sh->hash); 422 } 423 424 static inline void insert_hash(struct r5conf *conf, struct stripe_head *sh) 425 { 426 struct hlist_head *hp = stripe_hash(conf, sh->sector); 427 428 pr_debug("insert_hash(), stripe %llu\n", 429 (unsigned long long)sh->sector); 430 431 hlist_add_head(&sh->hash, hp); 432 } 433 434 /* find an idle stripe, make sure it is unhashed, and return it. */ 435 static struct stripe_head *get_free_stripe(struct r5conf *conf, int hash) 436 { 437 struct stripe_head *sh = NULL; 438 struct list_head *first; 439 440 if (list_empty(conf->inactive_list + hash)) 441 goto out; 442 first = (conf->inactive_list + hash)->next; 443 sh = list_entry(first, struct stripe_head, lru); 444 list_del_init(first); 445 remove_hash(sh); 446 atomic_inc(&conf->active_stripes); 447 BUG_ON(hash != sh->hash_lock_index); 448 if (list_empty(conf->inactive_list + hash)) 449 atomic_inc(&conf->empty_inactive_list_nr); 450 out: 451 return sh; 452 } 453 454 static void shrink_buffers(struct stripe_head *sh) 455 { 456 struct page *p; 457 int i; 458 int num = sh->raid_conf->pool_size; 459 460 for (i = 0; i < num ; i++) { 461 WARN_ON(sh->dev[i].page != sh->dev[i].orig_page); 462 p = sh->dev[i].page; 463 if (!p) 464 continue; 465 sh->dev[i].page = NULL; 466 put_page(p); 467 } 468 } 469 470 static int grow_buffers(struct stripe_head *sh, gfp_t gfp) 471 { 472 int i; 473 int num = sh->raid_conf->pool_size; 474 475 for (i = 0; i < num; i++) { 476 struct page *page; 477 478 if (!(page = alloc_page(gfp))) { 479 return 1; 480 } 481 sh->dev[i].page = page; 482 sh->dev[i].orig_page = page; 483 } 484 return 0; 485 } 486 487 static void raid5_build_block(struct stripe_head *sh, int i, int previous); 488 static void stripe_set_idx(sector_t stripe, struct r5conf *conf, int previous, 489 struct stripe_head *sh); 490 491 static void init_stripe(struct stripe_head *sh, sector_t sector, int previous) 492 { 493 struct r5conf *conf = sh->raid_conf; 494 int i, seq; 495 496 BUG_ON(atomic_read(&sh->count) != 0); 497 BUG_ON(test_bit(STRIPE_HANDLE, &sh->state)); 498 BUG_ON(stripe_operations_active(sh)); 499 BUG_ON(sh->batch_head); 500 501 pr_debug("init_stripe called, stripe %llu\n", 502 (unsigned long long)sector); 503 retry: 504 seq = read_seqcount_begin(&conf->gen_lock); 505 sh->generation = conf->generation - previous; 506 sh->disks = previous ? conf->previous_raid_disks : conf->raid_disks; 507 sh->sector = sector; 508 stripe_set_idx(sector, conf, previous, sh); 509 sh->state = 0; 510 511 for (i = sh->disks; i--; ) { 512 struct r5dev *dev = &sh->dev[i]; 513 514 if (dev->toread || dev->read || dev->towrite || dev->written || 515 test_bit(R5_LOCKED, &dev->flags)) { 516 pr_err("sector=%llx i=%d %p %p %p %p %d\n", 517 (unsigned long long)sh->sector, i, dev->toread, 518 dev->read, dev->towrite, dev->written, 519 test_bit(R5_LOCKED, &dev->flags)); 520 WARN_ON(1); 521 } 522 dev->flags = 0; 523 raid5_build_block(sh, i, previous); 524 } 525 if (read_seqcount_retry(&conf->gen_lock, seq)) 526 goto retry; 527 sh->overwrite_disks = 0; 528 insert_hash(conf, sh); 529 sh->cpu = smp_processor_id(); 530 set_bit(STRIPE_BATCH_READY, &sh->state); 531 } 532 533 static struct stripe_head *__find_stripe(struct r5conf *conf, sector_t sector, 534 short generation) 535 { 536 struct stripe_head *sh; 537 538 pr_debug("__find_stripe, sector %llu\n", (unsigned long long)sector); 539 hlist_for_each_entry(sh, stripe_hash(conf, sector), hash) 540 if (sh->sector == sector && sh->generation == generation) 541 return sh; 542 pr_debug("__stripe %llu not in cache\n", (unsigned long long)sector); 543 return NULL; 544 } 545 546 /* 547 * Need to check if array has failed when deciding whether to: 548 * - start an array 549 * - remove non-faulty devices 550 * - add a spare 551 * - allow a reshape 552 * This determination is simple when no reshape is happening. 553 * However if there is a reshape, we need to carefully check 554 * both the before and after sections. 555 * This is because some failed devices may only affect one 556 * of the two sections, and some non-in_sync devices may 557 * be insync in the section most affected by failed devices. 558 */ 559 int raid5_calc_degraded(struct r5conf *conf) 560 { 561 int degraded, degraded2; 562 int i; 563 564 rcu_read_lock(); 565 degraded = 0; 566 for (i = 0; i < conf->previous_raid_disks; i++) { 567 struct md_rdev *rdev = rcu_dereference(conf->disks[i].rdev); 568 if (rdev && test_bit(Faulty, &rdev->flags)) 569 rdev = rcu_dereference(conf->disks[i].replacement); 570 if (!rdev || test_bit(Faulty, &rdev->flags)) 571 degraded++; 572 else if (test_bit(In_sync, &rdev->flags)) 573 ; 574 else 575 /* not in-sync or faulty. 576 * If the reshape increases the number of devices, 577 * this is being recovered by the reshape, so 578 * this 'previous' section is not in_sync. 579 * If the number of devices is being reduced however, 580 * the device can only be part of the array if 581 * we are reverting a reshape, so this section will 582 * be in-sync. 583 */ 584 if (conf->raid_disks >= conf->previous_raid_disks) 585 degraded++; 586 } 587 rcu_read_unlock(); 588 if (conf->raid_disks == conf->previous_raid_disks) 589 return degraded; 590 rcu_read_lock(); 591 degraded2 = 0; 592 for (i = 0; i < conf->raid_disks; i++) { 593 struct md_rdev *rdev = rcu_dereference(conf->disks[i].rdev); 594 if (rdev && test_bit(Faulty, &rdev->flags)) 595 rdev = rcu_dereference(conf->disks[i].replacement); 596 if (!rdev || test_bit(Faulty, &rdev->flags)) 597 degraded2++; 598 else if (test_bit(In_sync, &rdev->flags)) 599 ; 600 else 601 /* not in-sync or faulty. 602 * If reshape increases the number of devices, this 603 * section has already been recovered, else it 604 * almost certainly hasn't. 605 */ 606 if (conf->raid_disks <= conf->previous_raid_disks) 607 degraded2++; 608 } 609 rcu_read_unlock(); 610 if (degraded2 > degraded) 611 return degraded2; 612 return degraded; 613 } 614 615 static int has_failed(struct r5conf *conf) 616 { 617 int degraded; 618 619 if (conf->mddev->reshape_position == MaxSector) 620 return conf->mddev->degraded > conf->max_degraded; 621 622 degraded = raid5_calc_degraded(conf); 623 if (degraded > conf->max_degraded) 624 return 1; 625 return 0; 626 } 627 628 struct stripe_head * 629 raid5_get_active_stripe(struct r5conf *conf, sector_t sector, 630 int previous, int noblock, int noquiesce) 631 { 632 struct stripe_head *sh; 633 int hash = stripe_hash_locks_hash(sector); 634 int inc_empty_inactive_list_flag; 635 636 pr_debug("get_stripe, sector %llu\n", (unsigned long long)sector); 637 638 spin_lock_irq(conf->hash_locks + hash); 639 640 do { 641 wait_event_lock_irq(conf->wait_for_quiescent, 642 conf->quiesce == 0 || noquiesce, 643 *(conf->hash_locks + hash)); 644 sh = __find_stripe(conf, sector, conf->generation - previous); 645 if (!sh) { 646 if (!test_bit(R5_INACTIVE_BLOCKED, &conf->cache_state)) { 647 sh = get_free_stripe(conf, hash); 648 if (!sh && !test_bit(R5_DID_ALLOC, 649 &conf->cache_state)) 650 set_bit(R5_ALLOC_MORE, 651 &conf->cache_state); 652 } 653 if (noblock && sh == NULL) 654 break; 655 656 r5c_check_stripe_cache_usage(conf); 657 if (!sh) { 658 set_bit(R5_INACTIVE_BLOCKED, 659 &conf->cache_state); 660 r5l_wake_reclaim(conf->log, 0); 661 wait_event_lock_irq( 662 conf->wait_for_stripe, 663 !list_empty(conf->inactive_list + hash) && 664 (atomic_read(&conf->active_stripes) 665 < (conf->max_nr_stripes * 3 / 4) 666 || !test_bit(R5_INACTIVE_BLOCKED, 667 &conf->cache_state)), 668 *(conf->hash_locks + hash)); 669 clear_bit(R5_INACTIVE_BLOCKED, 670 &conf->cache_state); 671 } else { 672 init_stripe(sh, sector, previous); 673 atomic_inc(&sh->count); 674 } 675 } else if (!atomic_inc_not_zero(&sh->count)) { 676 spin_lock(&conf->device_lock); 677 if (!atomic_read(&sh->count)) { 678 if (!test_bit(STRIPE_HANDLE, &sh->state)) 679 atomic_inc(&conf->active_stripes); 680 BUG_ON(list_empty(&sh->lru) && 681 !test_bit(STRIPE_EXPANDING, &sh->state)); 682 inc_empty_inactive_list_flag = 0; 683 if (!list_empty(conf->inactive_list + hash)) 684 inc_empty_inactive_list_flag = 1; 685 list_del_init(&sh->lru); 686 if (list_empty(conf->inactive_list + hash) && inc_empty_inactive_list_flag) 687 atomic_inc(&conf->empty_inactive_list_nr); 688 if (sh->group) { 689 sh->group->stripes_cnt--; 690 sh->group = NULL; 691 } 692 } 693 atomic_inc(&sh->count); 694 spin_unlock(&conf->device_lock); 695 } 696 } while (sh == NULL); 697 698 spin_unlock_irq(conf->hash_locks + hash); 699 return sh; 700 } 701 702 static bool is_full_stripe_write(struct stripe_head *sh) 703 { 704 BUG_ON(sh->overwrite_disks > (sh->disks - sh->raid_conf->max_degraded)); 705 return sh->overwrite_disks == (sh->disks - sh->raid_conf->max_degraded); 706 } 707 708 static void lock_two_stripes(struct stripe_head *sh1, struct stripe_head *sh2) 709 { 710 local_irq_disable(); 711 if (sh1 > sh2) { 712 spin_lock(&sh2->stripe_lock); 713 spin_lock_nested(&sh1->stripe_lock, 1); 714 } else { 715 spin_lock(&sh1->stripe_lock); 716 spin_lock_nested(&sh2->stripe_lock, 1); 717 } 718 } 719 720 static void unlock_two_stripes(struct stripe_head *sh1, struct stripe_head *sh2) 721 { 722 spin_unlock(&sh1->stripe_lock); 723 spin_unlock(&sh2->stripe_lock); 724 local_irq_enable(); 725 } 726 727 /* Only freshly new full stripe normal write stripe can be added to a batch list */ 728 static bool stripe_can_batch(struct stripe_head *sh) 729 { 730 struct r5conf *conf = sh->raid_conf; 731 732 if (conf->log) 733 return false; 734 return test_bit(STRIPE_BATCH_READY, &sh->state) && 735 !test_bit(STRIPE_BITMAP_PENDING, &sh->state) && 736 is_full_stripe_write(sh); 737 } 738 739 /* we only do back search */ 740 static void stripe_add_to_batch_list(struct r5conf *conf, struct stripe_head *sh) 741 { 742 struct stripe_head *head; 743 sector_t head_sector, tmp_sec; 744 int hash; 745 int dd_idx; 746 int inc_empty_inactive_list_flag; 747 748 /* Don't cross chunks, so stripe pd_idx/qd_idx is the same */ 749 tmp_sec = sh->sector; 750 if (!sector_div(tmp_sec, conf->chunk_sectors)) 751 return; 752 head_sector = sh->sector - STRIPE_SECTORS; 753 754 hash = stripe_hash_locks_hash(head_sector); 755 spin_lock_irq(conf->hash_locks + hash); 756 head = __find_stripe(conf, head_sector, conf->generation); 757 if (head && !atomic_inc_not_zero(&head->count)) { 758 spin_lock(&conf->device_lock); 759 if (!atomic_read(&head->count)) { 760 if (!test_bit(STRIPE_HANDLE, &head->state)) 761 atomic_inc(&conf->active_stripes); 762 BUG_ON(list_empty(&head->lru) && 763 !test_bit(STRIPE_EXPANDING, &head->state)); 764 inc_empty_inactive_list_flag = 0; 765 if (!list_empty(conf->inactive_list + hash)) 766 inc_empty_inactive_list_flag = 1; 767 list_del_init(&head->lru); 768 if (list_empty(conf->inactive_list + hash) && inc_empty_inactive_list_flag) 769 atomic_inc(&conf->empty_inactive_list_nr); 770 if (head->group) { 771 head->group->stripes_cnt--; 772 head->group = NULL; 773 } 774 } 775 atomic_inc(&head->count); 776 spin_unlock(&conf->device_lock); 777 } 778 spin_unlock_irq(conf->hash_locks + hash); 779 780 if (!head) 781 return; 782 if (!stripe_can_batch(head)) 783 goto out; 784 785 lock_two_stripes(head, sh); 786 /* clear_batch_ready clear the flag */ 787 if (!stripe_can_batch(head) || !stripe_can_batch(sh)) 788 goto unlock_out; 789 790 if (sh->batch_head) 791 goto unlock_out; 792 793 dd_idx = 0; 794 while (dd_idx == sh->pd_idx || dd_idx == sh->qd_idx) 795 dd_idx++; 796 if (head->dev[dd_idx].towrite->bi_opf != sh->dev[dd_idx].towrite->bi_opf || 797 bio_op(head->dev[dd_idx].towrite) != bio_op(sh->dev[dd_idx].towrite)) 798 goto unlock_out; 799 800 if (head->batch_head) { 801 spin_lock(&head->batch_head->batch_lock); 802 /* This batch list is already running */ 803 if (!stripe_can_batch(head)) { 804 spin_unlock(&head->batch_head->batch_lock); 805 goto unlock_out; 806 } 807 808 /* 809 * at this point, head's BATCH_READY could be cleared, but we 810 * can still add the stripe to batch list 811 */ 812 list_add(&sh->batch_list, &head->batch_list); 813 spin_unlock(&head->batch_head->batch_lock); 814 815 sh->batch_head = head->batch_head; 816 } else { 817 head->batch_head = head; 818 sh->batch_head = head->batch_head; 819 spin_lock(&head->batch_lock); 820 list_add_tail(&sh->batch_list, &head->batch_list); 821 spin_unlock(&head->batch_lock); 822 } 823 824 if (test_and_clear_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) 825 if (atomic_dec_return(&conf->preread_active_stripes) 826 < IO_THRESHOLD) 827 md_wakeup_thread(conf->mddev->thread); 828 829 if (test_and_clear_bit(STRIPE_BIT_DELAY, &sh->state)) { 830 int seq = sh->bm_seq; 831 if (test_bit(STRIPE_BIT_DELAY, &sh->batch_head->state) && 832 sh->batch_head->bm_seq > seq) 833 seq = sh->batch_head->bm_seq; 834 set_bit(STRIPE_BIT_DELAY, &sh->batch_head->state); 835 sh->batch_head->bm_seq = seq; 836 } 837 838 atomic_inc(&sh->count); 839 unlock_out: 840 unlock_two_stripes(head, sh); 841 out: 842 raid5_release_stripe(head); 843 } 844 845 /* Determine if 'data_offset' or 'new_data_offset' should be used 846 * in this stripe_head. 847 */ 848 static int use_new_offset(struct r5conf *conf, struct stripe_head *sh) 849 { 850 sector_t progress = conf->reshape_progress; 851 /* Need a memory barrier to make sure we see the value 852 * of conf->generation, or ->data_offset that was set before 853 * reshape_progress was updated. 854 */ 855 smp_rmb(); 856 if (progress == MaxSector) 857 return 0; 858 if (sh->generation == conf->generation - 1) 859 return 0; 860 /* We are in a reshape, and this is a new-generation stripe, 861 * so use new_data_offset. 862 */ 863 return 1; 864 } 865 866 static void 867 raid5_end_read_request(struct bio *bi); 868 static void 869 raid5_end_write_request(struct bio *bi); 870 871 static void ops_run_io(struct stripe_head *sh, struct stripe_head_state *s) 872 { 873 struct r5conf *conf = sh->raid_conf; 874 int i, disks = sh->disks; 875 struct stripe_head *head_sh = sh; 876 877 might_sleep(); 878 879 if (!test_bit(STRIPE_R5C_CACHING, &sh->state)) { 880 /* writing out phase */ 881 if (s->waiting_extra_page) 882 return; 883 if (r5l_write_stripe(conf->log, sh) == 0) 884 return; 885 } else { /* caching phase */ 886 if (test_bit(STRIPE_LOG_TRAPPED, &sh->state)) { 887 r5c_cache_data(conf->log, sh, s); 888 return; 889 } 890 } 891 892 for (i = disks; i--; ) { 893 int op, op_flags = 0; 894 int replace_only = 0; 895 struct bio *bi, *rbi; 896 struct md_rdev *rdev, *rrdev = NULL; 897 898 sh = head_sh; 899 if (test_and_clear_bit(R5_Wantwrite, &sh->dev[i].flags)) { 900 op = REQ_OP_WRITE; 901 if (test_and_clear_bit(R5_WantFUA, &sh->dev[i].flags)) 902 op_flags = REQ_FUA; 903 if (test_bit(R5_Discard, &sh->dev[i].flags)) 904 op = REQ_OP_DISCARD; 905 } else if (test_and_clear_bit(R5_Wantread, &sh->dev[i].flags)) 906 op = REQ_OP_READ; 907 else if (test_and_clear_bit(R5_WantReplace, 908 &sh->dev[i].flags)) { 909 op = REQ_OP_WRITE; 910 replace_only = 1; 911 } else 912 continue; 913 if (test_and_clear_bit(R5_SyncIO, &sh->dev[i].flags)) 914 op_flags |= REQ_SYNC; 915 916 again: 917 bi = &sh->dev[i].req; 918 rbi = &sh->dev[i].rreq; /* For writing to replacement */ 919 920 rcu_read_lock(); 921 rrdev = rcu_dereference(conf->disks[i].replacement); 922 smp_mb(); /* Ensure that if rrdev is NULL, rdev won't be */ 923 rdev = rcu_dereference(conf->disks[i].rdev); 924 if (!rdev) { 925 rdev = rrdev; 926 rrdev = NULL; 927 } 928 if (op_is_write(op)) { 929 if (replace_only) 930 rdev = NULL; 931 if (rdev == rrdev) 932 /* We raced and saw duplicates */ 933 rrdev = NULL; 934 } else { 935 if (test_bit(R5_ReadRepl, &head_sh->dev[i].flags) && rrdev) 936 rdev = rrdev; 937 rrdev = NULL; 938 } 939 940 if (rdev && test_bit(Faulty, &rdev->flags)) 941 rdev = NULL; 942 if (rdev) 943 atomic_inc(&rdev->nr_pending); 944 if (rrdev && test_bit(Faulty, &rrdev->flags)) 945 rrdev = NULL; 946 if (rrdev) 947 atomic_inc(&rrdev->nr_pending); 948 rcu_read_unlock(); 949 950 /* We have already checked bad blocks for reads. Now 951 * need to check for writes. We never accept write errors 952 * on the replacement, so we don't to check rrdev. 953 */ 954 while (op_is_write(op) && rdev && 955 test_bit(WriteErrorSeen, &rdev->flags)) { 956 sector_t first_bad; 957 int bad_sectors; 958 int bad = is_badblock(rdev, sh->sector, STRIPE_SECTORS, 959 &first_bad, &bad_sectors); 960 if (!bad) 961 break; 962 963 if (bad < 0) { 964 set_bit(BlockedBadBlocks, &rdev->flags); 965 if (!conf->mddev->external && 966 conf->mddev->sb_flags) { 967 /* It is very unlikely, but we might 968 * still need to write out the 969 * bad block log - better give it 970 * a chance*/ 971 md_check_recovery(conf->mddev); 972 } 973 /* 974 * Because md_wait_for_blocked_rdev 975 * will dec nr_pending, we must 976 * increment it first. 977 */ 978 atomic_inc(&rdev->nr_pending); 979 md_wait_for_blocked_rdev(rdev, conf->mddev); 980 } else { 981 /* Acknowledged bad block - skip the write */ 982 rdev_dec_pending(rdev, conf->mddev); 983 rdev = NULL; 984 } 985 } 986 987 if (rdev) { 988 if (s->syncing || s->expanding || s->expanded 989 || s->replacing) 990 md_sync_acct(rdev->bdev, STRIPE_SECTORS); 991 992 set_bit(STRIPE_IO_STARTED, &sh->state); 993 994 bi->bi_bdev = rdev->bdev; 995 bio_set_op_attrs(bi, op, op_flags); 996 bi->bi_end_io = op_is_write(op) 997 ? raid5_end_write_request 998 : raid5_end_read_request; 999 bi->bi_private = sh; 1000 1001 pr_debug("%s: for %llu schedule op %d on disc %d\n", 1002 __func__, (unsigned long long)sh->sector, 1003 bi->bi_opf, i); 1004 atomic_inc(&sh->count); 1005 if (sh != head_sh) 1006 atomic_inc(&head_sh->count); 1007 if (use_new_offset(conf, sh)) 1008 bi->bi_iter.bi_sector = (sh->sector 1009 + rdev->new_data_offset); 1010 else 1011 bi->bi_iter.bi_sector = (sh->sector 1012 + rdev->data_offset); 1013 if (test_bit(R5_ReadNoMerge, &head_sh->dev[i].flags)) 1014 bi->bi_opf |= REQ_NOMERGE; 1015 1016 if (test_bit(R5_SkipCopy, &sh->dev[i].flags)) 1017 WARN_ON(test_bit(R5_UPTODATE, &sh->dev[i].flags)); 1018 1019 if (!op_is_write(op) && 1020 test_bit(R5_InJournal, &sh->dev[i].flags)) 1021 /* 1022 * issuing read for a page in journal, this 1023 * must be preparing for prexor in rmw; read 1024 * the data into orig_page 1025 */ 1026 sh->dev[i].vec.bv_page = sh->dev[i].orig_page; 1027 else 1028 sh->dev[i].vec.bv_page = sh->dev[i].page; 1029 bi->bi_vcnt = 1; 1030 bi->bi_io_vec[0].bv_len = STRIPE_SIZE; 1031 bi->bi_io_vec[0].bv_offset = 0; 1032 bi->bi_iter.bi_size = STRIPE_SIZE; 1033 /* 1034 * If this is discard request, set bi_vcnt 0. We don't 1035 * want to confuse SCSI because SCSI will replace payload 1036 */ 1037 if (op == REQ_OP_DISCARD) 1038 bi->bi_vcnt = 0; 1039 if (rrdev) 1040 set_bit(R5_DOUBLE_LOCKED, &sh->dev[i].flags); 1041 1042 if (conf->mddev->gendisk) 1043 trace_block_bio_remap(bdev_get_queue(bi->bi_bdev), 1044 bi, disk_devt(conf->mddev->gendisk), 1045 sh->dev[i].sector); 1046 generic_make_request(bi); 1047 } 1048 if (rrdev) { 1049 if (s->syncing || s->expanding || s->expanded 1050 || s->replacing) 1051 md_sync_acct(rrdev->bdev, STRIPE_SECTORS); 1052 1053 set_bit(STRIPE_IO_STARTED, &sh->state); 1054 1055 rbi->bi_bdev = rrdev->bdev; 1056 bio_set_op_attrs(rbi, op, op_flags); 1057 BUG_ON(!op_is_write(op)); 1058 rbi->bi_end_io = raid5_end_write_request; 1059 rbi->bi_private = sh; 1060 1061 pr_debug("%s: for %llu schedule op %d on " 1062 "replacement disc %d\n", 1063 __func__, (unsigned long long)sh->sector, 1064 rbi->bi_opf, i); 1065 atomic_inc(&sh->count); 1066 if (sh != head_sh) 1067 atomic_inc(&head_sh->count); 1068 if (use_new_offset(conf, sh)) 1069 rbi->bi_iter.bi_sector = (sh->sector 1070 + rrdev->new_data_offset); 1071 else 1072 rbi->bi_iter.bi_sector = (sh->sector 1073 + rrdev->data_offset); 1074 if (test_bit(R5_SkipCopy, &sh->dev[i].flags)) 1075 WARN_ON(test_bit(R5_UPTODATE, &sh->dev[i].flags)); 1076 sh->dev[i].rvec.bv_page = sh->dev[i].page; 1077 rbi->bi_vcnt = 1; 1078 rbi->bi_io_vec[0].bv_len = STRIPE_SIZE; 1079 rbi->bi_io_vec[0].bv_offset = 0; 1080 rbi->bi_iter.bi_size = STRIPE_SIZE; 1081 /* 1082 * If this is discard request, set bi_vcnt 0. We don't 1083 * want to confuse SCSI because SCSI will replace payload 1084 */ 1085 if (op == REQ_OP_DISCARD) 1086 rbi->bi_vcnt = 0; 1087 if (conf->mddev->gendisk) 1088 trace_block_bio_remap(bdev_get_queue(rbi->bi_bdev), 1089 rbi, disk_devt(conf->mddev->gendisk), 1090 sh->dev[i].sector); 1091 generic_make_request(rbi); 1092 } 1093 if (!rdev && !rrdev) { 1094 if (op_is_write(op)) 1095 set_bit(STRIPE_DEGRADED, &sh->state); 1096 pr_debug("skip op %d on disc %d for sector %llu\n", 1097 bi->bi_opf, i, (unsigned long long)sh->sector); 1098 clear_bit(R5_LOCKED, &sh->dev[i].flags); 1099 set_bit(STRIPE_HANDLE, &sh->state); 1100 } 1101 1102 if (!head_sh->batch_head) 1103 continue; 1104 sh = list_first_entry(&sh->batch_list, struct stripe_head, 1105 batch_list); 1106 if (sh != head_sh) 1107 goto again; 1108 } 1109 } 1110 1111 static struct dma_async_tx_descriptor * 1112 async_copy_data(int frombio, struct bio *bio, struct page **page, 1113 sector_t sector, struct dma_async_tx_descriptor *tx, 1114 struct stripe_head *sh, int no_skipcopy) 1115 { 1116 struct bio_vec bvl; 1117 struct bvec_iter iter; 1118 struct page *bio_page; 1119 int page_offset; 1120 struct async_submit_ctl submit; 1121 enum async_tx_flags flags = 0; 1122 1123 if (bio->bi_iter.bi_sector >= sector) 1124 page_offset = (signed)(bio->bi_iter.bi_sector - sector) * 512; 1125 else 1126 page_offset = (signed)(sector - bio->bi_iter.bi_sector) * -512; 1127 1128 if (frombio) 1129 flags |= ASYNC_TX_FENCE; 1130 init_async_submit(&submit, flags, tx, NULL, NULL, NULL); 1131 1132 bio_for_each_segment(bvl, bio, iter) { 1133 int len = bvl.bv_len; 1134 int clen; 1135 int b_offset = 0; 1136 1137 if (page_offset < 0) { 1138 b_offset = -page_offset; 1139 page_offset += b_offset; 1140 len -= b_offset; 1141 } 1142 1143 if (len > 0 && page_offset + len > STRIPE_SIZE) 1144 clen = STRIPE_SIZE - page_offset; 1145 else 1146 clen = len; 1147 1148 if (clen > 0) { 1149 b_offset += bvl.bv_offset; 1150 bio_page = bvl.bv_page; 1151 if (frombio) { 1152 if (sh->raid_conf->skip_copy && 1153 b_offset == 0 && page_offset == 0 && 1154 clen == STRIPE_SIZE && 1155 !no_skipcopy) 1156 *page = bio_page; 1157 else 1158 tx = async_memcpy(*page, bio_page, page_offset, 1159 b_offset, clen, &submit); 1160 } else 1161 tx = async_memcpy(bio_page, *page, b_offset, 1162 page_offset, clen, &submit); 1163 } 1164 /* chain the operations */ 1165 submit.depend_tx = tx; 1166 1167 if (clen < len) /* hit end of page */ 1168 break; 1169 page_offset += len; 1170 } 1171 1172 return tx; 1173 } 1174 1175 static void ops_complete_biofill(void *stripe_head_ref) 1176 { 1177 struct stripe_head *sh = stripe_head_ref; 1178 struct bio_list return_bi = BIO_EMPTY_LIST; 1179 int i; 1180 1181 pr_debug("%s: stripe %llu\n", __func__, 1182 (unsigned long long)sh->sector); 1183 1184 /* clear completed biofills */ 1185 for (i = sh->disks; i--; ) { 1186 struct r5dev *dev = &sh->dev[i]; 1187 1188 /* acknowledge completion of a biofill operation */ 1189 /* and check if we need to reply to a read request, 1190 * new R5_Wantfill requests are held off until 1191 * !STRIPE_BIOFILL_RUN 1192 */ 1193 if (test_and_clear_bit(R5_Wantfill, &dev->flags)) { 1194 struct bio *rbi, *rbi2; 1195 1196 BUG_ON(!dev->read); 1197 rbi = dev->read; 1198 dev->read = NULL; 1199 while (rbi && rbi->bi_iter.bi_sector < 1200 dev->sector + STRIPE_SECTORS) { 1201 rbi2 = r5_next_bio(rbi, dev->sector); 1202 if (!raid5_dec_bi_active_stripes(rbi)) 1203 bio_list_add(&return_bi, rbi); 1204 rbi = rbi2; 1205 } 1206 } 1207 } 1208 clear_bit(STRIPE_BIOFILL_RUN, &sh->state); 1209 1210 return_io(&return_bi); 1211 1212 set_bit(STRIPE_HANDLE, &sh->state); 1213 raid5_release_stripe(sh); 1214 } 1215 1216 static void ops_run_biofill(struct stripe_head *sh) 1217 { 1218 struct dma_async_tx_descriptor *tx = NULL; 1219 struct async_submit_ctl submit; 1220 int i; 1221 1222 BUG_ON(sh->batch_head); 1223 pr_debug("%s: stripe %llu\n", __func__, 1224 (unsigned long long)sh->sector); 1225 1226 for (i = sh->disks; i--; ) { 1227 struct r5dev *dev = &sh->dev[i]; 1228 if (test_bit(R5_Wantfill, &dev->flags)) { 1229 struct bio *rbi; 1230 spin_lock_irq(&sh->stripe_lock); 1231 dev->read = rbi = dev->toread; 1232 dev->toread = NULL; 1233 spin_unlock_irq(&sh->stripe_lock); 1234 while (rbi && rbi->bi_iter.bi_sector < 1235 dev->sector + STRIPE_SECTORS) { 1236 tx = async_copy_data(0, rbi, &dev->page, 1237 dev->sector, tx, sh, 0); 1238 rbi = r5_next_bio(rbi, dev->sector); 1239 } 1240 } 1241 } 1242 1243 atomic_inc(&sh->count); 1244 init_async_submit(&submit, ASYNC_TX_ACK, tx, ops_complete_biofill, sh, NULL); 1245 async_trigger_callback(&submit); 1246 } 1247 1248 static void mark_target_uptodate(struct stripe_head *sh, int target) 1249 { 1250 struct r5dev *tgt; 1251 1252 if (target < 0) 1253 return; 1254 1255 tgt = &sh->dev[target]; 1256 set_bit(R5_UPTODATE, &tgt->flags); 1257 BUG_ON(!test_bit(R5_Wantcompute, &tgt->flags)); 1258 clear_bit(R5_Wantcompute, &tgt->flags); 1259 } 1260 1261 static void ops_complete_compute(void *stripe_head_ref) 1262 { 1263 struct stripe_head *sh = stripe_head_ref; 1264 1265 pr_debug("%s: stripe %llu\n", __func__, 1266 (unsigned long long)sh->sector); 1267 1268 /* mark the computed target(s) as uptodate */ 1269 mark_target_uptodate(sh, sh->ops.target); 1270 mark_target_uptodate(sh, sh->ops.target2); 1271 1272 clear_bit(STRIPE_COMPUTE_RUN, &sh->state); 1273 if (sh->check_state == check_state_compute_run) 1274 sh->check_state = check_state_compute_result; 1275 set_bit(STRIPE_HANDLE, &sh->state); 1276 raid5_release_stripe(sh); 1277 } 1278 1279 /* return a pointer to the address conversion region of the scribble buffer */ 1280 static addr_conv_t *to_addr_conv(struct stripe_head *sh, 1281 struct raid5_percpu *percpu, int i) 1282 { 1283 void *addr; 1284 1285 addr = flex_array_get(percpu->scribble, i); 1286 return addr + sizeof(struct page *) * (sh->disks + 2); 1287 } 1288 1289 /* return a pointer to the address conversion region of the scribble buffer */ 1290 static struct page **to_addr_page(struct raid5_percpu *percpu, int i) 1291 { 1292 void *addr; 1293 1294 addr = flex_array_get(percpu->scribble, i); 1295 return addr; 1296 } 1297 1298 static struct dma_async_tx_descriptor * 1299 ops_run_compute5(struct stripe_head *sh, struct raid5_percpu *percpu) 1300 { 1301 int disks = sh->disks; 1302 struct page **xor_srcs = to_addr_page(percpu, 0); 1303 int target = sh->ops.target; 1304 struct r5dev *tgt = &sh->dev[target]; 1305 struct page *xor_dest = tgt->page; 1306 int count = 0; 1307 struct dma_async_tx_descriptor *tx; 1308 struct async_submit_ctl submit; 1309 int i; 1310 1311 BUG_ON(sh->batch_head); 1312 1313 pr_debug("%s: stripe %llu block: %d\n", 1314 __func__, (unsigned long long)sh->sector, target); 1315 BUG_ON(!test_bit(R5_Wantcompute, &tgt->flags)); 1316 1317 for (i = disks; i--; ) 1318 if (i != target) 1319 xor_srcs[count++] = sh->dev[i].page; 1320 1321 atomic_inc(&sh->count); 1322 1323 init_async_submit(&submit, ASYNC_TX_FENCE|ASYNC_TX_XOR_ZERO_DST, NULL, 1324 ops_complete_compute, sh, to_addr_conv(sh, percpu, 0)); 1325 if (unlikely(count == 1)) 1326 tx = async_memcpy(xor_dest, xor_srcs[0], 0, 0, STRIPE_SIZE, &submit); 1327 else 1328 tx = async_xor(xor_dest, xor_srcs, 0, count, STRIPE_SIZE, &submit); 1329 1330 return tx; 1331 } 1332 1333 /* set_syndrome_sources - populate source buffers for gen_syndrome 1334 * @srcs - (struct page *) array of size sh->disks 1335 * @sh - stripe_head to parse 1336 * 1337 * Populates srcs in proper layout order for the stripe and returns the 1338 * 'count' of sources to be used in a call to async_gen_syndrome. The P 1339 * destination buffer is recorded in srcs[count] and the Q destination 1340 * is recorded in srcs[count+1]]. 1341 */ 1342 static int set_syndrome_sources(struct page **srcs, 1343 struct stripe_head *sh, 1344 int srctype) 1345 { 1346 int disks = sh->disks; 1347 int syndrome_disks = sh->ddf_layout ? disks : (disks - 2); 1348 int d0_idx = raid6_d0(sh); 1349 int count; 1350 int i; 1351 1352 for (i = 0; i < disks; i++) 1353 srcs[i] = NULL; 1354 1355 count = 0; 1356 i = d0_idx; 1357 do { 1358 int slot = raid6_idx_to_slot(i, sh, &count, syndrome_disks); 1359 struct r5dev *dev = &sh->dev[i]; 1360 1361 if (i == sh->qd_idx || i == sh->pd_idx || 1362 (srctype == SYNDROME_SRC_ALL) || 1363 (srctype == SYNDROME_SRC_WANT_DRAIN && 1364 (test_bit(R5_Wantdrain, &dev->flags) || 1365 test_bit(R5_InJournal, &dev->flags))) || 1366 (srctype == SYNDROME_SRC_WRITTEN && 1367 dev->written)) { 1368 if (test_bit(R5_InJournal, &dev->flags)) 1369 srcs[slot] = sh->dev[i].orig_page; 1370 else 1371 srcs[slot] = sh->dev[i].page; 1372 } 1373 i = raid6_next_disk(i, disks); 1374 } while (i != d0_idx); 1375 1376 return syndrome_disks; 1377 } 1378 1379 static struct dma_async_tx_descriptor * 1380 ops_run_compute6_1(struct stripe_head *sh, struct raid5_percpu *percpu) 1381 { 1382 int disks = sh->disks; 1383 struct page **blocks = to_addr_page(percpu, 0); 1384 int target; 1385 int qd_idx = sh->qd_idx; 1386 struct dma_async_tx_descriptor *tx; 1387 struct async_submit_ctl submit; 1388 struct r5dev *tgt; 1389 struct page *dest; 1390 int i; 1391 int count; 1392 1393 BUG_ON(sh->batch_head); 1394 if (sh->ops.target < 0) 1395 target = sh->ops.target2; 1396 else if (sh->ops.target2 < 0) 1397 target = sh->ops.target; 1398 else 1399 /* we should only have one valid target */ 1400 BUG(); 1401 BUG_ON(target < 0); 1402 pr_debug("%s: stripe %llu block: %d\n", 1403 __func__, (unsigned long long)sh->sector, target); 1404 1405 tgt = &sh->dev[target]; 1406 BUG_ON(!test_bit(R5_Wantcompute, &tgt->flags)); 1407 dest = tgt->page; 1408 1409 atomic_inc(&sh->count); 1410 1411 if (target == qd_idx) { 1412 count = set_syndrome_sources(blocks, sh, SYNDROME_SRC_ALL); 1413 blocks[count] = NULL; /* regenerating p is not necessary */ 1414 BUG_ON(blocks[count+1] != dest); /* q should already be set */ 1415 init_async_submit(&submit, ASYNC_TX_FENCE, NULL, 1416 ops_complete_compute, sh, 1417 to_addr_conv(sh, percpu, 0)); 1418 tx = async_gen_syndrome(blocks, 0, count+2, STRIPE_SIZE, &submit); 1419 } else { 1420 /* Compute any data- or p-drive using XOR */ 1421 count = 0; 1422 for (i = disks; i-- ; ) { 1423 if (i == target || i == qd_idx) 1424 continue; 1425 blocks[count++] = sh->dev[i].page; 1426 } 1427 1428 init_async_submit(&submit, ASYNC_TX_FENCE|ASYNC_TX_XOR_ZERO_DST, 1429 NULL, ops_complete_compute, sh, 1430 to_addr_conv(sh, percpu, 0)); 1431 tx = async_xor(dest, blocks, 0, count, STRIPE_SIZE, &submit); 1432 } 1433 1434 return tx; 1435 } 1436 1437 static struct dma_async_tx_descriptor * 1438 ops_run_compute6_2(struct stripe_head *sh, struct raid5_percpu *percpu) 1439 { 1440 int i, count, disks = sh->disks; 1441 int syndrome_disks = sh->ddf_layout ? disks : disks-2; 1442 int d0_idx = raid6_d0(sh); 1443 int faila = -1, failb = -1; 1444 int target = sh->ops.target; 1445 int target2 = sh->ops.target2; 1446 struct r5dev *tgt = &sh->dev[target]; 1447 struct r5dev *tgt2 = &sh->dev[target2]; 1448 struct dma_async_tx_descriptor *tx; 1449 struct page **blocks = to_addr_page(percpu, 0); 1450 struct async_submit_ctl submit; 1451 1452 BUG_ON(sh->batch_head); 1453 pr_debug("%s: stripe %llu block1: %d block2: %d\n", 1454 __func__, (unsigned long long)sh->sector, target, target2); 1455 BUG_ON(target < 0 || target2 < 0); 1456 BUG_ON(!test_bit(R5_Wantcompute, &tgt->flags)); 1457 BUG_ON(!test_bit(R5_Wantcompute, &tgt2->flags)); 1458 1459 /* we need to open-code set_syndrome_sources to handle the 1460 * slot number conversion for 'faila' and 'failb' 1461 */ 1462 for (i = 0; i < disks ; i++) 1463 blocks[i] = NULL; 1464 count = 0; 1465 i = d0_idx; 1466 do { 1467 int slot = raid6_idx_to_slot(i, sh, &count, syndrome_disks); 1468 1469 blocks[slot] = sh->dev[i].page; 1470 1471 if (i == target) 1472 faila = slot; 1473 if (i == target2) 1474 failb = slot; 1475 i = raid6_next_disk(i, disks); 1476 } while (i != d0_idx); 1477 1478 BUG_ON(faila == failb); 1479 if (failb < faila) 1480 swap(faila, failb); 1481 pr_debug("%s: stripe: %llu faila: %d failb: %d\n", 1482 __func__, (unsigned long long)sh->sector, faila, failb); 1483 1484 atomic_inc(&sh->count); 1485 1486 if (failb == syndrome_disks+1) { 1487 /* Q disk is one of the missing disks */ 1488 if (faila == syndrome_disks) { 1489 /* Missing P+Q, just recompute */ 1490 init_async_submit(&submit, ASYNC_TX_FENCE, NULL, 1491 ops_complete_compute, sh, 1492 to_addr_conv(sh, percpu, 0)); 1493 return async_gen_syndrome(blocks, 0, syndrome_disks+2, 1494 STRIPE_SIZE, &submit); 1495 } else { 1496 struct page *dest; 1497 int data_target; 1498 int qd_idx = sh->qd_idx; 1499 1500 /* Missing D+Q: recompute D from P, then recompute Q */ 1501 if (target == qd_idx) 1502 data_target = target2; 1503 else 1504 data_target = target; 1505 1506 count = 0; 1507 for (i = disks; i-- ; ) { 1508 if (i == data_target || i == qd_idx) 1509 continue; 1510 blocks[count++] = sh->dev[i].page; 1511 } 1512 dest = sh->dev[data_target].page; 1513 init_async_submit(&submit, 1514 ASYNC_TX_FENCE|ASYNC_TX_XOR_ZERO_DST, 1515 NULL, NULL, NULL, 1516 to_addr_conv(sh, percpu, 0)); 1517 tx = async_xor(dest, blocks, 0, count, STRIPE_SIZE, 1518 &submit); 1519 1520 count = set_syndrome_sources(blocks, sh, SYNDROME_SRC_ALL); 1521 init_async_submit(&submit, ASYNC_TX_FENCE, tx, 1522 ops_complete_compute, sh, 1523 to_addr_conv(sh, percpu, 0)); 1524 return async_gen_syndrome(blocks, 0, count+2, 1525 STRIPE_SIZE, &submit); 1526 } 1527 } else { 1528 init_async_submit(&submit, ASYNC_TX_FENCE, NULL, 1529 ops_complete_compute, sh, 1530 to_addr_conv(sh, percpu, 0)); 1531 if (failb == syndrome_disks) { 1532 /* We're missing D+P. */ 1533 return async_raid6_datap_recov(syndrome_disks+2, 1534 STRIPE_SIZE, faila, 1535 blocks, &submit); 1536 } else { 1537 /* We're missing D+D. */ 1538 return async_raid6_2data_recov(syndrome_disks+2, 1539 STRIPE_SIZE, faila, failb, 1540 blocks, &submit); 1541 } 1542 } 1543 } 1544 1545 static void ops_complete_prexor(void *stripe_head_ref) 1546 { 1547 struct stripe_head *sh = stripe_head_ref; 1548 1549 pr_debug("%s: stripe %llu\n", __func__, 1550 (unsigned long long)sh->sector); 1551 1552 if (r5c_is_writeback(sh->raid_conf->log)) 1553 /* 1554 * raid5-cache write back uses orig_page during prexor. 1555 * After prexor, it is time to free orig_page 1556 */ 1557 r5c_release_extra_page(sh); 1558 } 1559 1560 static struct dma_async_tx_descriptor * 1561 ops_run_prexor5(struct stripe_head *sh, struct raid5_percpu *percpu, 1562 struct dma_async_tx_descriptor *tx) 1563 { 1564 int disks = sh->disks; 1565 struct page **xor_srcs = to_addr_page(percpu, 0); 1566 int count = 0, pd_idx = sh->pd_idx, i; 1567 struct async_submit_ctl submit; 1568 1569 /* existing parity data subtracted */ 1570 struct page *xor_dest = xor_srcs[count++] = sh->dev[pd_idx].page; 1571 1572 BUG_ON(sh->batch_head); 1573 pr_debug("%s: stripe %llu\n", __func__, 1574 (unsigned long long)sh->sector); 1575 1576 for (i = disks; i--; ) { 1577 struct r5dev *dev = &sh->dev[i]; 1578 /* Only process blocks that are known to be uptodate */ 1579 if (test_bit(R5_InJournal, &dev->flags)) 1580 xor_srcs[count++] = dev->orig_page; 1581 else if (test_bit(R5_Wantdrain, &dev->flags)) 1582 xor_srcs[count++] = dev->page; 1583 } 1584 1585 init_async_submit(&submit, ASYNC_TX_FENCE|ASYNC_TX_XOR_DROP_DST, tx, 1586 ops_complete_prexor, sh, to_addr_conv(sh, percpu, 0)); 1587 tx = async_xor(xor_dest, xor_srcs, 0, count, STRIPE_SIZE, &submit); 1588 1589 return tx; 1590 } 1591 1592 static struct dma_async_tx_descriptor * 1593 ops_run_prexor6(struct stripe_head *sh, struct raid5_percpu *percpu, 1594 struct dma_async_tx_descriptor *tx) 1595 { 1596 struct page **blocks = to_addr_page(percpu, 0); 1597 int count; 1598 struct async_submit_ctl submit; 1599 1600 pr_debug("%s: stripe %llu\n", __func__, 1601 (unsigned long long)sh->sector); 1602 1603 count = set_syndrome_sources(blocks, sh, SYNDROME_SRC_WANT_DRAIN); 1604 1605 init_async_submit(&submit, ASYNC_TX_FENCE|ASYNC_TX_PQ_XOR_DST, tx, 1606 ops_complete_prexor, sh, to_addr_conv(sh, percpu, 0)); 1607 tx = async_gen_syndrome(blocks, 0, count+2, STRIPE_SIZE, &submit); 1608 1609 return tx; 1610 } 1611 1612 static struct dma_async_tx_descriptor * 1613 ops_run_biodrain(struct stripe_head *sh, struct dma_async_tx_descriptor *tx) 1614 { 1615 struct r5conf *conf = sh->raid_conf; 1616 int disks = sh->disks; 1617 int i; 1618 struct stripe_head *head_sh = sh; 1619 1620 pr_debug("%s: stripe %llu\n", __func__, 1621 (unsigned long long)sh->sector); 1622 1623 for (i = disks; i--; ) { 1624 struct r5dev *dev; 1625 struct bio *chosen; 1626 1627 sh = head_sh; 1628 if (test_and_clear_bit(R5_Wantdrain, &head_sh->dev[i].flags)) { 1629 struct bio *wbi; 1630 1631 again: 1632 dev = &sh->dev[i]; 1633 /* 1634 * clear R5_InJournal, so when rewriting a page in 1635 * journal, it is not skipped by r5l_log_stripe() 1636 */ 1637 clear_bit(R5_InJournal, &dev->flags); 1638 spin_lock_irq(&sh->stripe_lock); 1639 chosen = dev->towrite; 1640 dev->towrite = NULL; 1641 sh->overwrite_disks = 0; 1642 BUG_ON(dev->written); 1643 wbi = dev->written = chosen; 1644 spin_unlock_irq(&sh->stripe_lock); 1645 WARN_ON(dev->page != dev->orig_page); 1646 1647 while (wbi && wbi->bi_iter.bi_sector < 1648 dev->sector + STRIPE_SECTORS) { 1649 if (wbi->bi_opf & REQ_FUA) 1650 set_bit(R5_WantFUA, &dev->flags); 1651 if (wbi->bi_opf & REQ_SYNC) 1652 set_bit(R5_SyncIO, &dev->flags); 1653 if (bio_op(wbi) == REQ_OP_DISCARD) 1654 set_bit(R5_Discard, &dev->flags); 1655 else { 1656 tx = async_copy_data(1, wbi, &dev->page, 1657 dev->sector, tx, sh, 1658 r5c_is_writeback(conf->log)); 1659 if (dev->page != dev->orig_page && 1660 !r5c_is_writeback(conf->log)) { 1661 set_bit(R5_SkipCopy, &dev->flags); 1662 clear_bit(R5_UPTODATE, &dev->flags); 1663 clear_bit(R5_OVERWRITE, &dev->flags); 1664 } 1665 } 1666 wbi = r5_next_bio(wbi, dev->sector); 1667 } 1668 1669 if (head_sh->batch_head) { 1670 sh = list_first_entry(&sh->batch_list, 1671 struct stripe_head, 1672 batch_list); 1673 if (sh == head_sh) 1674 continue; 1675 goto again; 1676 } 1677 } 1678 } 1679 1680 return tx; 1681 } 1682 1683 static void ops_complete_reconstruct(void *stripe_head_ref) 1684 { 1685 struct stripe_head *sh = stripe_head_ref; 1686 int disks = sh->disks; 1687 int pd_idx = sh->pd_idx; 1688 int qd_idx = sh->qd_idx; 1689 int i; 1690 bool fua = false, sync = false, discard = false; 1691 1692 pr_debug("%s: stripe %llu\n", __func__, 1693 (unsigned long long)sh->sector); 1694 1695 for (i = disks; i--; ) { 1696 fua |= test_bit(R5_WantFUA, &sh->dev[i].flags); 1697 sync |= test_bit(R5_SyncIO, &sh->dev[i].flags); 1698 discard |= test_bit(R5_Discard, &sh->dev[i].flags); 1699 } 1700 1701 for (i = disks; i--; ) { 1702 struct r5dev *dev = &sh->dev[i]; 1703 1704 if (dev->written || i == pd_idx || i == qd_idx) { 1705 if (!discard && !test_bit(R5_SkipCopy, &dev->flags)) 1706 set_bit(R5_UPTODATE, &dev->flags); 1707 if (fua) 1708 set_bit(R5_WantFUA, &dev->flags); 1709 if (sync) 1710 set_bit(R5_SyncIO, &dev->flags); 1711 } 1712 } 1713 1714 if (sh->reconstruct_state == reconstruct_state_drain_run) 1715 sh->reconstruct_state = reconstruct_state_drain_result; 1716 else if (sh->reconstruct_state == reconstruct_state_prexor_drain_run) 1717 sh->reconstruct_state = reconstruct_state_prexor_drain_result; 1718 else { 1719 BUG_ON(sh->reconstruct_state != reconstruct_state_run); 1720 sh->reconstruct_state = reconstruct_state_result; 1721 } 1722 1723 set_bit(STRIPE_HANDLE, &sh->state); 1724 raid5_release_stripe(sh); 1725 } 1726 1727 static void 1728 ops_run_reconstruct5(struct stripe_head *sh, struct raid5_percpu *percpu, 1729 struct dma_async_tx_descriptor *tx) 1730 { 1731 int disks = sh->disks; 1732 struct page **xor_srcs; 1733 struct async_submit_ctl submit; 1734 int count, pd_idx = sh->pd_idx, i; 1735 struct page *xor_dest; 1736 int prexor = 0; 1737 unsigned long flags; 1738 int j = 0; 1739 struct stripe_head *head_sh = sh; 1740 int last_stripe; 1741 1742 pr_debug("%s: stripe %llu\n", __func__, 1743 (unsigned long long)sh->sector); 1744 1745 for (i = 0; i < sh->disks; i++) { 1746 if (pd_idx == i) 1747 continue; 1748 if (!test_bit(R5_Discard, &sh->dev[i].flags)) 1749 break; 1750 } 1751 if (i >= sh->disks) { 1752 atomic_inc(&sh->count); 1753 set_bit(R5_Discard, &sh->dev[pd_idx].flags); 1754 ops_complete_reconstruct(sh); 1755 return; 1756 } 1757 again: 1758 count = 0; 1759 xor_srcs = to_addr_page(percpu, j); 1760 /* check if prexor is active which means only process blocks 1761 * that are part of a read-modify-write (written) 1762 */ 1763 if (head_sh->reconstruct_state == reconstruct_state_prexor_drain_run) { 1764 prexor = 1; 1765 xor_dest = xor_srcs[count++] = sh->dev[pd_idx].page; 1766 for (i = disks; i--; ) { 1767 struct r5dev *dev = &sh->dev[i]; 1768 if (head_sh->dev[i].written || 1769 test_bit(R5_InJournal, &head_sh->dev[i].flags)) 1770 xor_srcs[count++] = dev->page; 1771 } 1772 } else { 1773 xor_dest = sh->dev[pd_idx].page; 1774 for (i = disks; i--; ) { 1775 struct r5dev *dev = &sh->dev[i]; 1776 if (i != pd_idx) 1777 xor_srcs[count++] = dev->page; 1778 } 1779 } 1780 1781 /* 1/ if we prexor'd then the dest is reused as a source 1782 * 2/ if we did not prexor then we are redoing the parity 1783 * set ASYNC_TX_XOR_DROP_DST and ASYNC_TX_XOR_ZERO_DST 1784 * for the synchronous xor case 1785 */ 1786 last_stripe = !head_sh->batch_head || 1787 list_first_entry(&sh->batch_list, 1788 struct stripe_head, batch_list) == head_sh; 1789 if (last_stripe) { 1790 flags = ASYNC_TX_ACK | 1791 (prexor ? ASYNC_TX_XOR_DROP_DST : ASYNC_TX_XOR_ZERO_DST); 1792 1793 atomic_inc(&head_sh->count); 1794 init_async_submit(&submit, flags, tx, ops_complete_reconstruct, head_sh, 1795 to_addr_conv(sh, percpu, j)); 1796 } else { 1797 flags = prexor ? ASYNC_TX_XOR_DROP_DST : ASYNC_TX_XOR_ZERO_DST; 1798 init_async_submit(&submit, flags, tx, NULL, NULL, 1799 to_addr_conv(sh, percpu, j)); 1800 } 1801 1802 if (unlikely(count == 1)) 1803 tx = async_memcpy(xor_dest, xor_srcs[0], 0, 0, STRIPE_SIZE, &submit); 1804 else 1805 tx = async_xor(xor_dest, xor_srcs, 0, count, STRIPE_SIZE, &submit); 1806 if (!last_stripe) { 1807 j++; 1808 sh = list_first_entry(&sh->batch_list, struct stripe_head, 1809 batch_list); 1810 goto again; 1811 } 1812 } 1813 1814 static void 1815 ops_run_reconstruct6(struct stripe_head *sh, struct raid5_percpu *percpu, 1816 struct dma_async_tx_descriptor *tx) 1817 { 1818 struct async_submit_ctl submit; 1819 struct page **blocks; 1820 int count, i, j = 0; 1821 struct stripe_head *head_sh = sh; 1822 int last_stripe; 1823 int synflags; 1824 unsigned long txflags; 1825 1826 pr_debug("%s: stripe %llu\n", __func__, (unsigned long long)sh->sector); 1827 1828 for (i = 0; i < sh->disks; i++) { 1829 if (sh->pd_idx == i || sh->qd_idx == i) 1830 continue; 1831 if (!test_bit(R5_Discard, &sh->dev[i].flags)) 1832 break; 1833 } 1834 if (i >= sh->disks) { 1835 atomic_inc(&sh->count); 1836 set_bit(R5_Discard, &sh->dev[sh->pd_idx].flags); 1837 set_bit(R5_Discard, &sh->dev[sh->qd_idx].flags); 1838 ops_complete_reconstruct(sh); 1839 return; 1840 } 1841 1842 again: 1843 blocks = to_addr_page(percpu, j); 1844 1845 if (sh->reconstruct_state == reconstruct_state_prexor_drain_run) { 1846 synflags = SYNDROME_SRC_WRITTEN; 1847 txflags = ASYNC_TX_ACK | ASYNC_TX_PQ_XOR_DST; 1848 } else { 1849 synflags = SYNDROME_SRC_ALL; 1850 txflags = ASYNC_TX_ACK; 1851 } 1852 1853 count = set_syndrome_sources(blocks, sh, synflags); 1854 last_stripe = !head_sh->batch_head || 1855 list_first_entry(&sh->batch_list, 1856 struct stripe_head, batch_list) == head_sh; 1857 1858 if (last_stripe) { 1859 atomic_inc(&head_sh->count); 1860 init_async_submit(&submit, txflags, tx, ops_complete_reconstruct, 1861 head_sh, to_addr_conv(sh, percpu, j)); 1862 } else 1863 init_async_submit(&submit, 0, tx, NULL, NULL, 1864 to_addr_conv(sh, percpu, j)); 1865 tx = async_gen_syndrome(blocks, 0, count+2, STRIPE_SIZE, &submit); 1866 if (!last_stripe) { 1867 j++; 1868 sh = list_first_entry(&sh->batch_list, struct stripe_head, 1869 batch_list); 1870 goto again; 1871 } 1872 } 1873 1874 static void ops_complete_check(void *stripe_head_ref) 1875 { 1876 struct stripe_head *sh = stripe_head_ref; 1877 1878 pr_debug("%s: stripe %llu\n", __func__, 1879 (unsigned long long)sh->sector); 1880 1881 sh->check_state = check_state_check_result; 1882 set_bit(STRIPE_HANDLE, &sh->state); 1883 raid5_release_stripe(sh); 1884 } 1885 1886 static void ops_run_check_p(struct stripe_head *sh, struct raid5_percpu *percpu) 1887 { 1888 int disks = sh->disks; 1889 int pd_idx = sh->pd_idx; 1890 int qd_idx = sh->qd_idx; 1891 struct page *xor_dest; 1892 struct page **xor_srcs = to_addr_page(percpu, 0); 1893 struct dma_async_tx_descriptor *tx; 1894 struct async_submit_ctl submit; 1895 int count; 1896 int i; 1897 1898 pr_debug("%s: stripe %llu\n", __func__, 1899 (unsigned long long)sh->sector); 1900 1901 BUG_ON(sh->batch_head); 1902 count = 0; 1903 xor_dest = sh->dev[pd_idx].page; 1904 xor_srcs[count++] = xor_dest; 1905 for (i = disks; i--; ) { 1906 if (i == pd_idx || i == qd_idx) 1907 continue; 1908 xor_srcs[count++] = sh->dev[i].page; 1909 } 1910 1911 init_async_submit(&submit, 0, NULL, NULL, NULL, 1912 to_addr_conv(sh, percpu, 0)); 1913 tx = async_xor_val(xor_dest, xor_srcs, 0, count, STRIPE_SIZE, 1914 &sh->ops.zero_sum_result, &submit); 1915 1916 atomic_inc(&sh->count); 1917 init_async_submit(&submit, ASYNC_TX_ACK, tx, ops_complete_check, sh, NULL); 1918 tx = async_trigger_callback(&submit); 1919 } 1920 1921 static void ops_run_check_pq(struct stripe_head *sh, struct raid5_percpu *percpu, int checkp) 1922 { 1923 struct page **srcs = to_addr_page(percpu, 0); 1924 struct async_submit_ctl submit; 1925 int count; 1926 1927 pr_debug("%s: stripe %llu checkp: %d\n", __func__, 1928 (unsigned long long)sh->sector, checkp); 1929 1930 BUG_ON(sh->batch_head); 1931 count = set_syndrome_sources(srcs, sh, SYNDROME_SRC_ALL); 1932 if (!checkp) 1933 srcs[count] = NULL; 1934 1935 atomic_inc(&sh->count); 1936 init_async_submit(&submit, ASYNC_TX_ACK, NULL, ops_complete_check, 1937 sh, to_addr_conv(sh, percpu, 0)); 1938 async_syndrome_val(srcs, 0, count+2, STRIPE_SIZE, 1939 &sh->ops.zero_sum_result, percpu->spare_page, &submit); 1940 } 1941 1942 static void raid_run_ops(struct stripe_head *sh, unsigned long ops_request) 1943 { 1944 int overlap_clear = 0, i, disks = sh->disks; 1945 struct dma_async_tx_descriptor *tx = NULL; 1946 struct r5conf *conf = sh->raid_conf; 1947 int level = conf->level; 1948 struct raid5_percpu *percpu; 1949 unsigned long cpu; 1950 1951 cpu = get_cpu(); 1952 percpu = per_cpu_ptr(conf->percpu, cpu); 1953 if (test_bit(STRIPE_OP_BIOFILL, &ops_request)) { 1954 ops_run_biofill(sh); 1955 overlap_clear++; 1956 } 1957 1958 if (test_bit(STRIPE_OP_COMPUTE_BLK, &ops_request)) { 1959 if (level < 6) 1960 tx = ops_run_compute5(sh, percpu); 1961 else { 1962 if (sh->ops.target2 < 0 || sh->ops.target < 0) 1963 tx = ops_run_compute6_1(sh, percpu); 1964 else 1965 tx = ops_run_compute6_2(sh, percpu); 1966 } 1967 /* terminate the chain if reconstruct is not set to be run */ 1968 if (tx && !test_bit(STRIPE_OP_RECONSTRUCT, &ops_request)) 1969 async_tx_ack(tx); 1970 } 1971 1972 if (test_bit(STRIPE_OP_PREXOR, &ops_request)) { 1973 if (level < 6) 1974 tx = ops_run_prexor5(sh, percpu, tx); 1975 else 1976 tx = ops_run_prexor6(sh, percpu, tx); 1977 } 1978 1979 if (test_bit(STRIPE_OP_BIODRAIN, &ops_request)) { 1980 tx = ops_run_biodrain(sh, tx); 1981 overlap_clear++; 1982 } 1983 1984 if (test_bit(STRIPE_OP_RECONSTRUCT, &ops_request)) { 1985 if (level < 6) 1986 ops_run_reconstruct5(sh, percpu, tx); 1987 else 1988 ops_run_reconstruct6(sh, percpu, tx); 1989 } 1990 1991 if (test_bit(STRIPE_OP_CHECK, &ops_request)) { 1992 if (sh->check_state == check_state_run) 1993 ops_run_check_p(sh, percpu); 1994 else if (sh->check_state == check_state_run_q) 1995 ops_run_check_pq(sh, percpu, 0); 1996 else if (sh->check_state == check_state_run_pq) 1997 ops_run_check_pq(sh, percpu, 1); 1998 else 1999 BUG(); 2000 } 2001 2002 if (overlap_clear && !sh->batch_head) 2003 for (i = disks; i--; ) { 2004 struct r5dev *dev = &sh->dev[i]; 2005 if (test_and_clear_bit(R5_Overlap, &dev->flags)) 2006 wake_up(&sh->raid_conf->wait_for_overlap); 2007 } 2008 put_cpu(); 2009 } 2010 2011 static struct stripe_head *alloc_stripe(struct kmem_cache *sc, gfp_t gfp, 2012 int disks) 2013 { 2014 struct stripe_head *sh; 2015 int i; 2016 2017 sh = kmem_cache_zalloc(sc, gfp); 2018 if (sh) { 2019 spin_lock_init(&sh->stripe_lock); 2020 spin_lock_init(&sh->batch_lock); 2021 INIT_LIST_HEAD(&sh->batch_list); 2022 INIT_LIST_HEAD(&sh->lru); 2023 INIT_LIST_HEAD(&sh->r5c); 2024 INIT_LIST_HEAD(&sh->log_list); 2025 atomic_set(&sh->count, 1); 2026 sh->log_start = MaxSector; 2027 for (i = 0; i < disks; i++) { 2028 struct r5dev *dev = &sh->dev[i]; 2029 2030 bio_init(&dev->req, &dev->vec, 1); 2031 bio_init(&dev->rreq, &dev->rvec, 1); 2032 } 2033 } 2034 return sh; 2035 } 2036 static int grow_one_stripe(struct r5conf *conf, gfp_t gfp) 2037 { 2038 struct stripe_head *sh; 2039 2040 sh = alloc_stripe(conf->slab_cache, gfp, conf->pool_size); 2041 if (!sh) 2042 return 0; 2043 2044 sh->raid_conf = conf; 2045 2046 if (grow_buffers(sh, gfp)) { 2047 shrink_buffers(sh); 2048 kmem_cache_free(conf->slab_cache, sh); 2049 return 0; 2050 } 2051 sh->hash_lock_index = 2052 conf->max_nr_stripes % NR_STRIPE_HASH_LOCKS; 2053 /* we just created an active stripe so... */ 2054 atomic_inc(&conf->active_stripes); 2055 2056 raid5_release_stripe(sh); 2057 conf->max_nr_stripes++; 2058 return 1; 2059 } 2060 2061 static int grow_stripes(struct r5conf *conf, int num) 2062 { 2063 struct kmem_cache *sc; 2064 int devs = max(conf->raid_disks, conf->previous_raid_disks); 2065 2066 if (conf->mddev->gendisk) 2067 sprintf(conf->cache_name[0], 2068 "raid%d-%s", conf->level, mdname(conf->mddev)); 2069 else 2070 sprintf(conf->cache_name[0], 2071 "raid%d-%p", conf->level, conf->mddev); 2072 sprintf(conf->cache_name[1], "%s-alt", conf->cache_name[0]); 2073 2074 conf->active_name = 0; 2075 sc = kmem_cache_create(conf->cache_name[conf->active_name], 2076 sizeof(struct stripe_head)+(devs-1)*sizeof(struct r5dev), 2077 0, 0, NULL); 2078 if (!sc) 2079 return 1; 2080 conf->slab_cache = sc; 2081 conf->pool_size = devs; 2082 while (num--) 2083 if (!grow_one_stripe(conf, GFP_KERNEL)) 2084 return 1; 2085 2086 return 0; 2087 } 2088 2089 /** 2090 * scribble_len - return the required size of the scribble region 2091 * @num - total number of disks in the array 2092 * 2093 * The size must be enough to contain: 2094 * 1/ a struct page pointer for each device in the array +2 2095 * 2/ room to convert each entry in (1) to its corresponding dma 2096 * (dma_map_page()) or page (page_address()) address. 2097 * 2098 * Note: the +2 is for the destination buffers of the ddf/raid6 case where we 2099 * calculate over all devices (not just the data blocks), using zeros in place 2100 * of the P and Q blocks. 2101 */ 2102 static struct flex_array *scribble_alloc(int num, int cnt, gfp_t flags) 2103 { 2104 struct flex_array *ret; 2105 size_t len; 2106 2107 len = sizeof(struct page *) * (num+2) + sizeof(addr_conv_t) * (num+2); 2108 ret = flex_array_alloc(len, cnt, flags); 2109 if (!ret) 2110 return NULL; 2111 /* always prealloc all elements, so no locking is required */ 2112 if (flex_array_prealloc(ret, 0, cnt, flags)) { 2113 flex_array_free(ret); 2114 return NULL; 2115 } 2116 return ret; 2117 } 2118 2119 static int resize_chunks(struct r5conf *conf, int new_disks, int new_sectors) 2120 { 2121 unsigned long cpu; 2122 int err = 0; 2123 2124 /* 2125 * Never shrink. And mddev_suspend() could deadlock if this is called 2126 * from raid5d. In that case, scribble_disks and scribble_sectors 2127 * should equal to new_disks and new_sectors 2128 */ 2129 if (conf->scribble_disks >= new_disks && 2130 conf->scribble_sectors >= new_sectors) 2131 return 0; 2132 mddev_suspend(conf->mddev); 2133 get_online_cpus(); 2134 for_each_present_cpu(cpu) { 2135 struct raid5_percpu *percpu; 2136 struct flex_array *scribble; 2137 2138 percpu = per_cpu_ptr(conf->percpu, cpu); 2139 scribble = scribble_alloc(new_disks, 2140 new_sectors / STRIPE_SECTORS, 2141 GFP_NOIO); 2142 2143 if (scribble) { 2144 flex_array_free(percpu->scribble); 2145 percpu->scribble = scribble; 2146 } else { 2147 err = -ENOMEM; 2148 break; 2149 } 2150 } 2151 put_online_cpus(); 2152 mddev_resume(conf->mddev); 2153 if (!err) { 2154 conf->scribble_disks = new_disks; 2155 conf->scribble_sectors = new_sectors; 2156 } 2157 return err; 2158 } 2159 2160 static int resize_stripes(struct r5conf *conf, int newsize) 2161 { 2162 /* Make all the stripes able to hold 'newsize' devices. 2163 * New slots in each stripe get 'page' set to a new page. 2164 * 2165 * This happens in stages: 2166 * 1/ create a new kmem_cache and allocate the required number of 2167 * stripe_heads. 2168 * 2/ gather all the old stripe_heads and transfer the pages across 2169 * to the new stripe_heads. This will have the side effect of 2170 * freezing the array as once all stripe_heads have been collected, 2171 * no IO will be possible. Old stripe heads are freed once their 2172 * pages have been transferred over, and the old kmem_cache is 2173 * freed when all stripes are done. 2174 * 3/ reallocate conf->disks to be suitable bigger. If this fails, 2175 * we simple return a failre status - no need to clean anything up. 2176 * 4/ allocate new pages for the new slots in the new stripe_heads. 2177 * If this fails, we don't bother trying the shrink the 2178 * stripe_heads down again, we just leave them as they are. 2179 * As each stripe_head is processed the new one is released into 2180 * active service. 2181 * 2182 * Once step2 is started, we cannot afford to wait for a write, 2183 * so we use GFP_NOIO allocations. 2184 */ 2185 struct stripe_head *osh, *nsh; 2186 LIST_HEAD(newstripes); 2187 struct disk_info *ndisks; 2188 int err; 2189 struct kmem_cache *sc; 2190 int i; 2191 int hash, cnt; 2192 2193 if (newsize <= conf->pool_size) 2194 return 0; /* never bother to shrink */ 2195 2196 err = md_allow_write(conf->mddev); 2197 if (err) 2198 return err; 2199 2200 /* Step 1 */ 2201 sc = kmem_cache_create(conf->cache_name[1-conf->active_name], 2202 sizeof(struct stripe_head)+(newsize-1)*sizeof(struct r5dev), 2203 0, 0, NULL); 2204 if (!sc) 2205 return -ENOMEM; 2206 2207 /* Need to ensure auto-resizing doesn't interfere */ 2208 mutex_lock(&conf->cache_size_mutex); 2209 2210 for (i = conf->max_nr_stripes; i; i--) { 2211 nsh = alloc_stripe(sc, GFP_KERNEL, newsize); 2212 if (!nsh) 2213 break; 2214 2215 nsh->raid_conf = conf; 2216 list_add(&nsh->lru, &newstripes); 2217 } 2218 if (i) { 2219 /* didn't get enough, give up */ 2220 while (!list_empty(&newstripes)) { 2221 nsh = list_entry(newstripes.next, struct stripe_head, lru); 2222 list_del(&nsh->lru); 2223 kmem_cache_free(sc, nsh); 2224 } 2225 kmem_cache_destroy(sc); 2226 mutex_unlock(&conf->cache_size_mutex); 2227 return -ENOMEM; 2228 } 2229 /* Step 2 - Must use GFP_NOIO now. 2230 * OK, we have enough stripes, start collecting inactive 2231 * stripes and copying them over 2232 */ 2233 hash = 0; 2234 cnt = 0; 2235 list_for_each_entry(nsh, &newstripes, lru) { 2236 lock_device_hash_lock(conf, hash); 2237 wait_event_cmd(conf->wait_for_stripe, 2238 !list_empty(conf->inactive_list + hash), 2239 unlock_device_hash_lock(conf, hash), 2240 lock_device_hash_lock(conf, hash)); 2241 osh = get_free_stripe(conf, hash); 2242 unlock_device_hash_lock(conf, hash); 2243 2244 for(i=0; i<conf->pool_size; i++) { 2245 nsh->dev[i].page = osh->dev[i].page; 2246 nsh->dev[i].orig_page = osh->dev[i].page; 2247 } 2248 nsh->hash_lock_index = hash; 2249 kmem_cache_free(conf->slab_cache, osh); 2250 cnt++; 2251 if (cnt >= conf->max_nr_stripes / NR_STRIPE_HASH_LOCKS + 2252 !!((conf->max_nr_stripes % NR_STRIPE_HASH_LOCKS) > hash)) { 2253 hash++; 2254 cnt = 0; 2255 } 2256 } 2257 kmem_cache_destroy(conf->slab_cache); 2258 2259 /* Step 3. 2260 * At this point, we are holding all the stripes so the array 2261 * is completely stalled, so now is a good time to resize 2262 * conf->disks and the scribble region 2263 */ 2264 ndisks = kzalloc(newsize * sizeof(struct disk_info), GFP_NOIO); 2265 if (ndisks) { 2266 for (i = 0; i < conf->pool_size; i++) 2267 ndisks[i] = conf->disks[i]; 2268 2269 for (i = conf->pool_size; i < newsize; i++) { 2270 ndisks[i].extra_page = alloc_page(GFP_NOIO); 2271 if (!ndisks[i].extra_page) 2272 err = -ENOMEM; 2273 } 2274 2275 if (err) { 2276 for (i = conf->pool_size; i < newsize; i++) 2277 if (ndisks[i].extra_page) 2278 put_page(ndisks[i].extra_page); 2279 kfree(ndisks); 2280 } else { 2281 kfree(conf->disks); 2282 conf->disks = ndisks; 2283 } 2284 } else 2285 err = -ENOMEM; 2286 2287 mutex_unlock(&conf->cache_size_mutex); 2288 /* Step 4, return new stripes to service */ 2289 while(!list_empty(&newstripes)) { 2290 nsh = list_entry(newstripes.next, struct stripe_head, lru); 2291 list_del_init(&nsh->lru); 2292 2293 for (i=conf->raid_disks; i < newsize; i++) 2294 if (nsh->dev[i].page == NULL) { 2295 struct page *p = alloc_page(GFP_NOIO); 2296 nsh->dev[i].page = p; 2297 nsh->dev[i].orig_page = p; 2298 if (!p) 2299 err = -ENOMEM; 2300 } 2301 raid5_release_stripe(nsh); 2302 } 2303 /* critical section pass, GFP_NOIO no longer needed */ 2304 2305 conf->slab_cache = sc; 2306 conf->active_name = 1-conf->active_name; 2307 if (!err) 2308 conf->pool_size = newsize; 2309 return err; 2310 } 2311 2312 static int drop_one_stripe(struct r5conf *conf) 2313 { 2314 struct stripe_head *sh; 2315 int hash = (conf->max_nr_stripes - 1) & STRIPE_HASH_LOCKS_MASK; 2316 2317 spin_lock_irq(conf->hash_locks + hash); 2318 sh = get_free_stripe(conf, hash); 2319 spin_unlock_irq(conf->hash_locks + hash); 2320 if (!sh) 2321 return 0; 2322 BUG_ON(atomic_read(&sh->count)); 2323 shrink_buffers(sh); 2324 kmem_cache_free(conf->slab_cache, sh); 2325 atomic_dec(&conf->active_stripes); 2326 conf->max_nr_stripes--; 2327 return 1; 2328 } 2329 2330 static void shrink_stripes(struct r5conf *conf) 2331 { 2332 while (conf->max_nr_stripes && 2333 drop_one_stripe(conf)) 2334 ; 2335 2336 kmem_cache_destroy(conf->slab_cache); 2337 conf->slab_cache = NULL; 2338 } 2339 2340 static void raid5_end_read_request(struct bio * bi) 2341 { 2342 struct stripe_head *sh = bi->bi_private; 2343 struct r5conf *conf = sh->raid_conf; 2344 int disks = sh->disks, i; 2345 char b[BDEVNAME_SIZE]; 2346 struct md_rdev *rdev = NULL; 2347 sector_t s; 2348 2349 for (i=0 ; i<disks; i++) 2350 if (bi == &sh->dev[i].req) 2351 break; 2352 2353 pr_debug("end_read_request %llu/%d, count: %d, error %d.\n", 2354 (unsigned long long)sh->sector, i, atomic_read(&sh->count), 2355 bi->bi_error); 2356 if (i == disks) { 2357 bio_reset(bi); 2358 BUG(); 2359 return; 2360 } 2361 if (test_bit(R5_ReadRepl, &sh->dev[i].flags)) 2362 /* If replacement finished while this request was outstanding, 2363 * 'replacement' might be NULL already. 2364 * In that case it moved down to 'rdev'. 2365 * rdev is not removed until all requests are finished. 2366 */ 2367 rdev = conf->disks[i].replacement; 2368 if (!rdev) 2369 rdev = conf->disks[i].rdev; 2370 2371 if (use_new_offset(conf, sh)) 2372 s = sh->sector + rdev->new_data_offset; 2373 else 2374 s = sh->sector + rdev->data_offset; 2375 if (!bi->bi_error) { 2376 set_bit(R5_UPTODATE, &sh->dev[i].flags); 2377 if (test_bit(R5_ReadError, &sh->dev[i].flags)) { 2378 /* Note that this cannot happen on a 2379 * replacement device. We just fail those on 2380 * any error 2381 */ 2382 pr_info_ratelimited( 2383 "md/raid:%s: read error corrected (%lu sectors at %llu on %s)\n", 2384 mdname(conf->mddev), STRIPE_SECTORS, 2385 (unsigned long long)s, 2386 bdevname(rdev->bdev, b)); 2387 atomic_add(STRIPE_SECTORS, &rdev->corrected_errors); 2388 clear_bit(R5_ReadError, &sh->dev[i].flags); 2389 clear_bit(R5_ReWrite, &sh->dev[i].flags); 2390 } else if (test_bit(R5_ReadNoMerge, &sh->dev[i].flags)) 2391 clear_bit(R5_ReadNoMerge, &sh->dev[i].flags); 2392 2393 if (test_bit(R5_InJournal, &sh->dev[i].flags)) 2394 /* 2395 * end read for a page in journal, this 2396 * must be preparing for prexor in rmw 2397 */ 2398 set_bit(R5_OrigPageUPTDODATE, &sh->dev[i].flags); 2399 2400 if (atomic_read(&rdev->read_errors)) 2401 atomic_set(&rdev->read_errors, 0); 2402 } else { 2403 const char *bdn = bdevname(rdev->bdev, b); 2404 int retry = 0; 2405 int set_bad = 0; 2406 2407 clear_bit(R5_UPTODATE, &sh->dev[i].flags); 2408 atomic_inc(&rdev->read_errors); 2409 if (test_bit(R5_ReadRepl, &sh->dev[i].flags)) 2410 pr_warn_ratelimited( 2411 "md/raid:%s: read error on replacement device (sector %llu on %s).\n", 2412 mdname(conf->mddev), 2413 (unsigned long long)s, 2414 bdn); 2415 else if (conf->mddev->degraded >= conf->max_degraded) { 2416 set_bad = 1; 2417 pr_warn_ratelimited( 2418 "md/raid:%s: read error not correctable (sector %llu on %s).\n", 2419 mdname(conf->mddev), 2420 (unsigned long long)s, 2421 bdn); 2422 } else if (test_bit(R5_ReWrite, &sh->dev[i].flags)) { 2423 /* Oh, no!!! */ 2424 set_bad = 1; 2425 pr_warn_ratelimited( 2426 "md/raid:%s: read error NOT corrected!! (sector %llu on %s).\n", 2427 mdname(conf->mddev), 2428 (unsigned long long)s, 2429 bdn); 2430 } else if (atomic_read(&rdev->read_errors) 2431 > conf->max_nr_stripes) 2432 pr_warn("md/raid:%s: Too many read errors, failing device %s.\n", 2433 mdname(conf->mddev), bdn); 2434 else 2435 retry = 1; 2436 if (set_bad && test_bit(In_sync, &rdev->flags) 2437 && !test_bit(R5_ReadNoMerge, &sh->dev[i].flags)) 2438 retry = 1; 2439 if (retry) 2440 if (test_bit(R5_ReadNoMerge, &sh->dev[i].flags)) { 2441 set_bit(R5_ReadError, &sh->dev[i].flags); 2442 clear_bit(R5_ReadNoMerge, &sh->dev[i].flags); 2443 } else 2444 set_bit(R5_ReadNoMerge, &sh->dev[i].flags); 2445 else { 2446 clear_bit(R5_ReadError, &sh->dev[i].flags); 2447 clear_bit(R5_ReWrite, &sh->dev[i].flags); 2448 if (!(set_bad 2449 && test_bit(In_sync, &rdev->flags) 2450 && rdev_set_badblocks( 2451 rdev, sh->sector, STRIPE_SECTORS, 0))) 2452 md_error(conf->mddev, rdev); 2453 } 2454 } 2455 rdev_dec_pending(rdev, conf->mddev); 2456 bio_reset(bi); 2457 clear_bit(R5_LOCKED, &sh->dev[i].flags); 2458 set_bit(STRIPE_HANDLE, &sh->state); 2459 raid5_release_stripe(sh); 2460 } 2461 2462 static void raid5_end_write_request(struct bio *bi) 2463 { 2464 struct stripe_head *sh = bi->bi_private; 2465 struct r5conf *conf = sh->raid_conf; 2466 int disks = sh->disks, i; 2467 struct md_rdev *uninitialized_var(rdev); 2468 sector_t first_bad; 2469 int bad_sectors; 2470 int replacement = 0; 2471 2472 for (i = 0 ; i < disks; i++) { 2473 if (bi == &sh->dev[i].req) { 2474 rdev = conf->disks[i].rdev; 2475 break; 2476 } 2477 if (bi == &sh->dev[i].rreq) { 2478 rdev = conf->disks[i].replacement; 2479 if (rdev) 2480 replacement = 1; 2481 else 2482 /* rdev was removed and 'replacement' 2483 * replaced it. rdev is not removed 2484 * until all requests are finished. 2485 */ 2486 rdev = conf->disks[i].rdev; 2487 break; 2488 } 2489 } 2490 pr_debug("end_write_request %llu/%d, count %d, error: %d.\n", 2491 (unsigned long long)sh->sector, i, atomic_read(&sh->count), 2492 bi->bi_error); 2493 if (i == disks) { 2494 bio_reset(bi); 2495 BUG(); 2496 return; 2497 } 2498 2499 if (replacement) { 2500 if (bi->bi_error) 2501 md_error(conf->mddev, rdev); 2502 else if (is_badblock(rdev, sh->sector, 2503 STRIPE_SECTORS, 2504 &first_bad, &bad_sectors)) 2505 set_bit(R5_MadeGoodRepl, &sh->dev[i].flags); 2506 } else { 2507 if (bi->bi_error) { 2508 set_bit(STRIPE_DEGRADED, &sh->state); 2509 set_bit(WriteErrorSeen, &rdev->flags); 2510 set_bit(R5_WriteError, &sh->dev[i].flags); 2511 if (!test_and_set_bit(WantReplacement, &rdev->flags)) 2512 set_bit(MD_RECOVERY_NEEDED, 2513 &rdev->mddev->recovery); 2514 } else if (is_badblock(rdev, sh->sector, 2515 STRIPE_SECTORS, 2516 &first_bad, &bad_sectors)) { 2517 set_bit(R5_MadeGood, &sh->dev[i].flags); 2518 if (test_bit(R5_ReadError, &sh->dev[i].flags)) 2519 /* That was a successful write so make 2520 * sure it looks like we already did 2521 * a re-write. 2522 */ 2523 set_bit(R5_ReWrite, &sh->dev[i].flags); 2524 } 2525 } 2526 rdev_dec_pending(rdev, conf->mddev); 2527 2528 if (sh->batch_head && bi->bi_error && !replacement) 2529 set_bit(STRIPE_BATCH_ERR, &sh->batch_head->state); 2530 2531 bio_reset(bi); 2532 if (!test_and_clear_bit(R5_DOUBLE_LOCKED, &sh->dev[i].flags)) 2533 clear_bit(R5_LOCKED, &sh->dev[i].flags); 2534 set_bit(STRIPE_HANDLE, &sh->state); 2535 raid5_release_stripe(sh); 2536 2537 if (sh->batch_head && sh != sh->batch_head) 2538 raid5_release_stripe(sh->batch_head); 2539 } 2540 2541 static void raid5_build_block(struct stripe_head *sh, int i, int previous) 2542 { 2543 struct r5dev *dev = &sh->dev[i]; 2544 2545 dev->flags = 0; 2546 dev->sector = raid5_compute_blocknr(sh, i, previous); 2547 } 2548 2549 static void raid5_error(struct mddev *mddev, struct md_rdev *rdev) 2550 { 2551 char b[BDEVNAME_SIZE]; 2552 struct r5conf *conf = mddev->private; 2553 unsigned long flags; 2554 pr_debug("raid456: error called\n"); 2555 2556 spin_lock_irqsave(&conf->device_lock, flags); 2557 clear_bit(In_sync, &rdev->flags); 2558 mddev->degraded = raid5_calc_degraded(conf); 2559 spin_unlock_irqrestore(&conf->device_lock, flags); 2560 set_bit(MD_RECOVERY_INTR, &mddev->recovery); 2561 2562 set_bit(Blocked, &rdev->flags); 2563 set_bit(Faulty, &rdev->flags); 2564 set_mask_bits(&mddev->sb_flags, 0, 2565 BIT(MD_SB_CHANGE_DEVS) | BIT(MD_SB_CHANGE_PENDING)); 2566 pr_crit("md/raid:%s: Disk failure on %s, disabling device.\n" 2567 "md/raid:%s: Operation continuing on %d devices.\n", 2568 mdname(mddev), 2569 bdevname(rdev->bdev, b), 2570 mdname(mddev), 2571 conf->raid_disks - mddev->degraded); 2572 r5c_update_on_rdev_error(mddev); 2573 } 2574 2575 /* 2576 * Input: a 'big' sector number, 2577 * Output: index of the data and parity disk, and the sector # in them. 2578 */ 2579 sector_t raid5_compute_sector(struct r5conf *conf, sector_t r_sector, 2580 int previous, int *dd_idx, 2581 struct stripe_head *sh) 2582 { 2583 sector_t stripe, stripe2; 2584 sector_t chunk_number; 2585 unsigned int chunk_offset; 2586 int pd_idx, qd_idx; 2587 int ddf_layout = 0; 2588 sector_t new_sector; 2589 int algorithm = previous ? conf->prev_algo 2590 : conf->algorithm; 2591 int sectors_per_chunk = previous ? conf->prev_chunk_sectors 2592 : conf->chunk_sectors; 2593 int raid_disks = previous ? conf->previous_raid_disks 2594 : conf->raid_disks; 2595 int data_disks = raid_disks - conf->max_degraded; 2596 2597 /* First compute the information on this sector */ 2598 2599 /* 2600 * Compute the chunk number and the sector offset inside the chunk 2601 */ 2602 chunk_offset = sector_div(r_sector, sectors_per_chunk); 2603 chunk_number = r_sector; 2604 2605 /* 2606 * Compute the stripe number 2607 */ 2608 stripe = chunk_number; 2609 *dd_idx = sector_div(stripe, data_disks); 2610 stripe2 = stripe; 2611 /* 2612 * Select the parity disk based on the user selected algorithm. 2613 */ 2614 pd_idx = qd_idx = -1; 2615 switch(conf->level) { 2616 case 4: 2617 pd_idx = data_disks; 2618 break; 2619 case 5: 2620 switch (algorithm) { 2621 case ALGORITHM_LEFT_ASYMMETRIC: 2622 pd_idx = data_disks - sector_div(stripe2, raid_disks); 2623 if (*dd_idx >= pd_idx) 2624 (*dd_idx)++; 2625 break; 2626 case ALGORITHM_RIGHT_ASYMMETRIC: 2627 pd_idx = sector_div(stripe2, raid_disks); 2628 if (*dd_idx >= pd_idx) 2629 (*dd_idx)++; 2630 break; 2631 case ALGORITHM_LEFT_SYMMETRIC: 2632 pd_idx = data_disks - sector_div(stripe2, raid_disks); 2633 *dd_idx = (pd_idx + 1 + *dd_idx) % raid_disks; 2634 break; 2635 case ALGORITHM_RIGHT_SYMMETRIC: 2636 pd_idx = sector_div(stripe2, raid_disks); 2637 *dd_idx = (pd_idx + 1 + *dd_idx) % raid_disks; 2638 break; 2639 case ALGORITHM_PARITY_0: 2640 pd_idx = 0; 2641 (*dd_idx)++; 2642 break; 2643 case ALGORITHM_PARITY_N: 2644 pd_idx = data_disks; 2645 break; 2646 default: 2647 BUG(); 2648 } 2649 break; 2650 case 6: 2651 2652 switch (algorithm) { 2653 case ALGORITHM_LEFT_ASYMMETRIC: 2654 pd_idx = raid_disks - 1 - sector_div(stripe2, raid_disks); 2655 qd_idx = pd_idx + 1; 2656 if (pd_idx == raid_disks-1) { 2657 (*dd_idx)++; /* Q D D D P */ 2658 qd_idx = 0; 2659 } else if (*dd_idx >= pd_idx) 2660 (*dd_idx) += 2; /* D D P Q D */ 2661 break; 2662 case ALGORITHM_RIGHT_ASYMMETRIC: 2663 pd_idx = sector_div(stripe2, raid_disks); 2664 qd_idx = pd_idx + 1; 2665 if (pd_idx == raid_disks-1) { 2666 (*dd_idx)++; /* Q D D D P */ 2667 qd_idx = 0; 2668 } else if (*dd_idx >= pd_idx) 2669 (*dd_idx) += 2; /* D D P Q D */ 2670 break; 2671 case ALGORITHM_LEFT_SYMMETRIC: 2672 pd_idx = raid_disks - 1 - sector_div(stripe2, raid_disks); 2673 qd_idx = (pd_idx + 1) % raid_disks; 2674 *dd_idx = (pd_idx + 2 + *dd_idx) % raid_disks; 2675 break; 2676 case ALGORITHM_RIGHT_SYMMETRIC: 2677 pd_idx = sector_div(stripe2, raid_disks); 2678 qd_idx = (pd_idx + 1) % raid_disks; 2679 *dd_idx = (pd_idx + 2 + *dd_idx) % raid_disks; 2680 break; 2681 2682 case ALGORITHM_PARITY_0: 2683 pd_idx = 0; 2684 qd_idx = 1; 2685 (*dd_idx) += 2; 2686 break; 2687 case ALGORITHM_PARITY_N: 2688 pd_idx = data_disks; 2689 qd_idx = data_disks + 1; 2690 break; 2691 2692 case ALGORITHM_ROTATING_ZERO_RESTART: 2693 /* Exactly the same as RIGHT_ASYMMETRIC, but or 2694 * of blocks for computing Q is different. 2695 */ 2696 pd_idx = sector_div(stripe2, raid_disks); 2697 qd_idx = pd_idx + 1; 2698 if (pd_idx == raid_disks-1) { 2699 (*dd_idx)++; /* Q D D D P */ 2700 qd_idx = 0; 2701 } else if (*dd_idx >= pd_idx) 2702 (*dd_idx) += 2; /* D D P Q D */ 2703 ddf_layout = 1; 2704 break; 2705 2706 case ALGORITHM_ROTATING_N_RESTART: 2707 /* Same a left_asymmetric, by first stripe is 2708 * D D D P Q rather than 2709 * Q D D D P 2710 */ 2711 stripe2 += 1; 2712 pd_idx = raid_disks - 1 - sector_div(stripe2, raid_disks); 2713 qd_idx = pd_idx + 1; 2714 if (pd_idx == raid_disks-1) { 2715 (*dd_idx)++; /* Q D D D P */ 2716 qd_idx = 0; 2717 } else if (*dd_idx >= pd_idx) 2718 (*dd_idx) += 2; /* D D P Q D */ 2719 ddf_layout = 1; 2720 break; 2721 2722 case ALGORITHM_ROTATING_N_CONTINUE: 2723 /* Same as left_symmetric but Q is before P */ 2724 pd_idx = raid_disks - 1 - sector_div(stripe2, raid_disks); 2725 qd_idx = (pd_idx + raid_disks - 1) % raid_disks; 2726 *dd_idx = (pd_idx + 1 + *dd_idx) % raid_disks; 2727 ddf_layout = 1; 2728 break; 2729 2730 case ALGORITHM_LEFT_ASYMMETRIC_6: 2731 /* RAID5 left_asymmetric, with Q on last device */ 2732 pd_idx = data_disks - sector_div(stripe2, raid_disks-1); 2733 if (*dd_idx >= pd_idx) 2734 (*dd_idx)++; 2735 qd_idx = raid_disks - 1; 2736 break; 2737 2738 case ALGORITHM_RIGHT_ASYMMETRIC_6: 2739 pd_idx = sector_div(stripe2, raid_disks-1); 2740 if (*dd_idx >= pd_idx) 2741 (*dd_idx)++; 2742 qd_idx = raid_disks - 1; 2743 break; 2744 2745 case ALGORITHM_LEFT_SYMMETRIC_6: 2746 pd_idx = data_disks - sector_div(stripe2, raid_disks-1); 2747 *dd_idx = (pd_idx + 1 + *dd_idx) % (raid_disks-1); 2748 qd_idx = raid_disks - 1; 2749 break; 2750 2751 case ALGORITHM_RIGHT_SYMMETRIC_6: 2752 pd_idx = sector_div(stripe2, raid_disks-1); 2753 *dd_idx = (pd_idx + 1 + *dd_idx) % (raid_disks-1); 2754 qd_idx = raid_disks - 1; 2755 break; 2756 2757 case ALGORITHM_PARITY_0_6: 2758 pd_idx = 0; 2759 (*dd_idx)++; 2760 qd_idx = raid_disks - 1; 2761 break; 2762 2763 default: 2764 BUG(); 2765 } 2766 break; 2767 } 2768 2769 if (sh) { 2770 sh->pd_idx = pd_idx; 2771 sh->qd_idx = qd_idx; 2772 sh->ddf_layout = ddf_layout; 2773 } 2774 /* 2775 * Finally, compute the new sector number 2776 */ 2777 new_sector = (sector_t)stripe * sectors_per_chunk + chunk_offset; 2778 return new_sector; 2779 } 2780 2781 sector_t raid5_compute_blocknr(struct stripe_head *sh, int i, int previous) 2782 { 2783 struct r5conf *conf = sh->raid_conf; 2784 int raid_disks = sh->disks; 2785 int data_disks = raid_disks - conf->max_degraded; 2786 sector_t new_sector = sh->sector, check; 2787 int sectors_per_chunk = previous ? conf->prev_chunk_sectors 2788 : conf->chunk_sectors; 2789 int algorithm = previous ? conf->prev_algo 2790 : conf->algorithm; 2791 sector_t stripe; 2792 int chunk_offset; 2793 sector_t chunk_number; 2794 int dummy1, dd_idx = i; 2795 sector_t r_sector; 2796 struct stripe_head sh2; 2797 2798 chunk_offset = sector_div(new_sector, sectors_per_chunk); 2799 stripe = new_sector; 2800 2801 if (i == sh->pd_idx) 2802 return 0; 2803 switch(conf->level) { 2804 case 4: break; 2805 case 5: 2806 switch (algorithm) { 2807 case ALGORITHM_LEFT_ASYMMETRIC: 2808 case ALGORITHM_RIGHT_ASYMMETRIC: 2809 if (i > sh->pd_idx) 2810 i--; 2811 break; 2812 case ALGORITHM_LEFT_SYMMETRIC: 2813 case ALGORITHM_RIGHT_SYMMETRIC: 2814 if (i < sh->pd_idx) 2815 i += raid_disks; 2816 i -= (sh->pd_idx + 1); 2817 break; 2818 case ALGORITHM_PARITY_0: 2819 i -= 1; 2820 break; 2821 case ALGORITHM_PARITY_N: 2822 break; 2823 default: 2824 BUG(); 2825 } 2826 break; 2827 case 6: 2828 if (i == sh->qd_idx) 2829 return 0; /* It is the Q disk */ 2830 switch (algorithm) { 2831 case ALGORITHM_LEFT_ASYMMETRIC: 2832 case ALGORITHM_RIGHT_ASYMMETRIC: 2833 case ALGORITHM_ROTATING_ZERO_RESTART: 2834 case ALGORITHM_ROTATING_N_RESTART: 2835 if (sh->pd_idx == raid_disks-1) 2836 i--; /* Q D D D P */ 2837 else if (i > sh->pd_idx) 2838 i -= 2; /* D D P Q D */ 2839 break; 2840 case ALGORITHM_LEFT_SYMMETRIC: 2841 case ALGORITHM_RIGHT_SYMMETRIC: 2842 if (sh->pd_idx == raid_disks-1) 2843 i--; /* Q D D D P */ 2844 else { 2845 /* D D P Q D */ 2846 if (i < sh->pd_idx) 2847 i += raid_disks; 2848 i -= (sh->pd_idx + 2); 2849 } 2850 break; 2851 case ALGORITHM_PARITY_0: 2852 i -= 2; 2853 break; 2854 case ALGORITHM_PARITY_N: 2855 break; 2856 case ALGORITHM_ROTATING_N_CONTINUE: 2857 /* Like left_symmetric, but P is before Q */ 2858 if (sh->pd_idx == 0) 2859 i--; /* P D D D Q */ 2860 else { 2861 /* D D Q P D */ 2862 if (i < sh->pd_idx) 2863 i += raid_disks; 2864 i -= (sh->pd_idx + 1); 2865 } 2866 break; 2867 case ALGORITHM_LEFT_ASYMMETRIC_6: 2868 case ALGORITHM_RIGHT_ASYMMETRIC_6: 2869 if (i > sh->pd_idx) 2870 i--; 2871 break; 2872 case ALGORITHM_LEFT_SYMMETRIC_6: 2873 case ALGORITHM_RIGHT_SYMMETRIC_6: 2874 if (i < sh->pd_idx) 2875 i += data_disks + 1; 2876 i -= (sh->pd_idx + 1); 2877 break; 2878 case ALGORITHM_PARITY_0_6: 2879 i -= 1; 2880 break; 2881 default: 2882 BUG(); 2883 } 2884 break; 2885 } 2886 2887 chunk_number = stripe * data_disks + i; 2888 r_sector = chunk_number * sectors_per_chunk + chunk_offset; 2889 2890 check = raid5_compute_sector(conf, r_sector, 2891 previous, &dummy1, &sh2); 2892 if (check != sh->sector || dummy1 != dd_idx || sh2.pd_idx != sh->pd_idx 2893 || sh2.qd_idx != sh->qd_idx) { 2894 pr_warn("md/raid:%s: compute_blocknr: map not correct\n", 2895 mdname(conf->mddev)); 2896 return 0; 2897 } 2898 return r_sector; 2899 } 2900 2901 /* 2902 * There are cases where we want handle_stripe_dirtying() and 2903 * schedule_reconstruction() to delay towrite to some dev of a stripe. 2904 * 2905 * This function checks whether we want to delay the towrite. Specifically, 2906 * we delay the towrite when: 2907 * 2908 * 1. degraded stripe has a non-overwrite to the missing dev, AND this 2909 * stripe has data in journal (for other devices). 2910 * 2911 * In this case, when reading data for the non-overwrite dev, it is 2912 * necessary to handle complex rmw of write back cache (prexor with 2913 * orig_page, and xor with page). To keep read path simple, we would 2914 * like to flush data in journal to RAID disks first, so complex rmw 2915 * is handled in the write patch (handle_stripe_dirtying). 2916 * 2917 */ 2918 static inline bool delay_towrite(struct r5dev *dev, 2919 struct stripe_head_state *s) 2920 { 2921 return !test_bit(R5_OVERWRITE, &dev->flags) && 2922 !test_bit(R5_Insync, &dev->flags) && s->injournal; 2923 } 2924 2925 static void 2926 schedule_reconstruction(struct stripe_head *sh, struct stripe_head_state *s, 2927 int rcw, int expand) 2928 { 2929 int i, pd_idx = sh->pd_idx, qd_idx = sh->qd_idx, disks = sh->disks; 2930 struct r5conf *conf = sh->raid_conf; 2931 int level = conf->level; 2932 2933 if (rcw) { 2934 /* 2935 * In some cases, handle_stripe_dirtying initially decided to 2936 * run rmw and allocates extra page for prexor. However, rcw is 2937 * cheaper later on. We need to free the extra page now, 2938 * because we won't be able to do that in ops_complete_prexor(). 2939 */ 2940 r5c_release_extra_page(sh); 2941 2942 for (i = disks; i--; ) { 2943 struct r5dev *dev = &sh->dev[i]; 2944 2945 if (dev->towrite && !delay_towrite(dev, s)) { 2946 set_bit(R5_LOCKED, &dev->flags); 2947 set_bit(R5_Wantdrain, &dev->flags); 2948 if (!expand) 2949 clear_bit(R5_UPTODATE, &dev->flags); 2950 s->locked++; 2951 } else if (test_bit(R5_InJournal, &dev->flags)) { 2952 set_bit(R5_LOCKED, &dev->flags); 2953 s->locked++; 2954 } 2955 } 2956 /* if we are not expanding this is a proper write request, and 2957 * there will be bios with new data to be drained into the 2958 * stripe cache 2959 */ 2960 if (!expand) { 2961 if (!s->locked) 2962 /* False alarm, nothing to do */ 2963 return; 2964 sh->reconstruct_state = reconstruct_state_drain_run; 2965 set_bit(STRIPE_OP_BIODRAIN, &s->ops_request); 2966 } else 2967 sh->reconstruct_state = reconstruct_state_run; 2968 2969 set_bit(STRIPE_OP_RECONSTRUCT, &s->ops_request); 2970 2971 if (s->locked + conf->max_degraded == disks) 2972 if (!test_and_set_bit(STRIPE_FULL_WRITE, &sh->state)) 2973 atomic_inc(&conf->pending_full_writes); 2974 } else { 2975 BUG_ON(!(test_bit(R5_UPTODATE, &sh->dev[pd_idx].flags) || 2976 test_bit(R5_Wantcompute, &sh->dev[pd_idx].flags))); 2977 BUG_ON(level == 6 && 2978 (!(test_bit(R5_UPTODATE, &sh->dev[qd_idx].flags) || 2979 test_bit(R5_Wantcompute, &sh->dev[qd_idx].flags)))); 2980 2981 for (i = disks; i--; ) { 2982 struct r5dev *dev = &sh->dev[i]; 2983 if (i == pd_idx || i == qd_idx) 2984 continue; 2985 2986 if (dev->towrite && 2987 (test_bit(R5_UPTODATE, &dev->flags) || 2988 test_bit(R5_Wantcompute, &dev->flags))) { 2989 set_bit(R5_Wantdrain, &dev->flags); 2990 set_bit(R5_LOCKED, &dev->flags); 2991 clear_bit(R5_UPTODATE, &dev->flags); 2992 s->locked++; 2993 } else if (test_bit(R5_InJournal, &dev->flags)) { 2994 set_bit(R5_LOCKED, &dev->flags); 2995 s->locked++; 2996 } 2997 } 2998 if (!s->locked) 2999 /* False alarm - nothing to do */ 3000 return; 3001 sh->reconstruct_state = reconstruct_state_prexor_drain_run; 3002 set_bit(STRIPE_OP_PREXOR, &s->ops_request); 3003 set_bit(STRIPE_OP_BIODRAIN, &s->ops_request); 3004 set_bit(STRIPE_OP_RECONSTRUCT, &s->ops_request); 3005 } 3006 3007 /* keep the parity disk(s) locked while asynchronous operations 3008 * are in flight 3009 */ 3010 set_bit(R5_LOCKED, &sh->dev[pd_idx].flags); 3011 clear_bit(R5_UPTODATE, &sh->dev[pd_idx].flags); 3012 s->locked++; 3013 3014 if (level == 6) { 3015 int qd_idx = sh->qd_idx; 3016 struct r5dev *dev = &sh->dev[qd_idx]; 3017 3018 set_bit(R5_LOCKED, &dev->flags); 3019 clear_bit(R5_UPTODATE, &dev->flags); 3020 s->locked++; 3021 } 3022 3023 pr_debug("%s: stripe %llu locked: %d ops_request: %lx\n", 3024 __func__, (unsigned long long)sh->sector, 3025 s->locked, s->ops_request); 3026 } 3027 3028 /* 3029 * Each stripe/dev can have one or more bion attached. 3030 * toread/towrite point to the first in a chain. 3031 * The bi_next chain must be in order. 3032 */ 3033 static int add_stripe_bio(struct stripe_head *sh, struct bio *bi, int dd_idx, 3034 int forwrite, int previous) 3035 { 3036 struct bio **bip; 3037 struct r5conf *conf = sh->raid_conf; 3038 int firstwrite=0; 3039 3040 pr_debug("adding bi b#%llu to stripe s#%llu\n", 3041 (unsigned long long)bi->bi_iter.bi_sector, 3042 (unsigned long long)sh->sector); 3043 3044 /* 3045 * If several bio share a stripe. The bio bi_phys_segments acts as a 3046 * reference count to avoid race. The reference count should already be 3047 * increased before this function is called (for example, in 3048 * raid5_make_request()), so other bio sharing this stripe will not free the 3049 * stripe. If a stripe is owned by one stripe, the stripe lock will 3050 * protect it. 3051 */ 3052 spin_lock_irq(&sh->stripe_lock); 3053 /* Don't allow new IO added to stripes in batch list */ 3054 if (sh->batch_head) 3055 goto overlap; 3056 if (forwrite) { 3057 bip = &sh->dev[dd_idx].towrite; 3058 if (*bip == NULL) 3059 firstwrite = 1; 3060 } else 3061 bip = &sh->dev[dd_idx].toread; 3062 while (*bip && (*bip)->bi_iter.bi_sector < bi->bi_iter.bi_sector) { 3063 if (bio_end_sector(*bip) > bi->bi_iter.bi_sector) 3064 goto overlap; 3065 bip = & (*bip)->bi_next; 3066 } 3067 if (*bip && (*bip)->bi_iter.bi_sector < bio_end_sector(bi)) 3068 goto overlap; 3069 3070 if (!forwrite || previous) 3071 clear_bit(STRIPE_BATCH_READY, &sh->state); 3072 3073 BUG_ON(*bip && bi->bi_next && (*bip) != bi->bi_next); 3074 if (*bip) 3075 bi->bi_next = *bip; 3076 *bip = bi; 3077 raid5_inc_bi_active_stripes(bi); 3078 3079 if (forwrite) { 3080 /* check if page is covered */ 3081 sector_t sector = sh->dev[dd_idx].sector; 3082 for (bi=sh->dev[dd_idx].towrite; 3083 sector < sh->dev[dd_idx].sector + STRIPE_SECTORS && 3084 bi && bi->bi_iter.bi_sector <= sector; 3085 bi = r5_next_bio(bi, sh->dev[dd_idx].sector)) { 3086 if (bio_end_sector(bi) >= sector) 3087 sector = bio_end_sector(bi); 3088 } 3089 if (sector >= sh->dev[dd_idx].sector + STRIPE_SECTORS) 3090 if (!test_and_set_bit(R5_OVERWRITE, &sh->dev[dd_idx].flags)) 3091 sh->overwrite_disks++; 3092 } 3093 3094 pr_debug("added bi b#%llu to stripe s#%llu, disk %d.\n", 3095 (unsigned long long)(*bip)->bi_iter.bi_sector, 3096 (unsigned long long)sh->sector, dd_idx); 3097 3098 if (conf->mddev->bitmap && firstwrite) { 3099 /* Cannot hold spinlock over bitmap_startwrite, 3100 * but must ensure this isn't added to a batch until 3101 * we have added to the bitmap and set bm_seq. 3102 * So set STRIPE_BITMAP_PENDING to prevent 3103 * batching. 3104 * If multiple add_stripe_bio() calls race here they 3105 * much all set STRIPE_BITMAP_PENDING. So only the first one 3106 * to complete "bitmap_startwrite" gets to set 3107 * STRIPE_BIT_DELAY. This is important as once a stripe 3108 * is added to a batch, STRIPE_BIT_DELAY cannot be changed 3109 * any more. 3110 */ 3111 set_bit(STRIPE_BITMAP_PENDING, &sh->state); 3112 spin_unlock_irq(&sh->stripe_lock); 3113 bitmap_startwrite(conf->mddev->bitmap, sh->sector, 3114 STRIPE_SECTORS, 0); 3115 spin_lock_irq(&sh->stripe_lock); 3116 clear_bit(STRIPE_BITMAP_PENDING, &sh->state); 3117 if (!sh->batch_head) { 3118 sh->bm_seq = conf->seq_flush+1; 3119 set_bit(STRIPE_BIT_DELAY, &sh->state); 3120 } 3121 } 3122 spin_unlock_irq(&sh->stripe_lock); 3123 3124 if (stripe_can_batch(sh)) 3125 stripe_add_to_batch_list(conf, sh); 3126 return 1; 3127 3128 overlap: 3129 set_bit(R5_Overlap, &sh->dev[dd_idx].flags); 3130 spin_unlock_irq(&sh->stripe_lock); 3131 return 0; 3132 } 3133 3134 static void end_reshape(struct r5conf *conf); 3135 3136 static void stripe_set_idx(sector_t stripe, struct r5conf *conf, int previous, 3137 struct stripe_head *sh) 3138 { 3139 int sectors_per_chunk = 3140 previous ? conf->prev_chunk_sectors : conf->chunk_sectors; 3141 int dd_idx; 3142 int chunk_offset = sector_div(stripe, sectors_per_chunk); 3143 int disks = previous ? conf->previous_raid_disks : conf->raid_disks; 3144 3145 raid5_compute_sector(conf, 3146 stripe * (disks - conf->max_degraded) 3147 *sectors_per_chunk + chunk_offset, 3148 previous, 3149 &dd_idx, sh); 3150 } 3151 3152 static void 3153 handle_failed_stripe(struct r5conf *conf, struct stripe_head *sh, 3154 struct stripe_head_state *s, int disks, 3155 struct bio_list *return_bi) 3156 { 3157 int i; 3158 BUG_ON(sh->batch_head); 3159 for (i = disks; i--; ) { 3160 struct bio *bi; 3161 int bitmap_end = 0; 3162 3163 if (test_bit(R5_ReadError, &sh->dev[i].flags)) { 3164 struct md_rdev *rdev; 3165 rcu_read_lock(); 3166 rdev = rcu_dereference(conf->disks[i].rdev); 3167 if (rdev && test_bit(In_sync, &rdev->flags) && 3168 !test_bit(Faulty, &rdev->flags)) 3169 atomic_inc(&rdev->nr_pending); 3170 else 3171 rdev = NULL; 3172 rcu_read_unlock(); 3173 if (rdev) { 3174 if (!rdev_set_badblocks( 3175 rdev, 3176 sh->sector, 3177 STRIPE_SECTORS, 0)) 3178 md_error(conf->mddev, rdev); 3179 rdev_dec_pending(rdev, conf->mddev); 3180 } 3181 } 3182 spin_lock_irq(&sh->stripe_lock); 3183 /* fail all writes first */ 3184 bi = sh->dev[i].towrite; 3185 sh->dev[i].towrite = NULL; 3186 sh->overwrite_disks = 0; 3187 spin_unlock_irq(&sh->stripe_lock); 3188 if (bi) 3189 bitmap_end = 1; 3190 3191 r5l_stripe_write_finished(sh); 3192 3193 if (test_and_clear_bit(R5_Overlap, &sh->dev[i].flags)) 3194 wake_up(&conf->wait_for_overlap); 3195 3196 while (bi && bi->bi_iter.bi_sector < 3197 sh->dev[i].sector + STRIPE_SECTORS) { 3198 struct bio *nextbi = r5_next_bio(bi, sh->dev[i].sector); 3199 3200 bi->bi_error = -EIO; 3201 if (!raid5_dec_bi_active_stripes(bi)) { 3202 md_write_end(conf->mddev); 3203 bio_list_add(return_bi, bi); 3204 } 3205 bi = nextbi; 3206 } 3207 if (bitmap_end) 3208 bitmap_endwrite(conf->mddev->bitmap, sh->sector, 3209 STRIPE_SECTORS, 0, 0); 3210 bitmap_end = 0; 3211 /* and fail all 'written' */ 3212 bi = sh->dev[i].written; 3213 sh->dev[i].written = NULL; 3214 if (test_and_clear_bit(R5_SkipCopy, &sh->dev[i].flags)) { 3215 WARN_ON(test_bit(R5_UPTODATE, &sh->dev[i].flags)); 3216 sh->dev[i].page = sh->dev[i].orig_page; 3217 } 3218 3219 if (bi) bitmap_end = 1; 3220 while (bi && bi->bi_iter.bi_sector < 3221 sh->dev[i].sector + STRIPE_SECTORS) { 3222 struct bio *bi2 = r5_next_bio(bi, sh->dev[i].sector); 3223 3224 bi->bi_error = -EIO; 3225 if (!raid5_dec_bi_active_stripes(bi)) { 3226 md_write_end(conf->mddev); 3227 bio_list_add(return_bi, bi); 3228 } 3229 bi = bi2; 3230 } 3231 3232 /* fail any reads if this device is non-operational and 3233 * the data has not reached the cache yet. 3234 */ 3235 if (!test_bit(R5_Wantfill, &sh->dev[i].flags) && 3236 s->failed > conf->max_degraded && 3237 (!test_bit(R5_Insync, &sh->dev[i].flags) || 3238 test_bit(R5_ReadError, &sh->dev[i].flags))) { 3239 spin_lock_irq(&sh->stripe_lock); 3240 bi = sh->dev[i].toread; 3241 sh->dev[i].toread = NULL; 3242 spin_unlock_irq(&sh->stripe_lock); 3243 if (test_and_clear_bit(R5_Overlap, &sh->dev[i].flags)) 3244 wake_up(&conf->wait_for_overlap); 3245 if (bi) 3246 s->to_read--; 3247 while (bi && bi->bi_iter.bi_sector < 3248 sh->dev[i].sector + STRIPE_SECTORS) { 3249 struct bio *nextbi = 3250 r5_next_bio(bi, sh->dev[i].sector); 3251 3252 bi->bi_error = -EIO; 3253 if (!raid5_dec_bi_active_stripes(bi)) 3254 bio_list_add(return_bi, bi); 3255 bi = nextbi; 3256 } 3257 } 3258 if (bitmap_end) 3259 bitmap_endwrite(conf->mddev->bitmap, sh->sector, 3260 STRIPE_SECTORS, 0, 0); 3261 /* If we were in the middle of a write the parity block might 3262 * still be locked - so just clear all R5_LOCKED flags 3263 */ 3264 clear_bit(R5_LOCKED, &sh->dev[i].flags); 3265 } 3266 s->to_write = 0; 3267 s->written = 0; 3268 3269 if (test_and_clear_bit(STRIPE_FULL_WRITE, &sh->state)) 3270 if (atomic_dec_and_test(&conf->pending_full_writes)) 3271 md_wakeup_thread(conf->mddev->thread); 3272 } 3273 3274 static void 3275 handle_failed_sync(struct r5conf *conf, struct stripe_head *sh, 3276 struct stripe_head_state *s) 3277 { 3278 int abort = 0; 3279 int i; 3280 3281 BUG_ON(sh->batch_head); 3282 clear_bit(STRIPE_SYNCING, &sh->state); 3283 if (test_and_clear_bit(R5_Overlap, &sh->dev[sh->pd_idx].flags)) 3284 wake_up(&conf->wait_for_overlap); 3285 s->syncing = 0; 3286 s->replacing = 0; 3287 /* There is nothing more to do for sync/check/repair. 3288 * Don't even need to abort as that is handled elsewhere 3289 * if needed, and not always wanted e.g. if there is a known 3290 * bad block here. 3291 * For recover/replace we need to record a bad block on all 3292 * non-sync devices, or abort the recovery 3293 */ 3294 if (test_bit(MD_RECOVERY_RECOVER, &conf->mddev->recovery)) { 3295 /* During recovery devices cannot be removed, so 3296 * locking and refcounting of rdevs is not needed 3297 */ 3298 rcu_read_lock(); 3299 for (i = 0; i < conf->raid_disks; i++) { 3300 struct md_rdev *rdev = rcu_dereference(conf->disks[i].rdev); 3301 if (rdev 3302 && !test_bit(Faulty, &rdev->flags) 3303 && !test_bit(In_sync, &rdev->flags) 3304 && !rdev_set_badblocks(rdev, sh->sector, 3305 STRIPE_SECTORS, 0)) 3306 abort = 1; 3307 rdev = rcu_dereference(conf->disks[i].replacement); 3308 if (rdev 3309 && !test_bit(Faulty, &rdev->flags) 3310 && !test_bit(In_sync, &rdev->flags) 3311 && !rdev_set_badblocks(rdev, sh->sector, 3312 STRIPE_SECTORS, 0)) 3313 abort = 1; 3314 } 3315 rcu_read_unlock(); 3316 if (abort) 3317 conf->recovery_disabled = 3318 conf->mddev->recovery_disabled; 3319 } 3320 md_done_sync(conf->mddev, STRIPE_SECTORS, !abort); 3321 } 3322 3323 static int want_replace(struct stripe_head *sh, int disk_idx) 3324 { 3325 struct md_rdev *rdev; 3326 int rv = 0; 3327 3328 rcu_read_lock(); 3329 rdev = rcu_dereference(sh->raid_conf->disks[disk_idx].replacement); 3330 if (rdev 3331 && !test_bit(Faulty, &rdev->flags) 3332 && !test_bit(In_sync, &rdev->flags) 3333 && (rdev->recovery_offset <= sh->sector 3334 || rdev->mddev->recovery_cp <= sh->sector)) 3335 rv = 1; 3336 rcu_read_unlock(); 3337 return rv; 3338 } 3339 3340 static int need_this_block(struct stripe_head *sh, struct stripe_head_state *s, 3341 int disk_idx, int disks) 3342 { 3343 struct r5dev *dev = &sh->dev[disk_idx]; 3344 struct r5dev *fdev[2] = { &sh->dev[s->failed_num[0]], 3345 &sh->dev[s->failed_num[1]] }; 3346 int i; 3347 3348 3349 if (test_bit(R5_LOCKED, &dev->flags) || 3350 test_bit(R5_UPTODATE, &dev->flags)) 3351 /* No point reading this as we already have it or have 3352 * decided to get it. 3353 */ 3354 return 0; 3355 3356 if (dev->toread || 3357 (dev->towrite && !test_bit(R5_OVERWRITE, &dev->flags))) 3358 /* We need this block to directly satisfy a request */ 3359 return 1; 3360 3361 if (s->syncing || s->expanding || 3362 (s->replacing && want_replace(sh, disk_idx))) 3363 /* When syncing, or expanding we read everything. 3364 * When replacing, we need the replaced block. 3365 */ 3366 return 1; 3367 3368 if ((s->failed >= 1 && fdev[0]->toread) || 3369 (s->failed >= 2 && fdev[1]->toread)) 3370 /* If we want to read from a failed device, then 3371 * we need to actually read every other device. 3372 */ 3373 return 1; 3374 3375 /* Sometimes neither read-modify-write nor reconstruct-write 3376 * cycles can work. In those cases we read every block we 3377 * can. Then the parity-update is certain to have enough to 3378 * work with. 3379 * This can only be a problem when we need to write something, 3380 * and some device has failed. If either of those tests 3381 * fail we need look no further. 3382 */ 3383 if (!s->failed || !s->to_write) 3384 return 0; 3385 3386 if (test_bit(R5_Insync, &dev->flags) && 3387 !test_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) 3388 /* Pre-reads at not permitted until after short delay 3389 * to gather multiple requests. However if this 3390 * device is no Insync, the block could only be be computed 3391 * and there is no need to delay that. 3392 */ 3393 return 0; 3394 3395 for (i = 0; i < s->failed && i < 2; i++) { 3396 if (fdev[i]->towrite && 3397 !test_bit(R5_UPTODATE, &fdev[i]->flags) && 3398 !test_bit(R5_OVERWRITE, &fdev[i]->flags)) 3399 /* If we have a partial write to a failed 3400 * device, then we will need to reconstruct 3401 * the content of that device, so all other 3402 * devices must be read. 3403 */ 3404 return 1; 3405 } 3406 3407 /* If we are forced to do a reconstruct-write, either because 3408 * the current RAID6 implementation only supports that, or 3409 * or because parity cannot be trusted and we are currently 3410 * recovering it, there is extra need to be careful. 3411 * If one of the devices that we would need to read, because 3412 * it is not being overwritten (and maybe not written at all) 3413 * is missing/faulty, then we need to read everything we can. 3414 */ 3415 if (sh->raid_conf->level != 6 && 3416 sh->sector < sh->raid_conf->mddev->recovery_cp) 3417 /* reconstruct-write isn't being forced */ 3418 return 0; 3419 for (i = 0; i < s->failed && i < 2; i++) { 3420 if (s->failed_num[i] != sh->pd_idx && 3421 s->failed_num[i] != sh->qd_idx && 3422 !test_bit(R5_UPTODATE, &fdev[i]->flags) && 3423 !test_bit(R5_OVERWRITE, &fdev[i]->flags)) 3424 return 1; 3425 } 3426 3427 return 0; 3428 } 3429 3430 /* fetch_block - checks the given member device to see if its data needs 3431 * to be read or computed to satisfy a request. 3432 * 3433 * Returns 1 when no more member devices need to be checked, otherwise returns 3434 * 0 to tell the loop in handle_stripe_fill to continue 3435 */ 3436 static int fetch_block(struct stripe_head *sh, struct stripe_head_state *s, 3437 int disk_idx, int disks) 3438 { 3439 struct r5dev *dev = &sh->dev[disk_idx]; 3440 3441 /* is the data in this block needed, and can we get it? */ 3442 if (need_this_block(sh, s, disk_idx, disks)) { 3443 /* we would like to get this block, possibly by computing it, 3444 * otherwise read it if the backing disk is insync 3445 */ 3446 BUG_ON(test_bit(R5_Wantcompute, &dev->flags)); 3447 BUG_ON(test_bit(R5_Wantread, &dev->flags)); 3448 BUG_ON(sh->batch_head); 3449 if ((s->uptodate == disks - 1) && 3450 (s->failed && (disk_idx == s->failed_num[0] || 3451 disk_idx == s->failed_num[1]))) { 3452 /* have disk failed, and we're requested to fetch it; 3453 * do compute it 3454 */ 3455 pr_debug("Computing stripe %llu block %d\n", 3456 (unsigned long long)sh->sector, disk_idx); 3457 set_bit(STRIPE_COMPUTE_RUN, &sh->state); 3458 set_bit(STRIPE_OP_COMPUTE_BLK, &s->ops_request); 3459 set_bit(R5_Wantcompute, &dev->flags); 3460 sh->ops.target = disk_idx; 3461 sh->ops.target2 = -1; /* no 2nd target */ 3462 s->req_compute = 1; 3463 /* Careful: from this point on 'uptodate' is in the eye 3464 * of raid_run_ops which services 'compute' operations 3465 * before writes. R5_Wantcompute flags a block that will 3466 * be R5_UPTODATE by the time it is needed for a 3467 * subsequent operation. 3468 */ 3469 s->uptodate++; 3470 return 1; 3471 } else if (s->uptodate == disks-2 && s->failed >= 2) { 3472 /* Computing 2-failure is *very* expensive; only 3473 * do it if failed >= 2 3474 */ 3475 int other; 3476 for (other = disks; other--; ) { 3477 if (other == disk_idx) 3478 continue; 3479 if (!test_bit(R5_UPTODATE, 3480 &sh->dev[other].flags)) 3481 break; 3482 } 3483 BUG_ON(other < 0); 3484 pr_debug("Computing stripe %llu blocks %d,%d\n", 3485 (unsigned long long)sh->sector, 3486 disk_idx, other); 3487 set_bit(STRIPE_COMPUTE_RUN, &sh->state); 3488 set_bit(STRIPE_OP_COMPUTE_BLK, &s->ops_request); 3489 set_bit(R5_Wantcompute, &sh->dev[disk_idx].flags); 3490 set_bit(R5_Wantcompute, &sh->dev[other].flags); 3491 sh->ops.target = disk_idx; 3492 sh->ops.target2 = other; 3493 s->uptodate += 2; 3494 s->req_compute = 1; 3495 return 1; 3496 } else if (test_bit(R5_Insync, &dev->flags)) { 3497 set_bit(R5_LOCKED, &dev->flags); 3498 set_bit(R5_Wantread, &dev->flags); 3499 s->locked++; 3500 pr_debug("Reading block %d (sync=%d)\n", 3501 disk_idx, s->syncing); 3502 } 3503 } 3504 3505 return 0; 3506 } 3507 3508 /** 3509 * handle_stripe_fill - read or compute data to satisfy pending requests. 3510 */ 3511 static void handle_stripe_fill(struct stripe_head *sh, 3512 struct stripe_head_state *s, 3513 int disks) 3514 { 3515 int i; 3516 3517 /* look for blocks to read/compute, skip this if a compute 3518 * is already in flight, or if the stripe contents are in the 3519 * midst of changing due to a write 3520 */ 3521 if (!test_bit(STRIPE_COMPUTE_RUN, &sh->state) && !sh->check_state && 3522 !sh->reconstruct_state) { 3523 3524 /* 3525 * For degraded stripe with data in journal, do not handle 3526 * read requests yet, instead, flush the stripe to raid 3527 * disks first, this avoids handling complex rmw of write 3528 * back cache (prexor with orig_page, and then xor with 3529 * page) in the read path 3530 */ 3531 if (s->injournal && s->failed) { 3532 if (test_bit(STRIPE_R5C_CACHING, &sh->state)) 3533 r5c_make_stripe_write_out(sh); 3534 goto out; 3535 } 3536 3537 for (i = disks; i--; ) 3538 if (fetch_block(sh, s, i, disks)) 3539 break; 3540 } 3541 out: 3542 set_bit(STRIPE_HANDLE, &sh->state); 3543 } 3544 3545 static void break_stripe_batch_list(struct stripe_head *head_sh, 3546 unsigned long handle_flags); 3547 /* handle_stripe_clean_event 3548 * any written block on an uptodate or failed drive can be returned. 3549 * Note that if we 'wrote' to a failed drive, it will be UPTODATE, but 3550 * never LOCKED, so we don't need to test 'failed' directly. 3551 */ 3552 static void handle_stripe_clean_event(struct r5conf *conf, 3553 struct stripe_head *sh, int disks, struct bio_list *return_bi) 3554 { 3555 int i; 3556 struct r5dev *dev; 3557 int discard_pending = 0; 3558 struct stripe_head *head_sh = sh; 3559 bool do_endio = false; 3560 3561 for (i = disks; i--; ) 3562 if (sh->dev[i].written) { 3563 dev = &sh->dev[i]; 3564 if (!test_bit(R5_LOCKED, &dev->flags) && 3565 (test_bit(R5_UPTODATE, &dev->flags) || 3566 test_bit(R5_Discard, &dev->flags) || 3567 test_bit(R5_SkipCopy, &dev->flags))) { 3568 /* We can return any write requests */ 3569 struct bio *wbi, *wbi2; 3570 pr_debug("Return write for disc %d\n", i); 3571 if (test_and_clear_bit(R5_Discard, &dev->flags)) 3572 clear_bit(R5_UPTODATE, &dev->flags); 3573 if (test_and_clear_bit(R5_SkipCopy, &dev->flags)) { 3574 WARN_ON(test_bit(R5_UPTODATE, &dev->flags)); 3575 } 3576 do_endio = true; 3577 3578 returnbi: 3579 dev->page = dev->orig_page; 3580 wbi = dev->written; 3581 dev->written = NULL; 3582 while (wbi && wbi->bi_iter.bi_sector < 3583 dev->sector + STRIPE_SECTORS) { 3584 wbi2 = r5_next_bio(wbi, dev->sector); 3585 if (!raid5_dec_bi_active_stripes(wbi)) { 3586 md_write_end(conf->mddev); 3587 bio_list_add(return_bi, wbi); 3588 } 3589 wbi = wbi2; 3590 } 3591 bitmap_endwrite(conf->mddev->bitmap, sh->sector, 3592 STRIPE_SECTORS, 3593 !test_bit(STRIPE_DEGRADED, &sh->state), 3594 0); 3595 if (head_sh->batch_head) { 3596 sh = list_first_entry(&sh->batch_list, 3597 struct stripe_head, 3598 batch_list); 3599 if (sh != head_sh) { 3600 dev = &sh->dev[i]; 3601 goto returnbi; 3602 } 3603 } 3604 sh = head_sh; 3605 dev = &sh->dev[i]; 3606 } else if (test_bit(R5_Discard, &dev->flags)) 3607 discard_pending = 1; 3608 } 3609 3610 r5l_stripe_write_finished(sh); 3611 3612 if (!discard_pending && 3613 test_bit(R5_Discard, &sh->dev[sh->pd_idx].flags)) { 3614 int hash; 3615 clear_bit(R5_Discard, &sh->dev[sh->pd_idx].flags); 3616 clear_bit(R5_UPTODATE, &sh->dev[sh->pd_idx].flags); 3617 if (sh->qd_idx >= 0) { 3618 clear_bit(R5_Discard, &sh->dev[sh->qd_idx].flags); 3619 clear_bit(R5_UPTODATE, &sh->dev[sh->qd_idx].flags); 3620 } 3621 /* now that discard is done we can proceed with any sync */ 3622 clear_bit(STRIPE_DISCARD, &sh->state); 3623 /* 3624 * SCSI discard will change some bio fields and the stripe has 3625 * no updated data, so remove it from hash list and the stripe 3626 * will be reinitialized 3627 */ 3628 unhash: 3629 hash = sh->hash_lock_index; 3630 spin_lock_irq(conf->hash_locks + hash); 3631 remove_hash(sh); 3632 spin_unlock_irq(conf->hash_locks + hash); 3633 if (head_sh->batch_head) { 3634 sh = list_first_entry(&sh->batch_list, 3635 struct stripe_head, batch_list); 3636 if (sh != head_sh) 3637 goto unhash; 3638 } 3639 sh = head_sh; 3640 3641 if (test_bit(STRIPE_SYNC_REQUESTED, &sh->state)) 3642 set_bit(STRIPE_HANDLE, &sh->state); 3643 3644 } 3645 3646 if (test_and_clear_bit(STRIPE_FULL_WRITE, &sh->state)) 3647 if (atomic_dec_and_test(&conf->pending_full_writes)) 3648 md_wakeup_thread(conf->mddev->thread); 3649 3650 if (head_sh->batch_head && do_endio) 3651 break_stripe_batch_list(head_sh, STRIPE_EXPAND_SYNC_FLAGS); 3652 } 3653 3654 /* 3655 * For RMW in write back cache, we need extra page in prexor to store the 3656 * old data. This page is stored in dev->orig_page. 3657 * 3658 * This function checks whether we have data for prexor. The exact logic 3659 * is: 3660 * R5_UPTODATE && (!R5_InJournal || R5_OrigPageUPTDODATE) 3661 */ 3662 static inline bool uptodate_for_rmw(struct r5dev *dev) 3663 { 3664 return (test_bit(R5_UPTODATE, &dev->flags)) && 3665 (!test_bit(R5_InJournal, &dev->flags) || 3666 test_bit(R5_OrigPageUPTDODATE, &dev->flags)); 3667 } 3668 3669 static int handle_stripe_dirtying(struct r5conf *conf, 3670 struct stripe_head *sh, 3671 struct stripe_head_state *s, 3672 int disks) 3673 { 3674 int rmw = 0, rcw = 0, i; 3675 sector_t recovery_cp = conf->mddev->recovery_cp; 3676 3677 /* Check whether resync is now happening or should start. 3678 * If yes, then the array is dirty (after unclean shutdown or 3679 * initial creation), so parity in some stripes might be inconsistent. 3680 * In this case, we need to always do reconstruct-write, to ensure 3681 * that in case of drive failure or read-error correction, we 3682 * generate correct data from the parity. 3683 */ 3684 if (conf->rmw_level == PARITY_DISABLE_RMW || 3685 (recovery_cp < MaxSector && sh->sector >= recovery_cp && 3686 s->failed == 0)) { 3687 /* Calculate the real rcw later - for now make it 3688 * look like rcw is cheaper 3689 */ 3690 rcw = 1; rmw = 2; 3691 pr_debug("force RCW rmw_level=%u, recovery_cp=%llu sh->sector=%llu\n", 3692 conf->rmw_level, (unsigned long long)recovery_cp, 3693 (unsigned long long)sh->sector); 3694 } else for (i = disks; i--; ) { 3695 /* would I have to read this buffer for read_modify_write */ 3696 struct r5dev *dev = &sh->dev[i]; 3697 if (((dev->towrite && !delay_towrite(dev, s)) || 3698 i == sh->pd_idx || i == sh->qd_idx || 3699 test_bit(R5_InJournal, &dev->flags)) && 3700 !test_bit(R5_LOCKED, &dev->flags) && 3701 !(uptodate_for_rmw(dev) || 3702 test_bit(R5_Wantcompute, &dev->flags))) { 3703 if (test_bit(R5_Insync, &dev->flags)) 3704 rmw++; 3705 else 3706 rmw += 2*disks; /* cannot read it */ 3707 } 3708 /* Would I have to read this buffer for reconstruct_write */ 3709 if (!test_bit(R5_OVERWRITE, &dev->flags) && 3710 i != sh->pd_idx && i != sh->qd_idx && 3711 !test_bit(R5_LOCKED, &dev->flags) && 3712 !(test_bit(R5_UPTODATE, &dev->flags) || 3713 test_bit(R5_Wantcompute, &dev->flags))) { 3714 if (test_bit(R5_Insync, &dev->flags)) 3715 rcw++; 3716 else 3717 rcw += 2*disks; 3718 } 3719 } 3720 3721 pr_debug("for sector %llu, rmw=%d rcw=%d\n", 3722 (unsigned long long)sh->sector, rmw, rcw); 3723 set_bit(STRIPE_HANDLE, &sh->state); 3724 if ((rmw < rcw || (rmw == rcw && conf->rmw_level == PARITY_PREFER_RMW)) && rmw > 0) { 3725 /* prefer read-modify-write, but need to get some data */ 3726 if (conf->mddev->queue) 3727 blk_add_trace_msg(conf->mddev->queue, 3728 "raid5 rmw %llu %d", 3729 (unsigned long long)sh->sector, rmw); 3730 for (i = disks; i--; ) { 3731 struct r5dev *dev = &sh->dev[i]; 3732 if (test_bit(R5_InJournal, &dev->flags) && 3733 dev->page == dev->orig_page && 3734 !test_bit(R5_LOCKED, &sh->dev[sh->pd_idx].flags)) { 3735 /* alloc page for prexor */ 3736 struct page *p = alloc_page(GFP_NOIO); 3737 3738 if (p) { 3739 dev->orig_page = p; 3740 continue; 3741 } 3742 3743 /* 3744 * alloc_page() failed, try use 3745 * disk_info->extra_page 3746 */ 3747 if (!test_and_set_bit(R5C_EXTRA_PAGE_IN_USE, 3748 &conf->cache_state)) { 3749 r5c_use_extra_page(sh); 3750 break; 3751 } 3752 3753 /* extra_page in use, add to delayed_list */ 3754 set_bit(STRIPE_DELAYED, &sh->state); 3755 s->waiting_extra_page = 1; 3756 return -EAGAIN; 3757 } 3758 } 3759 3760 for (i = disks; i--; ) { 3761 struct r5dev *dev = &sh->dev[i]; 3762 if (((dev->towrite && !delay_towrite(dev, s)) || 3763 i == sh->pd_idx || i == sh->qd_idx || 3764 test_bit(R5_InJournal, &dev->flags)) && 3765 !test_bit(R5_LOCKED, &dev->flags) && 3766 !(uptodate_for_rmw(dev) || 3767 test_bit(R5_Wantcompute, &dev->flags)) && 3768 test_bit(R5_Insync, &dev->flags)) { 3769 if (test_bit(STRIPE_PREREAD_ACTIVE, 3770 &sh->state)) { 3771 pr_debug("Read_old block %d for r-m-w\n", 3772 i); 3773 set_bit(R5_LOCKED, &dev->flags); 3774 set_bit(R5_Wantread, &dev->flags); 3775 s->locked++; 3776 } else { 3777 set_bit(STRIPE_DELAYED, &sh->state); 3778 set_bit(STRIPE_HANDLE, &sh->state); 3779 } 3780 } 3781 } 3782 } 3783 if ((rcw < rmw || (rcw == rmw && conf->rmw_level != PARITY_PREFER_RMW)) && rcw > 0) { 3784 /* want reconstruct write, but need to get some data */ 3785 int qread =0; 3786 rcw = 0; 3787 for (i = disks; i--; ) { 3788 struct r5dev *dev = &sh->dev[i]; 3789 if (!test_bit(R5_OVERWRITE, &dev->flags) && 3790 i != sh->pd_idx && i != sh->qd_idx && 3791 !test_bit(R5_LOCKED, &dev->flags) && 3792 !(test_bit(R5_UPTODATE, &dev->flags) || 3793 test_bit(R5_Wantcompute, &dev->flags))) { 3794 rcw++; 3795 if (test_bit(R5_Insync, &dev->flags) && 3796 test_bit(STRIPE_PREREAD_ACTIVE, 3797 &sh->state)) { 3798 pr_debug("Read_old block " 3799 "%d for Reconstruct\n", i); 3800 set_bit(R5_LOCKED, &dev->flags); 3801 set_bit(R5_Wantread, &dev->flags); 3802 s->locked++; 3803 qread++; 3804 } else { 3805 set_bit(STRIPE_DELAYED, &sh->state); 3806 set_bit(STRIPE_HANDLE, &sh->state); 3807 } 3808 } 3809 } 3810 if (rcw && conf->mddev->queue) 3811 blk_add_trace_msg(conf->mddev->queue, "raid5 rcw %llu %d %d %d", 3812 (unsigned long long)sh->sector, 3813 rcw, qread, test_bit(STRIPE_DELAYED, &sh->state)); 3814 } 3815 3816 if (rcw > disks && rmw > disks && 3817 !test_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) 3818 set_bit(STRIPE_DELAYED, &sh->state); 3819 3820 /* now if nothing is locked, and if we have enough data, 3821 * we can start a write request 3822 */ 3823 /* since handle_stripe can be called at any time we need to handle the 3824 * case where a compute block operation has been submitted and then a 3825 * subsequent call wants to start a write request. raid_run_ops only 3826 * handles the case where compute block and reconstruct are requested 3827 * simultaneously. If this is not the case then new writes need to be 3828 * held off until the compute completes. 3829 */ 3830 if ((s->req_compute || !test_bit(STRIPE_COMPUTE_RUN, &sh->state)) && 3831 (s->locked == 0 && (rcw == 0 || rmw == 0) && 3832 !test_bit(STRIPE_BIT_DELAY, &sh->state))) 3833 schedule_reconstruction(sh, s, rcw == 0, 0); 3834 return 0; 3835 } 3836 3837 static void handle_parity_checks5(struct r5conf *conf, struct stripe_head *sh, 3838 struct stripe_head_state *s, int disks) 3839 { 3840 struct r5dev *dev = NULL; 3841 3842 BUG_ON(sh->batch_head); 3843 set_bit(STRIPE_HANDLE, &sh->state); 3844 3845 switch (sh->check_state) { 3846 case check_state_idle: 3847 /* start a new check operation if there are no failures */ 3848 if (s->failed == 0) { 3849 BUG_ON(s->uptodate != disks); 3850 sh->check_state = check_state_run; 3851 set_bit(STRIPE_OP_CHECK, &s->ops_request); 3852 clear_bit(R5_UPTODATE, &sh->dev[sh->pd_idx].flags); 3853 s->uptodate--; 3854 break; 3855 } 3856 dev = &sh->dev[s->failed_num[0]]; 3857 /* fall through */ 3858 case check_state_compute_result: 3859 sh->check_state = check_state_idle; 3860 if (!dev) 3861 dev = &sh->dev[sh->pd_idx]; 3862 3863 /* check that a write has not made the stripe insync */ 3864 if (test_bit(STRIPE_INSYNC, &sh->state)) 3865 break; 3866 3867 /* either failed parity check, or recovery is happening */ 3868 BUG_ON(!test_bit(R5_UPTODATE, &dev->flags)); 3869 BUG_ON(s->uptodate != disks); 3870 3871 set_bit(R5_LOCKED, &dev->flags); 3872 s->locked++; 3873 set_bit(R5_Wantwrite, &dev->flags); 3874 3875 clear_bit(STRIPE_DEGRADED, &sh->state); 3876 set_bit(STRIPE_INSYNC, &sh->state); 3877 break; 3878 case check_state_run: 3879 break; /* we will be called again upon completion */ 3880 case check_state_check_result: 3881 sh->check_state = check_state_idle; 3882 3883 /* if a failure occurred during the check operation, leave 3884 * STRIPE_INSYNC not set and let the stripe be handled again 3885 */ 3886 if (s->failed) 3887 break; 3888 3889 /* handle a successful check operation, if parity is correct 3890 * we are done. Otherwise update the mismatch count and repair 3891 * parity if !MD_RECOVERY_CHECK 3892 */ 3893 if ((sh->ops.zero_sum_result & SUM_CHECK_P_RESULT) == 0) 3894 /* parity is correct (on disc, 3895 * not in buffer any more) 3896 */ 3897 set_bit(STRIPE_INSYNC, &sh->state); 3898 else { 3899 atomic64_add(STRIPE_SECTORS, &conf->mddev->resync_mismatches); 3900 if (test_bit(MD_RECOVERY_CHECK, &conf->mddev->recovery)) 3901 /* don't try to repair!! */ 3902 set_bit(STRIPE_INSYNC, &sh->state); 3903 else { 3904 sh->check_state = check_state_compute_run; 3905 set_bit(STRIPE_COMPUTE_RUN, &sh->state); 3906 set_bit(STRIPE_OP_COMPUTE_BLK, &s->ops_request); 3907 set_bit(R5_Wantcompute, 3908 &sh->dev[sh->pd_idx].flags); 3909 sh->ops.target = sh->pd_idx; 3910 sh->ops.target2 = -1; 3911 s->uptodate++; 3912 } 3913 } 3914 break; 3915 case check_state_compute_run: 3916 break; 3917 default: 3918 pr_err("%s: unknown check_state: %d sector: %llu\n", 3919 __func__, sh->check_state, 3920 (unsigned long long) sh->sector); 3921 BUG(); 3922 } 3923 } 3924 3925 static void handle_parity_checks6(struct r5conf *conf, struct stripe_head *sh, 3926 struct stripe_head_state *s, 3927 int disks) 3928 { 3929 int pd_idx = sh->pd_idx; 3930 int qd_idx = sh->qd_idx; 3931 struct r5dev *dev; 3932 3933 BUG_ON(sh->batch_head); 3934 set_bit(STRIPE_HANDLE, &sh->state); 3935 3936 BUG_ON(s->failed > 2); 3937 3938 /* Want to check and possibly repair P and Q. 3939 * However there could be one 'failed' device, in which 3940 * case we can only check one of them, possibly using the 3941 * other to generate missing data 3942 */ 3943 3944 switch (sh->check_state) { 3945 case check_state_idle: 3946 /* start a new check operation if there are < 2 failures */ 3947 if (s->failed == s->q_failed) { 3948 /* The only possible failed device holds Q, so it 3949 * makes sense to check P (If anything else were failed, 3950 * we would have used P to recreate it). 3951 */ 3952 sh->check_state = check_state_run; 3953 } 3954 if (!s->q_failed && s->failed < 2) { 3955 /* Q is not failed, and we didn't use it to generate 3956 * anything, so it makes sense to check it 3957 */ 3958 if (sh->check_state == check_state_run) 3959 sh->check_state = check_state_run_pq; 3960 else 3961 sh->check_state = check_state_run_q; 3962 } 3963 3964 /* discard potentially stale zero_sum_result */ 3965 sh->ops.zero_sum_result = 0; 3966 3967 if (sh->check_state == check_state_run) { 3968 /* async_xor_zero_sum destroys the contents of P */ 3969 clear_bit(R5_UPTODATE, &sh->dev[pd_idx].flags); 3970 s->uptodate--; 3971 } 3972 if (sh->check_state >= check_state_run && 3973 sh->check_state <= check_state_run_pq) { 3974 /* async_syndrome_zero_sum preserves P and Q, so 3975 * no need to mark them !uptodate here 3976 */ 3977 set_bit(STRIPE_OP_CHECK, &s->ops_request); 3978 break; 3979 } 3980 3981 /* we have 2-disk failure */ 3982 BUG_ON(s->failed != 2); 3983 /* fall through */ 3984 case check_state_compute_result: 3985 sh->check_state = check_state_idle; 3986 3987 /* check that a write has not made the stripe insync */ 3988 if (test_bit(STRIPE_INSYNC, &sh->state)) 3989 break; 3990 3991 /* now write out any block on a failed drive, 3992 * or P or Q if they were recomputed 3993 */ 3994 BUG_ON(s->uptodate < disks - 1); /* We don't need Q to recover */ 3995 if (s->failed == 2) { 3996 dev = &sh->dev[s->failed_num[1]]; 3997 s->locked++; 3998 set_bit(R5_LOCKED, &dev->flags); 3999 set_bit(R5_Wantwrite, &dev->flags); 4000 } 4001 if (s->failed >= 1) { 4002 dev = &sh->dev[s->failed_num[0]]; 4003 s->locked++; 4004 set_bit(R5_LOCKED, &dev->flags); 4005 set_bit(R5_Wantwrite, &dev->flags); 4006 } 4007 if (sh->ops.zero_sum_result & SUM_CHECK_P_RESULT) { 4008 dev = &sh->dev[pd_idx]; 4009 s->locked++; 4010 set_bit(R5_LOCKED, &dev->flags); 4011 set_bit(R5_Wantwrite, &dev->flags); 4012 } 4013 if (sh->ops.zero_sum_result & SUM_CHECK_Q_RESULT) { 4014 dev = &sh->dev[qd_idx]; 4015 s->locked++; 4016 set_bit(R5_LOCKED, &dev->flags); 4017 set_bit(R5_Wantwrite, &dev->flags); 4018 } 4019 clear_bit(STRIPE_DEGRADED, &sh->state); 4020 4021 set_bit(STRIPE_INSYNC, &sh->state); 4022 break; 4023 case check_state_run: 4024 case check_state_run_q: 4025 case check_state_run_pq: 4026 break; /* we will be called again upon completion */ 4027 case check_state_check_result: 4028 sh->check_state = check_state_idle; 4029 4030 /* handle a successful check operation, if parity is correct 4031 * we are done. Otherwise update the mismatch count and repair 4032 * parity if !MD_RECOVERY_CHECK 4033 */ 4034 if (sh->ops.zero_sum_result == 0) { 4035 /* both parities are correct */ 4036 if (!s->failed) 4037 set_bit(STRIPE_INSYNC, &sh->state); 4038 else { 4039 /* in contrast to the raid5 case we can validate 4040 * parity, but still have a failure to write 4041 * back 4042 */ 4043 sh->check_state = check_state_compute_result; 4044 /* Returning at this point means that we may go 4045 * off and bring p and/or q uptodate again so 4046 * we make sure to check zero_sum_result again 4047 * to verify if p or q need writeback 4048 */ 4049 } 4050 } else { 4051 atomic64_add(STRIPE_SECTORS, &conf->mddev->resync_mismatches); 4052 if (test_bit(MD_RECOVERY_CHECK, &conf->mddev->recovery)) 4053 /* don't try to repair!! */ 4054 set_bit(STRIPE_INSYNC, &sh->state); 4055 else { 4056 int *target = &sh->ops.target; 4057 4058 sh->ops.target = -1; 4059 sh->ops.target2 = -1; 4060 sh->check_state = check_state_compute_run; 4061 set_bit(STRIPE_COMPUTE_RUN, &sh->state); 4062 set_bit(STRIPE_OP_COMPUTE_BLK, &s->ops_request); 4063 if (sh->ops.zero_sum_result & SUM_CHECK_P_RESULT) { 4064 set_bit(R5_Wantcompute, 4065 &sh->dev[pd_idx].flags); 4066 *target = pd_idx; 4067 target = &sh->ops.target2; 4068 s->uptodate++; 4069 } 4070 if (sh->ops.zero_sum_result & SUM_CHECK_Q_RESULT) { 4071 set_bit(R5_Wantcompute, 4072 &sh->dev[qd_idx].flags); 4073 *target = qd_idx; 4074 s->uptodate++; 4075 } 4076 } 4077 } 4078 break; 4079 case check_state_compute_run: 4080 break; 4081 default: 4082 pr_warn("%s: unknown check_state: %d sector: %llu\n", 4083 __func__, sh->check_state, 4084 (unsigned long long) sh->sector); 4085 BUG(); 4086 } 4087 } 4088 4089 static void handle_stripe_expansion(struct r5conf *conf, struct stripe_head *sh) 4090 { 4091 int i; 4092 4093 /* We have read all the blocks in this stripe and now we need to 4094 * copy some of them into a target stripe for expand. 4095 */ 4096 struct dma_async_tx_descriptor *tx = NULL; 4097 BUG_ON(sh->batch_head); 4098 clear_bit(STRIPE_EXPAND_SOURCE, &sh->state); 4099 for (i = 0; i < sh->disks; i++) 4100 if (i != sh->pd_idx && i != sh->qd_idx) { 4101 int dd_idx, j; 4102 struct stripe_head *sh2; 4103 struct async_submit_ctl submit; 4104 4105 sector_t bn = raid5_compute_blocknr(sh, i, 1); 4106 sector_t s = raid5_compute_sector(conf, bn, 0, 4107 &dd_idx, NULL); 4108 sh2 = raid5_get_active_stripe(conf, s, 0, 1, 1); 4109 if (sh2 == NULL) 4110 /* so far only the early blocks of this stripe 4111 * have been requested. When later blocks 4112 * get requested, we will try again 4113 */ 4114 continue; 4115 if (!test_bit(STRIPE_EXPANDING, &sh2->state) || 4116 test_bit(R5_Expanded, &sh2->dev[dd_idx].flags)) { 4117 /* must have already done this block */ 4118 raid5_release_stripe(sh2); 4119 continue; 4120 } 4121 4122 /* place all the copies on one channel */ 4123 init_async_submit(&submit, 0, tx, NULL, NULL, NULL); 4124 tx = async_memcpy(sh2->dev[dd_idx].page, 4125 sh->dev[i].page, 0, 0, STRIPE_SIZE, 4126 &submit); 4127 4128 set_bit(R5_Expanded, &sh2->dev[dd_idx].flags); 4129 set_bit(R5_UPTODATE, &sh2->dev[dd_idx].flags); 4130 for (j = 0; j < conf->raid_disks; j++) 4131 if (j != sh2->pd_idx && 4132 j != sh2->qd_idx && 4133 !test_bit(R5_Expanded, &sh2->dev[j].flags)) 4134 break; 4135 if (j == conf->raid_disks) { 4136 set_bit(STRIPE_EXPAND_READY, &sh2->state); 4137 set_bit(STRIPE_HANDLE, &sh2->state); 4138 } 4139 raid5_release_stripe(sh2); 4140 4141 } 4142 /* done submitting copies, wait for them to complete */ 4143 async_tx_quiesce(&tx); 4144 } 4145 4146 /* 4147 * handle_stripe - do things to a stripe. 4148 * 4149 * We lock the stripe by setting STRIPE_ACTIVE and then examine the 4150 * state of various bits to see what needs to be done. 4151 * Possible results: 4152 * return some read requests which now have data 4153 * return some write requests which are safely on storage 4154 * schedule a read on some buffers 4155 * schedule a write of some buffers 4156 * return confirmation of parity correctness 4157 * 4158 */ 4159 4160 static void analyse_stripe(struct stripe_head *sh, struct stripe_head_state *s) 4161 { 4162 struct r5conf *conf = sh->raid_conf; 4163 int disks = sh->disks; 4164 struct r5dev *dev; 4165 int i; 4166 int do_recovery = 0; 4167 4168 memset(s, 0, sizeof(*s)); 4169 4170 s->expanding = test_bit(STRIPE_EXPAND_SOURCE, &sh->state) && !sh->batch_head; 4171 s->expanded = test_bit(STRIPE_EXPAND_READY, &sh->state) && !sh->batch_head; 4172 s->failed_num[0] = -1; 4173 s->failed_num[1] = -1; 4174 s->log_failed = r5l_log_disk_error(conf); 4175 4176 /* Now to look around and see what can be done */ 4177 rcu_read_lock(); 4178 for (i=disks; i--; ) { 4179 struct md_rdev *rdev; 4180 sector_t first_bad; 4181 int bad_sectors; 4182 int is_bad = 0; 4183 4184 dev = &sh->dev[i]; 4185 4186 pr_debug("check %d: state 0x%lx read %p write %p written %p\n", 4187 i, dev->flags, 4188 dev->toread, dev->towrite, dev->written); 4189 /* maybe we can reply to a read 4190 * 4191 * new wantfill requests are only permitted while 4192 * ops_complete_biofill is guaranteed to be inactive 4193 */ 4194 if (test_bit(R5_UPTODATE, &dev->flags) && dev->toread && 4195 !test_bit(STRIPE_BIOFILL_RUN, &sh->state)) 4196 set_bit(R5_Wantfill, &dev->flags); 4197 4198 /* now count some things */ 4199 if (test_bit(R5_LOCKED, &dev->flags)) 4200 s->locked++; 4201 if (test_bit(R5_UPTODATE, &dev->flags)) 4202 s->uptodate++; 4203 if (test_bit(R5_Wantcompute, &dev->flags)) { 4204 s->compute++; 4205 BUG_ON(s->compute > 2); 4206 } 4207 4208 if (test_bit(R5_Wantfill, &dev->flags)) 4209 s->to_fill++; 4210 else if (dev->toread) 4211 s->to_read++; 4212 if (dev->towrite) { 4213 s->to_write++; 4214 if (!test_bit(R5_OVERWRITE, &dev->flags)) 4215 s->non_overwrite++; 4216 } 4217 if (dev->written) 4218 s->written++; 4219 /* Prefer to use the replacement for reads, but only 4220 * if it is recovered enough and has no bad blocks. 4221 */ 4222 rdev = rcu_dereference(conf->disks[i].replacement); 4223 if (rdev && !test_bit(Faulty, &rdev->flags) && 4224 rdev->recovery_offset >= sh->sector + STRIPE_SECTORS && 4225 !is_badblock(rdev, sh->sector, STRIPE_SECTORS, 4226 &first_bad, &bad_sectors)) 4227 set_bit(R5_ReadRepl, &dev->flags); 4228 else { 4229 if (rdev && !test_bit(Faulty, &rdev->flags)) 4230 set_bit(R5_NeedReplace, &dev->flags); 4231 else 4232 clear_bit(R5_NeedReplace, &dev->flags); 4233 rdev = rcu_dereference(conf->disks[i].rdev); 4234 clear_bit(R5_ReadRepl, &dev->flags); 4235 } 4236 if (rdev && test_bit(Faulty, &rdev->flags)) 4237 rdev = NULL; 4238 if (rdev) { 4239 is_bad = is_badblock(rdev, sh->sector, STRIPE_SECTORS, 4240 &first_bad, &bad_sectors); 4241 if (s->blocked_rdev == NULL 4242 && (test_bit(Blocked, &rdev->flags) 4243 || is_bad < 0)) { 4244 if (is_bad < 0) 4245 set_bit(BlockedBadBlocks, 4246 &rdev->flags); 4247 s->blocked_rdev = rdev; 4248 atomic_inc(&rdev->nr_pending); 4249 } 4250 } 4251 clear_bit(R5_Insync, &dev->flags); 4252 if (!rdev) 4253 /* Not in-sync */; 4254 else if (is_bad) { 4255 /* also not in-sync */ 4256 if (!test_bit(WriteErrorSeen, &rdev->flags) && 4257 test_bit(R5_UPTODATE, &dev->flags)) { 4258 /* treat as in-sync, but with a read error 4259 * which we can now try to correct 4260 */ 4261 set_bit(R5_Insync, &dev->flags); 4262 set_bit(R5_ReadError, &dev->flags); 4263 } 4264 } else if (test_bit(In_sync, &rdev->flags)) 4265 set_bit(R5_Insync, &dev->flags); 4266 else if (sh->sector + STRIPE_SECTORS <= rdev->recovery_offset) 4267 /* in sync if before recovery_offset */ 4268 set_bit(R5_Insync, &dev->flags); 4269 else if (test_bit(R5_UPTODATE, &dev->flags) && 4270 test_bit(R5_Expanded, &dev->flags)) 4271 /* If we've reshaped into here, we assume it is Insync. 4272 * We will shortly update recovery_offset to make 4273 * it official. 4274 */ 4275 set_bit(R5_Insync, &dev->flags); 4276 4277 if (test_bit(R5_WriteError, &dev->flags)) { 4278 /* This flag does not apply to '.replacement' 4279 * only to .rdev, so make sure to check that*/ 4280 struct md_rdev *rdev2 = rcu_dereference( 4281 conf->disks[i].rdev); 4282 if (rdev2 == rdev) 4283 clear_bit(R5_Insync, &dev->flags); 4284 if (rdev2 && !test_bit(Faulty, &rdev2->flags)) { 4285 s->handle_bad_blocks = 1; 4286 atomic_inc(&rdev2->nr_pending); 4287 } else 4288 clear_bit(R5_WriteError, &dev->flags); 4289 } 4290 if (test_bit(R5_MadeGood, &dev->flags)) { 4291 /* This flag does not apply to '.replacement' 4292 * only to .rdev, so make sure to check that*/ 4293 struct md_rdev *rdev2 = rcu_dereference( 4294 conf->disks[i].rdev); 4295 if (rdev2 && !test_bit(Faulty, &rdev2->flags)) { 4296 s->handle_bad_blocks = 1; 4297 atomic_inc(&rdev2->nr_pending); 4298 } else 4299 clear_bit(R5_MadeGood, &dev->flags); 4300 } 4301 if (test_bit(R5_MadeGoodRepl, &dev->flags)) { 4302 struct md_rdev *rdev2 = rcu_dereference( 4303 conf->disks[i].replacement); 4304 if (rdev2 && !test_bit(Faulty, &rdev2->flags)) { 4305 s->handle_bad_blocks = 1; 4306 atomic_inc(&rdev2->nr_pending); 4307 } else 4308 clear_bit(R5_MadeGoodRepl, &dev->flags); 4309 } 4310 if (!test_bit(R5_Insync, &dev->flags)) { 4311 /* The ReadError flag will just be confusing now */ 4312 clear_bit(R5_ReadError, &dev->flags); 4313 clear_bit(R5_ReWrite, &dev->flags); 4314 } 4315 if (test_bit(R5_ReadError, &dev->flags)) 4316 clear_bit(R5_Insync, &dev->flags); 4317 if (!test_bit(R5_Insync, &dev->flags)) { 4318 if (s->failed < 2) 4319 s->failed_num[s->failed] = i; 4320 s->failed++; 4321 if (rdev && !test_bit(Faulty, &rdev->flags)) 4322 do_recovery = 1; 4323 } 4324 4325 if (test_bit(R5_InJournal, &dev->flags)) 4326 s->injournal++; 4327 if (test_bit(R5_InJournal, &dev->flags) && dev->written) 4328 s->just_cached++; 4329 } 4330 if (test_bit(STRIPE_SYNCING, &sh->state)) { 4331 /* If there is a failed device being replaced, 4332 * we must be recovering. 4333 * else if we are after recovery_cp, we must be syncing 4334 * else if MD_RECOVERY_REQUESTED is set, we also are syncing. 4335 * else we can only be replacing 4336 * sync and recovery both need to read all devices, and so 4337 * use the same flag. 4338 */ 4339 if (do_recovery || 4340 sh->sector >= conf->mddev->recovery_cp || 4341 test_bit(MD_RECOVERY_REQUESTED, &(conf->mddev->recovery))) 4342 s->syncing = 1; 4343 else 4344 s->replacing = 1; 4345 } 4346 rcu_read_unlock(); 4347 } 4348 4349 static int clear_batch_ready(struct stripe_head *sh) 4350 { 4351 /* Return '1' if this is a member of batch, or 4352 * '0' if it is a lone stripe or a head which can now be 4353 * handled. 4354 */ 4355 struct stripe_head *tmp; 4356 if (!test_and_clear_bit(STRIPE_BATCH_READY, &sh->state)) 4357 return (sh->batch_head && sh->batch_head != sh); 4358 spin_lock(&sh->stripe_lock); 4359 if (!sh->batch_head) { 4360 spin_unlock(&sh->stripe_lock); 4361 return 0; 4362 } 4363 4364 /* 4365 * this stripe could be added to a batch list before we check 4366 * BATCH_READY, skips it 4367 */ 4368 if (sh->batch_head != sh) { 4369 spin_unlock(&sh->stripe_lock); 4370 return 1; 4371 } 4372 spin_lock(&sh->batch_lock); 4373 list_for_each_entry(tmp, &sh->batch_list, batch_list) 4374 clear_bit(STRIPE_BATCH_READY, &tmp->state); 4375 spin_unlock(&sh->batch_lock); 4376 spin_unlock(&sh->stripe_lock); 4377 4378 /* 4379 * BATCH_READY is cleared, no new stripes can be added. 4380 * batch_list can be accessed without lock 4381 */ 4382 return 0; 4383 } 4384 4385 static void break_stripe_batch_list(struct stripe_head *head_sh, 4386 unsigned long handle_flags) 4387 { 4388 struct stripe_head *sh, *next; 4389 int i; 4390 int do_wakeup = 0; 4391 4392 list_for_each_entry_safe(sh, next, &head_sh->batch_list, batch_list) { 4393 4394 list_del_init(&sh->batch_list); 4395 4396 WARN_ONCE(sh->state & ((1 << STRIPE_ACTIVE) | 4397 (1 << STRIPE_SYNCING) | 4398 (1 << STRIPE_REPLACED) | 4399 (1 << STRIPE_DELAYED) | 4400 (1 << STRIPE_BIT_DELAY) | 4401 (1 << STRIPE_FULL_WRITE) | 4402 (1 << STRIPE_BIOFILL_RUN) | 4403 (1 << STRIPE_COMPUTE_RUN) | 4404 (1 << STRIPE_OPS_REQ_PENDING) | 4405 (1 << STRIPE_DISCARD) | 4406 (1 << STRIPE_BATCH_READY) | 4407 (1 << STRIPE_BATCH_ERR) | 4408 (1 << STRIPE_BITMAP_PENDING)), 4409 "stripe state: %lx\n", sh->state); 4410 WARN_ONCE(head_sh->state & ((1 << STRIPE_DISCARD) | 4411 (1 << STRIPE_REPLACED)), 4412 "head stripe state: %lx\n", head_sh->state); 4413 4414 set_mask_bits(&sh->state, ~(STRIPE_EXPAND_SYNC_FLAGS | 4415 (1 << STRIPE_PREREAD_ACTIVE) | 4416 (1 << STRIPE_DEGRADED)), 4417 head_sh->state & (1 << STRIPE_INSYNC)); 4418 4419 sh->check_state = head_sh->check_state; 4420 sh->reconstruct_state = head_sh->reconstruct_state; 4421 for (i = 0; i < sh->disks; i++) { 4422 if (test_and_clear_bit(R5_Overlap, &sh->dev[i].flags)) 4423 do_wakeup = 1; 4424 sh->dev[i].flags = head_sh->dev[i].flags & 4425 (~((1 << R5_WriteError) | (1 << R5_Overlap))); 4426 } 4427 spin_lock_irq(&sh->stripe_lock); 4428 sh->batch_head = NULL; 4429 spin_unlock_irq(&sh->stripe_lock); 4430 if (handle_flags == 0 || 4431 sh->state & handle_flags) 4432 set_bit(STRIPE_HANDLE, &sh->state); 4433 raid5_release_stripe(sh); 4434 } 4435 spin_lock_irq(&head_sh->stripe_lock); 4436 head_sh->batch_head = NULL; 4437 spin_unlock_irq(&head_sh->stripe_lock); 4438 for (i = 0; i < head_sh->disks; i++) 4439 if (test_and_clear_bit(R5_Overlap, &head_sh->dev[i].flags)) 4440 do_wakeup = 1; 4441 if (head_sh->state & handle_flags) 4442 set_bit(STRIPE_HANDLE, &head_sh->state); 4443 4444 if (do_wakeup) 4445 wake_up(&head_sh->raid_conf->wait_for_overlap); 4446 } 4447 4448 static void handle_stripe(struct stripe_head *sh) 4449 { 4450 struct stripe_head_state s; 4451 struct r5conf *conf = sh->raid_conf; 4452 int i; 4453 int prexor; 4454 int disks = sh->disks; 4455 struct r5dev *pdev, *qdev; 4456 4457 clear_bit(STRIPE_HANDLE, &sh->state); 4458 if (test_and_set_bit_lock(STRIPE_ACTIVE, &sh->state)) { 4459 /* already being handled, ensure it gets handled 4460 * again when current action finishes */ 4461 set_bit(STRIPE_HANDLE, &sh->state); 4462 return; 4463 } 4464 4465 if (clear_batch_ready(sh) ) { 4466 clear_bit_unlock(STRIPE_ACTIVE, &sh->state); 4467 return; 4468 } 4469 4470 if (test_and_clear_bit(STRIPE_BATCH_ERR, &sh->state)) 4471 break_stripe_batch_list(sh, 0); 4472 4473 if (test_bit(STRIPE_SYNC_REQUESTED, &sh->state) && !sh->batch_head) { 4474 spin_lock(&sh->stripe_lock); 4475 /* Cannot process 'sync' concurrently with 'discard' */ 4476 if (!test_bit(STRIPE_DISCARD, &sh->state) && 4477 test_and_clear_bit(STRIPE_SYNC_REQUESTED, &sh->state)) { 4478 set_bit(STRIPE_SYNCING, &sh->state); 4479 clear_bit(STRIPE_INSYNC, &sh->state); 4480 clear_bit(STRIPE_REPLACED, &sh->state); 4481 } 4482 spin_unlock(&sh->stripe_lock); 4483 } 4484 clear_bit(STRIPE_DELAYED, &sh->state); 4485 4486 pr_debug("handling stripe %llu, state=%#lx cnt=%d, " 4487 "pd_idx=%d, qd_idx=%d\n, check:%d, reconstruct:%d\n", 4488 (unsigned long long)sh->sector, sh->state, 4489 atomic_read(&sh->count), sh->pd_idx, sh->qd_idx, 4490 sh->check_state, sh->reconstruct_state); 4491 4492 analyse_stripe(sh, &s); 4493 4494 if (test_bit(STRIPE_LOG_TRAPPED, &sh->state)) 4495 goto finish; 4496 4497 if (s.handle_bad_blocks) { 4498 set_bit(STRIPE_HANDLE, &sh->state); 4499 goto finish; 4500 } 4501 4502 if (unlikely(s.blocked_rdev)) { 4503 if (s.syncing || s.expanding || s.expanded || 4504 s.replacing || s.to_write || s.written) { 4505 set_bit(STRIPE_HANDLE, &sh->state); 4506 goto finish; 4507 } 4508 /* There is nothing for the blocked_rdev to block */ 4509 rdev_dec_pending(s.blocked_rdev, conf->mddev); 4510 s.blocked_rdev = NULL; 4511 } 4512 4513 if (s.to_fill && !test_bit(STRIPE_BIOFILL_RUN, &sh->state)) { 4514 set_bit(STRIPE_OP_BIOFILL, &s.ops_request); 4515 set_bit(STRIPE_BIOFILL_RUN, &sh->state); 4516 } 4517 4518 pr_debug("locked=%d uptodate=%d to_read=%d" 4519 " to_write=%d failed=%d failed_num=%d,%d\n", 4520 s.locked, s.uptodate, s.to_read, s.to_write, s.failed, 4521 s.failed_num[0], s.failed_num[1]); 4522 /* check if the array has lost more than max_degraded devices and, 4523 * if so, some requests might need to be failed. 4524 */ 4525 if (s.failed > conf->max_degraded || s.log_failed) { 4526 sh->check_state = 0; 4527 sh->reconstruct_state = 0; 4528 break_stripe_batch_list(sh, 0); 4529 if (s.to_read+s.to_write+s.written) 4530 handle_failed_stripe(conf, sh, &s, disks, &s.return_bi); 4531 if (s.syncing + s.replacing) 4532 handle_failed_sync(conf, sh, &s); 4533 } 4534 4535 /* Now we check to see if any write operations have recently 4536 * completed 4537 */ 4538 prexor = 0; 4539 if (sh->reconstruct_state == reconstruct_state_prexor_drain_result) 4540 prexor = 1; 4541 if (sh->reconstruct_state == reconstruct_state_drain_result || 4542 sh->reconstruct_state == reconstruct_state_prexor_drain_result) { 4543 sh->reconstruct_state = reconstruct_state_idle; 4544 4545 /* All the 'written' buffers and the parity block are ready to 4546 * be written back to disk 4547 */ 4548 BUG_ON(!test_bit(R5_UPTODATE, &sh->dev[sh->pd_idx].flags) && 4549 !test_bit(R5_Discard, &sh->dev[sh->pd_idx].flags)); 4550 BUG_ON(sh->qd_idx >= 0 && 4551 !test_bit(R5_UPTODATE, &sh->dev[sh->qd_idx].flags) && 4552 !test_bit(R5_Discard, &sh->dev[sh->qd_idx].flags)); 4553 for (i = disks; i--; ) { 4554 struct r5dev *dev = &sh->dev[i]; 4555 if (test_bit(R5_LOCKED, &dev->flags) && 4556 (i == sh->pd_idx || i == sh->qd_idx || 4557 dev->written || test_bit(R5_InJournal, 4558 &dev->flags))) { 4559 pr_debug("Writing block %d\n", i); 4560 set_bit(R5_Wantwrite, &dev->flags); 4561 if (prexor) 4562 continue; 4563 if (s.failed > 1) 4564 continue; 4565 if (!test_bit(R5_Insync, &dev->flags) || 4566 ((i == sh->pd_idx || i == sh->qd_idx) && 4567 s.failed == 0)) 4568 set_bit(STRIPE_INSYNC, &sh->state); 4569 } 4570 } 4571 if (test_and_clear_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) 4572 s.dec_preread_active = 1; 4573 } 4574 4575 /* 4576 * might be able to return some write requests if the parity blocks 4577 * are safe, or on a failed drive 4578 */ 4579 pdev = &sh->dev[sh->pd_idx]; 4580 s.p_failed = (s.failed >= 1 && s.failed_num[0] == sh->pd_idx) 4581 || (s.failed >= 2 && s.failed_num[1] == sh->pd_idx); 4582 qdev = &sh->dev[sh->qd_idx]; 4583 s.q_failed = (s.failed >= 1 && s.failed_num[0] == sh->qd_idx) 4584 || (s.failed >= 2 && s.failed_num[1] == sh->qd_idx) 4585 || conf->level < 6; 4586 4587 if (s.written && 4588 (s.p_failed || ((test_bit(R5_Insync, &pdev->flags) 4589 && !test_bit(R5_LOCKED, &pdev->flags) 4590 && (test_bit(R5_UPTODATE, &pdev->flags) || 4591 test_bit(R5_Discard, &pdev->flags))))) && 4592 (s.q_failed || ((test_bit(R5_Insync, &qdev->flags) 4593 && !test_bit(R5_LOCKED, &qdev->flags) 4594 && (test_bit(R5_UPTODATE, &qdev->flags) || 4595 test_bit(R5_Discard, &qdev->flags)))))) 4596 handle_stripe_clean_event(conf, sh, disks, &s.return_bi); 4597 4598 if (s.just_cached) 4599 r5c_handle_cached_data_endio(conf, sh, disks, &s.return_bi); 4600 r5l_stripe_write_finished(sh); 4601 4602 /* Now we might consider reading some blocks, either to check/generate 4603 * parity, or to satisfy requests 4604 * or to load a block that is being partially written. 4605 */ 4606 if (s.to_read || s.non_overwrite 4607 || (conf->level == 6 && s.to_write && s.failed) 4608 || (s.syncing && (s.uptodate + s.compute < disks)) 4609 || s.replacing 4610 || s.expanding) 4611 handle_stripe_fill(sh, &s, disks); 4612 4613 /* 4614 * When the stripe finishes full journal write cycle (write to journal 4615 * and raid disk), this is the clean up procedure so it is ready for 4616 * next operation. 4617 */ 4618 r5c_finish_stripe_write_out(conf, sh, &s); 4619 4620 /* 4621 * Now to consider new write requests, cache write back and what else, 4622 * if anything should be read. We do not handle new writes when: 4623 * 1/ A 'write' operation (copy+xor) is already in flight. 4624 * 2/ A 'check' operation is in flight, as it may clobber the parity 4625 * block. 4626 * 3/ A r5c cache log write is in flight. 4627 */ 4628 4629 if (!sh->reconstruct_state && !sh->check_state && !sh->log_io) { 4630 if (!r5c_is_writeback(conf->log)) { 4631 if (s.to_write) 4632 handle_stripe_dirtying(conf, sh, &s, disks); 4633 } else { /* write back cache */ 4634 int ret = 0; 4635 4636 /* First, try handle writes in caching phase */ 4637 if (s.to_write) 4638 ret = r5c_try_caching_write(conf, sh, &s, 4639 disks); 4640 /* 4641 * If caching phase failed: ret == -EAGAIN 4642 * OR 4643 * stripe under reclaim: !caching && injournal 4644 * 4645 * fall back to handle_stripe_dirtying() 4646 */ 4647 if (ret == -EAGAIN || 4648 /* stripe under reclaim: !caching && injournal */ 4649 (!test_bit(STRIPE_R5C_CACHING, &sh->state) && 4650 s.injournal > 0)) { 4651 ret = handle_stripe_dirtying(conf, sh, &s, 4652 disks); 4653 if (ret == -EAGAIN) 4654 goto finish; 4655 } 4656 } 4657 } 4658 4659 /* maybe we need to check and possibly fix the parity for this stripe 4660 * Any reads will already have been scheduled, so we just see if enough 4661 * data is available. The parity check is held off while parity 4662 * dependent operations are in flight. 4663 */ 4664 if (sh->check_state || 4665 (s.syncing && s.locked == 0 && 4666 !test_bit(STRIPE_COMPUTE_RUN, &sh->state) && 4667 !test_bit(STRIPE_INSYNC, &sh->state))) { 4668 if (conf->level == 6) 4669 handle_parity_checks6(conf, sh, &s, disks); 4670 else 4671 handle_parity_checks5(conf, sh, &s, disks); 4672 } 4673 4674 if ((s.replacing || s.syncing) && s.locked == 0 4675 && !test_bit(STRIPE_COMPUTE_RUN, &sh->state) 4676 && !test_bit(STRIPE_REPLACED, &sh->state)) { 4677 /* Write out to replacement devices where possible */ 4678 for (i = 0; i < conf->raid_disks; i++) 4679 if (test_bit(R5_NeedReplace, &sh->dev[i].flags)) { 4680 WARN_ON(!test_bit(R5_UPTODATE, &sh->dev[i].flags)); 4681 set_bit(R5_WantReplace, &sh->dev[i].flags); 4682 set_bit(R5_LOCKED, &sh->dev[i].flags); 4683 s.locked++; 4684 } 4685 if (s.replacing) 4686 set_bit(STRIPE_INSYNC, &sh->state); 4687 set_bit(STRIPE_REPLACED, &sh->state); 4688 } 4689 if ((s.syncing || s.replacing) && s.locked == 0 && 4690 !test_bit(STRIPE_COMPUTE_RUN, &sh->state) && 4691 test_bit(STRIPE_INSYNC, &sh->state)) { 4692 md_done_sync(conf->mddev, STRIPE_SECTORS, 1); 4693 clear_bit(STRIPE_SYNCING, &sh->state); 4694 if (test_and_clear_bit(R5_Overlap, &sh->dev[sh->pd_idx].flags)) 4695 wake_up(&conf->wait_for_overlap); 4696 } 4697 4698 /* If the failed drives are just a ReadError, then we might need 4699 * to progress the repair/check process 4700 */ 4701 if (s.failed <= conf->max_degraded && !conf->mddev->ro) 4702 for (i = 0; i < s.failed; i++) { 4703 struct r5dev *dev = &sh->dev[s.failed_num[i]]; 4704 if (test_bit(R5_ReadError, &dev->flags) 4705 && !test_bit(R5_LOCKED, &dev->flags) 4706 && test_bit(R5_UPTODATE, &dev->flags) 4707 ) { 4708 if (!test_bit(R5_ReWrite, &dev->flags)) { 4709 set_bit(R5_Wantwrite, &dev->flags); 4710 set_bit(R5_ReWrite, &dev->flags); 4711 set_bit(R5_LOCKED, &dev->flags); 4712 s.locked++; 4713 } else { 4714 /* let's read it back */ 4715 set_bit(R5_Wantread, &dev->flags); 4716 set_bit(R5_LOCKED, &dev->flags); 4717 s.locked++; 4718 } 4719 } 4720 } 4721 4722 /* Finish reconstruct operations initiated by the expansion process */ 4723 if (sh->reconstruct_state == reconstruct_state_result) { 4724 struct stripe_head *sh_src 4725 = raid5_get_active_stripe(conf, sh->sector, 1, 1, 1); 4726 if (sh_src && test_bit(STRIPE_EXPAND_SOURCE, &sh_src->state)) { 4727 /* sh cannot be written until sh_src has been read. 4728 * so arrange for sh to be delayed a little 4729 */ 4730 set_bit(STRIPE_DELAYED, &sh->state); 4731 set_bit(STRIPE_HANDLE, &sh->state); 4732 if (!test_and_set_bit(STRIPE_PREREAD_ACTIVE, 4733 &sh_src->state)) 4734 atomic_inc(&conf->preread_active_stripes); 4735 raid5_release_stripe(sh_src); 4736 goto finish; 4737 } 4738 if (sh_src) 4739 raid5_release_stripe(sh_src); 4740 4741 sh->reconstruct_state = reconstruct_state_idle; 4742 clear_bit(STRIPE_EXPANDING, &sh->state); 4743 for (i = conf->raid_disks; i--; ) { 4744 set_bit(R5_Wantwrite, &sh->dev[i].flags); 4745 set_bit(R5_LOCKED, &sh->dev[i].flags); 4746 s.locked++; 4747 } 4748 } 4749 4750 if (s.expanded && test_bit(STRIPE_EXPANDING, &sh->state) && 4751 !sh->reconstruct_state) { 4752 /* Need to write out all blocks after computing parity */ 4753 sh->disks = conf->raid_disks; 4754 stripe_set_idx(sh->sector, conf, 0, sh); 4755 schedule_reconstruction(sh, &s, 1, 1); 4756 } else if (s.expanded && !sh->reconstruct_state && s.locked == 0) { 4757 clear_bit(STRIPE_EXPAND_READY, &sh->state); 4758 atomic_dec(&conf->reshape_stripes); 4759 wake_up(&conf->wait_for_overlap); 4760 md_done_sync(conf->mddev, STRIPE_SECTORS, 1); 4761 } 4762 4763 if (s.expanding && s.locked == 0 && 4764 !test_bit(STRIPE_COMPUTE_RUN, &sh->state)) 4765 handle_stripe_expansion(conf, sh); 4766 4767 finish: 4768 /* wait for this device to become unblocked */ 4769 if (unlikely(s.blocked_rdev)) { 4770 if (conf->mddev->external) 4771 md_wait_for_blocked_rdev(s.blocked_rdev, 4772 conf->mddev); 4773 else 4774 /* Internal metadata will immediately 4775 * be written by raid5d, so we don't 4776 * need to wait here. 4777 */ 4778 rdev_dec_pending(s.blocked_rdev, 4779 conf->mddev); 4780 } 4781 4782 if (s.handle_bad_blocks) 4783 for (i = disks; i--; ) { 4784 struct md_rdev *rdev; 4785 struct r5dev *dev = &sh->dev[i]; 4786 if (test_and_clear_bit(R5_WriteError, &dev->flags)) { 4787 /* We own a safe reference to the rdev */ 4788 rdev = conf->disks[i].rdev; 4789 if (!rdev_set_badblocks(rdev, sh->sector, 4790 STRIPE_SECTORS, 0)) 4791 md_error(conf->mddev, rdev); 4792 rdev_dec_pending(rdev, conf->mddev); 4793 } 4794 if (test_and_clear_bit(R5_MadeGood, &dev->flags)) { 4795 rdev = conf->disks[i].rdev; 4796 rdev_clear_badblocks(rdev, sh->sector, 4797 STRIPE_SECTORS, 0); 4798 rdev_dec_pending(rdev, conf->mddev); 4799 } 4800 if (test_and_clear_bit(R5_MadeGoodRepl, &dev->flags)) { 4801 rdev = conf->disks[i].replacement; 4802 if (!rdev) 4803 /* rdev have been moved down */ 4804 rdev = conf->disks[i].rdev; 4805 rdev_clear_badblocks(rdev, sh->sector, 4806 STRIPE_SECTORS, 0); 4807 rdev_dec_pending(rdev, conf->mddev); 4808 } 4809 } 4810 4811 if (s.ops_request) 4812 raid_run_ops(sh, s.ops_request); 4813 4814 ops_run_io(sh, &s); 4815 4816 if (s.dec_preread_active) { 4817 /* We delay this until after ops_run_io so that if make_request 4818 * is waiting on a flush, it won't continue until the writes 4819 * have actually been submitted. 4820 */ 4821 atomic_dec(&conf->preread_active_stripes); 4822 if (atomic_read(&conf->preread_active_stripes) < 4823 IO_THRESHOLD) 4824 md_wakeup_thread(conf->mddev->thread); 4825 } 4826 4827 if (!bio_list_empty(&s.return_bi)) { 4828 if (test_bit(MD_SB_CHANGE_PENDING, &conf->mddev->sb_flags)) { 4829 spin_lock_irq(&conf->device_lock); 4830 bio_list_merge(&conf->return_bi, &s.return_bi); 4831 spin_unlock_irq(&conf->device_lock); 4832 md_wakeup_thread(conf->mddev->thread); 4833 } else 4834 return_io(&s.return_bi); 4835 } 4836 4837 clear_bit_unlock(STRIPE_ACTIVE, &sh->state); 4838 } 4839 4840 static void raid5_activate_delayed(struct r5conf *conf) 4841 { 4842 if (atomic_read(&conf->preread_active_stripes) < IO_THRESHOLD) { 4843 while (!list_empty(&conf->delayed_list)) { 4844 struct list_head *l = conf->delayed_list.next; 4845 struct stripe_head *sh; 4846 sh = list_entry(l, struct stripe_head, lru); 4847 list_del_init(l); 4848 clear_bit(STRIPE_DELAYED, &sh->state); 4849 if (!test_and_set_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) 4850 atomic_inc(&conf->preread_active_stripes); 4851 list_add_tail(&sh->lru, &conf->hold_list); 4852 raid5_wakeup_stripe_thread(sh); 4853 } 4854 } 4855 } 4856 4857 static void activate_bit_delay(struct r5conf *conf, 4858 struct list_head *temp_inactive_list) 4859 { 4860 /* device_lock is held */ 4861 struct list_head head; 4862 list_add(&head, &conf->bitmap_list); 4863 list_del_init(&conf->bitmap_list); 4864 while (!list_empty(&head)) { 4865 struct stripe_head *sh = list_entry(head.next, struct stripe_head, lru); 4866 int hash; 4867 list_del_init(&sh->lru); 4868 atomic_inc(&sh->count); 4869 hash = sh->hash_lock_index; 4870 __release_stripe(conf, sh, &temp_inactive_list[hash]); 4871 } 4872 } 4873 4874 static int raid5_congested(struct mddev *mddev, int bits) 4875 { 4876 struct r5conf *conf = mddev->private; 4877 4878 /* No difference between reads and writes. Just check 4879 * how busy the stripe_cache is 4880 */ 4881 4882 if (test_bit(R5_INACTIVE_BLOCKED, &conf->cache_state)) 4883 return 1; 4884 4885 /* Also checks whether there is pressure on r5cache log space */ 4886 if (test_bit(R5C_LOG_TIGHT, &conf->cache_state)) 4887 return 1; 4888 if (conf->quiesce) 4889 return 1; 4890 if (atomic_read(&conf->empty_inactive_list_nr)) 4891 return 1; 4892 4893 return 0; 4894 } 4895 4896 static int in_chunk_boundary(struct mddev *mddev, struct bio *bio) 4897 { 4898 struct r5conf *conf = mddev->private; 4899 sector_t sector = bio->bi_iter.bi_sector + get_start_sect(bio->bi_bdev); 4900 unsigned int chunk_sectors; 4901 unsigned int bio_sectors = bio_sectors(bio); 4902 4903 chunk_sectors = min(conf->chunk_sectors, conf->prev_chunk_sectors); 4904 return chunk_sectors >= 4905 ((sector & (chunk_sectors - 1)) + bio_sectors); 4906 } 4907 4908 /* 4909 * add bio to the retry LIFO ( in O(1) ... we are in interrupt ) 4910 * later sampled by raid5d. 4911 */ 4912 static void add_bio_to_retry(struct bio *bi,struct r5conf *conf) 4913 { 4914 unsigned long flags; 4915 4916 spin_lock_irqsave(&conf->device_lock, flags); 4917 4918 bi->bi_next = conf->retry_read_aligned_list; 4919 conf->retry_read_aligned_list = bi; 4920 4921 spin_unlock_irqrestore(&conf->device_lock, flags); 4922 md_wakeup_thread(conf->mddev->thread); 4923 } 4924 4925 static struct bio *remove_bio_from_retry(struct r5conf *conf) 4926 { 4927 struct bio *bi; 4928 4929 bi = conf->retry_read_aligned; 4930 if (bi) { 4931 conf->retry_read_aligned = NULL; 4932 return bi; 4933 } 4934 bi = conf->retry_read_aligned_list; 4935 if(bi) { 4936 conf->retry_read_aligned_list = bi->bi_next; 4937 bi->bi_next = NULL; 4938 /* 4939 * this sets the active strip count to 1 and the processed 4940 * strip count to zero (upper 8 bits) 4941 */ 4942 raid5_set_bi_stripes(bi, 1); /* biased count of active stripes */ 4943 } 4944 4945 return bi; 4946 } 4947 4948 /* 4949 * The "raid5_align_endio" should check if the read succeeded and if it 4950 * did, call bio_endio on the original bio (having bio_put the new bio 4951 * first). 4952 * If the read failed.. 4953 */ 4954 static void raid5_align_endio(struct bio *bi) 4955 { 4956 struct bio* raid_bi = bi->bi_private; 4957 struct mddev *mddev; 4958 struct r5conf *conf; 4959 struct md_rdev *rdev; 4960 int error = bi->bi_error; 4961 4962 bio_put(bi); 4963 4964 rdev = (void*)raid_bi->bi_next; 4965 raid_bi->bi_next = NULL; 4966 mddev = rdev->mddev; 4967 conf = mddev->private; 4968 4969 rdev_dec_pending(rdev, conf->mddev); 4970 4971 if (!error) { 4972 trace_block_bio_complete(bdev_get_queue(raid_bi->bi_bdev), 4973 raid_bi, 0); 4974 bio_endio(raid_bi); 4975 if (atomic_dec_and_test(&conf->active_aligned_reads)) 4976 wake_up(&conf->wait_for_quiescent); 4977 return; 4978 } 4979 4980 pr_debug("raid5_align_endio : io error...handing IO for a retry\n"); 4981 4982 add_bio_to_retry(raid_bi, conf); 4983 } 4984 4985 static int raid5_read_one_chunk(struct mddev *mddev, struct bio *raid_bio) 4986 { 4987 struct r5conf *conf = mddev->private; 4988 int dd_idx; 4989 struct bio* align_bi; 4990 struct md_rdev *rdev; 4991 sector_t end_sector; 4992 4993 if (!in_chunk_boundary(mddev, raid_bio)) { 4994 pr_debug("%s: non aligned\n", __func__); 4995 return 0; 4996 } 4997 /* 4998 * use bio_clone_mddev to make a copy of the bio 4999 */ 5000 align_bi = bio_clone_mddev(raid_bio, GFP_NOIO, mddev); 5001 if (!align_bi) 5002 return 0; 5003 /* 5004 * set bi_end_io to a new function, and set bi_private to the 5005 * original bio. 5006 */ 5007 align_bi->bi_end_io = raid5_align_endio; 5008 align_bi->bi_private = raid_bio; 5009 /* 5010 * compute position 5011 */ 5012 align_bi->bi_iter.bi_sector = 5013 raid5_compute_sector(conf, raid_bio->bi_iter.bi_sector, 5014 0, &dd_idx, NULL); 5015 5016 end_sector = bio_end_sector(align_bi); 5017 rcu_read_lock(); 5018 rdev = rcu_dereference(conf->disks[dd_idx].replacement); 5019 if (!rdev || test_bit(Faulty, &rdev->flags) || 5020 rdev->recovery_offset < end_sector) { 5021 rdev = rcu_dereference(conf->disks[dd_idx].rdev); 5022 if (rdev && 5023 (test_bit(Faulty, &rdev->flags) || 5024 !(test_bit(In_sync, &rdev->flags) || 5025 rdev->recovery_offset >= end_sector))) 5026 rdev = NULL; 5027 } 5028 if (rdev) { 5029 sector_t first_bad; 5030 int bad_sectors; 5031 5032 atomic_inc(&rdev->nr_pending); 5033 rcu_read_unlock(); 5034 raid_bio->bi_next = (void*)rdev; 5035 align_bi->bi_bdev = rdev->bdev; 5036 bio_clear_flag(align_bi, BIO_SEG_VALID); 5037 5038 if (is_badblock(rdev, align_bi->bi_iter.bi_sector, 5039 bio_sectors(align_bi), 5040 &first_bad, &bad_sectors)) { 5041 bio_put(align_bi); 5042 rdev_dec_pending(rdev, mddev); 5043 return 0; 5044 } 5045 5046 /* No reshape active, so we can trust rdev->data_offset */ 5047 align_bi->bi_iter.bi_sector += rdev->data_offset; 5048 5049 spin_lock_irq(&conf->device_lock); 5050 wait_event_lock_irq(conf->wait_for_quiescent, 5051 conf->quiesce == 0, 5052 conf->device_lock); 5053 atomic_inc(&conf->active_aligned_reads); 5054 spin_unlock_irq(&conf->device_lock); 5055 5056 if (mddev->gendisk) 5057 trace_block_bio_remap(bdev_get_queue(align_bi->bi_bdev), 5058 align_bi, disk_devt(mddev->gendisk), 5059 raid_bio->bi_iter.bi_sector); 5060 generic_make_request(align_bi); 5061 return 1; 5062 } else { 5063 rcu_read_unlock(); 5064 bio_put(align_bi); 5065 return 0; 5066 } 5067 } 5068 5069 static struct bio *chunk_aligned_read(struct mddev *mddev, struct bio *raid_bio) 5070 { 5071 struct bio *split; 5072 5073 do { 5074 sector_t sector = raid_bio->bi_iter.bi_sector; 5075 unsigned chunk_sects = mddev->chunk_sectors; 5076 unsigned sectors = chunk_sects - (sector & (chunk_sects-1)); 5077 5078 if (sectors < bio_sectors(raid_bio)) { 5079 split = bio_split(raid_bio, sectors, GFP_NOIO, fs_bio_set); 5080 bio_chain(split, raid_bio); 5081 } else 5082 split = raid_bio; 5083 5084 if (!raid5_read_one_chunk(mddev, split)) { 5085 if (split != raid_bio) 5086 generic_make_request(raid_bio); 5087 return split; 5088 } 5089 } while (split != raid_bio); 5090 5091 return NULL; 5092 } 5093 5094 /* __get_priority_stripe - get the next stripe to process 5095 * 5096 * Full stripe writes are allowed to pass preread active stripes up until 5097 * the bypass_threshold is exceeded. In general the bypass_count 5098 * increments when the handle_list is handled before the hold_list; however, it 5099 * will not be incremented when STRIPE_IO_STARTED is sampled set signifying a 5100 * stripe with in flight i/o. The bypass_count will be reset when the 5101 * head of the hold_list has changed, i.e. the head was promoted to the 5102 * handle_list. 5103 */ 5104 static struct stripe_head *__get_priority_stripe(struct r5conf *conf, int group) 5105 { 5106 struct stripe_head *sh = NULL, *tmp; 5107 struct list_head *handle_list = NULL; 5108 struct r5worker_group *wg = NULL; 5109 5110 if (conf->worker_cnt_per_group == 0) { 5111 handle_list = &conf->handle_list; 5112 } else if (group != ANY_GROUP) { 5113 handle_list = &conf->worker_groups[group].handle_list; 5114 wg = &conf->worker_groups[group]; 5115 } else { 5116 int i; 5117 for (i = 0; i < conf->group_cnt; i++) { 5118 handle_list = &conf->worker_groups[i].handle_list; 5119 wg = &conf->worker_groups[i]; 5120 if (!list_empty(handle_list)) 5121 break; 5122 } 5123 } 5124 5125 pr_debug("%s: handle: %s hold: %s full_writes: %d bypass_count: %d\n", 5126 __func__, 5127 list_empty(handle_list) ? "empty" : "busy", 5128 list_empty(&conf->hold_list) ? "empty" : "busy", 5129 atomic_read(&conf->pending_full_writes), conf->bypass_count); 5130 5131 if (!list_empty(handle_list)) { 5132 sh = list_entry(handle_list->next, typeof(*sh), lru); 5133 5134 if (list_empty(&conf->hold_list)) 5135 conf->bypass_count = 0; 5136 else if (!test_bit(STRIPE_IO_STARTED, &sh->state)) { 5137 if (conf->hold_list.next == conf->last_hold) 5138 conf->bypass_count++; 5139 else { 5140 conf->last_hold = conf->hold_list.next; 5141 conf->bypass_count -= conf->bypass_threshold; 5142 if (conf->bypass_count < 0) 5143 conf->bypass_count = 0; 5144 } 5145 } 5146 } else if (!list_empty(&conf->hold_list) && 5147 ((conf->bypass_threshold && 5148 conf->bypass_count > conf->bypass_threshold) || 5149 atomic_read(&conf->pending_full_writes) == 0)) { 5150 5151 list_for_each_entry(tmp, &conf->hold_list, lru) { 5152 if (conf->worker_cnt_per_group == 0 || 5153 group == ANY_GROUP || 5154 !cpu_online(tmp->cpu) || 5155 cpu_to_group(tmp->cpu) == group) { 5156 sh = tmp; 5157 break; 5158 } 5159 } 5160 5161 if (sh) { 5162 conf->bypass_count -= conf->bypass_threshold; 5163 if (conf->bypass_count < 0) 5164 conf->bypass_count = 0; 5165 } 5166 wg = NULL; 5167 } 5168 5169 if (!sh) 5170 return NULL; 5171 5172 if (wg) { 5173 wg->stripes_cnt--; 5174 sh->group = NULL; 5175 } 5176 list_del_init(&sh->lru); 5177 BUG_ON(atomic_inc_return(&sh->count) != 1); 5178 return sh; 5179 } 5180 5181 struct raid5_plug_cb { 5182 struct blk_plug_cb cb; 5183 struct list_head list; 5184 struct list_head temp_inactive_list[NR_STRIPE_HASH_LOCKS]; 5185 }; 5186 5187 static void raid5_unplug(struct blk_plug_cb *blk_cb, bool from_schedule) 5188 { 5189 struct raid5_plug_cb *cb = container_of( 5190 blk_cb, struct raid5_plug_cb, cb); 5191 struct stripe_head *sh; 5192 struct mddev *mddev = cb->cb.data; 5193 struct r5conf *conf = mddev->private; 5194 int cnt = 0; 5195 int hash; 5196 5197 if (cb->list.next && !list_empty(&cb->list)) { 5198 spin_lock_irq(&conf->device_lock); 5199 while (!list_empty(&cb->list)) { 5200 sh = list_first_entry(&cb->list, struct stripe_head, lru); 5201 list_del_init(&sh->lru); 5202 /* 5203 * avoid race release_stripe_plug() sees 5204 * STRIPE_ON_UNPLUG_LIST clear but the stripe 5205 * is still in our list 5206 */ 5207 smp_mb__before_atomic(); 5208 clear_bit(STRIPE_ON_UNPLUG_LIST, &sh->state); 5209 /* 5210 * STRIPE_ON_RELEASE_LIST could be set here. In that 5211 * case, the count is always > 1 here 5212 */ 5213 hash = sh->hash_lock_index; 5214 __release_stripe(conf, sh, &cb->temp_inactive_list[hash]); 5215 cnt++; 5216 } 5217 spin_unlock_irq(&conf->device_lock); 5218 } 5219 release_inactive_stripe_list(conf, cb->temp_inactive_list, 5220 NR_STRIPE_HASH_LOCKS); 5221 if (mddev->queue) 5222 trace_block_unplug(mddev->queue, cnt, !from_schedule); 5223 kfree(cb); 5224 } 5225 5226 static void release_stripe_plug(struct mddev *mddev, 5227 struct stripe_head *sh) 5228 { 5229 struct blk_plug_cb *blk_cb = blk_check_plugged( 5230 raid5_unplug, mddev, 5231 sizeof(struct raid5_plug_cb)); 5232 struct raid5_plug_cb *cb; 5233 5234 if (!blk_cb) { 5235 raid5_release_stripe(sh); 5236 return; 5237 } 5238 5239 cb = container_of(blk_cb, struct raid5_plug_cb, cb); 5240 5241 if (cb->list.next == NULL) { 5242 int i; 5243 INIT_LIST_HEAD(&cb->list); 5244 for (i = 0; i < NR_STRIPE_HASH_LOCKS; i++) 5245 INIT_LIST_HEAD(cb->temp_inactive_list + i); 5246 } 5247 5248 if (!test_and_set_bit(STRIPE_ON_UNPLUG_LIST, &sh->state)) 5249 list_add_tail(&sh->lru, &cb->list); 5250 else 5251 raid5_release_stripe(sh); 5252 } 5253 5254 static void make_discard_request(struct mddev *mddev, struct bio *bi) 5255 { 5256 struct r5conf *conf = mddev->private; 5257 sector_t logical_sector, last_sector; 5258 struct stripe_head *sh; 5259 int remaining; 5260 int stripe_sectors; 5261 5262 if (mddev->reshape_position != MaxSector) 5263 /* Skip discard while reshape is happening */ 5264 return; 5265 5266 logical_sector = bi->bi_iter.bi_sector & ~((sector_t)STRIPE_SECTORS-1); 5267 last_sector = bi->bi_iter.bi_sector + (bi->bi_iter.bi_size>>9); 5268 5269 bi->bi_next = NULL; 5270 bi->bi_phys_segments = 1; /* over-loaded to count active stripes */ 5271 5272 stripe_sectors = conf->chunk_sectors * 5273 (conf->raid_disks - conf->max_degraded); 5274 logical_sector = DIV_ROUND_UP_SECTOR_T(logical_sector, 5275 stripe_sectors); 5276 sector_div(last_sector, stripe_sectors); 5277 5278 logical_sector *= conf->chunk_sectors; 5279 last_sector *= conf->chunk_sectors; 5280 5281 for (; logical_sector < last_sector; 5282 logical_sector += STRIPE_SECTORS) { 5283 DEFINE_WAIT(w); 5284 int d; 5285 again: 5286 sh = raid5_get_active_stripe(conf, logical_sector, 0, 0, 0); 5287 prepare_to_wait(&conf->wait_for_overlap, &w, 5288 TASK_UNINTERRUPTIBLE); 5289 set_bit(R5_Overlap, &sh->dev[sh->pd_idx].flags); 5290 if (test_bit(STRIPE_SYNCING, &sh->state)) { 5291 raid5_release_stripe(sh); 5292 schedule(); 5293 goto again; 5294 } 5295 clear_bit(R5_Overlap, &sh->dev[sh->pd_idx].flags); 5296 spin_lock_irq(&sh->stripe_lock); 5297 for (d = 0; d < conf->raid_disks; d++) { 5298 if (d == sh->pd_idx || d == sh->qd_idx) 5299 continue; 5300 if (sh->dev[d].towrite || sh->dev[d].toread) { 5301 set_bit(R5_Overlap, &sh->dev[d].flags); 5302 spin_unlock_irq(&sh->stripe_lock); 5303 raid5_release_stripe(sh); 5304 schedule(); 5305 goto again; 5306 } 5307 } 5308 set_bit(STRIPE_DISCARD, &sh->state); 5309 finish_wait(&conf->wait_for_overlap, &w); 5310 sh->overwrite_disks = 0; 5311 for (d = 0; d < conf->raid_disks; d++) { 5312 if (d == sh->pd_idx || d == sh->qd_idx) 5313 continue; 5314 sh->dev[d].towrite = bi; 5315 set_bit(R5_OVERWRITE, &sh->dev[d].flags); 5316 raid5_inc_bi_active_stripes(bi); 5317 sh->overwrite_disks++; 5318 } 5319 spin_unlock_irq(&sh->stripe_lock); 5320 if (conf->mddev->bitmap) { 5321 for (d = 0; 5322 d < conf->raid_disks - conf->max_degraded; 5323 d++) 5324 bitmap_startwrite(mddev->bitmap, 5325 sh->sector, 5326 STRIPE_SECTORS, 5327 0); 5328 sh->bm_seq = conf->seq_flush + 1; 5329 set_bit(STRIPE_BIT_DELAY, &sh->state); 5330 } 5331 5332 set_bit(STRIPE_HANDLE, &sh->state); 5333 clear_bit(STRIPE_DELAYED, &sh->state); 5334 if (!test_and_set_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) 5335 atomic_inc(&conf->preread_active_stripes); 5336 release_stripe_plug(mddev, sh); 5337 } 5338 5339 remaining = raid5_dec_bi_active_stripes(bi); 5340 if (remaining == 0) { 5341 md_write_end(mddev); 5342 bio_endio(bi); 5343 } 5344 } 5345 5346 static void raid5_make_request(struct mddev *mddev, struct bio * bi) 5347 { 5348 struct r5conf *conf = mddev->private; 5349 int dd_idx; 5350 sector_t new_sector; 5351 sector_t logical_sector, last_sector; 5352 struct stripe_head *sh; 5353 const int rw = bio_data_dir(bi); 5354 int remaining; 5355 DEFINE_WAIT(w); 5356 bool do_prepare; 5357 bool do_flush = false; 5358 5359 if (unlikely(bi->bi_opf & REQ_PREFLUSH)) { 5360 int ret = r5l_handle_flush_request(conf->log, bi); 5361 5362 if (ret == 0) 5363 return; 5364 if (ret == -ENODEV) { 5365 md_flush_request(mddev, bi); 5366 return; 5367 } 5368 /* ret == -EAGAIN, fallback */ 5369 /* 5370 * if r5l_handle_flush_request() didn't clear REQ_PREFLUSH, 5371 * we need to flush journal device 5372 */ 5373 do_flush = bi->bi_opf & REQ_PREFLUSH; 5374 } 5375 5376 md_write_start(mddev, bi); 5377 5378 /* 5379 * If array is degraded, better not do chunk aligned read because 5380 * later we might have to read it again in order to reconstruct 5381 * data on failed drives. 5382 */ 5383 if (rw == READ && mddev->degraded == 0 && 5384 !r5c_is_writeback(conf->log) && 5385 mddev->reshape_position == MaxSector) { 5386 bi = chunk_aligned_read(mddev, bi); 5387 if (!bi) 5388 return; 5389 } 5390 5391 if (unlikely(bio_op(bi) == REQ_OP_DISCARD)) { 5392 make_discard_request(mddev, bi); 5393 return; 5394 } 5395 5396 logical_sector = bi->bi_iter.bi_sector & ~((sector_t)STRIPE_SECTORS-1); 5397 last_sector = bio_end_sector(bi); 5398 bi->bi_next = NULL; 5399 bi->bi_phys_segments = 1; /* over-loaded to count active stripes */ 5400 5401 prepare_to_wait(&conf->wait_for_overlap, &w, TASK_UNINTERRUPTIBLE); 5402 for (;logical_sector < last_sector; logical_sector += STRIPE_SECTORS) { 5403 int previous; 5404 int seq; 5405 5406 do_prepare = false; 5407 retry: 5408 seq = read_seqcount_begin(&conf->gen_lock); 5409 previous = 0; 5410 if (do_prepare) 5411 prepare_to_wait(&conf->wait_for_overlap, &w, 5412 TASK_UNINTERRUPTIBLE); 5413 if (unlikely(conf->reshape_progress != MaxSector)) { 5414 /* spinlock is needed as reshape_progress may be 5415 * 64bit on a 32bit platform, and so it might be 5416 * possible to see a half-updated value 5417 * Of course reshape_progress could change after 5418 * the lock is dropped, so once we get a reference 5419 * to the stripe that we think it is, we will have 5420 * to check again. 5421 */ 5422 spin_lock_irq(&conf->device_lock); 5423 if (mddev->reshape_backwards 5424 ? logical_sector < conf->reshape_progress 5425 : logical_sector >= conf->reshape_progress) { 5426 previous = 1; 5427 } else { 5428 if (mddev->reshape_backwards 5429 ? logical_sector < conf->reshape_safe 5430 : logical_sector >= conf->reshape_safe) { 5431 spin_unlock_irq(&conf->device_lock); 5432 schedule(); 5433 do_prepare = true; 5434 goto retry; 5435 } 5436 } 5437 spin_unlock_irq(&conf->device_lock); 5438 } 5439 5440 new_sector = raid5_compute_sector(conf, logical_sector, 5441 previous, 5442 &dd_idx, NULL); 5443 pr_debug("raid456: raid5_make_request, sector %llu logical %llu\n", 5444 (unsigned long long)new_sector, 5445 (unsigned long long)logical_sector); 5446 5447 sh = raid5_get_active_stripe(conf, new_sector, previous, 5448 (bi->bi_opf & REQ_RAHEAD), 0); 5449 if (sh) { 5450 if (unlikely(previous)) { 5451 /* expansion might have moved on while waiting for a 5452 * stripe, so we must do the range check again. 5453 * Expansion could still move past after this 5454 * test, but as we are holding a reference to 5455 * 'sh', we know that if that happens, 5456 * STRIPE_EXPANDING will get set and the expansion 5457 * won't proceed until we finish with the stripe. 5458 */ 5459 int must_retry = 0; 5460 spin_lock_irq(&conf->device_lock); 5461 if (mddev->reshape_backwards 5462 ? logical_sector >= conf->reshape_progress 5463 : logical_sector < conf->reshape_progress) 5464 /* mismatch, need to try again */ 5465 must_retry = 1; 5466 spin_unlock_irq(&conf->device_lock); 5467 if (must_retry) { 5468 raid5_release_stripe(sh); 5469 schedule(); 5470 do_prepare = true; 5471 goto retry; 5472 } 5473 } 5474 if (read_seqcount_retry(&conf->gen_lock, seq)) { 5475 /* Might have got the wrong stripe_head 5476 * by accident 5477 */ 5478 raid5_release_stripe(sh); 5479 goto retry; 5480 } 5481 5482 if (rw == WRITE && 5483 logical_sector >= mddev->suspend_lo && 5484 logical_sector < mddev->suspend_hi) { 5485 raid5_release_stripe(sh); 5486 /* As the suspend_* range is controlled by 5487 * userspace, we want an interruptible 5488 * wait. 5489 */ 5490 flush_signals(current); 5491 prepare_to_wait(&conf->wait_for_overlap, 5492 &w, TASK_INTERRUPTIBLE); 5493 if (logical_sector >= mddev->suspend_lo && 5494 logical_sector < mddev->suspend_hi) { 5495 schedule(); 5496 do_prepare = true; 5497 } 5498 goto retry; 5499 } 5500 5501 if (test_bit(STRIPE_EXPANDING, &sh->state) || 5502 !add_stripe_bio(sh, bi, dd_idx, rw, previous)) { 5503 /* Stripe is busy expanding or 5504 * add failed due to overlap. Flush everything 5505 * and wait a while 5506 */ 5507 md_wakeup_thread(mddev->thread); 5508 raid5_release_stripe(sh); 5509 schedule(); 5510 do_prepare = true; 5511 goto retry; 5512 } 5513 if (do_flush) { 5514 set_bit(STRIPE_R5C_PREFLUSH, &sh->state); 5515 /* we only need flush for one stripe */ 5516 do_flush = false; 5517 } 5518 5519 set_bit(STRIPE_HANDLE, &sh->state); 5520 clear_bit(STRIPE_DELAYED, &sh->state); 5521 if ((!sh->batch_head || sh == sh->batch_head) && 5522 (bi->bi_opf & REQ_SYNC) && 5523 !test_and_set_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) 5524 atomic_inc(&conf->preread_active_stripes); 5525 release_stripe_plug(mddev, sh); 5526 } else { 5527 /* cannot get stripe for read-ahead, just give-up */ 5528 bi->bi_error = -EIO; 5529 break; 5530 } 5531 } 5532 finish_wait(&conf->wait_for_overlap, &w); 5533 5534 remaining = raid5_dec_bi_active_stripes(bi); 5535 if (remaining == 0) { 5536 5537 if ( rw == WRITE ) 5538 md_write_end(mddev); 5539 5540 trace_block_bio_complete(bdev_get_queue(bi->bi_bdev), 5541 bi, 0); 5542 bio_endio(bi); 5543 } 5544 } 5545 5546 static sector_t raid5_size(struct mddev *mddev, sector_t sectors, int raid_disks); 5547 5548 static sector_t reshape_request(struct mddev *mddev, sector_t sector_nr, int *skipped) 5549 { 5550 /* reshaping is quite different to recovery/resync so it is 5551 * handled quite separately ... here. 5552 * 5553 * On each call to sync_request, we gather one chunk worth of 5554 * destination stripes and flag them as expanding. 5555 * Then we find all the source stripes and request reads. 5556 * As the reads complete, handle_stripe will copy the data 5557 * into the destination stripe and release that stripe. 5558 */ 5559 struct r5conf *conf = mddev->private; 5560 struct stripe_head *sh; 5561 sector_t first_sector, last_sector; 5562 int raid_disks = conf->previous_raid_disks; 5563 int data_disks = raid_disks - conf->max_degraded; 5564 int new_data_disks = conf->raid_disks - conf->max_degraded; 5565 int i; 5566 int dd_idx; 5567 sector_t writepos, readpos, safepos; 5568 sector_t stripe_addr; 5569 int reshape_sectors; 5570 struct list_head stripes; 5571 sector_t retn; 5572 5573 if (sector_nr == 0) { 5574 /* If restarting in the middle, skip the initial sectors */ 5575 if (mddev->reshape_backwards && 5576 conf->reshape_progress < raid5_size(mddev, 0, 0)) { 5577 sector_nr = raid5_size(mddev, 0, 0) 5578 - conf->reshape_progress; 5579 } else if (mddev->reshape_backwards && 5580 conf->reshape_progress == MaxSector) { 5581 /* shouldn't happen, but just in case, finish up.*/ 5582 sector_nr = MaxSector; 5583 } else if (!mddev->reshape_backwards && 5584 conf->reshape_progress > 0) 5585 sector_nr = conf->reshape_progress; 5586 sector_div(sector_nr, new_data_disks); 5587 if (sector_nr) { 5588 mddev->curr_resync_completed = sector_nr; 5589 sysfs_notify(&mddev->kobj, NULL, "sync_completed"); 5590 *skipped = 1; 5591 retn = sector_nr; 5592 goto finish; 5593 } 5594 } 5595 5596 /* We need to process a full chunk at a time. 5597 * If old and new chunk sizes differ, we need to process the 5598 * largest of these 5599 */ 5600 5601 reshape_sectors = max(conf->chunk_sectors, conf->prev_chunk_sectors); 5602 5603 /* We update the metadata at least every 10 seconds, or when 5604 * the data about to be copied would over-write the source of 5605 * the data at the front of the range. i.e. one new_stripe 5606 * along from reshape_progress new_maps to after where 5607 * reshape_safe old_maps to 5608 */ 5609 writepos = conf->reshape_progress; 5610 sector_div(writepos, new_data_disks); 5611 readpos = conf->reshape_progress; 5612 sector_div(readpos, data_disks); 5613 safepos = conf->reshape_safe; 5614 sector_div(safepos, data_disks); 5615 if (mddev->reshape_backwards) { 5616 BUG_ON(writepos < reshape_sectors); 5617 writepos -= reshape_sectors; 5618 readpos += reshape_sectors; 5619 safepos += reshape_sectors; 5620 } else { 5621 writepos += reshape_sectors; 5622 /* readpos and safepos are worst-case calculations. 5623 * A negative number is overly pessimistic, and causes 5624 * obvious problems for unsigned storage. So clip to 0. 5625 */ 5626 readpos -= min_t(sector_t, reshape_sectors, readpos); 5627 safepos -= min_t(sector_t, reshape_sectors, safepos); 5628 } 5629 5630 /* Having calculated the 'writepos' possibly use it 5631 * to set 'stripe_addr' which is where we will write to. 5632 */ 5633 if (mddev->reshape_backwards) { 5634 BUG_ON(conf->reshape_progress == 0); 5635 stripe_addr = writepos; 5636 BUG_ON((mddev->dev_sectors & 5637 ~((sector_t)reshape_sectors - 1)) 5638 - reshape_sectors - stripe_addr 5639 != sector_nr); 5640 } else { 5641 BUG_ON(writepos != sector_nr + reshape_sectors); 5642 stripe_addr = sector_nr; 5643 } 5644 5645 /* 'writepos' is the most advanced device address we might write. 5646 * 'readpos' is the least advanced device address we might read. 5647 * 'safepos' is the least address recorded in the metadata as having 5648 * been reshaped. 5649 * If there is a min_offset_diff, these are adjusted either by 5650 * increasing the safepos/readpos if diff is negative, or 5651 * increasing writepos if diff is positive. 5652 * If 'readpos' is then behind 'writepos', there is no way that we can 5653 * ensure safety in the face of a crash - that must be done by userspace 5654 * making a backup of the data. So in that case there is no particular 5655 * rush to update metadata. 5656 * Otherwise if 'safepos' is behind 'writepos', then we really need to 5657 * update the metadata to advance 'safepos' to match 'readpos' so that 5658 * we can be safe in the event of a crash. 5659 * So we insist on updating metadata if safepos is behind writepos and 5660 * readpos is beyond writepos. 5661 * In any case, update the metadata every 10 seconds. 5662 * Maybe that number should be configurable, but I'm not sure it is 5663 * worth it.... maybe it could be a multiple of safemode_delay??? 5664 */ 5665 if (conf->min_offset_diff < 0) { 5666 safepos += -conf->min_offset_diff; 5667 readpos += -conf->min_offset_diff; 5668 } else 5669 writepos += conf->min_offset_diff; 5670 5671 if ((mddev->reshape_backwards 5672 ? (safepos > writepos && readpos < writepos) 5673 : (safepos < writepos && readpos > writepos)) || 5674 time_after(jiffies, conf->reshape_checkpoint + 10*HZ)) { 5675 /* Cannot proceed until we've updated the superblock... */ 5676 wait_event(conf->wait_for_overlap, 5677 atomic_read(&conf->reshape_stripes)==0 5678 || test_bit(MD_RECOVERY_INTR, &mddev->recovery)); 5679 if (atomic_read(&conf->reshape_stripes) != 0) 5680 return 0; 5681 mddev->reshape_position = conf->reshape_progress; 5682 mddev->curr_resync_completed = sector_nr; 5683 conf->reshape_checkpoint = jiffies; 5684 set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags); 5685 md_wakeup_thread(mddev->thread); 5686 wait_event(mddev->sb_wait, mddev->sb_flags == 0 || 5687 test_bit(MD_RECOVERY_INTR, &mddev->recovery)); 5688 if (test_bit(MD_RECOVERY_INTR, &mddev->recovery)) 5689 return 0; 5690 spin_lock_irq(&conf->device_lock); 5691 conf->reshape_safe = mddev->reshape_position; 5692 spin_unlock_irq(&conf->device_lock); 5693 wake_up(&conf->wait_for_overlap); 5694 sysfs_notify(&mddev->kobj, NULL, "sync_completed"); 5695 } 5696 5697 INIT_LIST_HEAD(&stripes); 5698 for (i = 0; i < reshape_sectors; i += STRIPE_SECTORS) { 5699 int j; 5700 int skipped_disk = 0; 5701 sh = raid5_get_active_stripe(conf, stripe_addr+i, 0, 0, 1); 5702 set_bit(STRIPE_EXPANDING, &sh->state); 5703 atomic_inc(&conf->reshape_stripes); 5704 /* If any of this stripe is beyond the end of the old 5705 * array, then we need to zero those blocks 5706 */ 5707 for (j=sh->disks; j--;) { 5708 sector_t s; 5709 if (j == sh->pd_idx) 5710 continue; 5711 if (conf->level == 6 && 5712 j == sh->qd_idx) 5713 continue; 5714 s = raid5_compute_blocknr(sh, j, 0); 5715 if (s < raid5_size(mddev, 0, 0)) { 5716 skipped_disk = 1; 5717 continue; 5718 } 5719 memset(page_address(sh->dev[j].page), 0, STRIPE_SIZE); 5720 set_bit(R5_Expanded, &sh->dev[j].flags); 5721 set_bit(R5_UPTODATE, &sh->dev[j].flags); 5722 } 5723 if (!skipped_disk) { 5724 set_bit(STRIPE_EXPAND_READY, &sh->state); 5725 set_bit(STRIPE_HANDLE, &sh->state); 5726 } 5727 list_add(&sh->lru, &stripes); 5728 } 5729 spin_lock_irq(&conf->device_lock); 5730 if (mddev->reshape_backwards) 5731 conf->reshape_progress -= reshape_sectors * new_data_disks; 5732 else 5733 conf->reshape_progress += reshape_sectors * new_data_disks; 5734 spin_unlock_irq(&conf->device_lock); 5735 /* Ok, those stripe are ready. We can start scheduling 5736 * reads on the source stripes. 5737 * The source stripes are determined by mapping the first and last 5738 * block on the destination stripes. 5739 */ 5740 first_sector = 5741 raid5_compute_sector(conf, stripe_addr*(new_data_disks), 5742 1, &dd_idx, NULL); 5743 last_sector = 5744 raid5_compute_sector(conf, ((stripe_addr+reshape_sectors) 5745 * new_data_disks - 1), 5746 1, &dd_idx, NULL); 5747 if (last_sector >= mddev->dev_sectors) 5748 last_sector = mddev->dev_sectors - 1; 5749 while (first_sector <= last_sector) { 5750 sh = raid5_get_active_stripe(conf, first_sector, 1, 0, 1); 5751 set_bit(STRIPE_EXPAND_SOURCE, &sh->state); 5752 set_bit(STRIPE_HANDLE, &sh->state); 5753 raid5_release_stripe(sh); 5754 first_sector += STRIPE_SECTORS; 5755 } 5756 /* Now that the sources are clearly marked, we can release 5757 * the destination stripes 5758 */ 5759 while (!list_empty(&stripes)) { 5760 sh = list_entry(stripes.next, struct stripe_head, lru); 5761 list_del_init(&sh->lru); 5762 raid5_release_stripe(sh); 5763 } 5764 /* If this takes us to the resync_max point where we have to pause, 5765 * then we need to write out the superblock. 5766 */ 5767 sector_nr += reshape_sectors; 5768 retn = reshape_sectors; 5769 finish: 5770 if (mddev->curr_resync_completed > mddev->resync_max || 5771 (sector_nr - mddev->curr_resync_completed) * 2 5772 >= mddev->resync_max - mddev->curr_resync_completed) { 5773 /* Cannot proceed until we've updated the superblock... */ 5774 wait_event(conf->wait_for_overlap, 5775 atomic_read(&conf->reshape_stripes) == 0 5776 || test_bit(MD_RECOVERY_INTR, &mddev->recovery)); 5777 if (atomic_read(&conf->reshape_stripes) != 0) 5778 goto ret; 5779 mddev->reshape_position = conf->reshape_progress; 5780 mddev->curr_resync_completed = sector_nr; 5781 conf->reshape_checkpoint = jiffies; 5782 set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags); 5783 md_wakeup_thread(mddev->thread); 5784 wait_event(mddev->sb_wait, 5785 !test_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags) 5786 || test_bit(MD_RECOVERY_INTR, &mddev->recovery)); 5787 if (test_bit(MD_RECOVERY_INTR, &mddev->recovery)) 5788 goto ret; 5789 spin_lock_irq(&conf->device_lock); 5790 conf->reshape_safe = mddev->reshape_position; 5791 spin_unlock_irq(&conf->device_lock); 5792 wake_up(&conf->wait_for_overlap); 5793 sysfs_notify(&mddev->kobj, NULL, "sync_completed"); 5794 } 5795 ret: 5796 return retn; 5797 } 5798 5799 static inline sector_t raid5_sync_request(struct mddev *mddev, sector_t sector_nr, 5800 int *skipped) 5801 { 5802 struct r5conf *conf = mddev->private; 5803 struct stripe_head *sh; 5804 sector_t max_sector = mddev->dev_sectors; 5805 sector_t sync_blocks; 5806 int still_degraded = 0; 5807 int i; 5808 5809 if (sector_nr >= max_sector) { 5810 /* just being told to finish up .. nothing much to do */ 5811 5812 if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)) { 5813 end_reshape(conf); 5814 return 0; 5815 } 5816 5817 if (mddev->curr_resync < max_sector) /* aborted */ 5818 bitmap_end_sync(mddev->bitmap, mddev->curr_resync, 5819 &sync_blocks, 1); 5820 else /* completed sync */ 5821 conf->fullsync = 0; 5822 bitmap_close_sync(mddev->bitmap); 5823 5824 return 0; 5825 } 5826 5827 /* Allow raid5_quiesce to complete */ 5828 wait_event(conf->wait_for_overlap, conf->quiesce != 2); 5829 5830 if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)) 5831 return reshape_request(mddev, sector_nr, skipped); 5832 5833 /* No need to check resync_max as we never do more than one 5834 * stripe, and as resync_max will always be on a chunk boundary, 5835 * if the check in md_do_sync didn't fire, there is no chance 5836 * of overstepping resync_max here 5837 */ 5838 5839 /* if there is too many failed drives and we are trying 5840 * to resync, then assert that we are finished, because there is 5841 * nothing we can do. 5842 */ 5843 if (mddev->degraded >= conf->max_degraded && 5844 test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) { 5845 sector_t rv = mddev->dev_sectors - sector_nr; 5846 *skipped = 1; 5847 return rv; 5848 } 5849 if (!test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery) && 5850 !conf->fullsync && 5851 !bitmap_start_sync(mddev->bitmap, sector_nr, &sync_blocks, 1) && 5852 sync_blocks >= STRIPE_SECTORS) { 5853 /* we can skip this block, and probably more */ 5854 sync_blocks /= STRIPE_SECTORS; 5855 *skipped = 1; 5856 return sync_blocks * STRIPE_SECTORS; /* keep things rounded to whole stripes */ 5857 } 5858 5859 bitmap_cond_end_sync(mddev->bitmap, sector_nr, false); 5860 5861 sh = raid5_get_active_stripe(conf, sector_nr, 0, 1, 0); 5862 if (sh == NULL) { 5863 sh = raid5_get_active_stripe(conf, sector_nr, 0, 0, 0); 5864 /* make sure we don't swamp the stripe cache if someone else 5865 * is trying to get access 5866 */ 5867 schedule_timeout_uninterruptible(1); 5868 } 5869 /* Need to check if array will still be degraded after recovery/resync 5870 * Note in case of > 1 drive failures it's possible we're rebuilding 5871 * one drive while leaving another faulty drive in array. 5872 */ 5873 rcu_read_lock(); 5874 for (i = 0; i < conf->raid_disks; i++) { 5875 struct md_rdev *rdev = ACCESS_ONCE(conf->disks[i].rdev); 5876 5877 if (rdev == NULL || test_bit(Faulty, &rdev->flags)) 5878 still_degraded = 1; 5879 } 5880 rcu_read_unlock(); 5881 5882 bitmap_start_sync(mddev->bitmap, sector_nr, &sync_blocks, still_degraded); 5883 5884 set_bit(STRIPE_SYNC_REQUESTED, &sh->state); 5885 set_bit(STRIPE_HANDLE, &sh->state); 5886 5887 raid5_release_stripe(sh); 5888 5889 return STRIPE_SECTORS; 5890 } 5891 5892 static int retry_aligned_read(struct r5conf *conf, struct bio *raid_bio) 5893 { 5894 /* We may not be able to submit a whole bio at once as there 5895 * may not be enough stripe_heads available. 5896 * We cannot pre-allocate enough stripe_heads as we may need 5897 * more than exist in the cache (if we allow ever large chunks). 5898 * So we do one stripe head at a time and record in 5899 * ->bi_hw_segments how many have been done. 5900 * 5901 * We *know* that this entire raid_bio is in one chunk, so 5902 * it will be only one 'dd_idx' and only need one call to raid5_compute_sector. 5903 */ 5904 struct stripe_head *sh; 5905 int dd_idx; 5906 sector_t sector, logical_sector, last_sector; 5907 int scnt = 0; 5908 int remaining; 5909 int handled = 0; 5910 5911 logical_sector = raid_bio->bi_iter.bi_sector & 5912 ~((sector_t)STRIPE_SECTORS-1); 5913 sector = raid5_compute_sector(conf, logical_sector, 5914 0, &dd_idx, NULL); 5915 last_sector = bio_end_sector(raid_bio); 5916 5917 for (; logical_sector < last_sector; 5918 logical_sector += STRIPE_SECTORS, 5919 sector += STRIPE_SECTORS, 5920 scnt++) { 5921 5922 if (scnt < raid5_bi_processed_stripes(raid_bio)) 5923 /* already done this stripe */ 5924 continue; 5925 5926 sh = raid5_get_active_stripe(conf, sector, 0, 1, 1); 5927 5928 if (!sh) { 5929 /* failed to get a stripe - must wait */ 5930 raid5_set_bi_processed_stripes(raid_bio, scnt); 5931 conf->retry_read_aligned = raid_bio; 5932 return handled; 5933 } 5934 5935 if (!add_stripe_bio(sh, raid_bio, dd_idx, 0, 0)) { 5936 raid5_release_stripe(sh); 5937 raid5_set_bi_processed_stripes(raid_bio, scnt); 5938 conf->retry_read_aligned = raid_bio; 5939 return handled; 5940 } 5941 5942 set_bit(R5_ReadNoMerge, &sh->dev[dd_idx].flags); 5943 handle_stripe(sh); 5944 raid5_release_stripe(sh); 5945 handled++; 5946 } 5947 remaining = raid5_dec_bi_active_stripes(raid_bio); 5948 if (remaining == 0) { 5949 trace_block_bio_complete(bdev_get_queue(raid_bio->bi_bdev), 5950 raid_bio, 0); 5951 bio_endio(raid_bio); 5952 } 5953 if (atomic_dec_and_test(&conf->active_aligned_reads)) 5954 wake_up(&conf->wait_for_quiescent); 5955 return handled; 5956 } 5957 5958 static int handle_active_stripes(struct r5conf *conf, int group, 5959 struct r5worker *worker, 5960 struct list_head *temp_inactive_list) 5961 { 5962 struct stripe_head *batch[MAX_STRIPE_BATCH], *sh; 5963 int i, batch_size = 0, hash; 5964 bool release_inactive = false; 5965 5966 while (batch_size < MAX_STRIPE_BATCH && 5967 (sh = __get_priority_stripe(conf, group)) != NULL) 5968 batch[batch_size++] = sh; 5969 5970 if (batch_size == 0) { 5971 for (i = 0; i < NR_STRIPE_HASH_LOCKS; i++) 5972 if (!list_empty(temp_inactive_list + i)) 5973 break; 5974 if (i == NR_STRIPE_HASH_LOCKS) { 5975 spin_unlock_irq(&conf->device_lock); 5976 r5l_flush_stripe_to_raid(conf->log); 5977 spin_lock_irq(&conf->device_lock); 5978 return batch_size; 5979 } 5980 release_inactive = true; 5981 } 5982 spin_unlock_irq(&conf->device_lock); 5983 5984 release_inactive_stripe_list(conf, temp_inactive_list, 5985 NR_STRIPE_HASH_LOCKS); 5986 5987 r5l_flush_stripe_to_raid(conf->log); 5988 if (release_inactive) { 5989 spin_lock_irq(&conf->device_lock); 5990 return 0; 5991 } 5992 5993 for (i = 0; i < batch_size; i++) 5994 handle_stripe(batch[i]); 5995 r5l_write_stripe_run(conf->log); 5996 5997 cond_resched(); 5998 5999 spin_lock_irq(&conf->device_lock); 6000 for (i = 0; i < batch_size; i++) { 6001 hash = batch[i]->hash_lock_index; 6002 __release_stripe(conf, batch[i], &temp_inactive_list[hash]); 6003 } 6004 return batch_size; 6005 } 6006 6007 static void raid5_do_work(struct work_struct *work) 6008 { 6009 struct r5worker *worker = container_of(work, struct r5worker, work); 6010 struct r5worker_group *group = worker->group; 6011 struct r5conf *conf = group->conf; 6012 int group_id = group - conf->worker_groups; 6013 int handled; 6014 struct blk_plug plug; 6015 6016 pr_debug("+++ raid5worker active\n"); 6017 6018 blk_start_plug(&plug); 6019 handled = 0; 6020 spin_lock_irq(&conf->device_lock); 6021 while (1) { 6022 int batch_size, released; 6023 6024 released = release_stripe_list(conf, worker->temp_inactive_list); 6025 6026 batch_size = handle_active_stripes(conf, group_id, worker, 6027 worker->temp_inactive_list); 6028 worker->working = false; 6029 if (!batch_size && !released) 6030 break; 6031 handled += batch_size; 6032 } 6033 pr_debug("%d stripes handled\n", handled); 6034 6035 spin_unlock_irq(&conf->device_lock); 6036 blk_finish_plug(&plug); 6037 6038 pr_debug("--- raid5worker inactive\n"); 6039 } 6040 6041 /* 6042 * This is our raid5 kernel thread. 6043 * 6044 * We scan the hash table for stripes which can be handled now. 6045 * During the scan, completed stripes are saved for us by the interrupt 6046 * handler, so that they will not have to wait for our next wakeup. 6047 */ 6048 static void raid5d(struct md_thread *thread) 6049 { 6050 struct mddev *mddev = thread->mddev; 6051 struct r5conf *conf = mddev->private; 6052 int handled; 6053 struct blk_plug plug; 6054 6055 pr_debug("+++ raid5d active\n"); 6056 6057 md_check_recovery(mddev); 6058 6059 if (!bio_list_empty(&conf->return_bi) && 6060 !test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags)) { 6061 struct bio_list tmp = BIO_EMPTY_LIST; 6062 spin_lock_irq(&conf->device_lock); 6063 if (!test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags)) { 6064 bio_list_merge(&tmp, &conf->return_bi); 6065 bio_list_init(&conf->return_bi); 6066 } 6067 spin_unlock_irq(&conf->device_lock); 6068 return_io(&tmp); 6069 } 6070 6071 blk_start_plug(&plug); 6072 handled = 0; 6073 spin_lock_irq(&conf->device_lock); 6074 while (1) { 6075 struct bio *bio; 6076 int batch_size, released; 6077 6078 released = release_stripe_list(conf, conf->temp_inactive_list); 6079 if (released) 6080 clear_bit(R5_DID_ALLOC, &conf->cache_state); 6081 6082 if ( 6083 !list_empty(&conf->bitmap_list)) { 6084 /* Now is a good time to flush some bitmap updates */ 6085 conf->seq_flush++; 6086 spin_unlock_irq(&conf->device_lock); 6087 bitmap_unplug(mddev->bitmap); 6088 spin_lock_irq(&conf->device_lock); 6089 conf->seq_write = conf->seq_flush; 6090 activate_bit_delay(conf, conf->temp_inactive_list); 6091 } 6092 raid5_activate_delayed(conf); 6093 6094 while ((bio = remove_bio_from_retry(conf))) { 6095 int ok; 6096 spin_unlock_irq(&conf->device_lock); 6097 ok = retry_aligned_read(conf, bio); 6098 spin_lock_irq(&conf->device_lock); 6099 if (!ok) 6100 break; 6101 handled++; 6102 } 6103 6104 batch_size = handle_active_stripes(conf, ANY_GROUP, NULL, 6105 conf->temp_inactive_list); 6106 if (!batch_size && !released) 6107 break; 6108 handled += batch_size; 6109 6110 if (mddev->sb_flags & ~(1 << MD_SB_CHANGE_PENDING)) { 6111 spin_unlock_irq(&conf->device_lock); 6112 md_check_recovery(mddev); 6113 spin_lock_irq(&conf->device_lock); 6114 } 6115 } 6116 pr_debug("%d stripes handled\n", handled); 6117 6118 spin_unlock_irq(&conf->device_lock); 6119 if (test_and_clear_bit(R5_ALLOC_MORE, &conf->cache_state) && 6120 mutex_trylock(&conf->cache_size_mutex)) { 6121 grow_one_stripe(conf, __GFP_NOWARN); 6122 /* Set flag even if allocation failed. This helps 6123 * slow down allocation requests when mem is short 6124 */ 6125 set_bit(R5_DID_ALLOC, &conf->cache_state); 6126 mutex_unlock(&conf->cache_size_mutex); 6127 } 6128 6129 r5l_flush_stripe_to_raid(conf->log); 6130 6131 async_tx_issue_pending_all(); 6132 blk_finish_plug(&plug); 6133 6134 pr_debug("--- raid5d inactive\n"); 6135 } 6136 6137 static ssize_t 6138 raid5_show_stripe_cache_size(struct mddev *mddev, char *page) 6139 { 6140 struct r5conf *conf; 6141 int ret = 0; 6142 spin_lock(&mddev->lock); 6143 conf = mddev->private; 6144 if (conf) 6145 ret = sprintf(page, "%d\n", conf->min_nr_stripes); 6146 spin_unlock(&mddev->lock); 6147 return ret; 6148 } 6149 6150 int 6151 raid5_set_cache_size(struct mddev *mddev, int size) 6152 { 6153 struct r5conf *conf = mddev->private; 6154 int err; 6155 6156 if (size <= 16 || size > 32768) 6157 return -EINVAL; 6158 6159 conf->min_nr_stripes = size; 6160 mutex_lock(&conf->cache_size_mutex); 6161 while (size < conf->max_nr_stripes && 6162 drop_one_stripe(conf)) 6163 ; 6164 mutex_unlock(&conf->cache_size_mutex); 6165 6166 6167 err = md_allow_write(mddev); 6168 if (err) 6169 return err; 6170 6171 mutex_lock(&conf->cache_size_mutex); 6172 while (size > conf->max_nr_stripes) 6173 if (!grow_one_stripe(conf, GFP_KERNEL)) 6174 break; 6175 mutex_unlock(&conf->cache_size_mutex); 6176 6177 return 0; 6178 } 6179 EXPORT_SYMBOL(raid5_set_cache_size); 6180 6181 static ssize_t 6182 raid5_store_stripe_cache_size(struct mddev *mddev, const char *page, size_t len) 6183 { 6184 struct r5conf *conf; 6185 unsigned long new; 6186 int err; 6187 6188 if (len >= PAGE_SIZE) 6189 return -EINVAL; 6190 if (kstrtoul(page, 10, &new)) 6191 return -EINVAL; 6192 err = mddev_lock(mddev); 6193 if (err) 6194 return err; 6195 conf = mddev->private; 6196 if (!conf) 6197 err = -ENODEV; 6198 else 6199 err = raid5_set_cache_size(mddev, new); 6200 mddev_unlock(mddev); 6201 6202 return err ?: len; 6203 } 6204 6205 static struct md_sysfs_entry 6206 raid5_stripecache_size = __ATTR(stripe_cache_size, S_IRUGO | S_IWUSR, 6207 raid5_show_stripe_cache_size, 6208 raid5_store_stripe_cache_size); 6209 6210 static ssize_t 6211 raid5_show_rmw_level(struct mddev *mddev, char *page) 6212 { 6213 struct r5conf *conf = mddev->private; 6214 if (conf) 6215 return sprintf(page, "%d\n", conf->rmw_level); 6216 else 6217 return 0; 6218 } 6219 6220 static ssize_t 6221 raid5_store_rmw_level(struct mddev *mddev, const char *page, size_t len) 6222 { 6223 struct r5conf *conf = mddev->private; 6224 unsigned long new; 6225 6226 if (!conf) 6227 return -ENODEV; 6228 6229 if (len >= PAGE_SIZE) 6230 return -EINVAL; 6231 6232 if (kstrtoul(page, 10, &new)) 6233 return -EINVAL; 6234 6235 if (new != PARITY_DISABLE_RMW && !raid6_call.xor_syndrome) 6236 return -EINVAL; 6237 6238 if (new != PARITY_DISABLE_RMW && 6239 new != PARITY_ENABLE_RMW && 6240 new != PARITY_PREFER_RMW) 6241 return -EINVAL; 6242 6243 conf->rmw_level = new; 6244 return len; 6245 } 6246 6247 static struct md_sysfs_entry 6248 raid5_rmw_level = __ATTR(rmw_level, S_IRUGO | S_IWUSR, 6249 raid5_show_rmw_level, 6250 raid5_store_rmw_level); 6251 6252 6253 static ssize_t 6254 raid5_show_preread_threshold(struct mddev *mddev, char *page) 6255 { 6256 struct r5conf *conf; 6257 int ret = 0; 6258 spin_lock(&mddev->lock); 6259 conf = mddev->private; 6260 if (conf) 6261 ret = sprintf(page, "%d\n", conf->bypass_threshold); 6262 spin_unlock(&mddev->lock); 6263 return ret; 6264 } 6265 6266 static ssize_t 6267 raid5_store_preread_threshold(struct mddev *mddev, const char *page, size_t len) 6268 { 6269 struct r5conf *conf; 6270 unsigned long new; 6271 int err; 6272 6273 if (len >= PAGE_SIZE) 6274 return -EINVAL; 6275 if (kstrtoul(page, 10, &new)) 6276 return -EINVAL; 6277 6278 err = mddev_lock(mddev); 6279 if (err) 6280 return err; 6281 conf = mddev->private; 6282 if (!conf) 6283 err = -ENODEV; 6284 else if (new > conf->min_nr_stripes) 6285 err = -EINVAL; 6286 else 6287 conf->bypass_threshold = new; 6288 mddev_unlock(mddev); 6289 return err ?: len; 6290 } 6291 6292 static struct md_sysfs_entry 6293 raid5_preread_bypass_threshold = __ATTR(preread_bypass_threshold, 6294 S_IRUGO | S_IWUSR, 6295 raid5_show_preread_threshold, 6296 raid5_store_preread_threshold); 6297 6298 static ssize_t 6299 raid5_show_skip_copy(struct mddev *mddev, char *page) 6300 { 6301 struct r5conf *conf; 6302 int ret = 0; 6303 spin_lock(&mddev->lock); 6304 conf = mddev->private; 6305 if (conf) 6306 ret = sprintf(page, "%d\n", conf->skip_copy); 6307 spin_unlock(&mddev->lock); 6308 return ret; 6309 } 6310 6311 static ssize_t 6312 raid5_store_skip_copy(struct mddev *mddev, const char *page, size_t len) 6313 { 6314 struct r5conf *conf; 6315 unsigned long new; 6316 int err; 6317 6318 if (len >= PAGE_SIZE) 6319 return -EINVAL; 6320 if (kstrtoul(page, 10, &new)) 6321 return -EINVAL; 6322 new = !!new; 6323 6324 err = mddev_lock(mddev); 6325 if (err) 6326 return err; 6327 conf = mddev->private; 6328 if (!conf) 6329 err = -ENODEV; 6330 else if (new != conf->skip_copy) { 6331 mddev_suspend(mddev); 6332 conf->skip_copy = new; 6333 if (new) 6334 mddev->queue->backing_dev_info->capabilities |= 6335 BDI_CAP_STABLE_WRITES; 6336 else 6337 mddev->queue->backing_dev_info->capabilities &= 6338 ~BDI_CAP_STABLE_WRITES; 6339 mddev_resume(mddev); 6340 } 6341 mddev_unlock(mddev); 6342 return err ?: len; 6343 } 6344 6345 static struct md_sysfs_entry 6346 raid5_skip_copy = __ATTR(skip_copy, S_IRUGO | S_IWUSR, 6347 raid5_show_skip_copy, 6348 raid5_store_skip_copy); 6349 6350 static ssize_t 6351 stripe_cache_active_show(struct mddev *mddev, char *page) 6352 { 6353 struct r5conf *conf = mddev->private; 6354 if (conf) 6355 return sprintf(page, "%d\n", atomic_read(&conf->active_stripes)); 6356 else 6357 return 0; 6358 } 6359 6360 static struct md_sysfs_entry 6361 raid5_stripecache_active = __ATTR_RO(stripe_cache_active); 6362 6363 static ssize_t 6364 raid5_show_group_thread_cnt(struct mddev *mddev, char *page) 6365 { 6366 struct r5conf *conf; 6367 int ret = 0; 6368 spin_lock(&mddev->lock); 6369 conf = mddev->private; 6370 if (conf) 6371 ret = sprintf(page, "%d\n", conf->worker_cnt_per_group); 6372 spin_unlock(&mddev->lock); 6373 return ret; 6374 } 6375 6376 static int alloc_thread_groups(struct r5conf *conf, int cnt, 6377 int *group_cnt, 6378 int *worker_cnt_per_group, 6379 struct r5worker_group **worker_groups); 6380 static ssize_t 6381 raid5_store_group_thread_cnt(struct mddev *mddev, const char *page, size_t len) 6382 { 6383 struct r5conf *conf; 6384 unsigned long new; 6385 int err; 6386 struct r5worker_group *new_groups, *old_groups; 6387 int group_cnt, worker_cnt_per_group; 6388 6389 if (len >= PAGE_SIZE) 6390 return -EINVAL; 6391 if (kstrtoul(page, 10, &new)) 6392 return -EINVAL; 6393 6394 err = mddev_lock(mddev); 6395 if (err) 6396 return err; 6397 conf = mddev->private; 6398 if (!conf) 6399 err = -ENODEV; 6400 else if (new != conf->worker_cnt_per_group) { 6401 mddev_suspend(mddev); 6402 6403 old_groups = conf->worker_groups; 6404 if (old_groups) 6405 flush_workqueue(raid5_wq); 6406 6407 err = alloc_thread_groups(conf, new, 6408 &group_cnt, &worker_cnt_per_group, 6409 &new_groups); 6410 if (!err) { 6411 spin_lock_irq(&conf->device_lock); 6412 conf->group_cnt = group_cnt; 6413 conf->worker_cnt_per_group = worker_cnt_per_group; 6414 conf->worker_groups = new_groups; 6415 spin_unlock_irq(&conf->device_lock); 6416 6417 if (old_groups) 6418 kfree(old_groups[0].workers); 6419 kfree(old_groups); 6420 } 6421 mddev_resume(mddev); 6422 } 6423 mddev_unlock(mddev); 6424 6425 return err ?: len; 6426 } 6427 6428 static struct md_sysfs_entry 6429 raid5_group_thread_cnt = __ATTR(group_thread_cnt, S_IRUGO | S_IWUSR, 6430 raid5_show_group_thread_cnt, 6431 raid5_store_group_thread_cnt); 6432 6433 static struct attribute *raid5_attrs[] = { 6434 &raid5_stripecache_size.attr, 6435 &raid5_stripecache_active.attr, 6436 &raid5_preread_bypass_threshold.attr, 6437 &raid5_group_thread_cnt.attr, 6438 &raid5_skip_copy.attr, 6439 &raid5_rmw_level.attr, 6440 &r5c_journal_mode.attr, 6441 NULL, 6442 }; 6443 static struct attribute_group raid5_attrs_group = { 6444 .name = NULL, 6445 .attrs = raid5_attrs, 6446 }; 6447 6448 static int alloc_thread_groups(struct r5conf *conf, int cnt, 6449 int *group_cnt, 6450 int *worker_cnt_per_group, 6451 struct r5worker_group **worker_groups) 6452 { 6453 int i, j, k; 6454 ssize_t size; 6455 struct r5worker *workers; 6456 6457 *worker_cnt_per_group = cnt; 6458 if (cnt == 0) { 6459 *group_cnt = 0; 6460 *worker_groups = NULL; 6461 return 0; 6462 } 6463 *group_cnt = num_possible_nodes(); 6464 size = sizeof(struct r5worker) * cnt; 6465 workers = kzalloc(size * *group_cnt, GFP_NOIO); 6466 *worker_groups = kzalloc(sizeof(struct r5worker_group) * 6467 *group_cnt, GFP_NOIO); 6468 if (!*worker_groups || !workers) { 6469 kfree(workers); 6470 kfree(*worker_groups); 6471 return -ENOMEM; 6472 } 6473 6474 for (i = 0; i < *group_cnt; i++) { 6475 struct r5worker_group *group; 6476 6477 group = &(*worker_groups)[i]; 6478 INIT_LIST_HEAD(&group->handle_list); 6479 group->conf = conf; 6480 group->workers = workers + i * cnt; 6481 6482 for (j = 0; j < cnt; j++) { 6483 struct r5worker *worker = group->workers + j; 6484 worker->group = group; 6485 INIT_WORK(&worker->work, raid5_do_work); 6486 6487 for (k = 0; k < NR_STRIPE_HASH_LOCKS; k++) 6488 INIT_LIST_HEAD(worker->temp_inactive_list + k); 6489 } 6490 } 6491 6492 return 0; 6493 } 6494 6495 static void free_thread_groups(struct r5conf *conf) 6496 { 6497 if (conf->worker_groups) 6498 kfree(conf->worker_groups[0].workers); 6499 kfree(conf->worker_groups); 6500 conf->worker_groups = NULL; 6501 } 6502 6503 static sector_t 6504 raid5_size(struct mddev *mddev, sector_t sectors, int raid_disks) 6505 { 6506 struct r5conf *conf = mddev->private; 6507 6508 if (!sectors) 6509 sectors = mddev->dev_sectors; 6510 if (!raid_disks) 6511 /* size is defined by the smallest of previous and new size */ 6512 raid_disks = min(conf->raid_disks, conf->previous_raid_disks); 6513 6514 sectors &= ~((sector_t)conf->chunk_sectors - 1); 6515 sectors &= ~((sector_t)conf->prev_chunk_sectors - 1); 6516 return sectors * (raid_disks - conf->max_degraded); 6517 } 6518 6519 static void free_scratch_buffer(struct r5conf *conf, struct raid5_percpu *percpu) 6520 { 6521 safe_put_page(percpu->spare_page); 6522 if (percpu->scribble) 6523 flex_array_free(percpu->scribble); 6524 percpu->spare_page = NULL; 6525 percpu->scribble = NULL; 6526 } 6527 6528 static int alloc_scratch_buffer(struct r5conf *conf, struct raid5_percpu *percpu) 6529 { 6530 if (conf->level == 6 && !percpu->spare_page) 6531 percpu->spare_page = alloc_page(GFP_KERNEL); 6532 if (!percpu->scribble) 6533 percpu->scribble = scribble_alloc(max(conf->raid_disks, 6534 conf->previous_raid_disks), 6535 max(conf->chunk_sectors, 6536 conf->prev_chunk_sectors) 6537 / STRIPE_SECTORS, 6538 GFP_KERNEL); 6539 6540 if (!percpu->scribble || (conf->level == 6 && !percpu->spare_page)) { 6541 free_scratch_buffer(conf, percpu); 6542 return -ENOMEM; 6543 } 6544 6545 return 0; 6546 } 6547 6548 static int raid456_cpu_dead(unsigned int cpu, struct hlist_node *node) 6549 { 6550 struct r5conf *conf = hlist_entry_safe(node, struct r5conf, node); 6551 6552 free_scratch_buffer(conf, per_cpu_ptr(conf->percpu, cpu)); 6553 return 0; 6554 } 6555 6556 static void raid5_free_percpu(struct r5conf *conf) 6557 { 6558 if (!conf->percpu) 6559 return; 6560 6561 cpuhp_state_remove_instance(CPUHP_MD_RAID5_PREPARE, &conf->node); 6562 free_percpu(conf->percpu); 6563 } 6564 6565 static void free_conf(struct r5conf *conf) 6566 { 6567 int i; 6568 6569 if (conf->log) 6570 r5l_exit_log(conf->log); 6571 if (conf->shrinker.nr_deferred) 6572 unregister_shrinker(&conf->shrinker); 6573 6574 free_thread_groups(conf); 6575 shrink_stripes(conf); 6576 raid5_free_percpu(conf); 6577 for (i = 0; i < conf->pool_size; i++) 6578 if (conf->disks[i].extra_page) 6579 put_page(conf->disks[i].extra_page); 6580 kfree(conf->disks); 6581 kfree(conf->stripe_hashtbl); 6582 kfree(conf); 6583 } 6584 6585 static int raid456_cpu_up_prepare(unsigned int cpu, struct hlist_node *node) 6586 { 6587 struct r5conf *conf = hlist_entry_safe(node, struct r5conf, node); 6588 struct raid5_percpu *percpu = per_cpu_ptr(conf->percpu, cpu); 6589 6590 if (alloc_scratch_buffer(conf, percpu)) { 6591 pr_warn("%s: failed memory allocation for cpu%u\n", 6592 __func__, cpu); 6593 return -ENOMEM; 6594 } 6595 return 0; 6596 } 6597 6598 static int raid5_alloc_percpu(struct r5conf *conf) 6599 { 6600 int err = 0; 6601 6602 conf->percpu = alloc_percpu(struct raid5_percpu); 6603 if (!conf->percpu) 6604 return -ENOMEM; 6605 6606 err = cpuhp_state_add_instance(CPUHP_MD_RAID5_PREPARE, &conf->node); 6607 if (!err) { 6608 conf->scribble_disks = max(conf->raid_disks, 6609 conf->previous_raid_disks); 6610 conf->scribble_sectors = max(conf->chunk_sectors, 6611 conf->prev_chunk_sectors); 6612 } 6613 return err; 6614 } 6615 6616 static unsigned long raid5_cache_scan(struct shrinker *shrink, 6617 struct shrink_control *sc) 6618 { 6619 struct r5conf *conf = container_of(shrink, struct r5conf, shrinker); 6620 unsigned long ret = SHRINK_STOP; 6621 6622 if (mutex_trylock(&conf->cache_size_mutex)) { 6623 ret= 0; 6624 while (ret < sc->nr_to_scan && 6625 conf->max_nr_stripes > conf->min_nr_stripes) { 6626 if (drop_one_stripe(conf) == 0) { 6627 ret = SHRINK_STOP; 6628 break; 6629 } 6630 ret++; 6631 } 6632 mutex_unlock(&conf->cache_size_mutex); 6633 } 6634 return ret; 6635 } 6636 6637 static unsigned long raid5_cache_count(struct shrinker *shrink, 6638 struct shrink_control *sc) 6639 { 6640 struct r5conf *conf = container_of(shrink, struct r5conf, shrinker); 6641 6642 if (conf->max_nr_stripes < conf->min_nr_stripes) 6643 /* unlikely, but not impossible */ 6644 return 0; 6645 return conf->max_nr_stripes - conf->min_nr_stripes; 6646 } 6647 6648 static struct r5conf *setup_conf(struct mddev *mddev) 6649 { 6650 struct r5conf *conf; 6651 int raid_disk, memory, max_disks; 6652 struct md_rdev *rdev; 6653 struct disk_info *disk; 6654 char pers_name[6]; 6655 int i; 6656 int group_cnt, worker_cnt_per_group; 6657 struct r5worker_group *new_group; 6658 6659 if (mddev->new_level != 5 6660 && mddev->new_level != 4 6661 && mddev->new_level != 6) { 6662 pr_warn("md/raid:%s: raid level not set to 4/5/6 (%d)\n", 6663 mdname(mddev), mddev->new_level); 6664 return ERR_PTR(-EIO); 6665 } 6666 if ((mddev->new_level == 5 6667 && !algorithm_valid_raid5(mddev->new_layout)) || 6668 (mddev->new_level == 6 6669 && !algorithm_valid_raid6(mddev->new_layout))) { 6670 pr_warn("md/raid:%s: layout %d not supported\n", 6671 mdname(mddev), mddev->new_layout); 6672 return ERR_PTR(-EIO); 6673 } 6674 if (mddev->new_level == 6 && mddev->raid_disks < 4) { 6675 pr_warn("md/raid:%s: not enough configured devices (%d, minimum 4)\n", 6676 mdname(mddev), mddev->raid_disks); 6677 return ERR_PTR(-EINVAL); 6678 } 6679 6680 if (!mddev->new_chunk_sectors || 6681 (mddev->new_chunk_sectors << 9) % PAGE_SIZE || 6682 !is_power_of_2(mddev->new_chunk_sectors)) { 6683 pr_warn("md/raid:%s: invalid chunk size %d\n", 6684 mdname(mddev), mddev->new_chunk_sectors << 9); 6685 return ERR_PTR(-EINVAL); 6686 } 6687 6688 conf = kzalloc(sizeof(struct r5conf), GFP_KERNEL); 6689 if (conf == NULL) 6690 goto abort; 6691 /* Don't enable multi-threading by default*/ 6692 if (!alloc_thread_groups(conf, 0, &group_cnt, &worker_cnt_per_group, 6693 &new_group)) { 6694 conf->group_cnt = group_cnt; 6695 conf->worker_cnt_per_group = worker_cnt_per_group; 6696 conf->worker_groups = new_group; 6697 } else 6698 goto abort; 6699 spin_lock_init(&conf->device_lock); 6700 seqcount_init(&conf->gen_lock); 6701 mutex_init(&conf->cache_size_mutex); 6702 init_waitqueue_head(&conf->wait_for_quiescent); 6703 init_waitqueue_head(&conf->wait_for_stripe); 6704 init_waitqueue_head(&conf->wait_for_overlap); 6705 INIT_LIST_HEAD(&conf->handle_list); 6706 INIT_LIST_HEAD(&conf->hold_list); 6707 INIT_LIST_HEAD(&conf->delayed_list); 6708 INIT_LIST_HEAD(&conf->bitmap_list); 6709 bio_list_init(&conf->return_bi); 6710 init_llist_head(&conf->released_stripes); 6711 atomic_set(&conf->active_stripes, 0); 6712 atomic_set(&conf->preread_active_stripes, 0); 6713 atomic_set(&conf->active_aligned_reads, 0); 6714 conf->bypass_threshold = BYPASS_THRESHOLD; 6715 conf->recovery_disabled = mddev->recovery_disabled - 1; 6716 6717 conf->raid_disks = mddev->raid_disks; 6718 if (mddev->reshape_position == MaxSector) 6719 conf->previous_raid_disks = mddev->raid_disks; 6720 else 6721 conf->previous_raid_disks = mddev->raid_disks - mddev->delta_disks; 6722 max_disks = max(conf->raid_disks, conf->previous_raid_disks); 6723 6724 conf->disks = kzalloc(max_disks * sizeof(struct disk_info), 6725 GFP_KERNEL); 6726 6727 if (!conf->disks) 6728 goto abort; 6729 6730 for (i = 0; i < max_disks; i++) { 6731 conf->disks[i].extra_page = alloc_page(GFP_KERNEL); 6732 if (!conf->disks[i].extra_page) 6733 goto abort; 6734 } 6735 6736 conf->mddev = mddev; 6737 6738 if ((conf->stripe_hashtbl = kzalloc(PAGE_SIZE, GFP_KERNEL)) == NULL) 6739 goto abort; 6740 6741 /* We init hash_locks[0] separately to that it can be used 6742 * as the reference lock in the spin_lock_nest_lock() call 6743 * in lock_all_device_hash_locks_irq in order to convince 6744 * lockdep that we know what we are doing. 6745 */ 6746 spin_lock_init(conf->hash_locks); 6747 for (i = 1; i < NR_STRIPE_HASH_LOCKS; i++) 6748 spin_lock_init(conf->hash_locks + i); 6749 6750 for (i = 0; i < NR_STRIPE_HASH_LOCKS; i++) 6751 INIT_LIST_HEAD(conf->inactive_list + i); 6752 6753 for (i = 0; i < NR_STRIPE_HASH_LOCKS; i++) 6754 INIT_LIST_HEAD(conf->temp_inactive_list + i); 6755 6756 atomic_set(&conf->r5c_cached_full_stripes, 0); 6757 INIT_LIST_HEAD(&conf->r5c_full_stripe_list); 6758 atomic_set(&conf->r5c_cached_partial_stripes, 0); 6759 INIT_LIST_HEAD(&conf->r5c_partial_stripe_list); 6760 6761 conf->level = mddev->new_level; 6762 conf->chunk_sectors = mddev->new_chunk_sectors; 6763 if (raid5_alloc_percpu(conf) != 0) 6764 goto abort; 6765 6766 pr_debug("raid456: run(%s) called.\n", mdname(mddev)); 6767 6768 rdev_for_each(rdev, mddev) { 6769 raid_disk = rdev->raid_disk; 6770 if (raid_disk >= max_disks 6771 || raid_disk < 0 || test_bit(Journal, &rdev->flags)) 6772 continue; 6773 disk = conf->disks + raid_disk; 6774 6775 if (test_bit(Replacement, &rdev->flags)) { 6776 if (disk->replacement) 6777 goto abort; 6778 disk->replacement = rdev; 6779 } else { 6780 if (disk->rdev) 6781 goto abort; 6782 disk->rdev = rdev; 6783 } 6784 6785 if (test_bit(In_sync, &rdev->flags)) { 6786 char b[BDEVNAME_SIZE]; 6787 pr_info("md/raid:%s: device %s operational as raid disk %d\n", 6788 mdname(mddev), bdevname(rdev->bdev, b), raid_disk); 6789 } else if (rdev->saved_raid_disk != raid_disk) 6790 /* Cannot rely on bitmap to complete recovery */ 6791 conf->fullsync = 1; 6792 } 6793 6794 conf->level = mddev->new_level; 6795 if (conf->level == 6) { 6796 conf->max_degraded = 2; 6797 if (raid6_call.xor_syndrome) 6798 conf->rmw_level = PARITY_ENABLE_RMW; 6799 else 6800 conf->rmw_level = PARITY_DISABLE_RMW; 6801 } else { 6802 conf->max_degraded = 1; 6803 conf->rmw_level = PARITY_ENABLE_RMW; 6804 } 6805 conf->algorithm = mddev->new_layout; 6806 conf->reshape_progress = mddev->reshape_position; 6807 if (conf->reshape_progress != MaxSector) { 6808 conf->prev_chunk_sectors = mddev->chunk_sectors; 6809 conf->prev_algo = mddev->layout; 6810 } else { 6811 conf->prev_chunk_sectors = conf->chunk_sectors; 6812 conf->prev_algo = conf->algorithm; 6813 } 6814 6815 conf->min_nr_stripes = NR_STRIPES; 6816 if (mddev->reshape_position != MaxSector) { 6817 int stripes = max_t(int, 6818 ((mddev->chunk_sectors << 9) / STRIPE_SIZE) * 4, 6819 ((mddev->new_chunk_sectors << 9) / STRIPE_SIZE) * 4); 6820 conf->min_nr_stripes = max(NR_STRIPES, stripes); 6821 if (conf->min_nr_stripes != NR_STRIPES) 6822 pr_info("md/raid:%s: force stripe size %d for reshape\n", 6823 mdname(mddev), conf->min_nr_stripes); 6824 } 6825 memory = conf->min_nr_stripes * (sizeof(struct stripe_head) + 6826 max_disks * ((sizeof(struct bio) + PAGE_SIZE))) / 1024; 6827 atomic_set(&conf->empty_inactive_list_nr, NR_STRIPE_HASH_LOCKS); 6828 if (grow_stripes(conf, conf->min_nr_stripes)) { 6829 pr_warn("md/raid:%s: couldn't allocate %dkB for buffers\n", 6830 mdname(mddev), memory); 6831 goto abort; 6832 } else 6833 pr_debug("md/raid:%s: allocated %dkB\n", mdname(mddev), memory); 6834 /* 6835 * Losing a stripe head costs more than the time to refill it, 6836 * it reduces the queue depth and so can hurt throughput. 6837 * So set it rather large, scaled by number of devices. 6838 */ 6839 conf->shrinker.seeks = DEFAULT_SEEKS * conf->raid_disks * 4; 6840 conf->shrinker.scan_objects = raid5_cache_scan; 6841 conf->shrinker.count_objects = raid5_cache_count; 6842 conf->shrinker.batch = 128; 6843 conf->shrinker.flags = 0; 6844 if (register_shrinker(&conf->shrinker)) { 6845 pr_warn("md/raid:%s: couldn't register shrinker.\n", 6846 mdname(mddev)); 6847 goto abort; 6848 } 6849 6850 sprintf(pers_name, "raid%d", mddev->new_level); 6851 conf->thread = md_register_thread(raid5d, mddev, pers_name); 6852 if (!conf->thread) { 6853 pr_warn("md/raid:%s: couldn't allocate thread.\n", 6854 mdname(mddev)); 6855 goto abort; 6856 } 6857 6858 return conf; 6859 6860 abort: 6861 if (conf) { 6862 free_conf(conf); 6863 return ERR_PTR(-EIO); 6864 } else 6865 return ERR_PTR(-ENOMEM); 6866 } 6867 6868 static int only_parity(int raid_disk, int algo, int raid_disks, int max_degraded) 6869 { 6870 switch (algo) { 6871 case ALGORITHM_PARITY_0: 6872 if (raid_disk < max_degraded) 6873 return 1; 6874 break; 6875 case ALGORITHM_PARITY_N: 6876 if (raid_disk >= raid_disks - max_degraded) 6877 return 1; 6878 break; 6879 case ALGORITHM_PARITY_0_6: 6880 if (raid_disk == 0 || 6881 raid_disk == raid_disks - 1) 6882 return 1; 6883 break; 6884 case ALGORITHM_LEFT_ASYMMETRIC_6: 6885 case ALGORITHM_RIGHT_ASYMMETRIC_6: 6886 case ALGORITHM_LEFT_SYMMETRIC_6: 6887 case ALGORITHM_RIGHT_SYMMETRIC_6: 6888 if (raid_disk == raid_disks - 1) 6889 return 1; 6890 } 6891 return 0; 6892 } 6893 6894 static int raid5_run(struct mddev *mddev) 6895 { 6896 struct r5conf *conf; 6897 int working_disks = 0; 6898 int dirty_parity_disks = 0; 6899 struct md_rdev *rdev; 6900 struct md_rdev *journal_dev = NULL; 6901 sector_t reshape_offset = 0; 6902 int i; 6903 long long min_offset_diff = 0; 6904 int first = 1; 6905 6906 if (mddev->recovery_cp != MaxSector) 6907 pr_notice("md/raid:%s: not clean -- starting background reconstruction\n", 6908 mdname(mddev)); 6909 6910 rdev_for_each(rdev, mddev) { 6911 long long diff; 6912 6913 if (test_bit(Journal, &rdev->flags)) { 6914 journal_dev = rdev; 6915 continue; 6916 } 6917 if (rdev->raid_disk < 0) 6918 continue; 6919 diff = (rdev->new_data_offset - rdev->data_offset); 6920 if (first) { 6921 min_offset_diff = diff; 6922 first = 0; 6923 } else if (mddev->reshape_backwards && 6924 diff < min_offset_diff) 6925 min_offset_diff = diff; 6926 else if (!mddev->reshape_backwards && 6927 diff > min_offset_diff) 6928 min_offset_diff = diff; 6929 } 6930 6931 if (mddev->reshape_position != MaxSector) { 6932 /* Check that we can continue the reshape. 6933 * Difficulties arise if the stripe we would write to 6934 * next is at or after the stripe we would read from next. 6935 * For a reshape that changes the number of devices, this 6936 * is only possible for a very short time, and mdadm makes 6937 * sure that time appears to have past before assembling 6938 * the array. So we fail if that time hasn't passed. 6939 * For a reshape that keeps the number of devices the same 6940 * mdadm must be monitoring the reshape can keeping the 6941 * critical areas read-only and backed up. It will start 6942 * the array in read-only mode, so we check for that. 6943 */ 6944 sector_t here_new, here_old; 6945 int old_disks; 6946 int max_degraded = (mddev->level == 6 ? 2 : 1); 6947 int chunk_sectors; 6948 int new_data_disks; 6949 6950 if (journal_dev) { 6951 pr_warn("md/raid:%s: don't support reshape with journal - aborting.\n", 6952 mdname(mddev)); 6953 return -EINVAL; 6954 } 6955 6956 if (mddev->new_level != mddev->level) { 6957 pr_warn("md/raid:%s: unsupported reshape required - aborting.\n", 6958 mdname(mddev)); 6959 return -EINVAL; 6960 } 6961 old_disks = mddev->raid_disks - mddev->delta_disks; 6962 /* reshape_position must be on a new-stripe boundary, and one 6963 * further up in new geometry must map after here in old 6964 * geometry. 6965 * If the chunk sizes are different, then as we perform reshape 6966 * in units of the largest of the two, reshape_position needs 6967 * be a multiple of the largest chunk size times new data disks. 6968 */ 6969 here_new = mddev->reshape_position; 6970 chunk_sectors = max(mddev->chunk_sectors, mddev->new_chunk_sectors); 6971 new_data_disks = mddev->raid_disks - max_degraded; 6972 if (sector_div(here_new, chunk_sectors * new_data_disks)) { 6973 pr_warn("md/raid:%s: reshape_position not on a stripe boundary\n", 6974 mdname(mddev)); 6975 return -EINVAL; 6976 } 6977 reshape_offset = here_new * chunk_sectors; 6978 /* here_new is the stripe we will write to */ 6979 here_old = mddev->reshape_position; 6980 sector_div(here_old, chunk_sectors * (old_disks-max_degraded)); 6981 /* here_old is the first stripe that we might need to read 6982 * from */ 6983 if (mddev->delta_disks == 0) { 6984 /* We cannot be sure it is safe to start an in-place 6985 * reshape. It is only safe if user-space is monitoring 6986 * and taking constant backups. 6987 * mdadm always starts a situation like this in 6988 * readonly mode so it can take control before 6989 * allowing any writes. So just check for that. 6990 */ 6991 if (abs(min_offset_diff) >= mddev->chunk_sectors && 6992 abs(min_offset_diff) >= mddev->new_chunk_sectors) 6993 /* not really in-place - so OK */; 6994 else if (mddev->ro == 0) { 6995 pr_warn("md/raid:%s: in-place reshape must be started in read-only mode - aborting\n", 6996 mdname(mddev)); 6997 return -EINVAL; 6998 } 6999 } else if (mddev->reshape_backwards 7000 ? (here_new * chunk_sectors + min_offset_diff <= 7001 here_old * chunk_sectors) 7002 : (here_new * chunk_sectors >= 7003 here_old * chunk_sectors + (-min_offset_diff))) { 7004 /* Reading from the same stripe as writing to - bad */ 7005 pr_warn("md/raid:%s: reshape_position too early for auto-recovery - aborting.\n", 7006 mdname(mddev)); 7007 return -EINVAL; 7008 } 7009 pr_debug("md/raid:%s: reshape will continue\n", mdname(mddev)); 7010 /* OK, we should be able to continue; */ 7011 } else { 7012 BUG_ON(mddev->level != mddev->new_level); 7013 BUG_ON(mddev->layout != mddev->new_layout); 7014 BUG_ON(mddev->chunk_sectors != mddev->new_chunk_sectors); 7015 BUG_ON(mddev->delta_disks != 0); 7016 } 7017 7018 if (mddev->private == NULL) 7019 conf = setup_conf(mddev); 7020 else 7021 conf = mddev->private; 7022 7023 if (IS_ERR(conf)) 7024 return PTR_ERR(conf); 7025 7026 if (test_bit(MD_HAS_JOURNAL, &mddev->flags)) { 7027 if (!journal_dev) { 7028 pr_warn("md/raid:%s: journal disk is missing, force array readonly\n", 7029 mdname(mddev)); 7030 mddev->ro = 1; 7031 set_disk_ro(mddev->gendisk, 1); 7032 } else if (mddev->recovery_cp == MaxSector) 7033 set_bit(MD_JOURNAL_CLEAN, &mddev->flags); 7034 } 7035 7036 conf->min_offset_diff = min_offset_diff; 7037 mddev->thread = conf->thread; 7038 conf->thread = NULL; 7039 mddev->private = conf; 7040 7041 for (i = 0; i < conf->raid_disks && conf->previous_raid_disks; 7042 i++) { 7043 rdev = conf->disks[i].rdev; 7044 if (!rdev && conf->disks[i].replacement) { 7045 /* The replacement is all we have yet */ 7046 rdev = conf->disks[i].replacement; 7047 conf->disks[i].replacement = NULL; 7048 clear_bit(Replacement, &rdev->flags); 7049 conf->disks[i].rdev = rdev; 7050 } 7051 if (!rdev) 7052 continue; 7053 if (conf->disks[i].replacement && 7054 conf->reshape_progress != MaxSector) { 7055 /* replacements and reshape simply do not mix. */ 7056 pr_warn("md: cannot handle concurrent replacement and reshape.\n"); 7057 goto abort; 7058 } 7059 if (test_bit(In_sync, &rdev->flags)) { 7060 working_disks++; 7061 continue; 7062 } 7063 /* This disc is not fully in-sync. However if it 7064 * just stored parity (beyond the recovery_offset), 7065 * when we don't need to be concerned about the 7066 * array being dirty. 7067 * When reshape goes 'backwards', we never have 7068 * partially completed devices, so we only need 7069 * to worry about reshape going forwards. 7070 */ 7071 /* Hack because v0.91 doesn't store recovery_offset properly. */ 7072 if (mddev->major_version == 0 && 7073 mddev->minor_version > 90) 7074 rdev->recovery_offset = reshape_offset; 7075 7076 if (rdev->recovery_offset < reshape_offset) { 7077 /* We need to check old and new layout */ 7078 if (!only_parity(rdev->raid_disk, 7079 conf->algorithm, 7080 conf->raid_disks, 7081 conf->max_degraded)) 7082 continue; 7083 } 7084 if (!only_parity(rdev->raid_disk, 7085 conf->prev_algo, 7086 conf->previous_raid_disks, 7087 conf->max_degraded)) 7088 continue; 7089 dirty_parity_disks++; 7090 } 7091 7092 /* 7093 * 0 for a fully functional array, 1 or 2 for a degraded array. 7094 */ 7095 mddev->degraded = raid5_calc_degraded(conf); 7096 7097 if (has_failed(conf)) { 7098 pr_crit("md/raid:%s: not enough operational devices (%d/%d failed)\n", 7099 mdname(mddev), mddev->degraded, conf->raid_disks); 7100 goto abort; 7101 } 7102 7103 /* device size must be a multiple of chunk size */ 7104 mddev->dev_sectors &= ~(mddev->chunk_sectors - 1); 7105 mddev->resync_max_sectors = mddev->dev_sectors; 7106 7107 if (mddev->degraded > dirty_parity_disks && 7108 mddev->recovery_cp != MaxSector) { 7109 if (mddev->ok_start_degraded) 7110 pr_crit("md/raid:%s: starting dirty degraded array - data corruption possible.\n", 7111 mdname(mddev)); 7112 else { 7113 pr_crit("md/raid:%s: cannot start dirty degraded array.\n", 7114 mdname(mddev)); 7115 goto abort; 7116 } 7117 } 7118 7119 pr_info("md/raid:%s: raid level %d active with %d out of %d devices, algorithm %d\n", 7120 mdname(mddev), conf->level, 7121 mddev->raid_disks-mddev->degraded, mddev->raid_disks, 7122 mddev->new_layout); 7123 7124 print_raid5_conf(conf); 7125 7126 if (conf->reshape_progress != MaxSector) { 7127 conf->reshape_safe = conf->reshape_progress; 7128 atomic_set(&conf->reshape_stripes, 0); 7129 clear_bit(MD_RECOVERY_SYNC, &mddev->recovery); 7130 clear_bit(MD_RECOVERY_CHECK, &mddev->recovery); 7131 set_bit(MD_RECOVERY_RESHAPE, &mddev->recovery); 7132 set_bit(MD_RECOVERY_RUNNING, &mddev->recovery); 7133 mddev->sync_thread = md_register_thread(md_do_sync, mddev, 7134 "reshape"); 7135 } 7136 7137 /* Ok, everything is just fine now */ 7138 if (mddev->to_remove == &raid5_attrs_group) 7139 mddev->to_remove = NULL; 7140 else if (mddev->kobj.sd && 7141 sysfs_create_group(&mddev->kobj, &raid5_attrs_group)) 7142 pr_warn("raid5: failed to create sysfs attributes for %s\n", 7143 mdname(mddev)); 7144 md_set_array_sectors(mddev, raid5_size(mddev, 0, 0)); 7145 7146 if (mddev->queue) { 7147 int chunk_size; 7148 bool discard_supported = true; 7149 /* read-ahead size must cover two whole stripes, which 7150 * is 2 * (datadisks) * chunksize where 'n' is the 7151 * number of raid devices 7152 */ 7153 int data_disks = conf->previous_raid_disks - conf->max_degraded; 7154 int stripe = data_disks * 7155 ((mddev->chunk_sectors << 9) / PAGE_SIZE); 7156 if (mddev->queue->backing_dev_info->ra_pages < 2 * stripe) 7157 mddev->queue->backing_dev_info->ra_pages = 2 * stripe; 7158 7159 chunk_size = mddev->chunk_sectors << 9; 7160 blk_queue_io_min(mddev->queue, chunk_size); 7161 blk_queue_io_opt(mddev->queue, chunk_size * 7162 (conf->raid_disks - conf->max_degraded)); 7163 mddev->queue->limits.raid_partial_stripes_expensive = 1; 7164 /* 7165 * We can only discard a whole stripe. It doesn't make sense to 7166 * discard data disk but write parity disk 7167 */ 7168 stripe = stripe * PAGE_SIZE; 7169 /* Round up to power of 2, as discard handling 7170 * currently assumes that */ 7171 while ((stripe-1) & stripe) 7172 stripe = (stripe | (stripe-1)) + 1; 7173 mddev->queue->limits.discard_alignment = stripe; 7174 mddev->queue->limits.discard_granularity = stripe; 7175 7176 /* 7177 * We use 16-bit counter of active stripes in bi_phys_segments 7178 * (minus one for over-loaded initialization) 7179 */ 7180 blk_queue_max_hw_sectors(mddev->queue, 0xfffe * STRIPE_SECTORS); 7181 blk_queue_max_discard_sectors(mddev->queue, 7182 0xfffe * STRIPE_SECTORS); 7183 7184 /* 7185 * unaligned part of discard request will be ignored, so can't 7186 * guarantee discard_zeroes_data 7187 */ 7188 mddev->queue->limits.discard_zeroes_data = 0; 7189 7190 blk_queue_max_write_same_sectors(mddev->queue, 0); 7191 7192 rdev_for_each(rdev, mddev) { 7193 disk_stack_limits(mddev->gendisk, rdev->bdev, 7194 rdev->data_offset << 9); 7195 disk_stack_limits(mddev->gendisk, rdev->bdev, 7196 rdev->new_data_offset << 9); 7197 /* 7198 * discard_zeroes_data is required, otherwise data 7199 * could be lost. Consider a scenario: discard a stripe 7200 * (the stripe could be inconsistent if 7201 * discard_zeroes_data is 0); write one disk of the 7202 * stripe (the stripe could be inconsistent again 7203 * depending on which disks are used to calculate 7204 * parity); the disk is broken; The stripe data of this 7205 * disk is lost. 7206 */ 7207 if (!blk_queue_discard(bdev_get_queue(rdev->bdev)) || 7208 !bdev_get_queue(rdev->bdev)-> 7209 limits.discard_zeroes_data) 7210 discard_supported = false; 7211 /* Unfortunately, discard_zeroes_data is not currently 7212 * a guarantee - just a hint. So we only allow DISCARD 7213 * if the sysadmin has confirmed that only safe devices 7214 * are in use by setting a module parameter. 7215 */ 7216 if (!devices_handle_discard_safely) { 7217 if (discard_supported) { 7218 pr_info("md/raid456: discard support disabled due to uncertainty.\n"); 7219 pr_info("Set raid456.devices_handle_discard_safely=Y to override.\n"); 7220 } 7221 discard_supported = false; 7222 } 7223 } 7224 7225 if (discard_supported && 7226 mddev->queue->limits.max_discard_sectors >= (stripe >> 9) && 7227 mddev->queue->limits.discard_granularity >= stripe) 7228 queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, 7229 mddev->queue); 7230 else 7231 queue_flag_clear_unlocked(QUEUE_FLAG_DISCARD, 7232 mddev->queue); 7233 7234 blk_queue_max_hw_sectors(mddev->queue, UINT_MAX); 7235 } 7236 7237 if (journal_dev) { 7238 char b[BDEVNAME_SIZE]; 7239 7240 pr_debug("md/raid:%s: using device %s as journal\n", 7241 mdname(mddev), bdevname(journal_dev->bdev, b)); 7242 if (r5l_init_log(conf, journal_dev)) 7243 goto abort; 7244 } 7245 7246 return 0; 7247 abort: 7248 md_unregister_thread(&mddev->thread); 7249 print_raid5_conf(conf); 7250 free_conf(conf); 7251 mddev->private = NULL; 7252 pr_warn("md/raid:%s: failed to run raid set.\n", mdname(mddev)); 7253 return -EIO; 7254 } 7255 7256 static void raid5_free(struct mddev *mddev, void *priv) 7257 { 7258 struct r5conf *conf = priv; 7259 7260 free_conf(conf); 7261 mddev->to_remove = &raid5_attrs_group; 7262 } 7263 7264 static void raid5_status(struct seq_file *seq, struct mddev *mddev) 7265 { 7266 struct r5conf *conf = mddev->private; 7267 int i; 7268 7269 seq_printf(seq, " level %d, %dk chunk, algorithm %d", mddev->level, 7270 conf->chunk_sectors / 2, mddev->layout); 7271 seq_printf (seq, " [%d/%d] [", conf->raid_disks, conf->raid_disks - mddev->degraded); 7272 rcu_read_lock(); 7273 for (i = 0; i < conf->raid_disks; i++) { 7274 struct md_rdev *rdev = rcu_dereference(conf->disks[i].rdev); 7275 seq_printf (seq, "%s", rdev && test_bit(In_sync, &rdev->flags) ? "U" : "_"); 7276 } 7277 rcu_read_unlock(); 7278 seq_printf (seq, "]"); 7279 } 7280 7281 static void print_raid5_conf (struct r5conf *conf) 7282 { 7283 int i; 7284 struct disk_info *tmp; 7285 7286 pr_debug("RAID conf printout:\n"); 7287 if (!conf) { 7288 pr_debug("(conf==NULL)\n"); 7289 return; 7290 } 7291 pr_debug(" --- level:%d rd:%d wd:%d\n", conf->level, 7292 conf->raid_disks, 7293 conf->raid_disks - conf->mddev->degraded); 7294 7295 for (i = 0; i < conf->raid_disks; i++) { 7296 char b[BDEVNAME_SIZE]; 7297 tmp = conf->disks + i; 7298 if (tmp->rdev) 7299 pr_debug(" disk %d, o:%d, dev:%s\n", 7300 i, !test_bit(Faulty, &tmp->rdev->flags), 7301 bdevname(tmp->rdev->bdev, b)); 7302 } 7303 } 7304 7305 static int raid5_spare_active(struct mddev *mddev) 7306 { 7307 int i; 7308 struct r5conf *conf = mddev->private; 7309 struct disk_info *tmp; 7310 int count = 0; 7311 unsigned long flags; 7312 7313 for (i = 0; i < conf->raid_disks; i++) { 7314 tmp = conf->disks + i; 7315 if (tmp->replacement 7316 && tmp->replacement->recovery_offset == MaxSector 7317 && !test_bit(Faulty, &tmp->replacement->flags) 7318 && !test_and_set_bit(In_sync, &tmp->replacement->flags)) { 7319 /* Replacement has just become active. */ 7320 if (!tmp->rdev 7321 || !test_and_clear_bit(In_sync, &tmp->rdev->flags)) 7322 count++; 7323 if (tmp->rdev) { 7324 /* Replaced device not technically faulty, 7325 * but we need to be sure it gets removed 7326 * and never re-added. 7327 */ 7328 set_bit(Faulty, &tmp->rdev->flags); 7329 sysfs_notify_dirent_safe( 7330 tmp->rdev->sysfs_state); 7331 } 7332 sysfs_notify_dirent_safe(tmp->replacement->sysfs_state); 7333 } else if (tmp->rdev 7334 && tmp->rdev->recovery_offset == MaxSector 7335 && !test_bit(Faulty, &tmp->rdev->flags) 7336 && !test_and_set_bit(In_sync, &tmp->rdev->flags)) { 7337 count++; 7338 sysfs_notify_dirent_safe(tmp->rdev->sysfs_state); 7339 } 7340 } 7341 spin_lock_irqsave(&conf->device_lock, flags); 7342 mddev->degraded = raid5_calc_degraded(conf); 7343 spin_unlock_irqrestore(&conf->device_lock, flags); 7344 print_raid5_conf(conf); 7345 return count; 7346 } 7347 7348 static int raid5_remove_disk(struct mddev *mddev, struct md_rdev *rdev) 7349 { 7350 struct r5conf *conf = mddev->private; 7351 int err = 0; 7352 int number = rdev->raid_disk; 7353 struct md_rdev **rdevp; 7354 struct disk_info *p = conf->disks + number; 7355 7356 print_raid5_conf(conf); 7357 if (test_bit(Journal, &rdev->flags) && conf->log) { 7358 struct r5l_log *log; 7359 /* 7360 * we can't wait pending write here, as this is called in 7361 * raid5d, wait will deadlock. 7362 */ 7363 if (atomic_read(&mddev->writes_pending)) 7364 return -EBUSY; 7365 log = conf->log; 7366 conf->log = NULL; 7367 synchronize_rcu(); 7368 r5l_exit_log(log); 7369 return 0; 7370 } 7371 if (rdev == p->rdev) 7372 rdevp = &p->rdev; 7373 else if (rdev == p->replacement) 7374 rdevp = &p->replacement; 7375 else 7376 return 0; 7377 7378 if (number >= conf->raid_disks && 7379 conf->reshape_progress == MaxSector) 7380 clear_bit(In_sync, &rdev->flags); 7381 7382 if (test_bit(In_sync, &rdev->flags) || 7383 atomic_read(&rdev->nr_pending)) { 7384 err = -EBUSY; 7385 goto abort; 7386 } 7387 /* Only remove non-faulty devices if recovery 7388 * isn't possible. 7389 */ 7390 if (!test_bit(Faulty, &rdev->flags) && 7391 mddev->recovery_disabled != conf->recovery_disabled && 7392 !has_failed(conf) && 7393 (!p->replacement || p->replacement == rdev) && 7394 number < conf->raid_disks) { 7395 err = -EBUSY; 7396 goto abort; 7397 } 7398 *rdevp = NULL; 7399 if (!test_bit(RemoveSynchronized, &rdev->flags)) { 7400 synchronize_rcu(); 7401 if (atomic_read(&rdev->nr_pending)) { 7402 /* lost the race, try later */ 7403 err = -EBUSY; 7404 *rdevp = rdev; 7405 } 7406 } 7407 if (p->replacement) { 7408 /* We must have just cleared 'rdev' */ 7409 p->rdev = p->replacement; 7410 clear_bit(Replacement, &p->replacement->flags); 7411 smp_mb(); /* Make sure other CPUs may see both as identical 7412 * but will never see neither - if they are careful 7413 */ 7414 p->replacement = NULL; 7415 clear_bit(WantReplacement, &rdev->flags); 7416 } else 7417 /* We might have just removed the Replacement as faulty- 7418 * clear the bit just in case 7419 */ 7420 clear_bit(WantReplacement, &rdev->flags); 7421 abort: 7422 7423 print_raid5_conf(conf); 7424 return err; 7425 } 7426 7427 static int raid5_add_disk(struct mddev *mddev, struct md_rdev *rdev) 7428 { 7429 struct r5conf *conf = mddev->private; 7430 int err = -EEXIST; 7431 int disk; 7432 struct disk_info *p; 7433 int first = 0; 7434 int last = conf->raid_disks - 1; 7435 7436 if (test_bit(Journal, &rdev->flags)) { 7437 char b[BDEVNAME_SIZE]; 7438 if (conf->log) 7439 return -EBUSY; 7440 7441 rdev->raid_disk = 0; 7442 /* 7443 * The array is in readonly mode if journal is missing, so no 7444 * write requests running. We should be safe 7445 */ 7446 r5l_init_log(conf, rdev); 7447 pr_debug("md/raid:%s: using device %s as journal\n", 7448 mdname(mddev), bdevname(rdev->bdev, b)); 7449 return 0; 7450 } 7451 if (mddev->recovery_disabled == conf->recovery_disabled) 7452 return -EBUSY; 7453 7454 if (rdev->saved_raid_disk < 0 && has_failed(conf)) 7455 /* no point adding a device */ 7456 return -EINVAL; 7457 7458 if (rdev->raid_disk >= 0) 7459 first = last = rdev->raid_disk; 7460 7461 /* 7462 * find the disk ... but prefer rdev->saved_raid_disk 7463 * if possible. 7464 */ 7465 if (rdev->saved_raid_disk >= 0 && 7466 rdev->saved_raid_disk >= first && 7467 conf->disks[rdev->saved_raid_disk].rdev == NULL) 7468 first = rdev->saved_raid_disk; 7469 7470 for (disk = first; disk <= last; disk++) { 7471 p = conf->disks + disk; 7472 if (p->rdev == NULL) { 7473 clear_bit(In_sync, &rdev->flags); 7474 rdev->raid_disk = disk; 7475 err = 0; 7476 if (rdev->saved_raid_disk != disk) 7477 conf->fullsync = 1; 7478 rcu_assign_pointer(p->rdev, rdev); 7479 goto out; 7480 } 7481 } 7482 for (disk = first; disk <= last; disk++) { 7483 p = conf->disks + disk; 7484 if (test_bit(WantReplacement, &p->rdev->flags) && 7485 p->replacement == NULL) { 7486 clear_bit(In_sync, &rdev->flags); 7487 set_bit(Replacement, &rdev->flags); 7488 rdev->raid_disk = disk; 7489 err = 0; 7490 conf->fullsync = 1; 7491 rcu_assign_pointer(p->replacement, rdev); 7492 break; 7493 } 7494 } 7495 out: 7496 print_raid5_conf(conf); 7497 return err; 7498 } 7499 7500 static int raid5_resize(struct mddev *mddev, sector_t sectors) 7501 { 7502 /* no resync is happening, and there is enough space 7503 * on all devices, so we can resize. 7504 * We need to make sure resync covers any new space. 7505 * If the array is shrinking we should possibly wait until 7506 * any io in the removed space completes, but it hardly seems 7507 * worth it. 7508 */ 7509 sector_t newsize; 7510 struct r5conf *conf = mddev->private; 7511 7512 if (conf->log) 7513 return -EINVAL; 7514 sectors &= ~((sector_t)conf->chunk_sectors - 1); 7515 newsize = raid5_size(mddev, sectors, mddev->raid_disks); 7516 if (mddev->external_size && 7517 mddev->array_sectors > newsize) 7518 return -EINVAL; 7519 if (mddev->bitmap) { 7520 int ret = bitmap_resize(mddev->bitmap, sectors, 0, 0); 7521 if (ret) 7522 return ret; 7523 } 7524 md_set_array_sectors(mddev, newsize); 7525 set_capacity(mddev->gendisk, mddev->array_sectors); 7526 revalidate_disk(mddev->gendisk); 7527 if (sectors > mddev->dev_sectors && 7528 mddev->recovery_cp > mddev->dev_sectors) { 7529 mddev->recovery_cp = mddev->dev_sectors; 7530 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 7531 } 7532 mddev->dev_sectors = sectors; 7533 mddev->resync_max_sectors = sectors; 7534 return 0; 7535 } 7536 7537 static int check_stripe_cache(struct mddev *mddev) 7538 { 7539 /* Can only proceed if there are plenty of stripe_heads. 7540 * We need a minimum of one full stripe,, and for sensible progress 7541 * it is best to have about 4 times that. 7542 * If we require 4 times, then the default 256 4K stripe_heads will 7543 * allow for chunk sizes up to 256K, which is probably OK. 7544 * If the chunk size is greater, user-space should request more 7545 * stripe_heads first. 7546 */ 7547 struct r5conf *conf = mddev->private; 7548 if (((mddev->chunk_sectors << 9) / STRIPE_SIZE) * 4 7549 > conf->min_nr_stripes || 7550 ((mddev->new_chunk_sectors << 9) / STRIPE_SIZE) * 4 7551 > conf->min_nr_stripes) { 7552 pr_warn("md/raid:%s: reshape: not enough stripes. Needed %lu\n", 7553 mdname(mddev), 7554 ((max(mddev->chunk_sectors, mddev->new_chunk_sectors) << 9) 7555 / STRIPE_SIZE)*4); 7556 return 0; 7557 } 7558 return 1; 7559 } 7560 7561 static int check_reshape(struct mddev *mddev) 7562 { 7563 struct r5conf *conf = mddev->private; 7564 7565 if (conf->log) 7566 return -EINVAL; 7567 if (mddev->delta_disks == 0 && 7568 mddev->new_layout == mddev->layout && 7569 mddev->new_chunk_sectors == mddev->chunk_sectors) 7570 return 0; /* nothing to do */ 7571 if (has_failed(conf)) 7572 return -EINVAL; 7573 if (mddev->delta_disks < 0 && mddev->reshape_position == MaxSector) { 7574 /* We might be able to shrink, but the devices must 7575 * be made bigger first. 7576 * For raid6, 4 is the minimum size. 7577 * Otherwise 2 is the minimum 7578 */ 7579 int min = 2; 7580 if (mddev->level == 6) 7581 min = 4; 7582 if (mddev->raid_disks + mddev->delta_disks < min) 7583 return -EINVAL; 7584 } 7585 7586 if (!check_stripe_cache(mddev)) 7587 return -ENOSPC; 7588 7589 if (mddev->new_chunk_sectors > mddev->chunk_sectors || 7590 mddev->delta_disks > 0) 7591 if (resize_chunks(conf, 7592 conf->previous_raid_disks 7593 + max(0, mddev->delta_disks), 7594 max(mddev->new_chunk_sectors, 7595 mddev->chunk_sectors) 7596 ) < 0) 7597 return -ENOMEM; 7598 return resize_stripes(conf, (conf->previous_raid_disks 7599 + mddev->delta_disks)); 7600 } 7601 7602 static int raid5_start_reshape(struct mddev *mddev) 7603 { 7604 struct r5conf *conf = mddev->private; 7605 struct md_rdev *rdev; 7606 int spares = 0; 7607 unsigned long flags; 7608 7609 if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) 7610 return -EBUSY; 7611 7612 if (!check_stripe_cache(mddev)) 7613 return -ENOSPC; 7614 7615 if (has_failed(conf)) 7616 return -EINVAL; 7617 7618 rdev_for_each(rdev, mddev) { 7619 if (!test_bit(In_sync, &rdev->flags) 7620 && !test_bit(Faulty, &rdev->flags)) 7621 spares++; 7622 } 7623 7624 if (spares - mddev->degraded < mddev->delta_disks - conf->max_degraded) 7625 /* Not enough devices even to make a degraded array 7626 * of that size 7627 */ 7628 return -EINVAL; 7629 7630 /* Refuse to reduce size of the array. Any reductions in 7631 * array size must be through explicit setting of array_size 7632 * attribute. 7633 */ 7634 if (raid5_size(mddev, 0, conf->raid_disks + mddev->delta_disks) 7635 < mddev->array_sectors) { 7636 pr_warn("md/raid:%s: array size must be reduced before number of disks\n", 7637 mdname(mddev)); 7638 return -EINVAL; 7639 } 7640 7641 atomic_set(&conf->reshape_stripes, 0); 7642 spin_lock_irq(&conf->device_lock); 7643 write_seqcount_begin(&conf->gen_lock); 7644 conf->previous_raid_disks = conf->raid_disks; 7645 conf->raid_disks += mddev->delta_disks; 7646 conf->prev_chunk_sectors = conf->chunk_sectors; 7647 conf->chunk_sectors = mddev->new_chunk_sectors; 7648 conf->prev_algo = conf->algorithm; 7649 conf->algorithm = mddev->new_layout; 7650 conf->generation++; 7651 /* Code that selects data_offset needs to see the generation update 7652 * if reshape_progress has been set - so a memory barrier needed. 7653 */ 7654 smp_mb(); 7655 if (mddev->reshape_backwards) 7656 conf->reshape_progress = raid5_size(mddev, 0, 0); 7657 else 7658 conf->reshape_progress = 0; 7659 conf->reshape_safe = conf->reshape_progress; 7660 write_seqcount_end(&conf->gen_lock); 7661 spin_unlock_irq(&conf->device_lock); 7662 7663 /* Now make sure any requests that proceeded on the assumption 7664 * the reshape wasn't running - like Discard or Read - have 7665 * completed. 7666 */ 7667 mddev_suspend(mddev); 7668 mddev_resume(mddev); 7669 7670 /* Add some new drives, as many as will fit. 7671 * We know there are enough to make the newly sized array work. 7672 * Don't add devices if we are reducing the number of 7673 * devices in the array. This is because it is not possible 7674 * to correctly record the "partially reconstructed" state of 7675 * such devices during the reshape and confusion could result. 7676 */ 7677 if (mddev->delta_disks >= 0) { 7678 rdev_for_each(rdev, mddev) 7679 if (rdev->raid_disk < 0 && 7680 !test_bit(Faulty, &rdev->flags)) { 7681 if (raid5_add_disk(mddev, rdev) == 0) { 7682 if (rdev->raid_disk 7683 >= conf->previous_raid_disks) 7684 set_bit(In_sync, &rdev->flags); 7685 else 7686 rdev->recovery_offset = 0; 7687 7688 if (sysfs_link_rdev(mddev, rdev)) 7689 /* Failure here is OK */; 7690 } 7691 } else if (rdev->raid_disk >= conf->previous_raid_disks 7692 && !test_bit(Faulty, &rdev->flags)) { 7693 /* This is a spare that was manually added */ 7694 set_bit(In_sync, &rdev->flags); 7695 } 7696 7697 /* When a reshape changes the number of devices, 7698 * ->degraded is measured against the larger of the 7699 * pre and post number of devices. 7700 */ 7701 spin_lock_irqsave(&conf->device_lock, flags); 7702 mddev->degraded = raid5_calc_degraded(conf); 7703 spin_unlock_irqrestore(&conf->device_lock, flags); 7704 } 7705 mddev->raid_disks = conf->raid_disks; 7706 mddev->reshape_position = conf->reshape_progress; 7707 set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags); 7708 7709 clear_bit(MD_RECOVERY_SYNC, &mddev->recovery); 7710 clear_bit(MD_RECOVERY_CHECK, &mddev->recovery); 7711 clear_bit(MD_RECOVERY_DONE, &mddev->recovery); 7712 set_bit(MD_RECOVERY_RESHAPE, &mddev->recovery); 7713 set_bit(MD_RECOVERY_RUNNING, &mddev->recovery); 7714 mddev->sync_thread = md_register_thread(md_do_sync, mddev, 7715 "reshape"); 7716 if (!mddev->sync_thread) { 7717 mddev->recovery = 0; 7718 spin_lock_irq(&conf->device_lock); 7719 write_seqcount_begin(&conf->gen_lock); 7720 mddev->raid_disks = conf->raid_disks = conf->previous_raid_disks; 7721 mddev->new_chunk_sectors = 7722 conf->chunk_sectors = conf->prev_chunk_sectors; 7723 mddev->new_layout = conf->algorithm = conf->prev_algo; 7724 rdev_for_each(rdev, mddev) 7725 rdev->new_data_offset = rdev->data_offset; 7726 smp_wmb(); 7727 conf->generation --; 7728 conf->reshape_progress = MaxSector; 7729 mddev->reshape_position = MaxSector; 7730 write_seqcount_end(&conf->gen_lock); 7731 spin_unlock_irq(&conf->device_lock); 7732 return -EAGAIN; 7733 } 7734 conf->reshape_checkpoint = jiffies; 7735 md_wakeup_thread(mddev->sync_thread); 7736 md_new_event(mddev); 7737 return 0; 7738 } 7739 7740 /* This is called from the reshape thread and should make any 7741 * changes needed in 'conf' 7742 */ 7743 static void end_reshape(struct r5conf *conf) 7744 { 7745 7746 if (!test_bit(MD_RECOVERY_INTR, &conf->mddev->recovery)) { 7747 struct md_rdev *rdev; 7748 7749 spin_lock_irq(&conf->device_lock); 7750 conf->previous_raid_disks = conf->raid_disks; 7751 rdev_for_each(rdev, conf->mddev) 7752 rdev->data_offset = rdev->new_data_offset; 7753 smp_wmb(); 7754 conf->reshape_progress = MaxSector; 7755 conf->mddev->reshape_position = MaxSector; 7756 spin_unlock_irq(&conf->device_lock); 7757 wake_up(&conf->wait_for_overlap); 7758 7759 /* read-ahead size must cover two whole stripes, which is 7760 * 2 * (datadisks) * chunksize where 'n' is the number of raid devices 7761 */ 7762 if (conf->mddev->queue) { 7763 int data_disks = conf->raid_disks - conf->max_degraded; 7764 int stripe = data_disks * ((conf->chunk_sectors << 9) 7765 / PAGE_SIZE); 7766 if (conf->mddev->queue->backing_dev_info->ra_pages < 2 * stripe) 7767 conf->mddev->queue->backing_dev_info->ra_pages = 2 * stripe; 7768 } 7769 } 7770 } 7771 7772 /* This is called from the raid5d thread with mddev_lock held. 7773 * It makes config changes to the device. 7774 */ 7775 static void raid5_finish_reshape(struct mddev *mddev) 7776 { 7777 struct r5conf *conf = mddev->private; 7778 7779 if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery)) { 7780 7781 if (mddev->delta_disks > 0) { 7782 md_set_array_sectors(mddev, raid5_size(mddev, 0, 0)); 7783 if (mddev->queue) { 7784 set_capacity(mddev->gendisk, mddev->array_sectors); 7785 revalidate_disk(mddev->gendisk); 7786 } 7787 } else { 7788 int d; 7789 spin_lock_irq(&conf->device_lock); 7790 mddev->degraded = raid5_calc_degraded(conf); 7791 spin_unlock_irq(&conf->device_lock); 7792 for (d = conf->raid_disks ; 7793 d < conf->raid_disks - mddev->delta_disks; 7794 d++) { 7795 struct md_rdev *rdev = conf->disks[d].rdev; 7796 if (rdev) 7797 clear_bit(In_sync, &rdev->flags); 7798 rdev = conf->disks[d].replacement; 7799 if (rdev) 7800 clear_bit(In_sync, &rdev->flags); 7801 } 7802 } 7803 mddev->layout = conf->algorithm; 7804 mddev->chunk_sectors = conf->chunk_sectors; 7805 mddev->reshape_position = MaxSector; 7806 mddev->delta_disks = 0; 7807 mddev->reshape_backwards = 0; 7808 } 7809 } 7810 7811 static void raid5_quiesce(struct mddev *mddev, int state) 7812 { 7813 struct r5conf *conf = mddev->private; 7814 7815 switch(state) { 7816 case 2: /* resume for a suspend */ 7817 wake_up(&conf->wait_for_overlap); 7818 break; 7819 7820 case 1: /* stop all writes */ 7821 lock_all_device_hash_locks_irq(conf); 7822 /* '2' tells resync/reshape to pause so that all 7823 * active stripes can drain 7824 */ 7825 r5c_flush_cache(conf, INT_MAX); 7826 conf->quiesce = 2; 7827 wait_event_cmd(conf->wait_for_quiescent, 7828 atomic_read(&conf->active_stripes) == 0 && 7829 atomic_read(&conf->active_aligned_reads) == 0, 7830 unlock_all_device_hash_locks_irq(conf), 7831 lock_all_device_hash_locks_irq(conf)); 7832 conf->quiesce = 1; 7833 unlock_all_device_hash_locks_irq(conf); 7834 /* allow reshape to continue */ 7835 wake_up(&conf->wait_for_overlap); 7836 break; 7837 7838 case 0: /* re-enable writes */ 7839 lock_all_device_hash_locks_irq(conf); 7840 conf->quiesce = 0; 7841 wake_up(&conf->wait_for_quiescent); 7842 wake_up(&conf->wait_for_overlap); 7843 unlock_all_device_hash_locks_irq(conf); 7844 break; 7845 } 7846 r5l_quiesce(conf->log, state); 7847 } 7848 7849 static void *raid45_takeover_raid0(struct mddev *mddev, int level) 7850 { 7851 struct r0conf *raid0_conf = mddev->private; 7852 sector_t sectors; 7853 7854 /* for raid0 takeover only one zone is supported */ 7855 if (raid0_conf->nr_strip_zones > 1) { 7856 pr_warn("md/raid:%s: cannot takeover raid0 with more than one zone.\n", 7857 mdname(mddev)); 7858 return ERR_PTR(-EINVAL); 7859 } 7860 7861 sectors = raid0_conf->strip_zone[0].zone_end; 7862 sector_div(sectors, raid0_conf->strip_zone[0].nb_dev); 7863 mddev->dev_sectors = sectors; 7864 mddev->new_level = level; 7865 mddev->new_layout = ALGORITHM_PARITY_N; 7866 mddev->new_chunk_sectors = mddev->chunk_sectors; 7867 mddev->raid_disks += 1; 7868 mddev->delta_disks = 1; 7869 /* make sure it will be not marked as dirty */ 7870 mddev->recovery_cp = MaxSector; 7871 7872 return setup_conf(mddev); 7873 } 7874 7875 static void *raid5_takeover_raid1(struct mddev *mddev) 7876 { 7877 int chunksect; 7878 void *ret; 7879 7880 if (mddev->raid_disks != 2 || 7881 mddev->degraded > 1) 7882 return ERR_PTR(-EINVAL); 7883 7884 /* Should check if there are write-behind devices? */ 7885 7886 chunksect = 64*2; /* 64K by default */ 7887 7888 /* The array must be an exact multiple of chunksize */ 7889 while (chunksect && (mddev->array_sectors & (chunksect-1))) 7890 chunksect >>= 1; 7891 7892 if ((chunksect<<9) < STRIPE_SIZE) 7893 /* array size does not allow a suitable chunk size */ 7894 return ERR_PTR(-EINVAL); 7895 7896 mddev->new_level = 5; 7897 mddev->new_layout = ALGORITHM_LEFT_SYMMETRIC; 7898 mddev->new_chunk_sectors = chunksect; 7899 7900 ret = setup_conf(mddev); 7901 if (!IS_ERR(ret)) 7902 mddev_clear_unsupported_flags(mddev, 7903 UNSUPPORTED_MDDEV_FLAGS); 7904 return ret; 7905 } 7906 7907 static void *raid5_takeover_raid6(struct mddev *mddev) 7908 { 7909 int new_layout; 7910 7911 switch (mddev->layout) { 7912 case ALGORITHM_LEFT_ASYMMETRIC_6: 7913 new_layout = ALGORITHM_LEFT_ASYMMETRIC; 7914 break; 7915 case ALGORITHM_RIGHT_ASYMMETRIC_6: 7916 new_layout = ALGORITHM_RIGHT_ASYMMETRIC; 7917 break; 7918 case ALGORITHM_LEFT_SYMMETRIC_6: 7919 new_layout = ALGORITHM_LEFT_SYMMETRIC; 7920 break; 7921 case ALGORITHM_RIGHT_SYMMETRIC_6: 7922 new_layout = ALGORITHM_RIGHT_SYMMETRIC; 7923 break; 7924 case ALGORITHM_PARITY_0_6: 7925 new_layout = ALGORITHM_PARITY_0; 7926 break; 7927 case ALGORITHM_PARITY_N: 7928 new_layout = ALGORITHM_PARITY_N; 7929 break; 7930 default: 7931 return ERR_PTR(-EINVAL); 7932 } 7933 mddev->new_level = 5; 7934 mddev->new_layout = new_layout; 7935 mddev->delta_disks = -1; 7936 mddev->raid_disks -= 1; 7937 return setup_conf(mddev); 7938 } 7939 7940 static int raid5_check_reshape(struct mddev *mddev) 7941 { 7942 /* For a 2-drive array, the layout and chunk size can be changed 7943 * immediately as not restriping is needed. 7944 * For larger arrays we record the new value - after validation 7945 * to be used by a reshape pass. 7946 */ 7947 struct r5conf *conf = mddev->private; 7948 int new_chunk = mddev->new_chunk_sectors; 7949 7950 if (mddev->new_layout >= 0 && !algorithm_valid_raid5(mddev->new_layout)) 7951 return -EINVAL; 7952 if (new_chunk > 0) { 7953 if (!is_power_of_2(new_chunk)) 7954 return -EINVAL; 7955 if (new_chunk < (PAGE_SIZE>>9)) 7956 return -EINVAL; 7957 if (mddev->array_sectors & (new_chunk-1)) 7958 /* not factor of array size */ 7959 return -EINVAL; 7960 } 7961 7962 /* They look valid */ 7963 7964 if (mddev->raid_disks == 2) { 7965 /* can make the change immediately */ 7966 if (mddev->new_layout >= 0) { 7967 conf->algorithm = mddev->new_layout; 7968 mddev->layout = mddev->new_layout; 7969 } 7970 if (new_chunk > 0) { 7971 conf->chunk_sectors = new_chunk ; 7972 mddev->chunk_sectors = new_chunk; 7973 } 7974 set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags); 7975 md_wakeup_thread(mddev->thread); 7976 } 7977 return check_reshape(mddev); 7978 } 7979 7980 static int raid6_check_reshape(struct mddev *mddev) 7981 { 7982 int new_chunk = mddev->new_chunk_sectors; 7983 7984 if (mddev->new_layout >= 0 && !algorithm_valid_raid6(mddev->new_layout)) 7985 return -EINVAL; 7986 if (new_chunk > 0) { 7987 if (!is_power_of_2(new_chunk)) 7988 return -EINVAL; 7989 if (new_chunk < (PAGE_SIZE >> 9)) 7990 return -EINVAL; 7991 if (mddev->array_sectors & (new_chunk-1)) 7992 /* not factor of array size */ 7993 return -EINVAL; 7994 } 7995 7996 /* They look valid */ 7997 return check_reshape(mddev); 7998 } 7999 8000 static void *raid5_takeover(struct mddev *mddev) 8001 { 8002 /* raid5 can take over: 8003 * raid0 - if there is only one strip zone - make it a raid4 layout 8004 * raid1 - if there are two drives. We need to know the chunk size 8005 * raid4 - trivial - just use a raid4 layout. 8006 * raid6 - Providing it is a *_6 layout 8007 */ 8008 if (mddev->level == 0) 8009 return raid45_takeover_raid0(mddev, 5); 8010 if (mddev->level == 1) 8011 return raid5_takeover_raid1(mddev); 8012 if (mddev->level == 4) { 8013 mddev->new_layout = ALGORITHM_PARITY_N; 8014 mddev->new_level = 5; 8015 return setup_conf(mddev); 8016 } 8017 if (mddev->level == 6) 8018 return raid5_takeover_raid6(mddev); 8019 8020 return ERR_PTR(-EINVAL); 8021 } 8022 8023 static void *raid4_takeover(struct mddev *mddev) 8024 { 8025 /* raid4 can take over: 8026 * raid0 - if there is only one strip zone 8027 * raid5 - if layout is right 8028 */ 8029 if (mddev->level == 0) 8030 return raid45_takeover_raid0(mddev, 4); 8031 if (mddev->level == 5 && 8032 mddev->layout == ALGORITHM_PARITY_N) { 8033 mddev->new_layout = 0; 8034 mddev->new_level = 4; 8035 return setup_conf(mddev); 8036 } 8037 return ERR_PTR(-EINVAL); 8038 } 8039 8040 static struct md_personality raid5_personality; 8041 8042 static void *raid6_takeover(struct mddev *mddev) 8043 { 8044 /* Currently can only take over a raid5. We map the 8045 * personality to an equivalent raid6 personality 8046 * with the Q block at the end. 8047 */ 8048 int new_layout; 8049 8050 if (mddev->pers != &raid5_personality) 8051 return ERR_PTR(-EINVAL); 8052 if (mddev->degraded > 1) 8053 return ERR_PTR(-EINVAL); 8054 if (mddev->raid_disks > 253) 8055 return ERR_PTR(-EINVAL); 8056 if (mddev->raid_disks < 3) 8057 return ERR_PTR(-EINVAL); 8058 8059 switch (mddev->layout) { 8060 case ALGORITHM_LEFT_ASYMMETRIC: 8061 new_layout = ALGORITHM_LEFT_ASYMMETRIC_6; 8062 break; 8063 case ALGORITHM_RIGHT_ASYMMETRIC: 8064 new_layout = ALGORITHM_RIGHT_ASYMMETRIC_6; 8065 break; 8066 case ALGORITHM_LEFT_SYMMETRIC: 8067 new_layout = ALGORITHM_LEFT_SYMMETRIC_6; 8068 break; 8069 case ALGORITHM_RIGHT_SYMMETRIC: 8070 new_layout = ALGORITHM_RIGHT_SYMMETRIC_6; 8071 break; 8072 case ALGORITHM_PARITY_0: 8073 new_layout = ALGORITHM_PARITY_0_6; 8074 break; 8075 case ALGORITHM_PARITY_N: 8076 new_layout = ALGORITHM_PARITY_N; 8077 break; 8078 default: 8079 return ERR_PTR(-EINVAL); 8080 } 8081 mddev->new_level = 6; 8082 mddev->new_layout = new_layout; 8083 mddev->delta_disks = 1; 8084 mddev->raid_disks += 1; 8085 return setup_conf(mddev); 8086 } 8087 8088 static struct md_personality raid6_personality = 8089 { 8090 .name = "raid6", 8091 .level = 6, 8092 .owner = THIS_MODULE, 8093 .make_request = raid5_make_request, 8094 .run = raid5_run, 8095 .free = raid5_free, 8096 .status = raid5_status, 8097 .error_handler = raid5_error, 8098 .hot_add_disk = raid5_add_disk, 8099 .hot_remove_disk= raid5_remove_disk, 8100 .spare_active = raid5_spare_active, 8101 .sync_request = raid5_sync_request, 8102 .resize = raid5_resize, 8103 .size = raid5_size, 8104 .check_reshape = raid6_check_reshape, 8105 .start_reshape = raid5_start_reshape, 8106 .finish_reshape = raid5_finish_reshape, 8107 .quiesce = raid5_quiesce, 8108 .takeover = raid6_takeover, 8109 .congested = raid5_congested, 8110 }; 8111 static struct md_personality raid5_personality = 8112 { 8113 .name = "raid5", 8114 .level = 5, 8115 .owner = THIS_MODULE, 8116 .make_request = raid5_make_request, 8117 .run = raid5_run, 8118 .free = raid5_free, 8119 .status = raid5_status, 8120 .error_handler = raid5_error, 8121 .hot_add_disk = raid5_add_disk, 8122 .hot_remove_disk= raid5_remove_disk, 8123 .spare_active = raid5_spare_active, 8124 .sync_request = raid5_sync_request, 8125 .resize = raid5_resize, 8126 .size = raid5_size, 8127 .check_reshape = raid5_check_reshape, 8128 .start_reshape = raid5_start_reshape, 8129 .finish_reshape = raid5_finish_reshape, 8130 .quiesce = raid5_quiesce, 8131 .takeover = raid5_takeover, 8132 .congested = raid5_congested, 8133 }; 8134 8135 static struct md_personality raid4_personality = 8136 { 8137 .name = "raid4", 8138 .level = 4, 8139 .owner = THIS_MODULE, 8140 .make_request = raid5_make_request, 8141 .run = raid5_run, 8142 .free = raid5_free, 8143 .status = raid5_status, 8144 .error_handler = raid5_error, 8145 .hot_add_disk = raid5_add_disk, 8146 .hot_remove_disk= raid5_remove_disk, 8147 .spare_active = raid5_spare_active, 8148 .sync_request = raid5_sync_request, 8149 .resize = raid5_resize, 8150 .size = raid5_size, 8151 .check_reshape = raid5_check_reshape, 8152 .start_reshape = raid5_start_reshape, 8153 .finish_reshape = raid5_finish_reshape, 8154 .quiesce = raid5_quiesce, 8155 .takeover = raid4_takeover, 8156 .congested = raid5_congested, 8157 }; 8158 8159 static int __init raid5_init(void) 8160 { 8161 int ret; 8162 8163 raid5_wq = alloc_workqueue("raid5wq", 8164 WQ_UNBOUND|WQ_MEM_RECLAIM|WQ_CPU_INTENSIVE|WQ_SYSFS, 0); 8165 if (!raid5_wq) 8166 return -ENOMEM; 8167 8168 ret = cpuhp_setup_state_multi(CPUHP_MD_RAID5_PREPARE, 8169 "md/raid5:prepare", 8170 raid456_cpu_up_prepare, 8171 raid456_cpu_dead); 8172 if (ret) { 8173 destroy_workqueue(raid5_wq); 8174 return ret; 8175 } 8176 register_md_personality(&raid6_personality); 8177 register_md_personality(&raid5_personality); 8178 register_md_personality(&raid4_personality); 8179 return 0; 8180 } 8181 8182 static void raid5_exit(void) 8183 { 8184 unregister_md_personality(&raid6_personality); 8185 unregister_md_personality(&raid5_personality); 8186 unregister_md_personality(&raid4_personality); 8187 cpuhp_remove_multi_state(CPUHP_MD_RAID5_PREPARE); 8188 destroy_workqueue(raid5_wq); 8189 } 8190 8191 module_init(raid5_init); 8192 module_exit(raid5_exit); 8193 MODULE_LICENSE("GPL"); 8194 MODULE_DESCRIPTION("RAID4/5/6 (striping with parity) personality for MD"); 8195 MODULE_ALIAS("md-personality-4"); /* RAID5 */ 8196 MODULE_ALIAS("md-raid5"); 8197 MODULE_ALIAS("md-raid4"); 8198 MODULE_ALIAS("md-level-5"); 8199 MODULE_ALIAS("md-level-4"); 8200 MODULE_ALIAS("md-personality-8"); /* RAID6 */ 8201 MODULE_ALIAS("md-raid6"); 8202 MODULE_ALIAS("md-level-6"); 8203 8204 /* This used to be two separate modules, they were: */ 8205 MODULE_ALIAS("raid5"); 8206 MODULE_ALIAS("raid6"); 8207