1 /* 2 * raid5.c : Multiple Devices driver for Linux 3 * Copyright (C) 1996, 1997 Ingo Molnar, Miguel de Icaza, Gadi Oxman 4 * Copyright (C) 1999, 2000 Ingo Molnar 5 * Copyright (C) 2002, 2003 H. Peter Anvin 6 * 7 * RAID-4/5/6 management functions. 8 * Thanks to Penguin Computing for making the RAID-6 development possible 9 * by donating a test server! 10 * 11 * This program is free software; you can redistribute it and/or modify 12 * it under the terms of the GNU General Public License as published by 13 * the Free Software Foundation; either version 2, or (at your option) 14 * any later version. 15 * 16 * You should have received a copy of the GNU General Public License 17 * (for example /usr/src/linux/COPYING); if not, write to the Free 18 * Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. 19 */ 20 21 /* 22 * BITMAP UNPLUGGING: 23 * 24 * The sequencing for updating the bitmap reliably is a little 25 * subtle (and I got it wrong the first time) so it deserves some 26 * explanation. 27 * 28 * We group bitmap updates into batches. Each batch has a number. 29 * We may write out several batches at once, but that isn't very important. 30 * conf->seq_write is the number of the last batch successfully written. 31 * conf->seq_flush is the number of the last batch that was closed to 32 * new additions. 33 * When we discover that we will need to write to any block in a stripe 34 * (in add_stripe_bio) we update the in-memory bitmap and record in sh->bm_seq 35 * the number of the batch it will be in. This is seq_flush+1. 36 * When we are ready to do a write, if that batch hasn't been written yet, 37 * we plug the array and queue the stripe for later. 38 * When an unplug happens, we increment bm_flush, thus closing the current 39 * batch. 40 * When we notice that bm_flush > bm_write, we write out all pending updates 41 * to the bitmap, and advance bm_write to where bm_flush was. 42 * This may occasionally write a bit out twice, but is sure never to 43 * miss any bits. 44 */ 45 46 #include <linux/blkdev.h> 47 #include <linux/kthread.h> 48 #include <linux/raid/pq.h> 49 #include <linux/async_tx.h> 50 #include <linux/module.h> 51 #include <linux/async.h> 52 #include <linux/seq_file.h> 53 #include <linux/cpu.h> 54 #include <linux/slab.h> 55 #include <linux/ratelimit.h> 56 #include <linux/nodemask.h> 57 #include <linux/flex_array.h> 58 #include <trace/events/block.h> 59 60 #include "md.h" 61 #include "raid5.h" 62 #include "raid0.h" 63 #include "bitmap.h" 64 65 #define UNSUPPORTED_MDDEV_FLAGS (1L << MD_FAILFAST_SUPPORTED) 66 67 #define cpu_to_group(cpu) cpu_to_node(cpu) 68 #define ANY_GROUP NUMA_NO_NODE 69 70 static bool devices_handle_discard_safely = false; 71 module_param(devices_handle_discard_safely, bool, 0644); 72 MODULE_PARM_DESC(devices_handle_discard_safely, 73 "Set to Y if all devices in each array reliably return zeroes on reads from discarded regions"); 74 static struct workqueue_struct *raid5_wq; 75 76 static inline struct hlist_head *stripe_hash(struct r5conf *conf, sector_t sect) 77 { 78 int hash = (sect >> STRIPE_SHIFT) & HASH_MASK; 79 return &conf->stripe_hashtbl[hash]; 80 } 81 82 static inline int stripe_hash_locks_hash(sector_t sect) 83 { 84 return (sect >> STRIPE_SHIFT) & STRIPE_HASH_LOCKS_MASK; 85 } 86 87 static inline void lock_device_hash_lock(struct r5conf *conf, int hash) 88 { 89 spin_lock_irq(conf->hash_locks + hash); 90 spin_lock(&conf->device_lock); 91 } 92 93 static inline void unlock_device_hash_lock(struct r5conf *conf, int hash) 94 { 95 spin_unlock(&conf->device_lock); 96 spin_unlock_irq(conf->hash_locks + hash); 97 } 98 99 static inline void lock_all_device_hash_locks_irq(struct r5conf *conf) 100 { 101 int i; 102 local_irq_disable(); 103 spin_lock(conf->hash_locks); 104 for (i = 1; i < NR_STRIPE_HASH_LOCKS; i++) 105 spin_lock_nest_lock(conf->hash_locks + i, conf->hash_locks); 106 spin_lock(&conf->device_lock); 107 } 108 109 static inline void unlock_all_device_hash_locks_irq(struct r5conf *conf) 110 { 111 int i; 112 spin_unlock(&conf->device_lock); 113 for (i = NR_STRIPE_HASH_LOCKS; i; i--) 114 spin_unlock(conf->hash_locks + i - 1); 115 local_irq_enable(); 116 } 117 118 /* Find first data disk in a raid6 stripe */ 119 static inline int raid6_d0(struct stripe_head *sh) 120 { 121 if (sh->ddf_layout) 122 /* ddf always start from first device */ 123 return 0; 124 /* md starts just after Q block */ 125 if (sh->qd_idx == sh->disks - 1) 126 return 0; 127 else 128 return sh->qd_idx + 1; 129 } 130 static inline int raid6_next_disk(int disk, int raid_disks) 131 { 132 disk++; 133 return (disk < raid_disks) ? disk : 0; 134 } 135 136 /* When walking through the disks in a raid5, starting at raid6_d0, 137 * We need to map each disk to a 'slot', where the data disks are slot 138 * 0 .. raid_disks-3, the parity disk is raid_disks-2 and the Q disk 139 * is raid_disks-1. This help does that mapping. 140 */ 141 static int raid6_idx_to_slot(int idx, struct stripe_head *sh, 142 int *count, int syndrome_disks) 143 { 144 int slot = *count; 145 146 if (sh->ddf_layout) 147 (*count)++; 148 if (idx == sh->pd_idx) 149 return syndrome_disks; 150 if (idx == sh->qd_idx) 151 return syndrome_disks + 1; 152 if (!sh->ddf_layout) 153 (*count)++; 154 return slot; 155 } 156 157 static void return_io(struct bio_list *return_bi) 158 { 159 struct bio *bi; 160 while ((bi = bio_list_pop(return_bi)) != NULL) { 161 bi->bi_iter.bi_size = 0; 162 trace_block_bio_complete(bdev_get_queue(bi->bi_bdev), 163 bi, 0); 164 bio_endio(bi); 165 } 166 } 167 168 static void print_raid5_conf (struct r5conf *conf); 169 170 static int stripe_operations_active(struct stripe_head *sh) 171 { 172 return sh->check_state || sh->reconstruct_state || 173 test_bit(STRIPE_BIOFILL_RUN, &sh->state) || 174 test_bit(STRIPE_COMPUTE_RUN, &sh->state); 175 } 176 177 static void raid5_wakeup_stripe_thread(struct stripe_head *sh) 178 { 179 struct r5conf *conf = sh->raid_conf; 180 struct r5worker_group *group; 181 int thread_cnt; 182 int i, cpu = sh->cpu; 183 184 if (!cpu_online(cpu)) { 185 cpu = cpumask_any(cpu_online_mask); 186 sh->cpu = cpu; 187 } 188 189 if (list_empty(&sh->lru)) { 190 struct r5worker_group *group; 191 group = conf->worker_groups + cpu_to_group(cpu); 192 list_add_tail(&sh->lru, &group->handle_list); 193 group->stripes_cnt++; 194 sh->group = group; 195 } 196 197 if (conf->worker_cnt_per_group == 0) { 198 md_wakeup_thread(conf->mddev->thread); 199 return; 200 } 201 202 group = conf->worker_groups + cpu_to_group(sh->cpu); 203 204 group->workers[0].working = true; 205 /* at least one worker should run to avoid race */ 206 queue_work_on(sh->cpu, raid5_wq, &group->workers[0].work); 207 208 thread_cnt = group->stripes_cnt / MAX_STRIPE_BATCH - 1; 209 /* wakeup more workers */ 210 for (i = 1; i < conf->worker_cnt_per_group && thread_cnt > 0; i++) { 211 if (group->workers[i].working == false) { 212 group->workers[i].working = true; 213 queue_work_on(sh->cpu, raid5_wq, 214 &group->workers[i].work); 215 thread_cnt--; 216 } 217 } 218 } 219 220 static void do_release_stripe(struct r5conf *conf, struct stripe_head *sh, 221 struct list_head *temp_inactive_list) 222 { 223 int i; 224 int injournal = 0; /* number of date pages with R5_InJournal */ 225 226 BUG_ON(!list_empty(&sh->lru)); 227 BUG_ON(atomic_read(&conf->active_stripes)==0); 228 229 if (r5c_is_writeback(conf->log)) 230 for (i = sh->disks; i--; ) 231 if (test_bit(R5_InJournal, &sh->dev[i].flags)) 232 injournal++; 233 /* 234 * When quiesce in r5c write back, set STRIPE_HANDLE for stripes with 235 * data in journal, so they are not released to cached lists 236 */ 237 if (conf->quiesce && r5c_is_writeback(conf->log) && 238 !test_bit(STRIPE_HANDLE, &sh->state) && injournal != 0) { 239 if (test_bit(STRIPE_R5C_CACHING, &sh->state)) 240 r5c_make_stripe_write_out(sh); 241 set_bit(STRIPE_HANDLE, &sh->state); 242 } 243 244 if (test_bit(STRIPE_HANDLE, &sh->state)) { 245 if (test_bit(STRIPE_DELAYED, &sh->state) && 246 !test_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) 247 list_add_tail(&sh->lru, &conf->delayed_list); 248 else if (test_bit(STRIPE_BIT_DELAY, &sh->state) && 249 sh->bm_seq - conf->seq_write > 0) 250 list_add_tail(&sh->lru, &conf->bitmap_list); 251 else { 252 clear_bit(STRIPE_DELAYED, &sh->state); 253 clear_bit(STRIPE_BIT_DELAY, &sh->state); 254 if (conf->worker_cnt_per_group == 0) { 255 list_add_tail(&sh->lru, &conf->handle_list); 256 } else { 257 raid5_wakeup_stripe_thread(sh); 258 return; 259 } 260 } 261 md_wakeup_thread(conf->mddev->thread); 262 } else { 263 BUG_ON(stripe_operations_active(sh)); 264 if (test_and_clear_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) 265 if (atomic_dec_return(&conf->preread_active_stripes) 266 < IO_THRESHOLD) 267 md_wakeup_thread(conf->mddev->thread); 268 atomic_dec(&conf->active_stripes); 269 if (!test_bit(STRIPE_EXPANDING, &sh->state)) { 270 if (!r5c_is_writeback(conf->log)) 271 list_add_tail(&sh->lru, temp_inactive_list); 272 else { 273 WARN_ON(test_bit(R5_InJournal, &sh->dev[sh->pd_idx].flags)); 274 if (injournal == 0) 275 list_add_tail(&sh->lru, temp_inactive_list); 276 else if (injournal == conf->raid_disks - conf->max_degraded) { 277 /* full stripe */ 278 if (!test_and_set_bit(STRIPE_R5C_FULL_STRIPE, &sh->state)) 279 atomic_inc(&conf->r5c_cached_full_stripes); 280 if (test_and_clear_bit(STRIPE_R5C_PARTIAL_STRIPE, &sh->state)) 281 atomic_dec(&conf->r5c_cached_partial_stripes); 282 list_add_tail(&sh->lru, &conf->r5c_full_stripe_list); 283 r5c_check_cached_full_stripe(conf); 284 } else { 285 /* partial stripe */ 286 if (!test_and_set_bit(STRIPE_R5C_PARTIAL_STRIPE, 287 &sh->state)) 288 atomic_inc(&conf->r5c_cached_partial_stripes); 289 list_add_tail(&sh->lru, &conf->r5c_partial_stripe_list); 290 } 291 } 292 } 293 } 294 } 295 296 static void __release_stripe(struct r5conf *conf, struct stripe_head *sh, 297 struct list_head *temp_inactive_list) 298 { 299 if (atomic_dec_and_test(&sh->count)) 300 do_release_stripe(conf, sh, temp_inactive_list); 301 } 302 303 /* 304 * @hash could be NR_STRIPE_HASH_LOCKS, then we have a list of inactive_list 305 * 306 * Be careful: Only one task can add/delete stripes from temp_inactive_list at 307 * given time. Adding stripes only takes device lock, while deleting stripes 308 * only takes hash lock. 309 */ 310 static void release_inactive_stripe_list(struct r5conf *conf, 311 struct list_head *temp_inactive_list, 312 int hash) 313 { 314 int size; 315 bool do_wakeup = false; 316 unsigned long flags; 317 318 if (hash == NR_STRIPE_HASH_LOCKS) { 319 size = NR_STRIPE_HASH_LOCKS; 320 hash = NR_STRIPE_HASH_LOCKS - 1; 321 } else 322 size = 1; 323 while (size) { 324 struct list_head *list = &temp_inactive_list[size - 1]; 325 326 /* 327 * We don't hold any lock here yet, raid5_get_active_stripe() might 328 * remove stripes from the list 329 */ 330 if (!list_empty_careful(list)) { 331 spin_lock_irqsave(conf->hash_locks + hash, flags); 332 if (list_empty(conf->inactive_list + hash) && 333 !list_empty(list)) 334 atomic_dec(&conf->empty_inactive_list_nr); 335 list_splice_tail_init(list, conf->inactive_list + hash); 336 do_wakeup = true; 337 spin_unlock_irqrestore(conf->hash_locks + hash, flags); 338 } 339 size--; 340 hash--; 341 } 342 343 if (do_wakeup) { 344 wake_up(&conf->wait_for_stripe); 345 if (atomic_read(&conf->active_stripes) == 0) 346 wake_up(&conf->wait_for_quiescent); 347 if (conf->retry_read_aligned) 348 md_wakeup_thread(conf->mddev->thread); 349 } 350 } 351 352 /* should hold conf->device_lock already */ 353 static int release_stripe_list(struct r5conf *conf, 354 struct list_head *temp_inactive_list) 355 { 356 struct stripe_head *sh; 357 int count = 0; 358 struct llist_node *head; 359 360 head = llist_del_all(&conf->released_stripes); 361 head = llist_reverse_order(head); 362 while (head) { 363 int hash; 364 365 sh = llist_entry(head, struct stripe_head, release_list); 366 head = llist_next(head); 367 /* sh could be readded after STRIPE_ON_RELEASE_LIST is cleard */ 368 smp_mb(); 369 clear_bit(STRIPE_ON_RELEASE_LIST, &sh->state); 370 /* 371 * Don't worry the bit is set here, because if the bit is set 372 * again, the count is always > 1. This is true for 373 * STRIPE_ON_UNPLUG_LIST bit too. 374 */ 375 hash = sh->hash_lock_index; 376 __release_stripe(conf, sh, &temp_inactive_list[hash]); 377 count++; 378 } 379 380 return count; 381 } 382 383 void raid5_release_stripe(struct stripe_head *sh) 384 { 385 struct r5conf *conf = sh->raid_conf; 386 unsigned long flags; 387 struct list_head list; 388 int hash; 389 bool wakeup; 390 391 /* Avoid release_list until the last reference. 392 */ 393 if (atomic_add_unless(&sh->count, -1, 1)) 394 return; 395 396 if (unlikely(!conf->mddev->thread) || 397 test_and_set_bit(STRIPE_ON_RELEASE_LIST, &sh->state)) 398 goto slow_path; 399 wakeup = llist_add(&sh->release_list, &conf->released_stripes); 400 if (wakeup) 401 md_wakeup_thread(conf->mddev->thread); 402 return; 403 slow_path: 404 local_irq_save(flags); 405 /* we are ok here if STRIPE_ON_RELEASE_LIST is set or not */ 406 if (atomic_dec_and_lock(&sh->count, &conf->device_lock)) { 407 INIT_LIST_HEAD(&list); 408 hash = sh->hash_lock_index; 409 do_release_stripe(conf, sh, &list); 410 spin_unlock(&conf->device_lock); 411 release_inactive_stripe_list(conf, &list, hash); 412 } 413 local_irq_restore(flags); 414 } 415 416 static inline void remove_hash(struct stripe_head *sh) 417 { 418 pr_debug("remove_hash(), stripe %llu\n", 419 (unsigned long long)sh->sector); 420 421 hlist_del_init(&sh->hash); 422 } 423 424 static inline void insert_hash(struct r5conf *conf, struct stripe_head *sh) 425 { 426 struct hlist_head *hp = stripe_hash(conf, sh->sector); 427 428 pr_debug("insert_hash(), stripe %llu\n", 429 (unsigned long long)sh->sector); 430 431 hlist_add_head(&sh->hash, hp); 432 } 433 434 /* find an idle stripe, make sure it is unhashed, and return it. */ 435 static struct stripe_head *get_free_stripe(struct r5conf *conf, int hash) 436 { 437 struct stripe_head *sh = NULL; 438 struct list_head *first; 439 440 if (list_empty(conf->inactive_list + hash)) 441 goto out; 442 first = (conf->inactive_list + hash)->next; 443 sh = list_entry(first, struct stripe_head, lru); 444 list_del_init(first); 445 remove_hash(sh); 446 atomic_inc(&conf->active_stripes); 447 BUG_ON(hash != sh->hash_lock_index); 448 if (list_empty(conf->inactive_list + hash)) 449 atomic_inc(&conf->empty_inactive_list_nr); 450 out: 451 return sh; 452 } 453 454 static void shrink_buffers(struct stripe_head *sh) 455 { 456 struct page *p; 457 int i; 458 int num = sh->raid_conf->pool_size; 459 460 for (i = 0; i < num ; i++) { 461 WARN_ON(sh->dev[i].page != sh->dev[i].orig_page); 462 p = sh->dev[i].page; 463 if (!p) 464 continue; 465 sh->dev[i].page = NULL; 466 put_page(p); 467 } 468 } 469 470 static int grow_buffers(struct stripe_head *sh, gfp_t gfp) 471 { 472 int i; 473 int num = sh->raid_conf->pool_size; 474 475 for (i = 0; i < num; i++) { 476 struct page *page; 477 478 if (!(page = alloc_page(gfp))) { 479 return 1; 480 } 481 sh->dev[i].page = page; 482 sh->dev[i].orig_page = page; 483 } 484 return 0; 485 } 486 487 static void raid5_build_block(struct stripe_head *sh, int i, int previous); 488 static void stripe_set_idx(sector_t stripe, struct r5conf *conf, int previous, 489 struct stripe_head *sh); 490 491 static void init_stripe(struct stripe_head *sh, sector_t sector, int previous) 492 { 493 struct r5conf *conf = sh->raid_conf; 494 int i, seq; 495 496 BUG_ON(atomic_read(&sh->count) != 0); 497 BUG_ON(test_bit(STRIPE_HANDLE, &sh->state)); 498 BUG_ON(stripe_operations_active(sh)); 499 BUG_ON(sh->batch_head); 500 501 pr_debug("init_stripe called, stripe %llu\n", 502 (unsigned long long)sector); 503 retry: 504 seq = read_seqcount_begin(&conf->gen_lock); 505 sh->generation = conf->generation - previous; 506 sh->disks = previous ? conf->previous_raid_disks : conf->raid_disks; 507 sh->sector = sector; 508 stripe_set_idx(sector, conf, previous, sh); 509 sh->state = 0; 510 511 for (i = sh->disks; i--; ) { 512 struct r5dev *dev = &sh->dev[i]; 513 514 if (dev->toread || dev->read || dev->towrite || dev->written || 515 test_bit(R5_LOCKED, &dev->flags)) { 516 pr_err("sector=%llx i=%d %p %p %p %p %d\n", 517 (unsigned long long)sh->sector, i, dev->toread, 518 dev->read, dev->towrite, dev->written, 519 test_bit(R5_LOCKED, &dev->flags)); 520 WARN_ON(1); 521 } 522 dev->flags = 0; 523 raid5_build_block(sh, i, previous); 524 } 525 if (read_seqcount_retry(&conf->gen_lock, seq)) 526 goto retry; 527 sh->overwrite_disks = 0; 528 insert_hash(conf, sh); 529 sh->cpu = smp_processor_id(); 530 set_bit(STRIPE_BATCH_READY, &sh->state); 531 } 532 533 static struct stripe_head *__find_stripe(struct r5conf *conf, sector_t sector, 534 short generation) 535 { 536 struct stripe_head *sh; 537 538 pr_debug("__find_stripe, sector %llu\n", (unsigned long long)sector); 539 hlist_for_each_entry(sh, stripe_hash(conf, sector), hash) 540 if (sh->sector == sector && sh->generation == generation) 541 return sh; 542 pr_debug("__stripe %llu not in cache\n", (unsigned long long)sector); 543 return NULL; 544 } 545 546 /* 547 * Need to check if array has failed when deciding whether to: 548 * - start an array 549 * - remove non-faulty devices 550 * - add a spare 551 * - allow a reshape 552 * This determination is simple when no reshape is happening. 553 * However if there is a reshape, we need to carefully check 554 * both the before and after sections. 555 * This is because some failed devices may only affect one 556 * of the two sections, and some non-in_sync devices may 557 * be insync in the section most affected by failed devices. 558 */ 559 static int calc_degraded(struct r5conf *conf) 560 { 561 int degraded, degraded2; 562 int i; 563 564 rcu_read_lock(); 565 degraded = 0; 566 for (i = 0; i < conf->previous_raid_disks; i++) { 567 struct md_rdev *rdev = rcu_dereference(conf->disks[i].rdev); 568 if (rdev && test_bit(Faulty, &rdev->flags)) 569 rdev = rcu_dereference(conf->disks[i].replacement); 570 if (!rdev || test_bit(Faulty, &rdev->flags)) 571 degraded++; 572 else if (test_bit(In_sync, &rdev->flags)) 573 ; 574 else 575 /* not in-sync or faulty. 576 * If the reshape increases the number of devices, 577 * this is being recovered by the reshape, so 578 * this 'previous' section is not in_sync. 579 * If the number of devices is being reduced however, 580 * the device can only be part of the array if 581 * we are reverting a reshape, so this section will 582 * be in-sync. 583 */ 584 if (conf->raid_disks >= conf->previous_raid_disks) 585 degraded++; 586 } 587 rcu_read_unlock(); 588 if (conf->raid_disks == conf->previous_raid_disks) 589 return degraded; 590 rcu_read_lock(); 591 degraded2 = 0; 592 for (i = 0; i < conf->raid_disks; i++) { 593 struct md_rdev *rdev = rcu_dereference(conf->disks[i].rdev); 594 if (rdev && test_bit(Faulty, &rdev->flags)) 595 rdev = rcu_dereference(conf->disks[i].replacement); 596 if (!rdev || test_bit(Faulty, &rdev->flags)) 597 degraded2++; 598 else if (test_bit(In_sync, &rdev->flags)) 599 ; 600 else 601 /* not in-sync or faulty. 602 * If reshape increases the number of devices, this 603 * section has already been recovered, else it 604 * almost certainly hasn't. 605 */ 606 if (conf->raid_disks <= conf->previous_raid_disks) 607 degraded2++; 608 } 609 rcu_read_unlock(); 610 if (degraded2 > degraded) 611 return degraded2; 612 return degraded; 613 } 614 615 static int has_failed(struct r5conf *conf) 616 { 617 int degraded; 618 619 if (conf->mddev->reshape_position == MaxSector) 620 return conf->mddev->degraded > conf->max_degraded; 621 622 degraded = calc_degraded(conf); 623 if (degraded > conf->max_degraded) 624 return 1; 625 return 0; 626 } 627 628 struct stripe_head * 629 raid5_get_active_stripe(struct r5conf *conf, sector_t sector, 630 int previous, int noblock, int noquiesce) 631 { 632 struct stripe_head *sh; 633 int hash = stripe_hash_locks_hash(sector); 634 int inc_empty_inactive_list_flag; 635 636 pr_debug("get_stripe, sector %llu\n", (unsigned long long)sector); 637 638 spin_lock_irq(conf->hash_locks + hash); 639 640 do { 641 wait_event_lock_irq(conf->wait_for_quiescent, 642 conf->quiesce == 0 || noquiesce, 643 *(conf->hash_locks + hash)); 644 sh = __find_stripe(conf, sector, conf->generation - previous); 645 if (!sh) { 646 if (!test_bit(R5_INACTIVE_BLOCKED, &conf->cache_state)) { 647 sh = get_free_stripe(conf, hash); 648 if (!sh && !test_bit(R5_DID_ALLOC, 649 &conf->cache_state)) 650 set_bit(R5_ALLOC_MORE, 651 &conf->cache_state); 652 } 653 if (noblock && sh == NULL) 654 break; 655 656 r5c_check_stripe_cache_usage(conf); 657 if (!sh) { 658 set_bit(R5_INACTIVE_BLOCKED, 659 &conf->cache_state); 660 r5l_wake_reclaim(conf->log, 0); 661 wait_event_lock_irq( 662 conf->wait_for_stripe, 663 !list_empty(conf->inactive_list + hash) && 664 (atomic_read(&conf->active_stripes) 665 < (conf->max_nr_stripes * 3 / 4) 666 || !test_bit(R5_INACTIVE_BLOCKED, 667 &conf->cache_state)), 668 *(conf->hash_locks + hash)); 669 clear_bit(R5_INACTIVE_BLOCKED, 670 &conf->cache_state); 671 } else { 672 init_stripe(sh, sector, previous); 673 atomic_inc(&sh->count); 674 } 675 } else if (!atomic_inc_not_zero(&sh->count)) { 676 spin_lock(&conf->device_lock); 677 if (!atomic_read(&sh->count)) { 678 if (!test_bit(STRIPE_HANDLE, &sh->state)) 679 atomic_inc(&conf->active_stripes); 680 BUG_ON(list_empty(&sh->lru) && 681 !test_bit(STRIPE_EXPANDING, &sh->state)); 682 inc_empty_inactive_list_flag = 0; 683 if (!list_empty(conf->inactive_list + hash)) 684 inc_empty_inactive_list_flag = 1; 685 list_del_init(&sh->lru); 686 if (list_empty(conf->inactive_list + hash) && inc_empty_inactive_list_flag) 687 atomic_inc(&conf->empty_inactive_list_nr); 688 if (sh->group) { 689 sh->group->stripes_cnt--; 690 sh->group = NULL; 691 } 692 } 693 atomic_inc(&sh->count); 694 spin_unlock(&conf->device_lock); 695 } 696 } while (sh == NULL); 697 698 spin_unlock_irq(conf->hash_locks + hash); 699 return sh; 700 } 701 702 static bool is_full_stripe_write(struct stripe_head *sh) 703 { 704 BUG_ON(sh->overwrite_disks > (sh->disks - sh->raid_conf->max_degraded)); 705 return sh->overwrite_disks == (sh->disks - sh->raid_conf->max_degraded); 706 } 707 708 static void lock_two_stripes(struct stripe_head *sh1, struct stripe_head *sh2) 709 { 710 local_irq_disable(); 711 if (sh1 > sh2) { 712 spin_lock(&sh2->stripe_lock); 713 spin_lock_nested(&sh1->stripe_lock, 1); 714 } else { 715 spin_lock(&sh1->stripe_lock); 716 spin_lock_nested(&sh2->stripe_lock, 1); 717 } 718 } 719 720 static void unlock_two_stripes(struct stripe_head *sh1, struct stripe_head *sh2) 721 { 722 spin_unlock(&sh1->stripe_lock); 723 spin_unlock(&sh2->stripe_lock); 724 local_irq_enable(); 725 } 726 727 /* Only freshly new full stripe normal write stripe can be added to a batch list */ 728 static bool stripe_can_batch(struct stripe_head *sh) 729 { 730 struct r5conf *conf = sh->raid_conf; 731 732 if (conf->log) 733 return false; 734 return test_bit(STRIPE_BATCH_READY, &sh->state) && 735 !test_bit(STRIPE_BITMAP_PENDING, &sh->state) && 736 is_full_stripe_write(sh); 737 } 738 739 /* we only do back search */ 740 static void stripe_add_to_batch_list(struct r5conf *conf, struct stripe_head *sh) 741 { 742 struct stripe_head *head; 743 sector_t head_sector, tmp_sec; 744 int hash; 745 int dd_idx; 746 int inc_empty_inactive_list_flag; 747 748 /* Don't cross chunks, so stripe pd_idx/qd_idx is the same */ 749 tmp_sec = sh->sector; 750 if (!sector_div(tmp_sec, conf->chunk_sectors)) 751 return; 752 head_sector = sh->sector - STRIPE_SECTORS; 753 754 hash = stripe_hash_locks_hash(head_sector); 755 spin_lock_irq(conf->hash_locks + hash); 756 head = __find_stripe(conf, head_sector, conf->generation); 757 if (head && !atomic_inc_not_zero(&head->count)) { 758 spin_lock(&conf->device_lock); 759 if (!atomic_read(&head->count)) { 760 if (!test_bit(STRIPE_HANDLE, &head->state)) 761 atomic_inc(&conf->active_stripes); 762 BUG_ON(list_empty(&head->lru) && 763 !test_bit(STRIPE_EXPANDING, &head->state)); 764 inc_empty_inactive_list_flag = 0; 765 if (!list_empty(conf->inactive_list + hash)) 766 inc_empty_inactive_list_flag = 1; 767 list_del_init(&head->lru); 768 if (list_empty(conf->inactive_list + hash) && inc_empty_inactive_list_flag) 769 atomic_inc(&conf->empty_inactive_list_nr); 770 if (head->group) { 771 head->group->stripes_cnt--; 772 head->group = NULL; 773 } 774 } 775 atomic_inc(&head->count); 776 spin_unlock(&conf->device_lock); 777 } 778 spin_unlock_irq(conf->hash_locks + hash); 779 780 if (!head) 781 return; 782 if (!stripe_can_batch(head)) 783 goto out; 784 785 lock_two_stripes(head, sh); 786 /* clear_batch_ready clear the flag */ 787 if (!stripe_can_batch(head) || !stripe_can_batch(sh)) 788 goto unlock_out; 789 790 if (sh->batch_head) 791 goto unlock_out; 792 793 dd_idx = 0; 794 while (dd_idx == sh->pd_idx || dd_idx == sh->qd_idx) 795 dd_idx++; 796 if (head->dev[dd_idx].towrite->bi_opf != sh->dev[dd_idx].towrite->bi_opf || 797 bio_op(head->dev[dd_idx].towrite) != bio_op(sh->dev[dd_idx].towrite)) 798 goto unlock_out; 799 800 if (head->batch_head) { 801 spin_lock(&head->batch_head->batch_lock); 802 /* This batch list is already running */ 803 if (!stripe_can_batch(head)) { 804 spin_unlock(&head->batch_head->batch_lock); 805 goto unlock_out; 806 } 807 808 /* 809 * at this point, head's BATCH_READY could be cleared, but we 810 * can still add the stripe to batch list 811 */ 812 list_add(&sh->batch_list, &head->batch_list); 813 spin_unlock(&head->batch_head->batch_lock); 814 815 sh->batch_head = head->batch_head; 816 } else { 817 head->batch_head = head; 818 sh->batch_head = head->batch_head; 819 spin_lock(&head->batch_lock); 820 list_add_tail(&sh->batch_list, &head->batch_list); 821 spin_unlock(&head->batch_lock); 822 } 823 824 if (test_and_clear_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) 825 if (atomic_dec_return(&conf->preread_active_stripes) 826 < IO_THRESHOLD) 827 md_wakeup_thread(conf->mddev->thread); 828 829 if (test_and_clear_bit(STRIPE_BIT_DELAY, &sh->state)) { 830 int seq = sh->bm_seq; 831 if (test_bit(STRIPE_BIT_DELAY, &sh->batch_head->state) && 832 sh->batch_head->bm_seq > seq) 833 seq = sh->batch_head->bm_seq; 834 set_bit(STRIPE_BIT_DELAY, &sh->batch_head->state); 835 sh->batch_head->bm_seq = seq; 836 } 837 838 atomic_inc(&sh->count); 839 unlock_out: 840 unlock_two_stripes(head, sh); 841 out: 842 raid5_release_stripe(head); 843 } 844 845 /* Determine if 'data_offset' or 'new_data_offset' should be used 846 * in this stripe_head. 847 */ 848 static int use_new_offset(struct r5conf *conf, struct stripe_head *sh) 849 { 850 sector_t progress = conf->reshape_progress; 851 /* Need a memory barrier to make sure we see the value 852 * of conf->generation, or ->data_offset that was set before 853 * reshape_progress was updated. 854 */ 855 smp_rmb(); 856 if (progress == MaxSector) 857 return 0; 858 if (sh->generation == conf->generation - 1) 859 return 0; 860 /* We are in a reshape, and this is a new-generation stripe, 861 * so use new_data_offset. 862 */ 863 return 1; 864 } 865 866 static void 867 raid5_end_read_request(struct bio *bi); 868 static void 869 raid5_end_write_request(struct bio *bi); 870 871 static void ops_run_io(struct stripe_head *sh, struct stripe_head_state *s) 872 { 873 struct r5conf *conf = sh->raid_conf; 874 int i, disks = sh->disks; 875 struct stripe_head *head_sh = sh; 876 877 might_sleep(); 878 879 if (!test_bit(STRIPE_R5C_CACHING, &sh->state)) { 880 /* writing out phase */ 881 if (s->waiting_extra_page) 882 return; 883 if (r5l_write_stripe(conf->log, sh) == 0) 884 return; 885 } else { /* caching phase */ 886 if (test_bit(STRIPE_LOG_TRAPPED, &sh->state)) { 887 r5c_cache_data(conf->log, sh, s); 888 return; 889 } 890 } 891 892 for (i = disks; i--; ) { 893 int op, op_flags = 0; 894 int replace_only = 0; 895 struct bio *bi, *rbi; 896 struct md_rdev *rdev, *rrdev = NULL; 897 898 sh = head_sh; 899 if (test_and_clear_bit(R5_Wantwrite, &sh->dev[i].flags)) { 900 op = REQ_OP_WRITE; 901 if (test_and_clear_bit(R5_WantFUA, &sh->dev[i].flags)) 902 op_flags = REQ_FUA; 903 if (test_bit(R5_Discard, &sh->dev[i].flags)) 904 op = REQ_OP_DISCARD; 905 } else if (test_and_clear_bit(R5_Wantread, &sh->dev[i].flags)) 906 op = REQ_OP_READ; 907 else if (test_and_clear_bit(R5_WantReplace, 908 &sh->dev[i].flags)) { 909 op = REQ_OP_WRITE; 910 replace_only = 1; 911 } else 912 continue; 913 if (test_and_clear_bit(R5_SyncIO, &sh->dev[i].flags)) 914 op_flags |= REQ_SYNC; 915 916 again: 917 bi = &sh->dev[i].req; 918 rbi = &sh->dev[i].rreq; /* For writing to replacement */ 919 920 rcu_read_lock(); 921 rrdev = rcu_dereference(conf->disks[i].replacement); 922 smp_mb(); /* Ensure that if rrdev is NULL, rdev won't be */ 923 rdev = rcu_dereference(conf->disks[i].rdev); 924 if (!rdev) { 925 rdev = rrdev; 926 rrdev = NULL; 927 } 928 if (op_is_write(op)) { 929 if (replace_only) 930 rdev = NULL; 931 if (rdev == rrdev) 932 /* We raced and saw duplicates */ 933 rrdev = NULL; 934 } else { 935 if (test_bit(R5_ReadRepl, &head_sh->dev[i].flags) && rrdev) 936 rdev = rrdev; 937 rrdev = NULL; 938 } 939 940 if (rdev && test_bit(Faulty, &rdev->flags)) 941 rdev = NULL; 942 if (rdev) 943 atomic_inc(&rdev->nr_pending); 944 if (rrdev && test_bit(Faulty, &rrdev->flags)) 945 rrdev = NULL; 946 if (rrdev) 947 atomic_inc(&rrdev->nr_pending); 948 rcu_read_unlock(); 949 950 /* We have already checked bad blocks for reads. Now 951 * need to check for writes. We never accept write errors 952 * on the replacement, so we don't to check rrdev. 953 */ 954 while (op_is_write(op) && rdev && 955 test_bit(WriteErrorSeen, &rdev->flags)) { 956 sector_t first_bad; 957 int bad_sectors; 958 int bad = is_badblock(rdev, sh->sector, STRIPE_SECTORS, 959 &first_bad, &bad_sectors); 960 if (!bad) 961 break; 962 963 if (bad < 0) { 964 set_bit(BlockedBadBlocks, &rdev->flags); 965 if (!conf->mddev->external && 966 conf->mddev->sb_flags) { 967 /* It is very unlikely, but we might 968 * still need to write out the 969 * bad block log - better give it 970 * a chance*/ 971 md_check_recovery(conf->mddev); 972 } 973 /* 974 * Because md_wait_for_blocked_rdev 975 * will dec nr_pending, we must 976 * increment it first. 977 */ 978 atomic_inc(&rdev->nr_pending); 979 md_wait_for_blocked_rdev(rdev, conf->mddev); 980 } else { 981 /* Acknowledged bad block - skip the write */ 982 rdev_dec_pending(rdev, conf->mddev); 983 rdev = NULL; 984 } 985 } 986 987 if (rdev) { 988 if (s->syncing || s->expanding || s->expanded 989 || s->replacing) 990 md_sync_acct(rdev->bdev, STRIPE_SECTORS); 991 992 set_bit(STRIPE_IO_STARTED, &sh->state); 993 994 bi->bi_bdev = rdev->bdev; 995 bio_set_op_attrs(bi, op, op_flags); 996 bi->bi_end_io = op_is_write(op) 997 ? raid5_end_write_request 998 : raid5_end_read_request; 999 bi->bi_private = sh; 1000 1001 pr_debug("%s: for %llu schedule op %d on disc %d\n", 1002 __func__, (unsigned long long)sh->sector, 1003 bi->bi_opf, i); 1004 atomic_inc(&sh->count); 1005 if (sh != head_sh) 1006 atomic_inc(&head_sh->count); 1007 if (use_new_offset(conf, sh)) 1008 bi->bi_iter.bi_sector = (sh->sector 1009 + rdev->new_data_offset); 1010 else 1011 bi->bi_iter.bi_sector = (sh->sector 1012 + rdev->data_offset); 1013 if (test_bit(R5_ReadNoMerge, &head_sh->dev[i].flags)) 1014 bi->bi_opf |= REQ_NOMERGE; 1015 1016 if (test_bit(R5_SkipCopy, &sh->dev[i].flags)) 1017 WARN_ON(test_bit(R5_UPTODATE, &sh->dev[i].flags)); 1018 sh->dev[i].vec.bv_page = sh->dev[i].page; 1019 bi->bi_vcnt = 1; 1020 bi->bi_io_vec[0].bv_len = STRIPE_SIZE; 1021 bi->bi_io_vec[0].bv_offset = 0; 1022 bi->bi_iter.bi_size = STRIPE_SIZE; 1023 /* 1024 * If this is discard request, set bi_vcnt 0. We don't 1025 * want to confuse SCSI because SCSI will replace payload 1026 */ 1027 if (op == REQ_OP_DISCARD) 1028 bi->bi_vcnt = 0; 1029 if (rrdev) 1030 set_bit(R5_DOUBLE_LOCKED, &sh->dev[i].flags); 1031 1032 if (conf->mddev->gendisk) 1033 trace_block_bio_remap(bdev_get_queue(bi->bi_bdev), 1034 bi, disk_devt(conf->mddev->gendisk), 1035 sh->dev[i].sector); 1036 generic_make_request(bi); 1037 } 1038 if (rrdev) { 1039 if (s->syncing || s->expanding || s->expanded 1040 || s->replacing) 1041 md_sync_acct(rrdev->bdev, STRIPE_SECTORS); 1042 1043 set_bit(STRIPE_IO_STARTED, &sh->state); 1044 1045 rbi->bi_bdev = rrdev->bdev; 1046 bio_set_op_attrs(rbi, op, op_flags); 1047 BUG_ON(!op_is_write(op)); 1048 rbi->bi_end_io = raid5_end_write_request; 1049 rbi->bi_private = sh; 1050 1051 pr_debug("%s: for %llu schedule op %d on " 1052 "replacement disc %d\n", 1053 __func__, (unsigned long long)sh->sector, 1054 rbi->bi_opf, i); 1055 atomic_inc(&sh->count); 1056 if (sh != head_sh) 1057 atomic_inc(&head_sh->count); 1058 if (use_new_offset(conf, sh)) 1059 rbi->bi_iter.bi_sector = (sh->sector 1060 + rrdev->new_data_offset); 1061 else 1062 rbi->bi_iter.bi_sector = (sh->sector 1063 + rrdev->data_offset); 1064 if (test_bit(R5_SkipCopy, &sh->dev[i].flags)) 1065 WARN_ON(test_bit(R5_UPTODATE, &sh->dev[i].flags)); 1066 sh->dev[i].rvec.bv_page = sh->dev[i].page; 1067 rbi->bi_vcnt = 1; 1068 rbi->bi_io_vec[0].bv_len = STRIPE_SIZE; 1069 rbi->bi_io_vec[0].bv_offset = 0; 1070 rbi->bi_iter.bi_size = STRIPE_SIZE; 1071 /* 1072 * If this is discard request, set bi_vcnt 0. We don't 1073 * want to confuse SCSI because SCSI will replace payload 1074 */ 1075 if (op == REQ_OP_DISCARD) 1076 rbi->bi_vcnt = 0; 1077 if (conf->mddev->gendisk) 1078 trace_block_bio_remap(bdev_get_queue(rbi->bi_bdev), 1079 rbi, disk_devt(conf->mddev->gendisk), 1080 sh->dev[i].sector); 1081 generic_make_request(rbi); 1082 } 1083 if (!rdev && !rrdev) { 1084 if (op_is_write(op)) 1085 set_bit(STRIPE_DEGRADED, &sh->state); 1086 pr_debug("skip op %d on disc %d for sector %llu\n", 1087 bi->bi_opf, i, (unsigned long long)sh->sector); 1088 clear_bit(R5_LOCKED, &sh->dev[i].flags); 1089 set_bit(STRIPE_HANDLE, &sh->state); 1090 } 1091 1092 if (!head_sh->batch_head) 1093 continue; 1094 sh = list_first_entry(&sh->batch_list, struct stripe_head, 1095 batch_list); 1096 if (sh != head_sh) 1097 goto again; 1098 } 1099 } 1100 1101 static struct dma_async_tx_descriptor * 1102 async_copy_data(int frombio, struct bio *bio, struct page **page, 1103 sector_t sector, struct dma_async_tx_descriptor *tx, 1104 struct stripe_head *sh, int no_skipcopy) 1105 { 1106 struct bio_vec bvl; 1107 struct bvec_iter iter; 1108 struct page *bio_page; 1109 int page_offset; 1110 struct async_submit_ctl submit; 1111 enum async_tx_flags flags = 0; 1112 1113 if (bio->bi_iter.bi_sector >= sector) 1114 page_offset = (signed)(bio->bi_iter.bi_sector - sector) * 512; 1115 else 1116 page_offset = (signed)(sector - bio->bi_iter.bi_sector) * -512; 1117 1118 if (frombio) 1119 flags |= ASYNC_TX_FENCE; 1120 init_async_submit(&submit, flags, tx, NULL, NULL, NULL); 1121 1122 bio_for_each_segment(bvl, bio, iter) { 1123 int len = bvl.bv_len; 1124 int clen; 1125 int b_offset = 0; 1126 1127 if (page_offset < 0) { 1128 b_offset = -page_offset; 1129 page_offset += b_offset; 1130 len -= b_offset; 1131 } 1132 1133 if (len > 0 && page_offset + len > STRIPE_SIZE) 1134 clen = STRIPE_SIZE - page_offset; 1135 else 1136 clen = len; 1137 1138 if (clen > 0) { 1139 b_offset += bvl.bv_offset; 1140 bio_page = bvl.bv_page; 1141 if (frombio) { 1142 if (sh->raid_conf->skip_copy && 1143 b_offset == 0 && page_offset == 0 && 1144 clen == STRIPE_SIZE && 1145 !no_skipcopy) 1146 *page = bio_page; 1147 else 1148 tx = async_memcpy(*page, bio_page, page_offset, 1149 b_offset, clen, &submit); 1150 } else 1151 tx = async_memcpy(bio_page, *page, b_offset, 1152 page_offset, clen, &submit); 1153 } 1154 /* chain the operations */ 1155 submit.depend_tx = tx; 1156 1157 if (clen < len) /* hit end of page */ 1158 break; 1159 page_offset += len; 1160 } 1161 1162 return tx; 1163 } 1164 1165 static void ops_complete_biofill(void *stripe_head_ref) 1166 { 1167 struct stripe_head *sh = stripe_head_ref; 1168 struct bio_list return_bi = BIO_EMPTY_LIST; 1169 int i; 1170 1171 pr_debug("%s: stripe %llu\n", __func__, 1172 (unsigned long long)sh->sector); 1173 1174 /* clear completed biofills */ 1175 for (i = sh->disks; i--; ) { 1176 struct r5dev *dev = &sh->dev[i]; 1177 1178 /* acknowledge completion of a biofill operation */ 1179 /* and check if we need to reply to a read request, 1180 * new R5_Wantfill requests are held off until 1181 * !STRIPE_BIOFILL_RUN 1182 */ 1183 if (test_and_clear_bit(R5_Wantfill, &dev->flags)) { 1184 struct bio *rbi, *rbi2; 1185 1186 BUG_ON(!dev->read); 1187 rbi = dev->read; 1188 dev->read = NULL; 1189 while (rbi && rbi->bi_iter.bi_sector < 1190 dev->sector + STRIPE_SECTORS) { 1191 rbi2 = r5_next_bio(rbi, dev->sector); 1192 if (!raid5_dec_bi_active_stripes(rbi)) 1193 bio_list_add(&return_bi, rbi); 1194 rbi = rbi2; 1195 } 1196 } 1197 } 1198 clear_bit(STRIPE_BIOFILL_RUN, &sh->state); 1199 1200 return_io(&return_bi); 1201 1202 set_bit(STRIPE_HANDLE, &sh->state); 1203 raid5_release_stripe(sh); 1204 } 1205 1206 static void ops_run_biofill(struct stripe_head *sh) 1207 { 1208 struct dma_async_tx_descriptor *tx = NULL; 1209 struct async_submit_ctl submit; 1210 int i; 1211 1212 BUG_ON(sh->batch_head); 1213 pr_debug("%s: stripe %llu\n", __func__, 1214 (unsigned long long)sh->sector); 1215 1216 for (i = sh->disks; i--; ) { 1217 struct r5dev *dev = &sh->dev[i]; 1218 if (test_bit(R5_Wantfill, &dev->flags)) { 1219 struct bio *rbi; 1220 spin_lock_irq(&sh->stripe_lock); 1221 dev->read = rbi = dev->toread; 1222 dev->toread = NULL; 1223 spin_unlock_irq(&sh->stripe_lock); 1224 while (rbi && rbi->bi_iter.bi_sector < 1225 dev->sector + STRIPE_SECTORS) { 1226 tx = async_copy_data(0, rbi, &dev->page, 1227 dev->sector, tx, sh, 0); 1228 rbi = r5_next_bio(rbi, dev->sector); 1229 } 1230 } 1231 } 1232 1233 atomic_inc(&sh->count); 1234 init_async_submit(&submit, ASYNC_TX_ACK, tx, ops_complete_biofill, sh, NULL); 1235 async_trigger_callback(&submit); 1236 } 1237 1238 static void mark_target_uptodate(struct stripe_head *sh, int target) 1239 { 1240 struct r5dev *tgt; 1241 1242 if (target < 0) 1243 return; 1244 1245 tgt = &sh->dev[target]; 1246 set_bit(R5_UPTODATE, &tgt->flags); 1247 BUG_ON(!test_bit(R5_Wantcompute, &tgt->flags)); 1248 clear_bit(R5_Wantcompute, &tgt->flags); 1249 } 1250 1251 static void ops_complete_compute(void *stripe_head_ref) 1252 { 1253 struct stripe_head *sh = stripe_head_ref; 1254 1255 pr_debug("%s: stripe %llu\n", __func__, 1256 (unsigned long long)sh->sector); 1257 1258 /* mark the computed target(s) as uptodate */ 1259 mark_target_uptodate(sh, sh->ops.target); 1260 mark_target_uptodate(sh, sh->ops.target2); 1261 1262 clear_bit(STRIPE_COMPUTE_RUN, &sh->state); 1263 if (sh->check_state == check_state_compute_run) 1264 sh->check_state = check_state_compute_result; 1265 set_bit(STRIPE_HANDLE, &sh->state); 1266 raid5_release_stripe(sh); 1267 } 1268 1269 /* return a pointer to the address conversion region of the scribble buffer */ 1270 static addr_conv_t *to_addr_conv(struct stripe_head *sh, 1271 struct raid5_percpu *percpu, int i) 1272 { 1273 void *addr; 1274 1275 addr = flex_array_get(percpu->scribble, i); 1276 return addr + sizeof(struct page *) * (sh->disks + 2); 1277 } 1278 1279 /* return a pointer to the address conversion region of the scribble buffer */ 1280 static struct page **to_addr_page(struct raid5_percpu *percpu, int i) 1281 { 1282 void *addr; 1283 1284 addr = flex_array_get(percpu->scribble, i); 1285 return addr; 1286 } 1287 1288 static struct dma_async_tx_descriptor * 1289 ops_run_compute5(struct stripe_head *sh, struct raid5_percpu *percpu) 1290 { 1291 int disks = sh->disks; 1292 struct page **xor_srcs = to_addr_page(percpu, 0); 1293 int target = sh->ops.target; 1294 struct r5dev *tgt = &sh->dev[target]; 1295 struct page *xor_dest = tgt->page; 1296 int count = 0; 1297 struct dma_async_tx_descriptor *tx; 1298 struct async_submit_ctl submit; 1299 int i; 1300 1301 BUG_ON(sh->batch_head); 1302 1303 pr_debug("%s: stripe %llu block: %d\n", 1304 __func__, (unsigned long long)sh->sector, target); 1305 BUG_ON(!test_bit(R5_Wantcompute, &tgt->flags)); 1306 1307 for (i = disks; i--; ) 1308 if (i != target) 1309 xor_srcs[count++] = sh->dev[i].page; 1310 1311 atomic_inc(&sh->count); 1312 1313 init_async_submit(&submit, ASYNC_TX_FENCE|ASYNC_TX_XOR_ZERO_DST, NULL, 1314 ops_complete_compute, sh, to_addr_conv(sh, percpu, 0)); 1315 if (unlikely(count == 1)) 1316 tx = async_memcpy(xor_dest, xor_srcs[0], 0, 0, STRIPE_SIZE, &submit); 1317 else 1318 tx = async_xor(xor_dest, xor_srcs, 0, count, STRIPE_SIZE, &submit); 1319 1320 return tx; 1321 } 1322 1323 /* set_syndrome_sources - populate source buffers for gen_syndrome 1324 * @srcs - (struct page *) array of size sh->disks 1325 * @sh - stripe_head to parse 1326 * 1327 * Populates srcs in proper layout order for the stripe and returns the 1328 * 'count' of sources to be used in a call to async_gen_syndrome. The P 1329 * destination buffer is recorded in srcs[count] and the Q destination 1330 * is recorded in srcs[count+1]]. 1331 */ 1332 static int set_syndrome_sources(struct page **srcs, 1333 struct stripe_head *sh, 1334 int srctype) 1335 { 1336 int disks = sh->disks; 1337 int syndrome_disks = sh->ddf_layout ? disks : (disks - 2); 1338 int d0_idx = raid6_d0(sh); 1339 int count; 1340 int i; 1341 1342 for (i = 0; i < disks; i++) 1343 srcs[i] = NULL; 1344 1345 count = 0; 1346 i = d0_idx; 1347 do { 1348 int slot = raid6_idx_to_slot(i, sh, &count, syndrome_disks); 1349 struct r5dev *dev = &sh->dev[i]; 1350 1351 if (i == sh->qd_idx || i == sh->pd_idx || 1352 (srctype == SYNDROME_SRC_ALL) || 1353 (srctype == SYNDROME_SRC_WANT_DRAIN && 1354 (test_bit(R5_Wantdrain, &dev->flags) || 1355 test_bit(R5_InJournal, &dev->flags))) || 1356 (srctype == SYNDROME_SRC_WRITTEN && 1357 dev->written)) { 1358 if (test_bit(R5_InJournal, &dev->flags)) 1359 srcs[slot] = sh->dev[i].orig_page; 1360 else 1361 srcs[slot] = sh->dev[i].page; 1362 } 1363 i = raid6_next_disk(i, disks); 1364 } while (i != d0_idx); 1365 1366 return syndrome_disks; 1367 } 1368 1369 static struct dma_async_tx_descriptor * 1370 ops_run_compute6_1(struct stripe_head *sh, struct raid5_percpu *percpu) 1371 { 1372 int disks = sh->disks; 1373 struct page **blocks = to_addr_page(percpu, 0); 1374 int target; 1375 int qd_idx = sh->qd_idx; 1376 struct dma_async_tx_descriptor *tx; 1377 struct async_submit_ctl submit; 1378 struct r5dev *tgt; 1379 struct page *dest; 1380 int i; 1381 int count; 1382 1383 BUG_ON(sh->batch_head); 1384 if (sh->ops.target < 0) 1385 target = sh->ops.target2; 1386 else if (sh->ops.target2 < 0) 1387 target = sh->ops.target; 1388 else 1389 /* we should only have one valid target */ 1390 BUG(); 1391 BUG_ON(target < 0); 1392 pr_debug("%s: stripe %llu block: %d\n", 1393 __func__, (unsigned long long)sh->sector, target); 1394 1395 tgt = &sh->dev[target]; 1396 BUG_ON(!test_bit(R5_Wantcompute, &tgt->flags)); 1397 dest = tgt->page; 1398 1399 atomic_inc(&sh->count); 1400 1401 if (target == qd_idx) { 1402 count = set_syndrome_sources(blocks, sh, SYNDROME_SRC_ALL); 1403 blocks[count] = NULL; /* regenerating p is not necessary */ 1404 BUG_ON(blocks[count+1] != dest); /* q should already be set */ 1405 init_async_submit(&submit, ASYNC_TX_FENCE, NULL, 1406 ops_complete_compute, sh, 1407 to_addr_conv(sh, percpu, 0)); 1408 tx = async_gen_syndrome(blocks, 0, count+2, STRIPE_SIZE, &submit); 1409 } else { 1410 /* Compute any data- or p-drive using XOR */ 1411 count = 0; 1412 for (i = disks; i-- ; ) { 1413 if (i == target || i == qd_idx) 1414 continue; 1415 blocks[count++] = sh->dev[i].page; 1416 } 1417 1418 init_async_submit(&submit, ASYNC_TX_FENCE|ASYNC_TX_XOR_ZERO_DST, 1419 NULL, ops_complete_compute, sh, 1420 to_addr_conv(sh, percpu, 0)); 1421 tx = async_xor(dest, blocks, 0, count, STRIPE_SIZE, &submit); 1422 } 1423 1424 return tx; 1425 } 1426 1427 static struct dma_async_tx_descriptor * 1428 ops_run_compute6_2(struct stripe_head *sh, struct raid5_percpu *percpu) 1429 { 1430 int i, count, disks = sh->disks; 1431 int syndrome_disks = sh->ddf_layout ? disks : disks-2; 1432 int d0_idx = raid6_d0(sh); 1433 int faila = -1, failb = -1; 1434 int target = sh->ops.target; 1435 int target2 = sh->ops.target2; 1436 struct r5dev *tgt = &sh->dev[target]; 1437 struct r5dev *tgt2 = &sh->dev[target2]; 1438 struct dma_async_tx_descriptor *tx; 1439 struct page **blocks = to_addr_page(percpu, 0); 1440 struct async_submit_ctl submit; 1441 1442 BUG_ON(sh->batch_head); 1443 pr_debug("%s: stripe %llu block1: %d block2: %d\n", 1444 __func__, (unsigned long long)sh->sector, target, target2); 1445 BUG_ON(target < 0 || target2 < 0); 1446 BUG_ON(!test_bit(R5_Wantcompute, &tgt->flags)); 1447 BUG_ON(!test_bit(R5_Wantcompute, &tgt2->flags)); 1448 1449 /* we need to open-code set_syndrome_sources to handle the 1450 * slot number conversion for 'faila' and 'failb' 1451 */ 1452 for (i = 0; i < disks ; i++) 1453 blocks[i] = NULL; 1454 count = 0; 1455 i = d0_idx; 1456 do { 1457 int slot = raid6_idx_to_slot(i, sh, &count, syndrome_disks); 1458 1459 blocks[slot] = sh->dev[i].page; 1460 1461 if (i == target) 1462 faila = slot; 1463 if (i == target2) 1464 failb = slot; 1465 i = raid6_next_disk(i, disks); 1466 } while (i != d0_idx); 1467 1468 BUG_ON(faila == failb); 1469 if (failb < faila) 1470 swap(faila, failb); 1471 pr_debug("%s: stripe: %llu faila: %d failb: %d\n", 1472 __func__, (unsigned long long)sh->sector, faila, failb); 1473 1474 atomic_inc(&sh->count); 1475 1476 if (failb == syndrome_disks+1) { 1477 /* Q disk is one of the missing disks */ 1478 if (faila == syndrome_disks) { 1479 /* Missing P+Q, just recompute */ 1480 init_async_submit(&submit, ASYNC_TX_FENCE, NULL, 1481 ops_complete_compute, sh, 1482 to_addr_conv(sh, percpu, 0)); 1483 return async_gen_syndrome(blocks, 0, syndrome_disks+2, 1484 STRIPE_SIZE, &submit); 1485 } else { 1486 struct page *dest; 1487 int data_target; 1488 int qd_idx = sh->qd_idx; 1489 1490 /* Missing D+Q: recompute D from P, then recompute Q */ 1491 if (target == qd_idx) 1492 data_target = target2; 1493 else 1494 data_target = target; 1495 1496 count = 0; 1497 for (i = disks; i-- ; ) { 1498 if (i == data_target || i == qd_idx) 1499 continue; 1500 blocks[count++] = sh->dev[i].page; 1501 } 1502 dest = sh->dev[data_target].page; 1503 init_async_submit(&submit, 1504 ASYNC_TX_FENCE|ASYNC_TX_XOR_ZERO_DST, 1505 NULL, NULL, NULL, 1506 to_addr_conv(sh, percpu, 0)); 1507 tx = async_xor(dest, blocks, 0, count, STRIPE_SIZE, 1508 &submit); 1509 1510 count = set_syndrome_sources(blocks, sh, SYNDROME_SRC_ALL); 1511 init_async_submit(&submit, ASYNC_TX_FENCE, tx, 1512 ops_complete_compute, sh, 1513 to_addr_conv(sh, percpu, 0)); 1514 return async_gen_syndrome(blocks, 0, count+2, 1515 STRIPE_SIZE, &submit); 1516 } 1517 } else { 1518 init_async_submit(&submit, ASYNC_TX_FENCE, NULL, 1519 ops_complete_compute, sh, 1520 to_addr_conv(sh, percpu, 0)); 1521 if (failb == syndrome_disks) { 1522 /* We're missing D+P. */ 1523 return async_raid6_datap_recov(syndrome_disks+2, 1524 STRIPE_SIZE, faila, 1525 blocks, &submit); 1526 } else { 1527 /* We're missing D+D. */ 1528 return async_raid6_2data_recov(syndrome_disks+2, 1529 STRIPE_SIZE, faila, failb, 1530 blocks, &submit); 1531 } 1532 } 1533 } 1534 1535 static void ops_complete_prexor(void *stripe_head_ref) 1536 { 1537 struct stripe_head *sh = stripe_head_ref; 1538 1539 pr_debug("%s: stripe %llu\n", __func__, 1540 (unsigned long long)sh->sector); 1541 1542 if (r5c_is_writeback(sh->raid_conf->log)) 1543 /* 1544 * raid5-cache write back uses orig_page during prexor. 1545 * After prexor, it is time to free orig_page 1546 */ 1547 r5c_release_extra_page(sh); 1548 } 1549 1550 static struct dma_async_tx_descriptor * 1551 ops_run_prexor5(struct stripe_head *sh, struct raid5_percpu *percpu, 1552 struct dma_async_tx_descriptor *tx) 1553 { 1554 int disks = sh->disks; 1555 struct page **xor_srcs = to_addr_page(percpu, 0); 1556 int count = 0, pd_idx = sh->pd_idx, i; 1557 struct async_submit_ctl submit; 1558 1559 /* existing parity data subtracted */ 1560 struct page *xor_dest = xor_srcs[count++] = sh->dev[pd_idx].page; 1561 1562 BUG_ON(sh->batch_head); 1563 pr_debug("%s: stripe %llu\n", __func__, 1564 (unsigned long long)sh->sector); 1565 1566 for (i = disks; i--; ) { 1567 struct r5dev *dev = &sh->dev[i]; 1568 /* Only process blocks that are known to be uptodate */ 1569 if (test_bit(R5_InJournal, &dev->flags)) 1570 xor_srcs[count++] = dev->orig_page; 1571 else if (test_bit(R5_Wantdrain, &dev->flags)) 1572 xor_srcs[count++] = dev->page; 1573 } 1574 1575 init_async_submit(&submit, ASYNC_TX_FENCE|ASYNC_TX_XOR_DROP_DST, tx, 1576 ops_complete_prexor, sh, to_addr_conv(sh, percpu, 0)); 1577 tx = async_xor(xor_dest, xor_srcs, 0, count, STRIPE_SIZE, &submit); 1578 1579 return tx; 1580 } 1581 1582 static struct dma_async_tx_descriptor * 1583 ops_run_prexor6(struct stripe_head *sh, struct raid5_percpu *percpu, 1584 struct dma_async_tx_descriptor *tx) 1585 { 1586 struct page **blocks = to_addr_page(percpu, 0); 1587 int count; 1588 struct async_submit_ctl submit; 1589 1590 pr_debug("%s: stripe %llu\n", __func__, 1591 (unsigned long long)sh->sector); 1592 1593 count = set_syndrome_sources(blocks, sh, SYNDROME_SRC_WANT_DRAIN); 1594 1595 init_async_submit(&submit, ASYNC_TX_FENCE|ASYNC_TX_PQ_XOR_DST, tx, 1596 ops_complete_prexor, sh, to_addr_conv(sh, percpu, 0)); 1597 tx = async_gen_syndrome(blocks, 0, count+2, STRIPE_SIZE, &submit); 1598 1599 return tx; 1600 } 1601 1602 static struct dma_async_tx_descriptor * 1603 ops_run_biodrain(struct stripe_head *sh, struct dma_async_tx_descriptor *tx) 1604 { 1605 struct r5conf *conf = sh->raid_conf; 1606 int disks = sh->disks; 1607 int i; 1608 struct stripe_head *head_sh = sh; 1609 1610 pr_debug("%s: stripe %llu\n", __func__, 1611 (unsigned long long)sh->sector); 1612 1613 for (i = disks; i--; ) { 1614 struct r5dev *dev; 1615 struct bio *chosen; 1616 1617 sh = head_sh; 1618 if (test_and_clear_bit(R5_Wantdrain, &head_sh->dev[i].flags)) { 1619 struct bio *wbi; 1620 1621 again: 1622 dev = &sh->dev[i]; 1623 /* 1624 * clear R5_InJournal, so when rewriting a page in 1625 * journal, it is not skipped by r5l_log_stripe() 1626 */ 1627 clear_bit(R5_InJournal, &dev->flags); 1628 spin_lock_irq(&sh->stripe_lock); 1629 chosen = dev->towrite; 1630 dev->towrite = NULL; 1631 sh->overwrite_disks = 0; 1632 BUG_ON(dev->written); 1633 wbi = dev->written = chosen; 1634 spin_unlock_irq(&sh->stripe_lock); 1635 WARN_ON(dev->page != dev->orig_page); 1636 1637 while (wbi && wbi->bi_iter.bi_sector < 1638 dev->sector + STRIPE_SECTORS) { 1639 if (wbi->bi_opf & REQ_FUA) 1640 set_bit(R5_WantFUA, &dev->flags); 1641 if (wbi->bi_opf & REQ_SYNC) 1642 set_bit(R5_SyncIO, &dev->flags); 1643 if (bio_op(wbi) == REQ_OP_DISCARD) 1644 set_bit(R5_Discard, &dev->flags); 1645 else { 1646 tx = async_copy_data(1, wbi, &dev->page, 1647 dev->sector, tx, sh, 1648 r5c_is_writeback(conf->log)); 1649 if (dev->page != dev->orig_page && 1650 !r5c_is_writeback(conf->log)) { 1651 set_bit(R5_SkipCopy, &dev->flags); 1652 clear_bit(R5_UPTODATE, &dev->flags); 1653 clear_bit(R5_OVERWRITE, &dev->flags); 1654 } 1655 } 1656 wbi = r5_next_bio(wbi, dev->sector); 1657 } 1658 1659 if (head_sh->batch_head) { 1660 sh = list_first_entry(&sh->batch_list, 1661 struct stripe_head, 1662 batch_list); 1663 if (sh == head_sh) 1664 continue; 1665 goto again; 1666 } 1667 } 1668 } 1669 1670 return tx; 1671 } 1672 1673 static void ops_complete_reconstruct(void *stripe_head_ref) 1674 { 1675 struct stripe_head *sh = stripe_head_ref; 1676 int disks = sh->disks; 1677 int pd_idx = sh->pd_idx; 1678 int qd_idx = sh->qd_idx; 1679 int i; 1680 bool fua = false, sync = false, discard = false; 1681 1682 pr_debug("%s: stripe %llu\n", __func__, 1683 (unsigned long long)sh->sector); 1684 1685 for (i = disks; i--; ) { 1686 fua |= test_bit(R5_WantFUA, &sh->dev[i].flags); 1687 sync |= test_bit(R5_SyncIO, &sh->dev[i].flags); 1688 discard |= test_bit(R5_Discard, &sh->dev[i].flags); 1689 } 1690 1691 for (i = disks; i--; ) { 1692 struct r5dev *dev = &sh->dev[i]; 1693 1694 if (dev->written || i == pd_idx || i == qd_idx) { 1695 if (!discard && !test_bit(R5_SkipCopy, &dev->flags)) 1696 set_bit(R5_UPTODATE, &dev->flags); 1697 if (fua) 1698 set_bit(R5_WantFUA, &dev->flags); 1699 if (sync) 1700 set_bit(R5_SyncIO, &dev->flags); 1701 } 1702 } 1703 1704 if (sh->reconstruct_state == reconstruct_state_drain_run) 1705 sh->reconstruct_state = reconstruct_state_drain_result; 1706 else if (sh->reconstruct_state == reconstruct_state_prexor_drain_run) 1707 sh->reconstruct_state = reconstruct_state_prexor_drain_result; 1708 else { 1709 BUG_ON(sh->reconstruct_state != reconstruct_state_run); 1710 sh->reconstruct_state = reconstruct_state_result; 1711 } 1712 1713 set_bit(STRIPE_HANDLE, &sh->state); 1714 raid5_release_stripe(sh); 1715 } 1716 1717 static void 1718 ops_run_reconstruct5(struct stripe_head *sh, struct raid5_percpu *percpu, 1719 struct dma_async_tx_descriptor *tx) 1720 { 1721 int disks = sh->disks; 1722 struct page **xor_srcs; 1723 struct async_submit_ctl submit; 1724 int count, pd_idx = sh->pd_idx, i; 1725 struct page *xor_dest; 1726 int prexor = 0; 1727 unsigned long flags; 1728 int j = 0; 1729 struct stripe_head *head_sh = sh; 1730 int last_stripe; 1731 1732 pr_debug("%s: stripe %llu\n", __func__, 1733 (unsigned long long)sh->sector); 1734 1735 for (i = 0; i < sh->disks; i++) { 1736 if (pd_idx == i) 1737 continue; 1738 if (!test_bit(R5_Discard, &sh->dev[i].flags)) 1739 break; 1740 } 1741 if (i >= sh->disks) { 1742 atomic_inc(&sh->count); 1743 set_bit(R5_Discard, &sh->dev[pd_idx].flags); 1744 ops_complete_reconstruct(sh); 1745 return; 1746 } 1747 again: 1748 count = 0; 1749 xor_srcs = to_addr_page(percpu, j); 1750 /* check if prexor is active which means only process blocks 1751 * that are part of a read-modify-write (written) 1752 */ 1753 if (head_sh->reconstruct_state == reconstruct_state_prexor_drain_run) { 1754 prexor = 1; 1755 xor_dest = xor_srcs[count++] = sh->dev[pd_idx].page; 1756 for (i = disks; i--; ) { 1757 struct r5dev *dev = &sh->dev[i]; 1758 if (head_sh->dev[i].written || 1759 test_bit(R5_InJournal, &head_sh->dev[i].flags)) 1760 xor_srcs[count++] = dev->page; 1761 } 1762 } else { 1763 xor_dest = sh->dev[pd_idx].page; 1764 for (i = disks; i--; ) { 1765 struct r5dev *dev = &sh->dev[i]; 1766 if (i != pd_idx) 1767 xor_srcs[count++] = dev->page; 1768 } 1769 } 1770 1771 /* 1/ if we prexor'd then the dest is reused as a source 1772 * 2/ if we did not prexor then we are redoing the parity 1773 * set ASYNC_TX_XOR_DROP_DST and ASYNC_TX_XOR_ZERO_DST 1774 * for the synchronous xor case 1775 */ 1776 last_stripe = !head_sh->batch_head || 1777 list_first_entry(&sh->batch_list, 1778 struct stripe_head, batch_list) == head_sh; 1779 if (last_stripe) { 1780 flags = ASYNC_TX_ACK | 1781 (prexor ? ASYNC_TX_XOR_DROP_DST : ASYNC_TX_XOR_ZERO_DST); 1782 1783 atomic_inc(&head_sh->count); 1784 init_async_submit(&submit, flags, tx, ops_complete_reconstruct, head_sh, 1785 to_addr_conv(sh, percpu, j)); 1786 } else { 1787 flags = prexor ? ASYNC_TX_XOR_DROP_DST : ASYNC_TX_XOR_ZERO_DST; 1788 init_async_submit(&submit, flags, tx, NULL, NULL, 1789 to_addr_conv(sh, percpu, j)); 1790 } 1791 1792 if (unlikely(count == 1)) 1793 tx = async_memcpy(xor_dest, xor_srcs[0], 0, 0, STRIPE_SIZE, &submit); 1794 else 1795 tx = async_xor(xor_dest, xor_srcs, 0, count, STRIPE_SIZE, &submit); 1796 if (!last_stripe) { 1797 j++; 1798 sh = list_first_entry(&sh->batch_list, struct stripe_head, 1799 batch_list); 1800 goto again; 1801 } 1802 } 1803 1804 static void 1805 ops_run_reconstruct6(struct stripe_head *sh, struct raid5_percpu *percpu, 1806 struct dma_async_tx_descriptor *tx) 1807 { 1808 struct async_submit_ctl submit; 1809 struct page **blocks; 1810 int count, i, j = 0; 1811 struct stripe_head *head_sh = sh; 1812 int last_stripe; 1813 int synflags; 1814 unsigned long txflags; 1815 1816 pr_debug("%s: stripe %llu\n", __func__, (unsigned long long)sh->sector); 1817 1818 for (i = 0; i < sh->disks; i++) { 1819 if (sh->pd_idx == i || sh->qd_idx == i) 1820 continue; 1821 if (!test_bit(R5_Discard, &sh->dev[i].flags)) 1822 break; 1823 } 1824 if (i >= sh->disks) { 1825 atomic_inc(&sh->count); 1826 set_bit(R5_Discard, &sh->dev[sh->pd_idx].flags); 1827 set_bit(R5_Discard, &sh->dev[sh->qd_idx].flags); 1828 ops_complete_reconstruct(sh); 1829 return; 1830 } 1831 1832 again: 1833 blocks = to_addr_page(percpu, j); 1834 1835 if (sh->reconstruct_state == reconstruct_state_prexor_drain_run) { 1836 synflags = SYNDROME_SRC_WRITTEN; 1837 txflags = ASYNC_TX_ACK | ASYNC_TX_PQ_XOR_DST; 1838 } else { 1839 synflags = SYNDROME_SRC_ALL; 1840 txflags = ASYNC_TX_ACK; 1841 } 1842 1843 count = set_syndrome_sources(blocks, sh, synflags); 1844 last_stripe = !head_sh->batch_head || 1845 list_first_entry(&sh->batch_list, 1846 struct stripe_head, batch_list) == head_sh; 1847 1848 if (last_stripe) { 1849 atomic_inc(&head_sh->count); 1850 init_async_submit(&submit, txflags, tx, ops_complete_reconstruct, 1851 head_sh, to_addr_conv(sh, percpu, j)); 1852 } else 1853 init_async_submit(&submit, 0, tx, NULL, NULL, 1854 to_addr_conv(sh, percpu, j)); 1855 tx = async_gen_syndrome(blocks, 0, count+2, STRIPE_SIZE, &submit); 1856 if (!last_stripe) { 1857 j++; 1858 sh = list_first_entry(&sh->batch_list, struct stripe_head, 1859 batch_list); 1860 goto again; 1861 } 1862 } 1863 1864 static void ops_complete_check(void *stripe_head_ref) 1865 { 1866 struct stripe_head *sh = stripe_head_ref; 1867 1868 pr_debug("%s: stripe %llu\n", __func__, 1869 (unsigned long long)sh->sector); 1870 1871 sh->check_state = check_state_check_result; 1872 set_bit(STRIPE_HANDLE, &sh->state); 1873 raid5_release_stripe(sh); 1874 } 1875 1876 static void ops_run_check_p(struct stripe_head *sh, struct raid5_percpu *percpu) 1877 { 1878 int disks = sh->disks; 1879 int pd_idx = sh->pd_idx; 1880 int qd_idx = sh->qd_idx; 1881 struct page *xor_dest; 1882 struct page **xor_srcs = to_addr_page(percpu, 0); 1883 struct dma_async_tx_descriptor *tx; 1884 struct async_submit_ctl submit; 1885 int count; 1886 int i; 1887 1888 pr_debug("%s: stripe %llu\n", __func__, 1889 (unsigned long long)sh->sector); 1890 1891 BUG_ON(sh->batch_head); 1892 count = 0; 1893 xor_dest = sh->dev[pd_idx].page; 1894 xor_srcs[count++] = xor_dest; 1895 for (i = disks; i--; ) { 1896 if (i == pd_idx || i == qd_idx) 1897 continue; 1898 xor_srcs[count++] = sh->dev[i].page; 1899 } 1900 1901 init_async_submit(&submit, 0, NULL, NULL, NULL, 1902 to_addr_conv(sh, percpu, 0)); 1903 tx = async_xor_val(xor_dest, xor_srcs, 0, count, STRIPE_SIZE, 1904 &sh->ops.zero_sum_result, &submit); 1905 1906 atomic_inc(&sh->count); 1907 init_async_submit(&submit, ASYNC_TX_ACK, tx, ops_complete_check, sh, NULL); 1908 tx = async_trigger_callback(&submit); 1909 } 1910 1911 static void ops_run_check_pq(struct stripe_head *sh, struct raid5_percpu *percpu, int checkp) 1912 { 1913 struct page **srcs = to_addr_page(percpu, 0); 1914 struct async_submit_ctl submit; 1915 int count; 1916 1917 pr_debug("%s: stripe %llu checkp: %d\n", __func__, 1918 (unsigned long long)sh->sector, checkp); 1919 1920 BUG_ON(sh->batch_head); 1921 count = set_syndrome_sources(srcs, sh, SYNDROME_SRC_ALL); 1922 if (!checkp) 1923 srcs[count] = NULL; 1924 1925 atomic_inc(&sh->count); 1926 init_async_submit(&submit, ASYNC_TX_ACK, NULL, ops_complete_check, 1927 sh, to_addr_conv(sh, percpu, 0)); 1928 async_syndrome_val(srcs, 0, count+2, STRIPE_SIZE, 1929 &sh->ops.zero_sum_result, percpu->spare_page, &submit); 1930 } 1931 1932 static void raid_run_ops(struct stripe_head *sh, unsigned long ops_request) 1933 { 1934 int overlap_clear = 0, i, disks = sh->disks; 1935 struct dma_async_tx_descriptor *tx = NULL; 1936 struct r5conf *conf = sh->raid_conf; 1937 int level = conf->level; 1938 struct raid5_percpu *percpu; 1939 unsigned long cpu; 1940 1941 cpu = get_cpu(); 1942 percpu = per_cpu_ptr(conf->percpu, cpu); 1943 if (test_bit(STRIPE_OP_BIOFILL, &ops_request)) { 1944 ops_run_biofill(sh); 1945 overlap_clear++; 1946 } 1947 1948 if (test_bit(STRIPE_OP_COMPUTE_BLK, &ops_request)) { 1949 if (level < 6) 1950 tx = ops_run_compute5(sh, percpu); 1951 else { 1952 if (sh->ops.target2 < 0 || sh->ops.target < 0) 1953 tx = ops_run_compute6_1(sh, percpu); 1954 else 1955 tx = ops_run_compute6_2(sh, percpu); 1956 } 1957 /* terminate the chain if reconstruct is not set to be run */ 1958 if (tx && !test_bit(STRIPE_OP_RECONSTRUCT, &ops_request)) 1959 async_tx_ack(tx); 1960 } 1961 1962 if (test_bit(STRIPE_OP_PREXOR, &ops_request)) { 1963 if (level < 6) 1964 tx = ops_run_prexor5(sh, percpu, tx); 1965 else 1966 tx = ops_run_prexor6(sh, percpu, tx); 1967 } 1968 1969 if (test_bit(STRIPE_OP_BIODRAIN, &ops_request)) { 1970 tx = ops_run_biodrain(sh, tx); 1971 overlap_clear++; 1972 } 1973 1974 if (test_bit(STRIPE_OP_RECONSTRUCT, &ops_request)) { 1975 if (level < 6) 1976 ops_run_reconstruct5(sh, percpu, tx); 1977 else 1978 ops_run_reconstruct6(sh, percpu, tx); 1979 } 1980 1981 if (test_bit(STRIPE_OP_CHECK, &ops_request)) { 1982 if (sh->check_state == check_state_run) 1983 ops_run_check_p(sh, percpu); 1984 else if (sh->check_state == check_state_run_q) 1985 ops_run_check_pq(sh, percpu, 0); 1986 else if (sh->check_state == check_state_run_pq) 1987 ops_run_check_pq(sh, percpu, 1); 1988 else 1989 BUG(); 1990 } 1991 1992 if (overlap_clear && !sh->batch_head) 1993 for (i = disks; i--; ) { 1994 struct r5dev *dev = &sh->dev[i]; 1995 if (test_and_clear_bit(R5_Overlap, &dev->flags)) 1996 wake_up(&sh->raid_conf->wait_for_overlap); 1997 } 1998 put_cpu(); 1999 } 2000 2001 static struct stripe_head *alloc_stripe(struct kmem_cache *sc, gfp_t gfp, 2002 int disks) 2003 { 2004 struct stripe_head *sh; 2005 int i; 2006 2007 sh = kmem_cache_zalloc(sc, gfp); 2008 if (sh) { 2009 spin_lock_init(&sh->stripe_lock); 2010 spin_lock_init(&sh->batch_lock); 2011 INIT_LIST_HEAD(&sh->batch_list); 2012 INIT_LIST_HEAD(&sh->lru); 2013 INIT_LIST_HEAD(&sh->r5c); 2014 INIT_LIST_HEAD(&sh->log_list); 2015 atomic_set(&sh->count, 1); 2016 sh->log_start = MaxSector; 2017 for (i = 0; i < disks; i++) { 2018 struct r5dev *dev = &sh->dev[i]; 2019 2020 bio_init(&dev->req, &dev->vec, 1); 2021 bio_init(&dev->rreq, &dev->rvec, 1); 2022 } 2023 } 2024 return sh; 2025 } 2026 static int grow_one_stripe(struct r5conf *conf, gfp_t gfp) 2027 { 2028 struct stripe_head *sh; 2029 2030 sh = alloc_stripe(conf->slab_cache, gfp, conf->pool_size); 2031 if (!sh) 2032 return 0; 2033 2034 sh->raid_conf = conf; 2035 2036 if (grow_buffers(sh, gfp)) { 2037 shrink_buffers(sh); 2038 kmem_cache_free(conf->slab_cache, sh); 2039 return 0; 2040 } 2041 sh->hash_lock_index = 2042 conf->max_nr_stripes % NR_STRIPE_HASH_LOCKS; 2043 /* we just created an active stripe so... */ 2044 atomic_inc(&conf->active_stripes); 2045 2046 raid5_release_stripe(sh); 2047 conf->max_nr_stripes++; 2048 return 1; 2049 } 2050 2051 static int grow_stripes(struct r5conf *conf, int num) 2052 { 2053 struct kmem_cache *sc; 2054 int devs = max(conf->raid_disks, conf->previous_raid_disks); 2055 2056 if (conf->mddev->gendisk) 2057 sprintf(conf->cache_name[0], 2058 "raid%d-%s", conf->level, mdname(conf->mddev)); 2059 else 2060 sprintf(conf->cache_name[0], 2061 "raid%d-%p", conf->level, conf->mddev); 2062 sprintf(conf->cache_name[1], "%s-alt", conf->cache_name[0]); 2063 2064 conf->active_name = 0; 2065 sc = kmem_cache_create(conf->cache_name[conf->active_name], 2066 sizeof(struct stripe_head)+(devs-1)*sizeof(struct r5dev), 2067 0, 0, NULL); 2068 if (!sc) 2069 return 1; 2070 conf->slab_cache = sc; 2071 conf->pool_size = devs; 2072 while (num--) 2073 if (!grow_one_stripe(conf, GFP_KERNEL)) 2074 return 1; 2075 2076 return 0; 2077 } 2078 2079 /** 2080 * scribble_len - return the required size of the scribble region 2081 * @num - total number of disks in the array 2082 * 2083 * The size must be enough to contain: 2084 * 1/ a struct page pointer for each device in the array +2 2085 * 2/ room to convert each entry in (1) to its corresponding dma 2086 * (dma_map_page()) or page (page_address()) address. 2087 * 2088 * Note: the +2 is for the destination buffers of the ddf/raid6 case where we 2089 * calculate over all devices (not just the data blocks), using zeros in place 2090 * of the P and Q blocks. 2091 */ 2092 static struct flex_array *scribble_alloc(int num, int cnt, gfp_t flags) 2093 { 2094 struct flex_array *ret; 2095 size_t len; 2096 2097 len = sizeof(struct page *) * (num+2) + sizeof(addr_conv_t) * (num+2); 2098 ret = flex_array_alloc(len, cnt, flags); 2099 if (!ret) 2100 return NULL; 2101 /* always prealloc all elements, so no locking is required */ 2102 if (flex_array_prealloc(ret, 0, cnt, flags)) { 2103 flex_array_free(ret); 2104 return NULL; 2105 } 2106 return ret; 2107 } 2108 2109 static int resize_chunks(struct r5conf *conf, int new_disks, int new_sectors) 2110 { 2111 unsigned long cpu; 2112 int err = 0; 2113 2114 /* 2115 * Never shrink. And mddev_suspend() could deadlock if this is called 2116 * from raid5d. In that case, scribble_disks and scribble_sectors 2117 * should equal to new_disks and new_sectors 2118 */ 2119 if (conf->scribble_disks >= new_disks && 2120 conf->scribble_sectors >= new_sectors) 2121 return 0; 2122 mddev_suspend(conf->mddev); 2123 get_online_cpus(); 2124 for_each_present_cpu(cpu) { 2125 struct raid5_percpu *percpu; 2126 struct flex_array *scribble; 2127 2128 percpu = per_cpu_ptr(conf->percpu, cpu); 2129 scribble = scribble_alloc(new_disks, 2130 new_sectors / STRIPE_SECTORS, 2131 GFP_NOIO); 2132 2133 if (scribble) { 2134 flex_array_free(percpu->scribble); 2135 percpu->scribble = scribble; 2136 } else { 2137 err = -ENOMEM; 2138 break; 2139 } 2140 } 2141 put_online_cpus(); 2142 mddev_resume(conf->mddev); 2143 if (!err) { 2144 conf->scribble_disks = new_disks; 2145 conf->scribble_sectors = new_sectors; 2146 } 2147 return err; 2148 } 2149 2150 static int resize_stripes(struct r5conf *conf, int newsize) 2151 { 2152 /* Make all the stripes able to hold 'newsize' devices. 2153 * New slots in each stripe get 'page' set to a new page. 2154 * 2155 * This happens in stages: 2156 * 1/ create a new kmem_cache and allocate the required number of 2157 * stripe_heads. 2158 * 2/ gather all the old stripe_heads and transfer the pages across 2159 * to the new stripe_heads. This will have the side effect of 2160 * freezing the array as once all stripe_heads have been collected, 2161 * no IO will be possible. Old stripe heads are freed once their 2162 * pages have been transferred over, and the old kmem_cache is 2163 * freed when all stripes are done. 2164 * 3/ reallocate conf->disks to be suitable bigger. If this fails, 2165 * we simple return a failre status - no need to clean anything up. 2166 * 4/ allocate new pages for the new slots in the new stripe_heads. 2167 * If this fails, we don't bother trying the shrink the 2168 * stripe_heads down again, we just leave them as they are. 2169 * As each stripe_head is processed the new one is released into 2170 * active service. 2171 * 2172 * Once step2 is started, we cannot afford to wait for a write, 2173 * so we use GFP_NOIO allocations. 2174 */ 2175 struct stripe_head *osh, *nsh; 2176 LIST_HEAD(newstripes); 2177 struct disk_info *ndisks; 2178 int err; 2179 struct kmem_cache *sc; 2180 int i; 2181 int hash, cnt; 2182 2183 if (newsize <= conf->pool_size) 2184 return 0; /* never bother to shrink */ 2185 2186 err = md_allow_write(conf->mddev); 2187 if (err) 2188 return err; 2189 2190 /* Step 1 */ 2191 sc = kmem_cache_create(conf->cache_name[1-conf->active_name], 2192 sizeof(struct stripe_head)+(newsize-1)*sizeof(struct r5dev), 2193 0, 0, NULL); 2194 if (!sc) 2195 return -ENOMEM; 2196 2197 /* Need to ensure auto-resizing doesn't interfere */ 2198 mutex_lock(&conf->cache_size_mutex); 2199 2200 for (i = conf->max_nr_stripes; i; i--) { 2201 nsh = alloc_stripe(sc, GFP_KERNEL, newsize); 2202 if (!nsh) 2203 break; 2204 2205 nsh->raid_conf = conf; 2206 list_add(&nsh->lru, &newstripes); 2207 } 2208 if (i) { 2209 /* didn't get enough, give up */ 2210 while (!list_empty(&newstripes)) { 2211 nsh = list_entry(newstripes.next, struct stripe_head, lru); 2212 list_del(&nsh->lru); 2213 kmem_cache_free(sc, nsh); 2214 } 2215 kmem_cache_destroy(sc); 2216 mutex_unlock(&conf->cache_size_mutex); 2217 return -ENOMEM; 2218 } 2219 /* Step 2 - Must use GFP_NOIO now. 2220 * OK, we have enough stripes, start collecting inactive 2221 * stripes and copying them over 2222 */ 2223 hash = 0; 2224 cnt = 0; 2225 list_for_each_entry(nsh, &newstripes, lru) { 2226 lock_device_hash_lock(conf, hash); 2227 wait_event_cmd(conf->wait_for_stripe, 2228 !list_empty(conf->inactive_list + hash), 2229 unlock_device_hash_lock(conf, hash), 2230 lock_device_hash_lock(conf, hash)); 2231 osh = get_free_stripe(conf, hash); 2232 unlock_device_hash_lock(conf, hash); 2233 2234 for(i=0; i<conf->pool_size; i++) { 2235 nsh->dev[i].page = osh->dev[i].page; 2236 nsh->dev[i].orig_page = osh->dev[i].page; 2237 } 2238 nsh->hash_lock_index = hash; 2239 kmem_cache_free(conf->slab_cache, osh); 2240 cnt++; 2241 if (cnt >= conf->max_nr_stripes / NR_STRIPE_HASH_LOCKS + 2242 !!((conf->max_nr_stripes % NR_STRIPE_HASH_LOCKS) > hash)) { 2243 hash++; 2244 cnt = 0; 2245 } 2246 } 2247 kmem_cache_destroy(conf->slab_cache); 2248 2249 /* Step 3. 2250 * At this point, we are holding all the stripes so the array 2251 * is completely stalled, so now is a good time to resize 2252 * conf->disks and the scribble region 2253 */ 2254 ndisks = kzalloc(newsize * sizeof(struct disk_info), GFP_NOIO); 2255 if (ndisks) { 2256 for (i = 0; i < conf->pool_size; i++) 2257 ndisks[i] = conf->disks[i]; 2258 2259 for (i = conf->pool_size; i < newsize; i++) { 2260 ndisks[i].extra_page = alloc_page(GFP_NOIO); 2261 if (!ndisks[i].extra_page) 2262 err = -ENOMEM; 2263 } 2264 2265 if (err) { 2266 for (i = conf->pool_size; i < newsize; i++) 2267 if (ndisks[i].extra_page) 2268 put_page(ndisks[i].extra_page); 2269 kfree(ndisks); 2270 } else { 2271 kfree(conf->disks); 2272 conf->disks = ndisks; 2273 } 2274 } else 2275 err = -ENOMEM; 2276 2277 mutex_unlock(&conf->cache_size_mutex); 2278 /* Step 4, return new stripes to service */ 2279 while(!list_empty(&newstripes)) { 2280 nsh = list_entry(newstripes.next, struct stripe_head, lru); 2281 list_del_init(&nsh->lru); 2282 2283 for (i=conf->raid_disks; i < newsize; i++) 2284 if (nsh->dev[i].page == NULL) { 2285 struct page *p = alloc_page(GFP_NOIO); 2286 nsh->dev[i].page = p; 2287 nsh->dev[i].orig_page = p; 2288 if (!p) 2289 err = -ENOMEM; 2290 } 2291 raid5_release_stripe(nsh); 2292 } 2293 /* critical section pass, GFP_NOIO no longer needed */ 2294 2295 conf->slab_cache = sc; 2296 conf->active_name = 1-conf->active_name; 2297 if (!err) 2298 conf->pool_size = newsize; 2299 return err; 2300 } 2301 2302 static int drop_one_stripe(struct r5conf *conf) 2303 { 2304 struct stripe_head *sh; 2305 int hash = (conf->max_nr_stripes - 1) & STRIPE_HASH_LOCKS_MASK; 2306 2307 spin_lock_irq(conf->hash_locks + hash); 2308 sh = get_free_stripe(conf, hash); 2309 spin_unlock_irq(conf->hash_locks + hash); 2310 if (!sh) 2311 return 0; 2312 BUG_ON(atomic_read(&sh->count)); 2313 shrink_buffers(sh); 2314 kmem_cache_free(conf->slab_cache, sh); 2315 atomic_dec(&conf->active_stripes); 2316 conf->max_nr_stripes--; 2317 return 1; 2318 } 2319 2320 static void shrink_stripes(struct r5conf *conf) 2321 { 2322 while (conf->max_nr_stripes && 2323 drop_one_stripe(conf)) 2324 ; 2325 2326 kmem_cache_destroy(conf->slab_cache); 2327 conf->slab_cache = NULL; 2328 } 2329 2330 static void raid5_end_read_request(struct bio * bi) 2331 { 2332 struct stripe_head *sh = bi->bi_private; 2333 struct r5conf *conf = sh->raid_conf; 2334 int disks = sh->disks, i; 2335 char b[BDEVNAME_SIZE]; 2336 struct md_rdev *rdev = NULL; 2337 sector_t s; 2338 2339 for (i=0 ; i<disks; i++) 2340 if (bi == &sh->dev[i].req) 2341 break; 2342 2343 pr_debug("end_read_request %llu/%d, count: %d, error %d.\n", 2344 (unsigned long long)sh->sector, i, atomic_read(&sh->count), 2345 bi->bi_error); 2346 if (i == disks) { 2347 bio_reset(bi); 2348 BUG(); 2349 return; 2350 } 2351 if (test_bit(R5_ReadRepl, &sh->dev[i].flags)) 2352 /* If replacement finished while this request was outstanding, 2353 * 'replacement' might be NULL already. 2354 * In that case it moved down to 'rdev'. 2355 * rdev is not removed until all requests are finished. 2356 */ 2357 rdev = conf->disks[i].replacement; 2358 if (!rdev) 2359 rdev = conf->disks[i].rdev; 2360 2361 if (use_new_offset(conf, sh)) 2362 s = sh->sector + rdev->new_data_offset; 2363 else 2364 s = sh->sector + rdev->data_offset; 2365 if (!bi->bi_error) { 2366 set_bit(R5_UPTODATE, &sh->dev[i].flags); 2367 if (test_bit(R5_ReadError, &sh->dev[i].flags)) { 2368 /* Note that this cannot happen on a 2369 * replacement device. We just fail those on 2370 * any error 2371 */ 2372 pr_info_ratelimited( 2373 "md/raid:%s: read error corrected (%lu sectors at %llu on %s)\n", 2374 mdname(conf->mddev), STRIPE_SECTORS, 2375 (unsigned long long)s, 2376 bdevname(rdev->bdev, b)); 2377 atomic_add(STRIPE_SECTORS, &rdev->corrected_errors); 2378 clear_bit(R5_ReadError, &sh->dev[i].flags); 2379 clear_bit(R5_ReWrite, &sh->dev[i].flags); 2380 } else if (test_bit(R5_ReadNoMerge, &sh->dev[i].flags)) 2381 clear_bit(R5_ReadNoMerge, &sh->dev[i].flags); 2382 2383 if (atomic_read(&rdev->read_errors)) 2384 atomic_set(&rdev->read_errors, 0); 2385 } else { 2386 const char *bdn = bdevname(rdev->bdev, b); 2387 int retry = 0; 2388 int set_bad = 0; 2389 2390 clear_bit(R5_UPTODATE, &sh->dev[i].flags); 2391 atomic_inc(&rdev->read_errors); 2392 if (test_bit(R5_ReadRepl, &sh->dev[i].flags)) 2393 pr_warn_ratelimited( 2394 "md/raid:%s: read error on replacement device (sector %llu on %s).\n", 2395 mdname(conf->mddev), 2396 (unsigned long long)s, 2397 bdn); 2398 else if (conf->mddev->degraded >= conf->max_degraded) { 2399 set_bad = 1; 2400 pr_warn_ratelimited( 2401 "md/raid:%s: read error not correctable (sector %llu on %s).\n", 2402 mdname(conf->mddev), 2403 (unsigned long long)s, 2404 bdn); 2405 } else if (test_bit(R5_ReWrite, &sh->dev[i].flags)) { 2406 /* Oh, no!!! */ 2407 set_bad = 1; 2408 pr_warn_ratelimited( 2409 "md/raid:%s: read error NOT corrected!! (sector %llu on %s).\n", 2410 mdname(conf->mddev), 2411 (unsigned long long)s, 2412 bdn); 2413 } else if (atomic_read(&rdev->read_errors) 2414 > conf->max_nr_stripes) 2415 pr_warn("md/raid:%s: Too many read errors, failing device %s.\n", 2416 mdname(conf->mddev), bdn); 2417 else 2418 retry = 1; 2419 if (set_bad && test_bit(In_sync, &rdev->flags) 2420 && !test_bit(R5_ReadNoMerge, &sh->dev[i].flags)) 2421 retry = 1; 2422 if (retry) 2423 if (test_bit(R5_ReadNoMerge, &sh->dev[i].flags)) { 2424 set_bit(R5_ReadError, &sh->dev[i].flags); 2425 clear_bit(R5_ReadNoMerge, &sh->dev[i].flags); 2426 } else 2427 set_bit(R5_ReadNoMerge, &sh->dev[i].flags); 2428 else { 2429 clear_bit(R5_ReadError, &sh->dev[i].flags); 2430 clear_bit(R5_ReWrite, &sh->dev[i].flags); 2431 if (!(set_bad 2432 && test_bit(In_sync, &rdev->flags) 2433 && rdev_set_badblocks( 2434 rdev, sh->sector, STRIPE_SECTORS, 0))) 2435 md_error(conf->mddev, rdev); 2436 } 2437 } 2438 rdev_dec_pending(rdev, conf->mddev); 2439 bio_reset(bi); 2440 clear_bit(R5_LOCKED, &sh->dev[i].flags); 2441 set_bit(STRIPE_HANDLE, &sh->state); 2442 raid5_release_stripe(sh); 2443 } 2444 2445 static void raid5_end_write_request(struct bio *bi) 2446 { 2447 struct stripe_head *sh = bi->bi_private; 2448 struct r5conf *conf = sh->raid_conf; 2449 int disks = sh->disks, i; 2450 struct md_rdev *uninitialized_var(rdev); 2451 sector_t first_bad; 2452 int bad_sectors; 2453 int replacement = 0; 2454 2455 for (i = 0 ; i < disks; i++) { 2456 if (bi == &sh->dev[i].req) { 2457 rdev = conf->disks[i].rdev; 2458 break; 2459 } 2460 if (bi == &sh->dev[i].rreq) { 2461 rdev = conf->disks[i].replacement; 2462 if (rdev) 2463 replacement = 1; 2464 else 2465 /* rdev was removed and 'replacement' 2466 * replaced it. rdev is not removed 2467 * until all requests are finished. 2468 */ 2469 rdev = conf->disks[i].rdev; 2470 break; 2471 } 2472 } 2473 pr_debug("end_write_request %llu/%d, count %d, error: %d.\n", 2474 (unsigned long long)sh->sector, i, atomic_read(&sh->count), 2475 bi->bi_error); 2476 if (i == disks) { 2477 bio_reset(bi); 2478 BUG(); 2479 return; 2480 } 2481 2482 if (replacement) { 2483 if (bi->bi_error) 2484 md_error(conf->mddev, rdev); 2485 else if (is_badblock(rdev, sh->sector, 2486 STRIPE_SECTORS, 2487 &first_bad, &bad_sectors)) 2488 set_bit(R5_MadeGoodRepl, &sh->dev[i].flags); 2489 } else { 2490 if (bi->bi_error) { 2491 set_bit(STRIPE_DEGRADED, &sh->state); 2492 set_bit(WriteErrorSeen, &rdev->flags); 2493 set_bit(R5_WriteError, &sh->dev[i].flags); 2494 if (!test_and_set_bit(WantReplacement, &rdev->flags)) 2495 set_bit(MD_RECOVERY_NEEDED, 2496 &rdev->mddev->recovery); 2497 } else if (is_badblock(rdev, sh->sector, 2498 STRIPE_SECTORS, 2499 &first_bad, &bad_sectors)) { 2500 set_bit(R5_MadeGood, &sh->dev[i].flags); 2501 if (test_bit(R5_ReadError, &sh->dev[i].flags)) 2502 /* That was a successful write so make 2503 * sure it looks like we already did 2504 * a re-write. 2505 */ 2506 set_bit(R5_ReWrite, &sh->dev[i].flags); 2507 } 2508 } 2509 rdev_dec_pending(rdev, conf->mddev); 2510 2511 if (sh->batch_head && bi->bi_error && !replacement) 2512 set_bit(STRIPE_BATCH_ERR, &sh->batch_head->state); 2513 2514 bio_reset(bi); 2515 if (!test_and_clear_bit(R5_DOUBLE_LOCKED, &sh->dev[i].flags)) 2516 clear_bit(R5_LOCKED, &sh->dev[i].flags); 2517 set_bit(STRIPE_HANDLE, &sh->state); 2518 raid5_release_stripe(sh); 2519 2520 if (sh->batch_head && sh != sh->batch_head) 2521 raid5_release_stripe(sh->batch_head); 2522 } 2523 2524 static void raid5_build_block(struct stripe_head *sh, int i, int previous) 2525 { 2526 struct r5dev *dev = &sh->dev[i]; 2527 2528 dev->flags = 0; 2529 dev->sector = raid5_compute_blocknr(sh, i, previous); 2530 } 2531 2532 static void raid5_error(struct mddev *mddev, struct md_rdev *rdev) 2533 { 2534 char b[BDEVNAME_SIZE]; 2535 struct r5conf *conf = mddev->private; 2536 unsigned long flags; 2537 pr_debug("raid456: error called\n"); 2538 2539 spin_lock_irqsave(&conf->device_lock, flags); 2540 clear_bit(In_sync, &rdev->flags); 2541 mddev->degraded = calc_degraded(conf); 2542 spin_unlock_irqrestore(&conf->device_lock, flags); 2543 set_bit(MD_RECOVERY_INTR, &mddev->recovery); 2544 2545 set_bit(Blocked, &rdev->flags); 2546 set_bit(Faulty, &rdev->flags); 2547 set_mask_bits(&mddev->sb_flags, 0, 2548 BIT(MD_SB_CHANGE_DEVS) | BIT(MD_SB_CHANGE_PENDING)); 2549 pr_crit("md/raid:%s: Disk failure on %s, disabling device.\n" 2550 "md/raid:%s: Operation continuing on %d devices.\n", 2551 mdname(mddev), 2552 bdevname(rdev->bdev, b), 2553 mdname(mddev), 2554 conf->raid_disks - mddev->degraded); 2555 } 2556 2557 /* 2558 * Input: a 'big' sector number, 2559 * Output: index of the data and parity disk, and the sector # in them. 2560 */ 2561 sector_t raid5_compute_sector(struct r5conf *conf, sector_t r_sector, 2562 int previous, int *dd_idx, 2563 struct stripe_head *sh) 2564 { 2565 sector_t stripe, stripe2; 2566 sector_t chunk_number; 2567 unsigned int chunk_offset; 2568 int pd_idx, qd_idx; 2569 int ddf_layout = 0; 2570 sector_t new_sector; 2571 int algorithm = previous ? conf->prev_algo 2572 : conf->algorithm; 2573 int sectors_per_chunk = previous ? conf->prev_chunk_sectors 2574 : conf->chunk_sectors; 2575 int raid_disks = previous ? conf->previous_raid_disks 2576 : conf->raid_disks; 2577 int data_disks = raid_disks - conf->max_degraded; 2578 2579 /* First compute the information on this sector */ 2580 2581 /* 2582 * Compute the chunk number and the sector offset inside the chunk 2583 */ 2584 chunk_offset = sector_div(r_sector, sectors_per_chunk); 2585 chunk_number = r_sector; 2586 2587 /* 2588 * Compute the stripe number 2589 */ 2590 stripe = chunk_number; 2591 *dd_idx = sector_div(stripe, data_disks); 2592 stripe2 = stripe; 2593 /* 2594 * Select the parity disk based on the user selected algorithm. 2595 */ 2596 pd_idx = qd_idx = -1; 2597 switch(conf->level) { 2598 case 4: 2599 pd_idx = data_disks; 2600 break; 2601 case 5: 2602 switch (algorithm) { 2603 case ALGORITHM_LEFT_ASYMMETRIC: 2604 pd_idx = data_disks - sector_div(stripe2, raid_disks); 2605 if (*dd_idx >= pd_idx) 2606 (*dd_idx)++; 2607 break; 2608 case ALGORITHM_RIGHT_ASYMMETRIC: 2609 pd_idx = sector_div(stripe2, raid_disks); 2610 if (*dd_idx >= pd_idx) 2611 (*dd_idx)++; 2612 break; 2613 case ALGORITHM_LEFT_SYMMETRIC: 2614 pd_idx = data_disks - sector_div(stripe2, raid_disks); 2615 *dd_idx = (pd_idx + 1 + *dd_idx) % raid_disks; 2616 break; 2617 case ALGORITHM_RIGHT_SYMMETRIC: 2618 pd_idx = sector_div(stripe2, raid_disks); 2619 *dd_idx = (pd_idx + 1 + *dd_idx) % raid_disks; 2620 break; 2621 case ALGORITHM_PARITY_0: 2622 pd_idx = 0; 2623 (*dd_idx)++; 2624 break; 2625 case ALGORITHM_PARITY_N: 2626 pd_idx = data_disks; 2627 break; 2628 default: 2629 BUG(); 2630 } 2631 break; 2632 case 6: 2633 2634 switch (algorithm) { 2635 case ALGORITHM_LEFT_ASYMMETRIC: 2636 pd_idx = raid_disks - 1 - sector_div(stripe2, raid_disks); 2637 qd_idx = pd_idx + 1; 2638 if (pd_idx == raid_disks-1) { 2639 (*dd_idx)++; /* Q D D D P */ 2640 qd_idx = 0; 2641 } else if (*dd_idx >= pd_idx) 2642 (*dd_idx) += 2; /* D D P Q D */ 2643 break; 2644 case ALGORITHM_RIGHT_ASYMMETRIC: 2645 pd_idx = sector_div(stripe2, raid_disks); 2646 qd_idx = pd_idx + 1; 2647 if (pd_idx == raid_disks-1) { 2648 (*dd_idx)++; /* Q D D D P */ 2649 qd_idx = 0; 2650 } else if (*dd_idx >= pd_idx) 2651 (*dd_idx) += 2; /* D D P Q D */ 2652 break; 2653 case ALGORITHM_LEFT_SYMMETRIC: 2654 pd_idx = raid_disks - 1 - sector_div(stripe2, raid_disks); 2655 qd_idx = (pd_idx + 1) % raid_disks; 2656 *dd_idx = (pd_idx + 2 + *dd_idx) % raid_disks; 2657 break; 2658 case ALGORITHM_RIGHT_SYMMETRIC: 2659 pd_idx = sector_div(stripe2, raid_disks); 2660 qd_idx = (pd_idx + 1) % raid_disks; 2661 *dd_idx = (pd_idx + 2 + *dd_idx) % raid_disks; 2662 break; 2663 2664 case ALGORITHM_PARITY_0: 2665 pd_idx = 0; 2666 qd_idx = 1; 2667 (*dd_idx) += 2; 2668 break; 2669 case ALGORITHM_PARITY_N: 2670 pd_idx = data_disks; 2671 qd_idx = data_disks + 1; 2672 break; 2673 2674 case ALGORITHM_ROTATING_ZERO_RESTART: 2675 /* Exactly the same as RIGHT_ASYMMETRIC, but or 2676 * of blocks for computing Q is different. 2677 */ 2678 pd_idx = sector_div(stripe2, raid_disks); 2679 qd_idx = pd_idx + 1; 2680 if (pd_idx == raid_disks-1) { 2681 (*dd_idx)++; /* Q D D D P */ 2682 qd_idx = 0; 2683 } else if (*dd_idx >= pd_idx) 2684 (*dd_idx) += 2; /* D D P Q D */ 2685 ddf_layout = 1; 2686 break; 2687 2688 case ALGORITHM_ROTATING_N_RESTART: 2689 /* Same a left_asymmetric, by first stripe is 2690 * D D D P Q rather than 2691 * Q D D D P 2692 */ 2693 stripe2 += 1; 2694 pd_idx = raid_disks - 1 - sector_div(stripe2, raid_disks); 2695 qd_idx = pd_idx + 1; 2696 if (pd_idx == raid_disks-1) { 2697 (*dd_idx)++; /* Q D D D P */ 2698 qd_idx = 0; 2699 } else if (*dd_idx >= pd_idx) 2700 (*dd_idx) += 2; /* D D P Q D */ 2701 ddf_layout = 1; 2702 break; 2703 2704 case ALGORITHM_ROTATING_N_CONTINUE: 2705 /* Same as left_symmetric but Q is before P */ 2706 pd_idx = raid_disks - 1 - sector_div(stripe2, raid_disks); 2707 qd_idx = (pd_idx + raid_disks - 1) % raid_disks; 2708 *dd_idx = (pd_idx + 1 + *dd_idx) % raid_disks; 2709 ddf_layout = 1; 2710 break; 2711 2712 case ALGORITHM_LEFT_ASYMMETRIC_6: 2713 /* RAID5 left_asymmetric, with Q on last device */ 2714 pd_idx = data_disks - sector_div(stripe2, raid_disks-1); 2715 if (*dd_idx >= pd_idx) 2716 (*dd_idx)++; 2717 qd_idx = raid_disks - 1; 2718 break; 2719 2720 case ALGORITHM_RIGHT_ASYMMETRIC_6: 2721 pd_idx = sector_div(stripe2, raid_disks-1); 2722 if (*dd_idx >= pd_idx) 2723 (*dd_idx)++; 2724 qd_idx = raid_disks - 1; 2725 break; 2726 2727 case ALGORITHM_LEFT_SYMMETRIC_6: 2728 pd_idx = data_disks - sector_div(stripe2, raid_disks-1); 2729 *dd_idx = (pd_idx + 1 + *dd_idx) % (raid_disks-1); 2730 qd_idx = raid_disks - 1; 2731 break; 2732 2733 case ALGORITHM_RIGHT_SYMMETRIC_6: 2734 pd_idx = sector_div(stripe2, raid_disks-1); 2735 *dd_idx = (pd_idx + 1 + *dd_idx) % (raid_disks-1); 2736 qd_idx = raid_disks - 1; 2737 break; 2738 2739 case ALGORITHM_PARITY_0_6: 2740 pd_idx = 0; 2741 (*dd_idx)++; 2742 qd_idx = raid_disks - 1; 2743 break; 2744 2745 default: 2746 BUG(); 2747 } 2748 break; 2749 } 2750 2751 if (sh) { 2752 sh->pd_idx = pd_idx; 2753 sh->qd_idx = qd_idx; 2754 sh->ddf_layout = ddf_layout; 2755 } 2756 /* 2757 * Finally, compute the new sector number 2758 */ 2759 new_sector = (sector_t)stripe * sectors_per_chunk + chunk_offset; 2760 return new_sector; 2761 } 2762 2763 sector_t raid5_compute_blocknr(struct stripe_head *sh, int i, int previous) 2764 { 2765 struct r5conf *conf = sh->raid_conf; 2766 int raid_disks = sh->disks; 2767 int data_disks = raid_disks - conf->max_degraded; 2768 sector_t new_sector = sh->sector, check; 2769 int sectors_per_chunk = previous ? conf->prev_chunk_sectors 2770 : conf->chunk_sectors; 2771 int algorithm = previous ? conf->prev_algo 2772 : conf->algorithm; 2773 sector_t stripe; 2774 int chunk_offset; 2775 sector_t chunk_number; 2776 int dummy1, dd_idx = i; 2777 sector_t r_sector; 2778 struct stripe_head sh2; 2779 2780 chunk_offset = sector_div(new_sector, sectors_per_chunk); 2781 stripe = new_sector; 2782 2783 if (i == sh->pd_idx) 2784 return 0; 2785 switch(conf->level) { 2786 case 4: break; 2787 case 5: 2788 switch (algorithm) { 2789 case ALGORITHM_LEFT_ASYMMETRIC: 2790 case ALGORITHM_RIGHT_ASYMMETRIC: 2791 if (i > sh->pd_idx) 2792 i--; 2793 break; 2794 case ALGORITHM_LEFT_SYMMETRIC: 2795 case ALGORITHM_RIGHT_SYMMETRIC: 2796 if (i < sh->pd_idx) 2797 i += raid_disks; 2798 i -= (sh->pd_idx + 1); 2799 break; 2800 case ALGORITHM_PARITY_0: 2801 i -= 1; 2802 break; 2803 case ALGORITHM_PARITY_N: 2804 break; 2805 default: 2806 BUG(); 2807 } 2808 break; 2809 case 6: 2810 if (i == sh->qd_idx) 2811 return 0; /* It is the Q disk */ 2812 switch (algorithm) { 2813 case ALGORITHM_LEFT_ASYMMETRIC: 2814 case ALGORITHM_RIGHT_ASYMMETRIC: 2815 case ALGORITHM_ROTATING_ZERO_RESTART: 2816 case ALGORITHM_ROTATING_N_RESTART: 2817 if (sh->pd_idx == raid_disks-1) 2818 i--; /* Q D D D P */ 2819 else if (i > sh->pd_idx) 2820 i -= 2; /* D D P Q D */ 2821 break; 2822 case ALGORITHM_LEFT_SYMMETRIC: 2823 case ALGORITHM_RIGHT_SYMMETRIC: 2824 if (sh->pd_idx == raid_disks-1) 2825 i--; /* Q D D D P */ 2826 else { 2827 /* D D P Q D */ 2828 if (i < sh->pd_idx) 2829 i += raid_disks; 2830 i -= (sh->pd_idx + 2); 2831 } 2832 break; 2833 case ALGORITHM_PARITY_0: 2834 i -= 2; 2835 break; 2836 case ALGORITHM_PARITY_N: 2837 break; 2838 case ALGORITHM_ROTATING_N_CONTINUE: 2839 /* Like left_symmetric, but P is before Q */ 2840 if (sh->pd_idx == 0) 2841 i--; /* P D D D Q */ 2842 else { 2843 /* D D Q P D */ 2844 if (i < sh->pd_idx) 2845 i += raid_disks; 2846 i -= (sh->pd_idx + 1); 2847 } 2848 break; 2849 case ALGORITHM_LEFT_ASYMMETRIC_6: 2850 case ALGORITHM_RIGHT_ASYMMETRIC_6: 2851 if (i > sh->pd_idx) 2852 i--; 2853 break; 2854 case ALGORITHM_LEFT_SYMMETRIC_6: 2855 case ALGORITHM_RIGHT_SYMMETRIC_6: 2856 if (i < sh->pd_idx) 2857 i += data_disks + 1; 2858 i -= (sh->pd_idx + 1); 2859 break; 2860 case ALGORITHM_PARITY_0_6: 2861 i -= 1; 2862 break; 2863 default: 2864 BUG(); 2865 } 2866 break; 2867 } 2868 2869 chunk_number = stripe * data_disks + i; 2870 r_sector = chunk_number * sectors_per_chunk + chunk_offset; 2871 2872 check = raid5_compute_sector(conf, r_sector, 2873 previous, &dummy1, &sh2); 2874 if (check != sh->sector || dummy1 != dd_idx || sh2.pd_idx != sh->pd_idx 2875 || sh2.qd_idx != sh->qd_idx) { 2876 pr_warn("md/raid:%s: compute_blocknr: map not correct\n", 2877 mdname(conf->mddev)); 2878 return 0; 2879 } 2880 return r_sector; 2881 } 2882 2883 static void 2884 schedule_reconstruction(struct stripe_head *sh, struct stripe_head_state *s, 2885 int rcw, int expand) 2886 { 2887 int i, pd_idx = sh->pd_idx, qd_idx = sh->qd_idx, disks = sh->disks; 2888 struct r5conf *conf = sh->raid_conf; 2889 int level = conf->level; 2890 2891 if (rcw) { 2892 /* 2893 * In some cases, handle_stripe_dirtying initially decided to 2894 * run rmw and allocates extra page for prexor. However, rcw is 2895 * cheaper later on. We need to free the extra page now, 2896 * because we won't be able to do that in ops_complete_prexor(). 2897 */ 2898 r5c_release_extra_page(sh); 2899 2900 for (i = disks; i--; ) { 2901 struct r5dev *dev = &sh->dev[i]; 2902 2903 if (dev->towrite) { 2904 set_bit(R5_LOCKED, &dev->flags); 2905 set_bit(R5_Wantdrain, &dev->flags); 2906 if (!expand) 2907 clear_bit(R5_UPTODATE, &dev->flags); 2908 s->locked++; 2909 } else if (test_bit(R5_InJournal, &dev->flags)) { 2910 set_bit(R5_LOCKED, &dev->flags); 2911 s->locked++; 2912 } 2913 } 2914 /* if we are not expanding this is a proper write request, and 2915 * there will be bios with new data to be drained into the 2916 * stripe cache 2917 */ 2918 if (!expand) { 2919 if (!s->locked) 2920 /* False alarm, nothing to do */ 2921 return; 2922 sh->reconstruct_state = reconstruct_state_drain_run; 2923 set_bit(STRIPE_OP_BIODRAIN, &s->ops_request); 2924 } else 2925 sh->reconstruct_state = reconstruct_state_run; 2926 2927 set_bit(STRIPE_OP_RECONSTRUCT, &s->ops_request); 2928 2929 if (s->locked + conf->max_degraded == disks) 2930 if (!test_and_set_bit(STRIPE_FULL_WRITE, &sh->state)) 2931 atomic_inc(&conf->pending_full_writes); 2932 } else { 2933 BUG_ON(!(test_bit(R5_UPTODATE, &sh->dev[pd_idx].flags) || 2934 test_bit(R5_Wantcompute, &sh->dev[pd_idx].flags))); 2935 BUG_ON(level == 6 && 2936 (!(test_bit(R5_UPTODATE, &sh->dev[qd_idx].flags) || 2937 test_bit(R5_Wantcompute, &sh->dev[qd_idx].flags)))); 2938 2939 for (i = disks; i--; ) { 2940 struct r5dev *dev = &sh->dev[i]; 2941 if (i == pd_idx || i == qd_idx) 2942 continue; 2943 2944 if (dev->towrite && 2945 (test_bit(R5_UPTODATE, &dev->flags) || 2946 test_bit(R5_Wantcompute, &dev->flags))) { 2947 set_bit(R5_Wantdrain, &dev->flags); 2948 set_bit(R5_LOCKED, &dev->flags); 2949 clear_bit(R5_UPTODATE, &dev->flags); 2950 s->locked++; 2951 } else if (test_bit(R5_InJournal, &dev->flags)) { 2952 set_bit(R5_LOCKED, &dev->flags); 2953 s->locked++; 2954 } 2955 } 2956 if (!s->locked) 2957 /* False alarm - nothing to do */ 2958 return; 2959 sh->reconstruct_state = reconstruct_state_prexor_drain_run; 2960 set_bit(STRIPE_OP_PREXOR, &s->ops_request); 2961 set_bit(STRIPE_OP_BIODRAIN, &s->ops_request); 2962 set_bit(STRIPE_OP_RECONSTRUCT, &s->ops_request); 2963 } 2964 2965 /* keep the parity disk(s) locked while asynchronous operations 2966 * are in flight 2967 */ 2968 set_bit(R5_LOCKED, &sh->dev[pd_idx].flags); 2969 clear_bit(R5_UPTODATE, &sh->dev[pd_idx].flags); 2970 s->locked++; 2971 2972 if (level == 6) { 2973 int qd_idx = sh->qd_idx; 2974 struct r5dev *dev = &sh->dev[qd_idx]; 2975 2976 set_bit(R5_LOCKED, &dev->flags); 2977 clear_bit(R5_UPTODATE, &dev->flags); 2978 s->locked++; 2979 } 2980 2981 pr_debug("%s: stripe %llu locked: %d ops_request: %lx\n", 2982 __func__, (unsigned long long)sh->sector, 2983 s->locked, s->ops_request); 2984 } 2985 2986 /* 2987 * Each stripe/dev can have one or more bion attached. 2988 * toread/towrite point to the first in a chain. 2989 * The bi_next chain must be in order. 2990 */ 2991 static int add_stripe_bio(struct stripe_head *sh, struct bio *bi, int dd_idx, 2992 int forwrite, int previous) 2993 { 2994 struct bio **bip; 2995 struct r5conf *conf = sh->raid_conf; 2996 int firstwrite=0; 2997 2998 pr_debug("adding bi b#%llu to stripe s#%llu\n", 2999 (unsigned long long)bi->bi_iter.bi_sector, 3000 (unsigned long long)sh->sector); 3001 3002 /* 3003 * If several bio share a stripe. The bio bi_phys_segments acts as a 3004 * reference count to avoid race. The reference count should already be 3005 * increased before this function is called (for example, in 3006 * raid5_make_request()), so other bio sharing this stripe will not free the 3007 * stripe. If a stripe is owned by one stripe, the stripe lock will 3008 * protect it. 3009 */ 3010 spin_lock_irq(&sh->stripe_lock); 3011 /* Don't allow new IO added to stripes in batch list */ 3012 if (sh->batch_head) 3013 goto overlap; 3014 if (forwrite) { 3015 bip = &sh->dev[dd_idx].towrite; 3016 if (*bip == NULL) 3017 firstwrite = 1; 3018 } else 3019 bip = &sh->dev[dd_idx].toread; 3020 while (*bip && (*bip)->bi_iter.bi_sector < bi->bi_iter.bi_sector) { 3021 if (bio_end_sector(*bip) > bi->bi_iter.bi_sector) 3022 goto overlap; 3023 bip = & (*bip)->bi_next; 3024 } 3025 if (*bip && (*bip)->bi_iter.bi_sector < bio_end_sector(bi)) 3026 goto overlap; 3027 3028 if (!forwrite || previous) 3029 clear_bit(STRIPE_BATCH_READY, &sh->state); 3030 3031 BUG_ON(*bip && bi->bi_next && (*bip) != bi->bi_next); 3032 if (*bip) 3033 bi->bi_next = *bip; 3034 *bip = bi; 3035 raid5_inc_bi_active_stripes(bi); 3036 3037 if (forwrite) { 3038 /* check if page is covered */ 3039 sector_t sector = sh->dev[dd_idx].sector; 3040 for (bi=sh->dev[dd_idx].towrite; 3041 sector < sh->dev[dd_idx].sector + STRIPE_SECTORS && 3042 bi && bi->bi_iter.bi_sector <= sector; 3043 bi = r5_next_bio(bi, sh->dev[dd_idx].sector)) { 3044 if (bio_end_sector(bi) >= sector) 3045 sector = bio_end_sector(bi); 3046 } 3047 if (sector >= sh->dev[dd_idx].sector + STRIPE_SECTORS) 3048 if (!test_and_set_bit(R5_OVERWRITE, &sh->dev[dd_idx].flags)) 3049 sh->overwrite_disks++; 3050 } 3051 3052 pr_debug("added bi b#%llu to stripe s#%llu, disk %d.\n", 3053 (unsigned long long)(*bip)->bi_iter.bi_sector, 3054 (unsigned long long)sh->sector, dd_idx); 3055 3056 if (conf->mddev->bitmap && firstwrite) { 3057 /* Cannot hold spinlock over bitmap_startwrite, 3058 * but must ensure this isn't added to a batch until 3059 * we have added to the bitmap and set bm_seq. 3060 * So set STRIPE_BITMAP_PENDING to prevent 3061 * batching. 3062 * If multiple add_stripe_bio() calls race here they 3063 * much all set STRIPE_BITMAP_PENDING. So only the first one 3064 * to complete "bitmap_startwrite" gets to set 3065 * STRIPE_BIT_DELAY. This is important as once a stripe 3066 * is added to a batch, STRIPE_BIT_DELAY cannot be changed 3067 * any more. 3068 */ 3069 set_bit(STRIPE_BITMAP_PENDING, &sh->state); 3070 spin_unlock_irq(&sh->stripe_lock); 3071 bitmap_startwrite(conf->mddev->bitmap, sh->sector, 3072 STRIPE_SECTORS, 0); 3073 spin_lock_irq(&sh->stripe_lock); 3074 clear_bit(STRIPE_BITMAP_PENDING, &sh->state); 3075 if (!sh->batch_head) { 3076 sh->bm_seq = conf->seq_flush+1; 3077 set_bit(STRIPE_BIT_DELAY, &sh->state); 3078 } 3079 } 3080 spin_unlock_irq(&sh->stripe_lock); 3081 3082 if (stripe_can_batch(sh)) 3083 stripe_add_to_batch_list(conf, sh); 3084 return 1; 3085 3086 overlap: 3087 set_bit(R5_Overlap, &sh->dev[dd_idx].flags); 3088 spin_unlock_irq(&sh->stripe_lock); 3089 return 0; 3090 } 3091 3092 static void end_reshape(struct r5conf *conf); 3093 3094 static void stripe_set_idx(sector_t stripe, struct r5conf *conf, int previous, 3095 struct stripe_head *sh) 3096 { 3097 int sectors_per_chunk = 3098 previous ? conf->prev_chunk_sectors : conf->chunk_sectors; 3099 int dd_idx; 3100 int chunk_offset = sector_div(stripe, sectors_per_chunk); 3101 int disks = previous ? conf->previous_raid_disks : conf->raid_disks; 3102 3103 raid5_compute_sector(conf, 3104 stripe * (disks - conf->max_degraded) 3105 *sectors_per_chunk + chunk_offset, 3106 previous, 3107 &dd_idx, sh); 3108 } 3109 3110 static void 3111 handle_failed_stripe(struct r5conf *conf, struct stripe_head *sh, 3112 struct stripe_head_state *s, int disks, 3113 struct bio_list *return_bi) 3114 { 3115 int i; 3116 BUG_ON(sh->batch_head); 3117 for (i = disks; i--; ) { 3118 struct bio *bi; 3119 int bitmap_end = 0; 3120 3121 if (test_bit(R5_ReadError, &sh->dev[i].flags)) { 3122 struct md_rdev *rdev; 3123 rcu_read_lock(); 3124 rdev = rcu_dereference(conf->disks[i].rdev); 3125 if (rdev && test_bit(In_sync, &rdev->flags) && 3126 !test_bit(Faulty, &rdev->flags)) 3127 atomic_inc(&rdev->nr_pending); 3128 else 3129 rdev = NULL; 3130 rcu_read_unlock(); 3131 if (rdev) { 3132 if (!rdev_set_badblocks( 3133 rdev, 3134 sh->sector, 3135 STRIPE_SECTORS, 0)) 3136 md_error(conf->mddev, rdev); 3137 rdev_dec_pending(rdev, conf->mddev); 3138 } 3139 } 3140 spin_lock_irq(&sh->stripe_lock); 3141 /* fail all writes first */ 3142 bi = sh->dev[i].towrite; 3143 sh->dev[i].towrite = NULL; 3144 sh->overwrite_disks = 0; 3145 spin_unlock_irq(&sh->stripe_lock); 3146 if (bi) 3147 bitmap_end = 1; 3148 3149 r5l_stripe_write_finished(sh); 3150 3151 if (test_and_clear_bit(R5_Overlap, &sh->dev[i].flags)) 3152 wake_up(&conf->wait_for_overlap); 3153 3154 while (bi && bi->bi_iter.bi_sector < 3155 sh->dev[i].sector + STRIPE_SECTORS) { 3156 struct bio *nextbi = r5_next_bio(bi, sh->dev[i].sector); 3157 3158 bi->bi_error = -EIO; 3159 if (!raid5_dec_bi_active_stripes(bi)) { 3160 md_write_end(conf->mddev); 3161 bio_list_add(return_bi, bi); 3162 } 3163 bi = nextbi; 3164 } 3165 if (bitmap_end) 3166 bitmap_endwrite(conf->mddev->bitmap, sh->sector, 3167 STRIPE_SECTORS, 0, 0); 3168 bitmap_end = 0; 3169 /* and fail all 'written' */ 3170 bi = sh->dev[i].written; 3171 sh->dev[i].written = NULL; 3172 if (test_and_clear_bit(R5_SkipCopy, &sh->dev[i].flags)) { 3173 WARN_ON(test_bit(R5_UPTODATE, &sh->dev[i].flags)); 3174 sh->dev[i].page = sh->dev[i].orig_page; 3175 } 3176 3177 if (bi) bitmap_end = 1; 3178 while (bi && bi->bi_iter.bi_sector < 3179 sh->dev[i].sector + STRIPE_SECTORS) { 3180 struct bio *bi2 = r5_next_bio(bi, sh->dev[i].sector); 3181 3182 bi->bi_error = -EIO; 3183 if (!raid5_dec_bi_active_stripes(bi)) { 3184 md_write_end(conf->mddev); 3185 bio_list_add(return_bi, bi); 3186 } 3187 bi = bi2; 3188 } 3189 3190 /* fail any reads if this device is non-operational and 3191 * the data has not reached the cache yet. 3192 */ 3193 if (!test_bit(R5_Wantfill, &sh->dev[i].flags) && 3194 s->failed > conf->max_degraded && 3195 (!test_bit(R5_Insync, &sh->dev[i].flags) || 3196 test_bit(R5_ReadError, &sh->dev[i].flags))) { 3197 spin_lock_irq(&sh->stripe_lock); 3198 bi = sh->dev[i].toread; 3199 sh->dev[i].toread = NULL; 3200 spin_unlock_irq(&sh->stripe_lock); 3201 if (test_and_clear_bit(R5_Overlap, &sh->dev[i].flags)) 3202 wake_up(&conf->wait_for_overlap); 3203 if (bi) 3204 s->to_read--; 3205 while (bi && bi->bi_iter.bi_sector < 3206 sh->dev[i].sector + STRIPE_SECTORS) { 3207 struct bio *nextbi = 3208 r5_next_bio(bi, sh->dev[i].sector); 3209 3210 bi->bi_error = -EIO; 3211 if (!raid5_dec_bi_active_stripes(bi)) 3212 bio_list_add(return_bi, bi); 3213 bi = nextbi; 3214 } 3215 } 3216 if (bitmap_end) 3217 bitmap_endwrite(conf->mddev->bitmap, sh->sector, 3218 STRIPE_SECTORS, 0, 0); 3219 /* If we were in the middle of a write the parity block might 3220 * still be locked - so just clear all R5_LOCKED flags 3221 */ 3222 clear_bit(R5_LOCKED, &sh->dev[i].flags); 3223 } 3224 s->to_write = 0; 3225 s->written = 0; 3226 3227 if (test_and_clear_bit(STRIPE_FULL_WRITE, &sh->state)) 3228 if (atomic_dec_and_test(&conf->pending_full_writes)) 3229 md_wakeup_thread(conf->mddev->thread); 3230 } 3231 3232 static void 3233 handle_failed_sync(struct r5conf *conf, struct stripe_head *sh, 3234 struct stripe_head_state *s) 3235 { 3236 int abort = 0; 3237 int i; 3238 3239 BUG_ON(sh->batch_head); 3240 clear_bit(STRIPE_SYNCING, &sh->state); 3241 if (test_and_clear_bit(R5_Overlap, &sh->dev[sh->pd_idx].flags)) 3242 wake_up(&conf->wait_for_overlap); 3243 s->syncing = 0; 3244 s->replacing = 0; 3245 /* There is nothing more to do for sync/check/repair. 3246 * Don't even need to abort as that is handled elsewhere 3247 * if needed, and not always wanted e.g. if there is a known 3248 * bad block here. 3249 * For recover/replace we need to record a bad block on all 3250 * non-sync devices, or abort the recovery 3251 */ 3252 if (test_bit(MD_RECOVERY_RECOVER, &conf->mddev->recovery)) { 3253 /* During recovery devices cannot be removed, so 3254 * locking and refcounting of rdevs is not needed 3255 */ 3256 rcu_read_lock(); 3257 for (i = 0; i < conf->raid_disks; i++) { 3258 struct md_rdev *rdev = rcu_dereference(conf->disks[i].rdev); 3259 if (rdev 3260 && !test_bit(Faulty, &rdev->flags) 3261 && !test_bit(In_sync, &rdev->flags) 3262 && !rdev_set_badblocks(rdev, sh->sector, 3263 STRIPE_SECTORS, 0)) 3264 abort = 1; 3265 rdev = rcu_dereference(conf->disks[i].replacement); 3266 if (rdev 3267 && !test_bit(Faulty, &rdev->flags) 3268 && !test_bit(In_sync, &rdev->flags) 3269 && !rdev_set_badblocks(rdev, sh->sector, 3270 STRIPE_SECTORS, 0)) 3271 abort = 1; 3272 } 3273 rcu_read_unlock(); 3274 if (abort) 3275 conf->recovery_disabled = 3276 conf->mddev->recovery_disabled; 3277 } 3278 md_done_sync(conf->mddev, STRIPE_SECTORS, !abort); 3279 } 3280 3281 static int want_replace(struct stripe_head *sh, int disk_idx) 3282 { 3283 struct md_rdev *rdev; 3284 int rv = 0; 3285 3286 rcu_read_lock(); 3287 rdev = rcu_dereference(sh->raid_conf->disks[disk_idx].replacement); 3288 if (rdev 3289 && !test_bit(Faulty, &rdev->flags) 3290 && !test_bit(In_sync, &rdev->flags) 3291 && (rdev->recovery_offset <= sh->sector 3292 || rdev->mddev->recovery_cp <= sh->sector)) 3293 rv = 1; 3294 rcu_read_unlock(); 3295 return rv; 3296 } 3297 3298 /* fetch_block - checks the given member device to see if its data needs 3299 * to be read or computed to satisfy a request. 3300 * 3301 * Returns 1 when no more member devices need to be checked, otherwise returns 3302 * 0 to tell the loop in handle_stripe_fill to continue 3303 */ 3304 3305 static int need_this_block(struct stripe_head *sh, struct stripe_head_state *s, 3306 int disk_idx, int disks) 3307 { 3308 struct r5dev *dev = &sh->dev[disk_idx]; 3309 struct r5dev *fdev[2] = { &sh->dev[s->failed_num[0]], 3310 &sh->dev[s->failed_num[1]] }; 3311 int i; 3312 3313 3314 if (test_bit(R5_LOCKED, &dev->flags) || 3315 test_bit(R5_UPTODATE, &dev->flags)) 3316 /* No point reading this as we already have it or have 3317 * decided to get it. 3318 */ 3319 return 0; 3320 3321 if (dev->toread || 3322 (dev->towrite && !test_bit(R5_OVERWRITE, &dev->flags))) 3323 /* We need this block to directly satisfy a request */ 3324 return 1; 3325 3326 if (s->syncing || s->expanding || 3327 (s->replacing && want_replace(sh, disk_idx))) 3328 /* When syncing, or expanding we read everything. 3329 * When replacing, we need the replaced block. 3330 */ 3331 return 1; 3332 3333 if ((s->failed >= 1 && fdev[0]->toread) || 3334 (s->failed >= 2 && fdev[1]->toread)) 3335 /* If we want to read from a failed device, then 3336 * we need to actually read every other device. 3337 */ 3338 return 1; 3339 3340 /* Sometimes neither read-modify-write nor reconstruct-write 3341 * cycles can work. In those cases we read every block we 3342 * can. Then the parity-update is certain to have enough to 3343 * work with. 3344 * This can only be a problem when we need to write something, 3345 * and some device has failed. If either of those tests 3346 * fail we need look no further. 3347 */ 3348 if (!s->failed || !s->to_write) 3349 return 0; 3350 3351 if (test_bit(R5_Insync, &dev->flags) && 3352 !test_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) 3353 /* Pre-reads at not permitted until after short delay 3354 * to gather multiple requests. However if this 3355 * device is no Insync, the block could only be be computed 3356 * and there is no need to delay that. 3357 */ 3358 return 0; 3359 3360 for (i = 0; i < s->failed && i < 2; i++) { 3361 if (fdev[i]->towrite && 3362 !test_bit(R5_UPTODATE, &fdev[i]->flags) && 3363 !test_bit(R5_OVERWRITE, &fdev[i]->flags)) 3364 /* If we have a partial write to a failed 3365 * device, then we will need to reconstruct 3366 * the content of that device, so all other 3367 * devices must be read. 3368 */ 3369 return 1; 3370 } 3371 3372 /* If we are forced to do a reconstruct-write, either because 3373 * the current RAID6 implementation only supports that, or 3374 * or because parity cannot be trusted and we are currently 3375 * recovering it, there is extra need to be careful. 3376 * If one of the devices that we would need to read, because 3377 * it is not being overwritten (and maybe not written at all) 3378 * is missing/faulty, then we need to read everything we can. 3379 */ 3380 if (sh->raid_conf->level != 6 && 3381 sh->sector < sh->raid_conf->mddev->recovery_cp) 3382 /* reconstruct-write isn't being forced */ 3383 return 0; 3384 for (i = 0; i < s->failed && i < 2; i++) { 3385 if (s->failed_num[i] != sh->pd_idx && 3386 s->failed_num[i] != sh->qd_idx && 3387 !test_bit(R5_UPTODATE, &fdev[i]->flags) && 3388 !test_bit(R5_OVERWRITE, &fdev[i]->flags)) 3389 return 1; 3390 } 3391 3392 return 0; 3393 } 3394 3395 static int fetch_block(struct stripe_head *sh, struct stripe_head_state *s, 3396 int disk_idx, int disks) 3397 { 3398 struct r5dev *dev = &sh->dev[disk_idx]; 3399 3400 /* is the data in this block needed, and can we get it? */ 3401 if (need_this_block(sh, s, disk_idx, disks)) { 3402 /* we would like to get this block, possibly by computing it, 3403 * otherwise read it if the backing disk is insync 3404 */ 3405 BUG_ON(test_bit(R5_Wantcompute, &dev->flags)); 3406 BUG_ON(test_bit(R5_Wantread, &dev->flags)); 3407 BUG_ON(sh->batch_head); 3408 if ((s->uptodate == disks - 1) && 3409 (s->failed && (disk_idx == s->failed_num[0] || 3410 disk_idx == s->failed_num[1]))) { 3411 /* have disk failed, and we're requested to fetch it; 3412 * do compute it 3413 */ 3414 pr_debug("Computing stripe %llu block %d\n", 3415 (unsigned long long)sh->sector, disk_idx); 3416 set_bit(STRIPE_COMPUTE_RUN, &sh->state); 3417 set_bit(STRIPE_OP_COMPUTE_BLK, &s->ops_request); 3418 set_bit(R5_Wantcompute, &dev->flags); 3419 sh->ops.target = disk_idx; 3420 sh->ops.target2 = -1; /* no 2nd target */ 3421 s->req_compute = 1; 3422 /* Careful: from this point on 'uptodate' is in the eye 3423 * of raid_run_ops which services 'compute' operations 3424 * before writes. R5_Wantcompute flags a block that will 3425 * be R5_UPTODATE by the time it is needed for a 3426 * subsequent operation. 3427 */ 3428 s->uptodate++; 3429 return 1; 3430 } else if (s->uptodate == disks-2 && s->failed >= 2) { 3431 /* Computing 2-failure is *very* expensive; only 3432 * do it if failed >= 2 3433 */ 3434 int other; 3435 for (other = disks; other--; ) { 3436 if (other == disk_idx) 3437 continue; 3438 if (!test_bit(R5_UPTODATE, 3439 &sh->dev[other].flags)) 3440 break; 3441 } 3442 BUG_ON(other < 0); 3443 pr_debug("Computing stripe %llu blocks %d,%d\n", 3444 (unsigned long long)sh->sector, 3445 disk_idx, other); 3446 set_bit(STRIPE_COMPUTE_RUN, &sh->state); 3447 set_bit(STRIPE_OP_COMPUTE_BLK, &s->ops_request); 3448 set_bit(R5_Wantcompute, &sh->dev[disk_idx].flags); 3449 set_bit(R5_Wantcompute, &sh->dev[other].flags); 3450 sh->ops.target = disk_idx; 3451 sh->ops.target2 = other; 3452 s->uptodate += 2; 3453 s->req_compute = 1; 3454 return 1; 3455 } else if (test_bit(R5_Insync, &dev->flags)) { 3456 set_bit(R5_LOCKED, &dev->flags); 3457 set_bit(R5_Wantread, &dev->flags); 3458 s->locked++; 3459 pr_debug("Reading block %d (sync=%d)\n", 3460 disk_idx, s->syncing); 3461 } 3462 } 3463 3464 return 0; 3465 } 3466 3467 /** 3468 * handle_stripe_fill - read or compute data to satisfy pending requests. 3469 */ 3470 static void handle_stripe_fill(struct stripe_head *sh, 3471 struct stripe_head_state *s, 3472 int disks) 3473 { 3474 int i; 3475 3476 /* look for blocks to read/compute, skip this if a compute 3477 * is already in flight, or if the stripe contents are in the 3478 * midst of changing due to a write 3479 */ 3480 if (!test_bit(STRIPE_COMPUTE_RUN, &sh->state) && !sh->check_state && 3481 !sh->reconstruct_state) 3482 for (i = disks; i--; ) 3483 if (fetch_block(sh, s, i, disks)) 3484 break; 3485 set_bit(STRIPE_HANDLE, &sh->state); 3486 } 3487 3488 static void break_stripe_batch_list(struct stripe_head *head_sh, 3489 unsigned long handle_flags); 3490 /* handle_stripe_clean_event 3491 * any written block on an uptodate or failed drive can be returned. 3492 * Note that if we 'wrote' to a failed drive, it will be UPTODATE, but 3493 * never LOCKED, so we don't need to test 'failed' directly. 3494 */ 3495 static void handle_stripe_clean_event(struct r5conf *conf, 3496 struct stripe_head *sh, int disks, struct bio_list *return_bi) 3497 { 3498 int i; 3499 struct r5dev *dev; 3500 int discard_pending = 0; 3501 struct stripe_head *head_sh = sh; 3502 bool do_endio = false; 3503 3504 for (i = disks; i--; ) 3505 if (sh->dev[i].written) { 3506 dev = &sh->dev[i]; 3507 if (!test_bit(R5_LOCKED, &dev->flags) && 3508 (test_bit(R5_UPTODATE, &dev->flags) || 3509 test_bit(R5_Discard, &dev->flags) || 3510 test_bit(R5_SkipCopy, &dev->flags))) { 3511 /* We can return any write requests */ 3512 struct bio *wbi, *wbi2; 3513 pr_debug("Return write for disc %d\n", i); 3514 if (test_and_clear_bit(R5_Discard, &dev->flags)) 3515 clear_bit(R5_UPTODATE, &dev->flags); 3516 if (test_and_clear_bit(R5_SkipCopy, &dev->flags)) { 3517 WARN_ON(test_bit(R5_UPTODATE, &dev->flags)); 3518 } 3519 do_endio = true; 3520 3521 returnbi: 3522 dev->page = dev->orig_page; 3523 wbi = dev->written; 3524 dev->written = NULL; 3525 while (wbi && wbi->bi_iter.bi_sector < 3526 dev->sector + STRIPE_SECTORS) { 3527 wbi2 = r5_next_bio(wbi, dev->sector); 3528 if (!raid5_dec_bi_active_stripes(wbi)) { 3529 md_write_end(conf->mddev); 3530 bio_list_add(return_bi, wbi); 3531 } 3532 wbi = wbi2; 3533 } 3534 bitmap_endwrite(conf->mddev->bitmap, sh->sector, 3535 STRIPE_SECTORS, 3536 !test_bit(STRIPE_DEGRADED, &sh->state), 3537 0); 3538 if (head_sh->batch_head) { 3539 sh = list_first_entry(&sh->batch_list, 3540 struct stripe_head, 3541 batch_list); 3542 if (sh != head_sh) { 3543 dev = &sh->dev[i]; 3544 goto returnbi; 3545 } 3546 } 3547 sh = head_sh; 3548 dev = &sh->dev[i]; 3549 } else if (test_bit(R5_Discard, &dev->flags)) 3550 discard_pending = 1; 3551 } 3552 3553 r5l_stripe_write_finished(sh); 3554 3555 if (!discard_pending && 3556 test_bit(R5_Discard, &sh->dev[sh->pd_idx].flags)) { 3557 int hash; 3558 clear_bit(R5_Discard, &sh->dev[sh->pd_idx].flags); 3559 clear_bit(R5_UPTODATE, &sh->dev[sh->pd_idx].flags); 3560 if (sh->qd_idx >= 0) { 3561 clear_bit(R5_Discard, &sh->dev[sh->qd_idx].flags); 3562 clear_bit(R5_UPTODATE, &sh->dev[sh->qd_idx].flags); 3563 } 3564 /* now that discard is done we can proceed with any sync */ 3565 clear_bit(STRIPE_DISCARD, &sh->state); 3566 /* 3567 * SCSI discard will change some bio fields and the stripe has 3568 * no updated data, so remove it from hash list and the stripe 3569 * will be reinitialized 3570 */ 3571 unhash: 3572 hash = sh->hash_lock_index; 3573 spin_lock_irq(conf->hash_locks + hash); 3574 remove_hash(sh); 3575 spin_unlock_irq(conf->hash_locks + hash); 3576 if (head_sh->batch_head) { 3577 sh = list_first_entry(&sh->batch_list, 3578 struct stripe_head, batch_list); 3579 if (sh != head_sh) 3580 goto unhash; 3581 } 3582 sh = head_sh; 3583 3584 if (test_bit(STRIPE_SYNC_REQUESTED, &sh->state)) 3585 set_bit(STRIPE_HANDLE, &sh->state); 3586 3587 } 3588 3589 if (test_and_clear_bit(STRIPE_FULL_WRITE, &sh->state)) 3590 if (atomic_dec_and_test(&conf->pending_full_writes)) 3591 md_wakeup_thread(conf->mddev->thread); 3592 3593 if (head_sh->batch_head && do_endio) 3594 break_stripe_batch_list(head_sh, STRIPE_EXPAND_SYNC_FLAGS); 3595 } 3596 3597 static int handle_stripe_dirtying(struct r5conf *conf, 3598 struct stripe_head *sh, 3599 struct stripe_head_state *s, 3600 int disks) 3601 { 3602 int rmw = 0, rcw = 0, i; 3603 sector_t recovery_cp = conf->mddev->recovery_cp; 3604 3605 /* Check whether resync is now happening or should start. 3606 * If yes, then the array is dirty (after unclean shutdown or 3607 * initial creation), so parity in some stripes might be inconsistent. 3608 * In this case, we need to always do reconstruct-write, to ensure 3609 * that in case of drive failure or read-error correction, we 3610 * generate correct data from the parity. 3611 */ 3612 if (conf->rmw_level == PARITY_DISABLE_RMW || 3613 (recovery_cp < MaxSector && sh->sector >= recovery_cp && 3614 s->failed == 0)) { 3615 /* Calculate the real rcw later - for now make it 3616 * look like rcw is cheaper 3617 */ 3618 rcw = 1; rmw = 2; 3619 pr_debug("force RCW rmw_level=%u, recovery_cp=%llu sh->sector=%llu\n", 3620 conf->rmw_level, (unsigned long long)recovery_cp, 3621 (unsigned long long)sh->sector); 3622 } else for (i = disks; i--; ) { 3623 /* would I have to read this buffer for read_modify_write */ 3624 struct r5dev *dev = &sh->dev[i]; 3625 if ((dev->towrite || i == sh->pd_idx || i == sh->qd_idx || 3626 test_bit(R5_InJournal, &dev->flags)) && 3627 !test_bit(R5_LOCKED, &dev->flags) && 3628 !((test_bit(R5_UPTODATE, &dev->flags) && 3629 (!test_bit(R5_InJournal, &dev->flags) || 3630 dev->page != dev->orig_page)) || 3631 test_bit(R5_Wantcompute, &dev->flags))) { 3632 if (test_bit(R5_Insync, &dev->flags)) 3633 rmw++; 3634 else 3635 rmw += 2*disks; /* cannot read it */ 3636 } 3637 /* Would I have to read this buffer for reconstruct_write */ 3638 if (!test_bit(R5_OVERWRITE, &dev->flags) && 3639 i != sh->pd_idx && i != sh->qd_idx && 3640 !test_bit(R5_LOCKED, &dev->flags) && 3641 !(test_bit(R5_UPTODATE, &dev->flags) || 3642 test_bit(R5_InJournal, &dev->flags) || 3643 test_bit(R5_Wantcompute, &dev->flags))) { 3644 if (test_bit(R5_Insync, &dev->flags)) 3645 rcw++; 3646 else 3647 rcw += 2*disks; 3648 } 3649 } 3650 3651 pr_debug("for sector %llu, rmw=%d rcw=%d\n", 3652 (unsigned long long)sh->sector, rmw, rcw); 3653 set_bit(STRIPE_HANDLE, &sh->state); 3654 if ((rmw < rcw || (rmw == rcw && conf->rmw_level == PARITY_PREFER_RMW)) && rmw > 0) { 3655 /* prefer read-modify-write, but need to get some data */ 3656 if (conf->mddev->queue) 3657 blk_add_trace_msg(conf->mddev->queue, 3658 "raid5 rmw %llu %d", 3659 (unsigned long long)sh->sector, rmw); 3660 for (i = disks; i--; ) { 3661 struct r5dev *dev = &sh->dev[i]; 3662 if (test_bit(R5_InJournal, &dev->flags) && 3663 dev->page == dev->orig_page && 3664 !test_bit(R5_LOCKED, &sh->dev[sh->pd_idx].flags)) { 3665 /* alloc page for prexor */ 3666 struct page *p = alloc_page(GFP_NOIO); 3667 3668 if (p) { 3669 dev->orig_page = p; 3670 continue; 3671 } 3672 3673 /* 3674 * alloc_page() failed, try use 3675 * disk_info->extra_page 3676 */ 3677 if (!test_and_set_bit(R5C_EXTRA_PAGE_IN_USE, 3678 &conf->cache_state)) { 3679 r5c_use_extra_page(sh); 3680 break; 3681 } 3682 3683 /* extra_page in use, add to delayed_list */ 3684 set_bit(STRIPE_DELAYED, &sh->state); 3685 s->waiting_extra_page = 1; 3686 return -EAGAIN; 3687 } 3688 } 3689 3690 for (i = disks; i--; ) { 3691 struct r5dev *dev = &sh->dev[i]; 3692 if ((dev->towrite || 3693 i == sh->pd_idx || i == sh->qd_idx || 3694 test_bit(R5_InJournal, &dev->flags)) && 3695 !test_bit(R5_LOCKED, &dev->flags) && 3696 !((test_bit(R5_UPTODATE, &dev->flags) && 3697 (!test_bit(R5_InJournal, &dev->flags) || 3698 dev->page != dev->orig_page)) || 3699 test_bit(R5_Wantcompute, &dev->flags)) && 3700 test_bit(R5_Insync, &dev->flags)) { 3701 if (test_bit(STRIPE_PREREAD_ACTIVE, 3702 &sh->state)) { 3703 pr_debug("Read_old block %d for r-m-w\n", 3704 i); 3705 set_bit(R5_LOCKED, &dev->flags); 3706 set_bit(R5_Wantread, &dev->flags); 3707 s->locked++; 3708 } else { 3709 set_bit(STRIPE_DELAYED, &sh->state); 3710 set_bit(STRIPE_HANDLE, &sh->state); 3711 } 3712 } 3713 } 3714 } 3715 if ((rcw < rmw || (rcw == rmw && conf->rmw_level != PARITY_PREFER_RMW)) && rcw > 0) { 3716 /* want reconstruct write, but need to get some data */ 3717 int qread =0; 3718 rcw = 0; 3719 for (i = disks; i--; ) { 3720 struct r5dev *dev = &sh->dev[i]; 3721 if (!test_bit(R5_OVERWRITE, &dev->flags) && 3722 i != sh->pd_idx && i != sh->qd_idx && 3723 !test_bit(R5_LOCKED, &dev->flags) && 3724 !(test_bit(R5_UPTODATE, &dev->flags) || 3725 test_bit(R5_InJournal, &dev->flags) || 3726 test_bit(R5_Wantcompute, &dev->flags))) { 3727 rcw++; 3728 if (test_bit(R5_Insync, &dev->flags) && 3729 test_bit(STRIPE_PREREAD_ACTIVE, 3730 &sh->state)) { 3731 pr_debug("Read_old block " 3732 "%d for Reconstruct\n", i); 3733 set_bit(R5_LOCKED, &dev->flags); 3734 set_bit(R5_Wantread, &dev->flags); 3735 s->locked++; 3736 qread++; 3737 } else { 3738 set_bit(STRIPE_DELAYED, &sh->state); 3739 set_bit(STRIPE_HANDLE, &sh->state); 3740 } 3741 } 3742 } 3743 if (rcw && conf->mddev->queue) 3744 blk_add_trace_msg(conf->mddev->queue, "raid5 rcw %llu %d %d %d", 3745 (unsigned long long)sh->sector, 3746 rcw, qread, test_bit(STRIPE_DELAYED, &sh->state)); 3747 } 3748 3749 if (rcw > disks && rmw > disks && 3750 !test_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) 3751 set_bit(STRIPE_DELAYED, &sh->state); 3752 3753 /* now if nothing is locked, and if we have enough data, 3754 * we can start a write request 3755 */ 3756 /* since handle_stripe can be called at any time we need to handle the 3757 * case where a compute block operation has been submitted and then a 3758 * subsequent call wants to start a write request. raid_run_ops only 3759 * handles the case where compute block and reconstruct are requested 3760 * simultaneously. If this is not the case then new writes need to be 3761 * held off until the compute completes. 3762 */ 3763 if ((s->req_compute || !test_bit(STRIPE_COMPUTE_RUN, &sh->state)) && 3764 (s->locked == 0 && (rcw == 0 || rmw == 0) && 3765 !test_bit(STRIPE_BIT_DELAY, &sh->state))) 3766 schedule_reconstruction(sh, s, rcw == 0, 0); 3767 return 0; 3768 } 3769 3770 static void handle_parity_checks5(struct r5conf *conf, struct stripe_head *sh, 3771 struct stripe_head_state *s, int disks) 3772 { 3773 struct r5dev *dev = NULL; 3774 3775 BUG_ON(sh->batch_head); 3776 set_bit(STRIPE_HANDLE, &sh->state); 3777 3778 switch (sh->check_state) { 3779 case check_state_idle: 3780 /* start a new check operation if there are no failures */ 3781 if (s->failed == 0) { 3782 BUG_ON(s->uptodate != disks); 3783 sh->check_state = check_state_run; 3784 set_bit(STRIPE_OP_CHECK, &s->ops_request); 3785 clear_bit(R5_UPTODATE, &sh->dev[sh->pd_idx].flags); 3786 s->uptodate--; 3787 break; 3788 } 3789 dev = &sh->dev[s->failed_num[0]]; 3790 /* fall through */ 3791 case check_state_compute_result: 3792 sh->check_state = check_state_idle; 3793 if (!dev) 3794 dev = &sh->dev[sh->pd_idx]; 3795 3796 /* check that a write has not made the stripe insync */ 3797 if (test_bit(STRIPE_INSYNC, &sh->state)) 3798 break; 3799 3800 /* either failed parity check, or recovery is happening */ 3801 BUG_ON(!test_bit(R5_UPTODATE, &dev->flags)); 3802 BUG_ON(s->uptodate != disks); 3803 3804 set_bit(R5_LOCKED, &dev->flags); 3805 s->locked++; 3806 set_bit(R5_Wantwrite, &dev->flags); 3807 3808 clear_bit(STRIPE_DEGRADED, &sh->state); 3809 set_bit(STRIPE_INSYNC, &sh->state); 3810 break; 3811 case check_state_run: 3812 break; /* we will be called again upon completion */ 3813 case check_state_check_result: 3814 sh->check_state = check_state_idle; 3815 3816 /* if a failure occurred during the check operation, leave 3817 * STRIPE_INSYNC not set and let the stripe be handled again 3818 */ 3819 if (s->failed) 3820 break; 3821 3822 /* handle a successful check operation, if parity is correct 3823 * we are done. Otherwise update the mismatch count and repair 3824 * parity if !MD_RECOVERY_CHECK 3825 */ 3826 if ((sh->ops.zero_sum_result & SUM_CHECK_P_RESULT) == 0) 3827 /* parity is correct (on disc, 3828 * not in buffer any more) 3829 */ 3830 set_bit(STRIPE_INSYNC, &sh->state); 3831 else { 3832 atomic64_add(STRIPE_SECTORS, &conf->mddev->resync_mismatches); 3833 if (test_bit(MD_RECOVERY_CHECK, &conf->mddev->recovery)) 3834 /* don't try to repair!! */ 3835 set_bit(STRIPE_INSYNC, &sh->state); 3836 else { 3837 sh->check_state = check_state_compute_run; 3838 set_bit(STRIPE_COMPUTE_RUN, &sh->state); 3839 set_bit(STRIPE_OP_COMPUTE_BLK, &s->ops_request); 3840 set_bit(R5_Wantcompute, 3841 &sh->dev[sh->pd_idx].flags); 3842 sh->ops.target = sh->pd_idx; 3843 sh->ops.target2 = -1; 3844 s->uptodate++; 3845 } 3846 } 3847 break; 3848 case check_state_compute_run: 3849 break; 3850 default: 3851 pr_err("%s: unknown check_state: %d sector: %llu\n", 3852 __func__, sh->check_state, 3853 (unsigned long long) sh->sector); 3854 BUG(); 3855 } 3856 } 3857 3858 static void handle_parity_checks6(struct r5conf *conf, struct stripe_head *sh, 3859 struct stripe_head_state *s, 3860 int disks) 3861 { 3862 int pd_idx = sh->pd_idx; 3863 int qd_idx = sh->qd_idx; 3864 struct r5dev *dev; 3865 3866 BUG_ON(sh->batch_head); 3867 set_bit(STRIPE_HANDLE, &sh->state); 3868 3869 BUG_ON(s->failed > 2); 3870 3871 /* Want to check and possibly repair P and Q. 3872 * However there could be one 'failed' device, in which 3873 * case we can only check one of them, possibly using the 3874 * other to generate missing data 3875 */ 3876 3877 switch (sh->check_state) { 3878 case check_state_idle: 3879 /* start a new check operation if there are < 2 failures */ 3880 if (s->failed == s->q_failed) { 3881 /* The only possible failed device holds Q, so it 3882 * makes sense to check P (If anything else were failed, 3883 * we would have used P to recreate it). 3884 */ 3885 sh->check_state = check_state_run; 3886 } 3887 if (!s->q_failed && s->failed < 2) { 3888 /* Q is not failed, and we didn't use it to generate 3889 * anything, so it makes sense to check it 3890 */ 3891 if (sh->check_state == check_state_run) 3892 sh->check_state = check_state_run_pq; 3893 else 3894 sh->check_state = check_state_run_q; 3895 } 3896 3897 /* discard potentially stale zero_sum_result */ 3898 sh->ops.zero_sum_result = 0; 3899 3900 if (sh->check_state == check_state_run) { 3901 /* async_xor_zero_sum destroys the contents of P */ 3902 clear_bit(R5_UPTODATE, &sh->dev[pd_idx].flags); 3903 s->uptodate--; 3904 } 3905 if (sh->check_state >= check_state_run && 3906 sh->check_state <= check_state_run_pq) { 3907 /* async_syndrome_zero_sum preserves P and Q, so 3908 * no need to mark them !uptodate here 3909 */ 3910 set_bit(STRIPE_OP_CHECK, &s->ops_request); 3911 break; 3912 } 3913 3914 /* we have 2-disk failure */ 3915 BUG_ON(s->failed != 2); 3916 /* fall through */ 3917 case check_state_compute_result: 3918 sh->check_state = check_state_idle; 3919 3920 /* check that a write has not made the stripe insync */ 3921 if (test_bit(STRIPE_INSYNC, &sh->state)) 3922 break; 3923 3924 /* now write out any block on a failed drive, 3925 * or P or Q if they were recomputed 3926 */ 3927 BUG_ON(s->uptodate < disks - 1); /* We don't need Q to recover */ 3928 if (s->failed == 2) { 3929 dev = &sh->dev[s->failed_num[1]]; 3930 s->locked++; 3931 set_bit(R5_LOCKED, &dev->flags); 3932 set_bit(R5_Wantwrite, &dev->flags); 3933 } 3934 if (s->failed >= 1) { 3935 dev = &sh->dev[s->failed_num[0]]; 3936 s->locked++; 3937 set_bit(R5_LOCKED, &dev->flags); 3938 set_bit(R5_Wantwrite, &dev->flags); 3939 } 3940 if (sh->ops.zero_sum_result & SUM_CHECK_P_RESULT) { 3941 dev = &sh->dev[pd_idx]; 3942 s->locked++; 3943 set_bit(R5_LOCKED, &dev->flags); 3944 set_bit(R5_Wantwrite, &dev->flags); 3945 } 3946 if (sh->ops.zero_sum_result & SUM_CHECK_Q_RESULT) { 3947 dev = &sh->dev[qd_idx]; 3948 s->locked++; 3949 set_bit(R5_LOCKED, &dev->flags); 3950 set_bit(R5_Wantwrite, &dev->flags); 3951 } 3952 clear_bit(STRIPE_DEGRADED, &sh->state); 3953 3954 set_bit(STRIPE_INSYNC, &sh->state); 3955 break; 3956 case check_state_run: 3957 case check_state_run_q: 3958 case check_state_run_pq: 3959 break; /* we will be called again upon completion */ 3960 case check_state_check_result: 3961 sh->check_state = check_state_idle; 3962 3963 /* handle a successful check operation, if parity is correct 3964 * we are done. Otherwise update the mismatch count and repair 3965 * parity if !MD_RECOVERY_CHECK 3966 */ 3967 if (sh->ops.zero_sum_result == 0) { 3968 /* both parities are correct */ 3969 if (!s->failed) 3970 set_bit(STRIPE_INSYNC, &sh->state); 3971 else { 3972 /* in contrast to the raid5 case we can validate 3973 * parity, but still have a failure to write 3974 * back 3975 */ 3976 sh->check_state = check_state_compute_result; 3977 /* Returning at this point means that we may go 3978 * off and bring p and/or q uptodate again so 3979 * we make sure to check zero_sum_result again 3980 * to verify if p or q need writeback 3981 */ 3982 } 3983 } else { 3984 atomic64_add(STRIPE_SECTORS, &conf->mddev->resync_mismatches); 3985 if (test_bit(MD_RECOVERY_CHECK, &conf->mddev->recovery)) 3986 /* don't try to repair!! */ 3987 set_bit(STRIPE_INSYNC, &sh->state); 3988 else { 3989 int *target = &sh->ops.target; 3990 3991 sh->ops.target = -1; 3992 sh->ops.target2 = -1; 3993 sh->check_state = check_state_compute_run; 3994 set_bit(STRIPE_COMPUTE_RUN, &sh->state); 3995 set_bit(STRIPE_OP_COMPUTE_BLK, &s->ops_request); 3996 if (sh->ops.zero_sum_result & SUM_CHECK_P_RESULT) { 3997 set_bit(R5_Wantcompute, 3998 &sh->dev[pd_idx].flags); 3999 *target = pd_idx; 4000 target = &sh->ops.target2; 4001 s->uptodate++; 4002 } 4003 if (sh->ops.zero_sum_result & SUM_CHECK_Q_RESULT) { 4004 set_bit(R5_Wantcompute, 4005 &sh->dev[qd_idx].flags); 4006 *target = qd_idx; 4007 s->uptodate++; 4008 } 4009 } 4010 } 4011 break; 4012 case check_state_compute_run: 4013 break; 4014 default: 4015 pr_warn("%s: unknown check_state: %d sector: %llu\n", 4016 __func__, sh->check_state, 4017 (unsigned long long) sh->sector); 4018 BUG(); 4019 } 4020 } 4021 4022 static void handle_stripe_expansion(struct r5conf *conf, struct stripe_head *sh) 4023 { 4024 int i; 4025 4026 /* We have read all the blocks in this stripe and now we need to 4027 * copy some of them into a target stripe for expand. 4028 */ 4029 struct dma_async_tx_descriptor *tx = NULL; 4030 BUG_ON(sh->batch_head); 4031 clear_bit(STRIPE_EXPAND_SOURCE, &sh->state); 4032 for (i = 0; i < sh->disks; i++) 4033 if (i != sh->pd_idx && i != sh->qd_idx) { 4034 int dd_idx, j; 4035 struct stripe_head *sh2; 4036 struct async_submit_ctl submit; 4037 4038 sector_t bn = raid5_compute_blocknr(sh, i, 1); 4039 sector_t s = raid5_compute_sector(conf, bn, 0, 4040 &dd_idx, NULL); 4041 sh2 = raid5_get_active_stripe(conf, s, 0, 1, 1); 4042 if (sh2 == NULL) 4043 /* so far only the early blocks of this stripe 4044 * have been requested. When later blocks 4045 * get requested, we will try again 4046 */ 4047 continue; 4048 if (!test_bit(STRIPE_EXPANDING, &sh2->state) || 4049 test_bit(R5_Expanded, &sh2->dev[dd_idx].flags)) { 4050 /* must have already done this block */ 4051 raid5_release_stripe(sh2); 4052 continue; 4053 } 4054 4055 /* place all the copies on one channel */ 4056 init_async_submit(&submit, 0, tx, NULL, NULL, NULL); 4057 tx = async_memcpy(sh2->dev[dd_idx].page, 4058 sh->dev[i].page, 0, 0, STRIPE_SIZE, 4059 &submit); 4060 4061 set_bit(R5_Expanded, &sh2->dev[dd_idx].flags); 4062 set_bit(R5_UPTODATE, &sh2->dev[dd_idx].flags); 4063 for (j = 0; j < conf->raid_disks; j++) 4064 if (j != sh2->pd_idx && 4065 j != sh2->qd_idx && 4066 !test_bit(R5_Expanded, &sh2->dev[j].flags)) 4067 break; 4068 if (j == conf->raid_disks) { 4069 set_bit(STRIPE_EXPAND_READY, &sh2->state); 4070 set_bit(STRIPE_HANDLE, &sh2->state); 4071 } 4072 raid5_release_stripe(sh2); 4073 4074 } 4075 /* done submitting copies, wait for them to complete */ 4076 async_tx_quiesce(&tx); 4077 } 4078 4079 /* 4080 * handle_stripe - do things to a stripe. 4081 * 4082 * We lock the stripe by setting STRIPE_ACTIVE and then examine the 4083 * state of various bits to see what needs to be done. 4084 * Possible results: 4085 * return some read requests which now have data 4086 * return some write requests which are safely on storage 4087 * schedule a read on some buffers 4088 * schedule a write of some buffers 4089 * return confirmation of parity correctness 4090 * 4091 */ 4092 4093 static void analyse_stripe(struct stripe_head *sh, struct stripe_head_state *s) 4094 { 4095 struct r5conf *conf = sh->raid_conf; 4096 int disks = sh->disks; 4097 struct r5dev *dev; 4098 int i; 4099 int do_recovery = 0; 4100 4101 memset(s, 0, sizeof(*s)); 4102 4103 s->expanding = test_bit(STRIPE_EXPAND_SOURCE, &sh->state) && !sh->batch_head; 4104 s->expanded = test_bit(STRIPE_EXPAND_READY, &sh->state) && !sh->batch_head; 4105 s->failed_num[0] = -1; 4106 s->failed_num[1] = -1; 4107 s->log_failed = r5l_log_disk_error(conf); 4108 4109 /* Now to look around and see what can be done */ 4110 rcu_read_lock(); 4111 for (i=disks; i--; ) { 4112 struct md_rdev *rdev; 4113 sector_t first_bad; 4114 int bad_sectors; 4115 int is_bad = 0; 4116 4117 dev = &sh->dev[i]; 4118 4119 pr_debug("check %d: state 0x%lx read %p write %p written %p\n", 4120 i, dev->flags, 4121 dev->toread, dev->towrite, dev->written); 4122 /* maybe we can reply to a read 4123 * 4124 * new wantfill requests are only permitted while 4125 * ops_complete_biofill is guaranteed to be inactive 4126 */ 4127 if (test_bit(R5_UPTODATE, &dev->flags) && dev->toread && 4128 !test_bit(STRIPE_BIOFILL_RUN, &sh->state)) 4129 set_bit(R5_Wantfill, &dev->flags); 4130 4131 /* now count some things */ 4132 if (test_bit(R5_LOCKED, &dev->flags)) 4133 s->locked++; 4134 if (test_bit(R5_UPTODATE, &dev->flags)) 4135 s->uptodate++; 4136 if (test_bit(R5_Wantcompute, &dev->flags)) { 4137 s->compute++; 4138 BUG_ON(s->compute > 2); 4139 } 4140 4141 if (test_bit(R5_Wantfill, &dev->flags)) 4142 s->to_fill++; 4143 else if (dev->toread) 4144 s->to_read++; 4145 if (dev->towrite) { 4146 s->to_write++; 4147 if (!test_bit(R5_OVERWRITE, &dev->flags)) 4148 s->non_overwrite++; 4149 } 4150 if (dev->written) 4151 s->written++; 4152 /* Prefer to use the replacement for reads, but only 4153 * if it is recovered enough and has no bad blocks. 4154 */ 4155 rdev = rcu_dereference(conf->disks[i].replacement); 4156 if (rdev && !test_bit(Faulty, &rdev->flags) && 4157 rdev->recovery_offset >= sh->sector + STRIPE_SECTORS && 4158 !is_badblock(rdev, sh->sector, STRIPE_SECTORS, 4159 &first_bad, &bad_sectors)) 4160 set_bit(R5_ReadRepl, &dev->flags); 4161 else { 4162 if (rdev && !test_bit(Faulty, &rdev->flags)) 4163 set_bit(R5_NeedReplace, &dev->flags); 4164 else 4165 clear_bit(R5_NeedReplace, &dev->flags); 4166 rdev = rcu_dereference(conf->disks[i].rdev); 4167 clear_bit(R5_ReadRepl, &dev->flags); 4168 } 4169 if (rdev && test_bit(Faulty, &rdev->flags)) 4170 rdev = NULL; 4171 if (rdev) { 4172 is_bad = is_badblock(rdev, sh->sector, STRIPE_SECTORS, 4173 &first_bad, &bad_sectors); 4174 if (s->blocked_rdev == NULL 4175 && (test_bit(Blocked, &rdev->flags) 4176 || is_bad < 0)) { 4177 if (is_bad < 0) 4178 set_bit(BlockedBadBlocks, 4179 &rdev->flags); 4180 s->blocked_rdev = rdev; 4181 atomic_inc(&rdev->nr_pending); 4182 } 4183 } 4184 clear_bit(R5_Insync, &dev->flags); 4185 if (!rdev) 4186 /* Not in-sync */; 4187 else if (is_bad) { 4188 /* also not in-sync */ 4189 if (!test_bit(WriteErrorSeen, &rdev->flags) && 4190 test_bit(R5_UPTODATE, &dev->flags)) { 4191 /* treat as in-sync, but with a read error 4192 * which we can now try to correct 4193 */ 4194 set_bit(R5_Insync, &dev->flags); 4195 set_bit(R5_ReadError, &dev->flags); 4196 } 4197 } else if (test_bit(In_sync, &rdev->flags)) 4198 set_bit(R5_Insync, &dev->flags); 4199 else if (sh->sector + STRIPE_SECTORS <= rdev->recovery_offset) 4200 /* in sync if before recovery_offset */ 4201 set_bit(R5_Insync, &dev->flags); 4202 else if (test_bit(R5_UPTODATE, &dev->flags) && 4203 test_bit(R5_Expanded, &dev->flags)) 4204 /* If we've reshaped into here, we assume it is Insync. 4205 * We will shortly update recovery_offset to make 4206 * it official. 4207 */ 4208 set_bit(R5_Insync, &dev->flags); 4209 4210 if (test_bit(R5_WriteError, &dev->flags)) { 4211 /* This flag does not apply to '.replacement' 4212 * only to .rdev, so make sure to check that*/ 4213 struct md_rdev *rdev2 = rcu_dereference( 4214 conf->disks[i].rdev); 4215 if (rdev2 == rdev) 4216 clear_bit(R5_Insync, &dev->flags); 4217 if (rdev2 && !test_bit(Faulty, &rdev2->flags)) { 4218 s->handle_bad_blocks = 1; 4219 atomic_inc(&rdev2->nr_pending); 4220 } else 4221 clear_bit(R5_WriteError, &dev->flags); 4222 } 4223 if (test_bit(R5_MadeGood, &dev->flags)) { 4224 /* This flag does not apply to '.replacement' 4225 * only to .rdev, so make sure to check that*/ 4226 struct md_rdev *rdev2 = rcu_dereference( 4227 conf->disks[i].rdev); 4228 if (rdev2 && !test_bit(Faulty, &rdev2->flags)) { 4229 s->handle_bad_blocks = 1; 4230 atomic_inc(&rdev2->nr_pending); 4231 } else 4232 clear_bit(R5_MadeGood, &dev->flags); 4233 } 4234 if (test_bit(R5_MadeGoodRepl, &dev->flags)) { 4235 struct md_rdev *rdev2 = rcu_dereference( 4236 conf->disks[i].replacement); 4237 if (rdev2 && !test_bit(Faulty, &rdev2->flags)) { 4238 s->handle_bad_blocks = 1; 4239 atomic_inc(&rdev2->nr_pending); 4240 } else 4241 clear_bit(R5_MadeGoodRepl, &dev->flags); 4242 } 4243 if (!test_bit(R5_Insync, &dev->flags)) { 4244 /* The ReadError flag will just be confusing now */ 4245 clear_bit(R5_ReadError, &dev->flags); 4246 clear_bit(R5_ReWrite, &dev->flags); 4247 } 4248 if (test_bit(R5_ReadError, &dev->flags)) 4249 clear_bit(R5_Insync, &dev->flags); 4250 if (!test_bit(R5_Insync, &dev->flags)) { 4251 if (s->failed < 2) 4252 s->failed_num[s->failed] = i; 4253 s->failed++; 4254 if (rdev && !test_bit(Faulty, &rdev->flags)) 4255 do_recovery = 1; 4256 } 4257 4258 if (test_bit(R5_InJournal, &dev->flags)) 4259 s->injournal++; 4260 if (test_bit(R5_InJournal, &dev->flags) && dev->written) 4261 s->just_cached++; 4262 } 4263 if (test_bit(STRIPE_SYNCING, &sh->state)) { 4264 /* If there is a failed device being replaced, 4265 * we must be recovering. 4266 * else if we are after recovery_cp, we must be syncing 4267 * else if MD_RECOVERY_REQUESTED is set, we also are syncing. 4268 * else we can only be replacing 4269 * sync and recovery both need to read all devices, and so 4270 * use the same flag. 4271 */ 4272 if (do_recovery || 4273 sh->sector >= conf->mddev->recovery_cp || 4274 test_bit(MD_RECOVERY_REQUESTED, &(conf->mddev->recovery))) 4275 s->syncing = 1; 4276 else 4277 s->replacing = 1; 4278 } 4279 rcu_read_unlock(); 4280 } 4281 4282 static int clear_batch_ready(struct stripe_head *sh) 4283 { 4284 /* Return '1' if this is a member of batch, or 4285 * '0' if it is a lone stripe or a head which can now be 4286 * handled. 4287 */ 4288 struct stripe_head *tmp; 4289 if (!test_and_clear_bit(STRIPE_BATCH_READY, &sh->state)) 4290 return (sh->batch_head && sh->batch_head != sh); 4291 spin_lock(&sh->stripe_lock); 4292 if (!sh->batch_head) { 4293 spin_unlock(&sh->stripe_lock); 4294 return 0; 4295 } 4296 4297 /* 4298 * this stripe could be added to a batch list before we check 4299 * BATCH_READY, skips it 4300 */ 4301 if (sh->batch_head != sh) { 4302 spin_unlock(&sh->stripe_lock); 4303 return 1; 4304 } 4305 spin_lock(&sh->batch_lock); 4306 list_for_each_entry(tmp, &sh->batch_list, batch_list) 4307 clear_bit(STRIPE_BATCH_READY, &tmp->state); 4308 spin_unlock(&sh->batch_lock); 4309 spin_unlock(&sh->stripe_lock); 4310 4311 /* 4312 * BATCH_READY is cleared, no new stripes can be added. 4313 * batch_list can be accessed without lock 4314 */ 4315 return 0; 4316 } 4317 4318 static void break_stripe_batch_list(struct stripe_head *head_sh, 4319 unsigned long handle_flags) 4320 { 4321 struct stripe_head *sh, *next; 4322 int i; 4323 int do_wakeup = 0; 4324 4325 list_for_each_entry_safe(sh, next, &head_sh->batch_list, batch_list) { 4326 4327 list_del_init(&sh->batch_list); 4328 4329 WARN_ONCE(sh->state & ((1 << STRIPE_ACTIVE) | 4330 (1 << STRIPE_SYNCING) | 4331 (1 << STRIPE_REPLACED) | 4332 (1 << STRIPE_DELAYED) | 4333 (1 << STRIPE_BIT_DELAY) | 4334 (1 << STRIPE_FULL_WRITE) | 4335 (1 << STRIPE_BIOFILL_RUN) | 4336 (1 << STRIPE_COMPUTE_RUN) | 4337 (1 << STRIPE_OPS_REQ_PENDING) | 4338 (1 << STRIPE_DISCARD) | 4339 (1 << STRIPE_BATCH_READY) | 4340 (1 << STRIPE_BATCH_ERR) | 4341 (1 << STRIPE_BITMAP_PENDING)), 4342 "stripe state: %lx\n", sh->state); 4343 WARN_ONCE(head_sh->state & ((1 << STRIPE_DISCARD) | 4344 (1 << STRIPE_REPLACED)), 4345 "head stripe state: %lx\n", head_sh->state); 4346 4347 set_mask_bits(&sh->state, ~(STRIPE_EXPAND_SYNC_FLAGS | 4348 (1 << STRIPE_PREREAD_ACTIVE) | 4349 (1 << STRIPE_DEGRADED)), 4350 head_sh->state & (1 << STRIPE_INSYNC)); 4351 4352 sh->check_state = head_sh->check_state; 4353 sh->reconstruct_state = head_sh->reconstruct_state; 4354 for (i = 0; i < sh->disks; i++) { 4355 if (test_and_clear_bit(R5_Overlap, &sh->dev[i].flags)) 4356 do_wakeup = 1; 4357 sh->dev[i].flags = head_sh->dev[i].flags & 4358 (~((1 << R5_WriteError) | (1 << R5_Overlap))); 4359 } 4360 spin_lock_irq(&sh->stripe_lock); 4361 sh->batch_head = NULL; 4362 spin_unlock_irq(&sh->stripe_lock); 4363 if (handle_flags == 0 || 4364 sh->state & handle_flags) 4365 set_bit(STRIPE_HANDLE, &sh->state); 4366 raid5_release_stripe(sh); 4367 } 4368 spin_lock_irq(&head_sh->stripe_lock); 4369 head_sh->batch_head = NULL; 4370 spin_unlock_irq(&head_sh->stripe_lock); 4371 for (i = 0; i < head_sh->disks; i++) 4372 if (test_and_clear_bit(R5_Overlap, &head_sh->dev[i].flags)) 4373 do_wakeup = 1; 4374 if (head_sh->state & handle_flags) 4375 set_bit(STRIPE_HANDLE, &head_sh->state); 4376 4377 if (do_wakeup) 4378 wake_up(&head_sh->raid_conf->wait_for_overlap); 4379 } 4380 4381 static void handle_stripe(struct stripe_head *sh) 4382 { 4383 struct stripe_head_state s; 4384 struct r5conf *conf = sh->raid_conf; 4385 int i; 4386 int prexor; 4387 int disks = sh->disks; 4388 struct r5dev *pdev, *qdev; 4389 4390 clear_bit(STRIPE_HANDLE, &sh->state); 4391 if (test_and_set_bit_lock(STRIPE_ACTIVE, &sh->state)) { 4392 /* already being handled, ensure it gets handled 4393 * again when current action finishes */ 4394 set_bit(STRIPE_HANDLE, &sh->state); 4395 return; 4396 } 4397 4398 if (clear_batch_ready(sh) ) { 4399 clear_bit_unlock(STRIPE_ACTIVE, &sh->state); 4400 return; 4401 } 4402 4403 if (test_and_clear_bit(STRIPE_BATCH_ERR, &sh->state)) 4404 break_stripe_batch_list(sh, 0); 4405 4406 if (test_bit(STRIPE_SYNC_REQUESTED, &sh->state) && !sh->batch_head) { 4407 spin_lock(&sh->stripe_lock); 4408 /* Cannot process 'sync' concurrently with 'discard' */ 4409 if (!test_bit(STRIPE_DISCARD, &sh->state) && 4410 test_and_clear_bit(STRIPE_SYNC_REQUESTED, &sh->state)) { 4411 set_bit(STRIPE_SYNCING, &sh->state); 4412 clear_bit(STRIPE_INSYNC, &sh->state); 4413 clear_bit(STRIPE_REPLACED, &sh->state); 4414 } 4415 spin_unlock(&sh->stripe_lock); 4416 } 4417 clear_bit(STRIPE_DELAYED, &sh->state); 4418 4419 pr_debug("handling stripe %llu, state=%#lx cnt=%d, " 4420 "pd_idx=%d, qd_idx=%d\n, check:%d, reconstruct:%d\n", 4421 (unsigned long long)sh->sector, sh->state, 4422 atomic_read(&sh->count), sh->pd_idx, sh->qd_idx, 4423 sh->check_state, sh->reconstruct_state); 4424 4425 analyse_stripe(sh, &s); 4426 4427 if (test_bit(STRIPE_LOG_TRAPPED, &sh->state)) 4428 goto finish; 4429 4430 if (s.handle_bad_blocks) { 4431 set_bit(STRIPE_HANDLE, &sh->state); 4432 goto finish; 4433 } 4434 4435 if (unlikely(s.blocked_rdev)) { 4436 if (s.syncing || s.expanding || s.expanded || 4437 s.replacing || s.to_write || s.written) { 4438 set_bit(STRIPE_HANDLE, &sh->state); 4439 goto finish; 4440 } 4441 /* There is nothing for the blocked_rdev to block */ 4442 rdev_dec_pending(s.blocked_rdev, conf->mddev); 4443 s.blocked_rdev = NULL; 4444 } 4445 4446 if (s.to_fill && !test_bit(STRIPE_BIOFILL_RUN, &sh->state)) { 4447 set_bit(STRIPE_OP_BIOFILL, &s.ops_request); 4448 set_bit(STRIPE_BIOFILL_RUN, &sh->state); 4449 } 4450 4451 pr_debug("locked=%d uptodate=%d to_read=%d" 4452 " to_write=%d failed=%d failed_num=%d,%d\n", 4453 s.locked, s.uptodate, s.to_read, s.to_write, s.failed, 4454 s.failed_num[0], s.failed_num[1]); 4455 /* check if the array has lost more than max_degraded devices and, 4456 * if so, some requests might need to be failed. 4457 */ 4458 if (s.failed > conf->max_degraded || s.log_failed) { 4459 sh->check_state = 0; 4460 sh->reconstruct_state = 0; 4461 break_stripe_batch_list(sh, 0); 4462 if (s.to_read+s.to_write+s.written) 4463 handle_failed_stripe(conf, sh, &s, disks, &s.return_bi); 4464 if (s.syncing + s.replacing) 4465 handle_failed_sync(conf, sh, &s); 4466 } 4467 4468 /* Now we check to see if any write operations have recently 4469 * completed 4470 */ 4471 prexor = 0; 4472 if (sh->reconstruct_state == reconstruct_state_prexor_drain_result) 4473 prexor = 1; 4474 if (sh->reconstruct_state == reconstruct_state_drain_result || 4475 sh->reconstruct_state == reconstruct_state_prexor_drain_result) { 4476 sh->reconstruct_state = reconstruct_state_idle; 4477 4478 /* All the 'written' buffers and the parity block are ready to 4479 * be written back to disk 4480 */ 4481 BUG_ON(!test_bit(R5_UPTODATE, &sh->dev[sh->pd_idx].flags) && 4482 !test_bit(R5_Discard, &sh->dev[sh->pd_idx].flags)); 4483 BUG_ON(sh->qd_idx >= 0 && 4484 !test_bit(R5_UPTODATE, &sh->dev[sh->qd_idx].flags) && 4485 !test_bit(R5_Discard, &sh->dev[sh->qd_idx].flags)); 4486 for (i = disks; i--; ) { 4487 struct r5dev *dev = &sh->dev[i]; 4488 if (test_bit(R5_LOCKED, &dev->flags) && 4489 (i == sh->pd_idx || i == sh->qd_idx || 4490 dev->written || test_bit(R5_InJournal, 4491 &dev->flags))) { 4492 pr_debug("Writing block %d\n", i); 4493 set_bit(R5_Wantwrite, &dev->flags); 4494 if (prexor) 4495 continue; 4496 if (s.failed > 1) 4497 continue; 4498 if (!test_bit(R5_Insync, &dev->flags) || 4499 ((i == sh->pd_idx || i == sh->qd_idx) && 4500 s.failed == 0)) 4501 set_bit(STRIPE_INSYNC, &sh->state); 4502 } 4503 } 4504 if (test_and_clear_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) 4505 s.dec_preread_active = 1; 4506 } 4507 4508 /* 4509 * might be able to return some write requests if the parity blocks 4510 * are safe, or on a failed drive 4511 */ 4512 pdev = &sh->dev[sh->pd_idx]; 4513 s.p_failed = (s.failed >= 1 && s.failed_num[0] == sh->pd_idx) 4514 || (s.failed >= 2 && s.failed_num[1] == sh->pd_idx); 4515 qdev = &sh->dev[sh->qd_idx]; 4516 s.q_failed = (s.failed >= 1 && s.failed_num[0] == sh->qd_idx) 4517 || (s.failed >= 2 && s.failed_num[1] == sh->qd_idx) 4518 || conf->level < 6; 4519 4520 if (s.written && 4521 (s.p_failed || ((test_bit(R5_Insync, &pdev->flags) 4522 && !test_bit(R5_LOCKED, &pdev->flags) 4523 && (test_bit(R5_UPTODATE, &pdev->flags) || 4524 test_bit(R5_Discard, &pdev->flags))))) && 4525 (s.q_failed || ((test_bit(R5_Insync, &qdev->flags) 4526 && !test_bit(R5_LOCKED, &qdev->flags) 4527 && (test_bit(R5_UPTODATE, &qdev->flags) || 4528 test_bit(R5_Discard, &qdev->flags)))))) 4529 handle_stripe_clean_event(conf, sh, disks, &s.return_bi); 4530 4531 if (s.just_cached) 4532 r5c_handle_cached_data_endio(conf, sh, disks, &s.return_bi); 4533 r5l_stripe_write_finished(sh); 4534 4535 /* Now we might consider reading some blocks, either to check/generate 4536 * parity, or to satisfy requests 4537 * or to load a block that is being partially written. 4538 */ 4539 if (s.to_read || s.non_overwrite 4540 || (conf->level == 6 && s.to_write && s.failed) 4541 || (s.syncing && (s.uptodate + s.compute < disks)) 4542 || s.replacing 4543 || s.expanding) 4544 handle_stripe_fill(sh, &s, disks); 4545 4546 /* 4547 * When the stripe finishes full journal write cycle (write to journal 4548 * and raid disk), this is the clean up procedure so it is ready for 4549 * next operation. 4550 */ 4551 r5c_finish_stripe_write_out(conf, sh, &s); 4552 4553 /* 4554 * Now to consider new write requests, cache write back and what else, 4555 * if anything should be read. We do not handle new writes when: 4556 * 1/ A 'write' operation (copy+xor) is already in flight. 4557 * 2/ A 'check' operation is in flight, as it may clobber the parity 4558 * block. 4559 * 3/ A r5c cache log write is in flight. 4560 */ 4561 4562 if (!sh->reconstruct_state && !sh->check_state && !sh->log_io) { 4563 if (!r5c_is_writeback(conf->log)) { 4564 if (s.to_write) 4565 handle_stripe_dirtying(conf, sh, &s, disks); 4566 } else { /* write back cache */ 4567 int ret = 0; 4568 4569 /* First, try handle writes in caching phase */ 4570 if (s.to_write) 4571 ret = r5c_try_caching_write(conf, sh, &s, 4572 disks); 4573 /* 4574 * If caching phase failed: ret == -EAGAIN 4575 * OR 4576 * stripe under reclaim: !caching && injournal 4577 * 4578 * fall back to handle_stripe_dirtying() 4579 */ 4580 if (ret == -EAGAIN || 4581 /* stripe under reclaim: !caching && injournal */ 4582 (!test_bit(STRIPE_R5C_CACHING, &sh->state) && 4583 s.injournal > 0)) { 4584 ret = handle_stripe_dirtying(conf, sh, &s, 4585 disks); 4586 if (ret == -EAGAIN) 4587 goto finish; 4588 } 4589 } 4590 } 4591 4592 /* maybe we need to check and possibly fix the parity for this stripe 4593 * Any reads will already have been scheduled, so we just see if enough 4594 * data is available. The parity check is held off while parity 4595 * dependent operations are in flight. 4596 */ 4597 if (sh->check_state || 4598 (s.syncing && s.locked == 0 && 4599 !test_bit(STRIPE_COMPUTE_RUN, &sh->state) && 4600 !test_bit(STRIPE_INSYNC, &sh->state))) { 4601 if (conf->level == 6) 4602 handle_parity_checks6(conf, sh, &s, disks); 4603 else 4604 handle_parity_checks5(conf, sh, &s, disks); 4605 } 4606 4607 if ((s.replacing || s.syncing) && s.locked == 0 4608 && !test_bit(STRIPE_COMPUTE_RUN, &sh->state) 4609 && !test_bit(STRIPE_REPLACED, &sh->state)) { 4610 /* Write out to replacement devices where possible */ 4611 for (i = 0; i < conf->raid_disks; i++) 4612 if (test_bit(R5_NeedReplace, &sh->dev[i].flags)) { 4613 WARN_ON(!test_bit(R5_UPTODATE, &sh->dev[i].flags)); 4614 set_bit(R5_WantReplace, &sh->dev[i].flags); 4615 set_bit(R5_LOCKED, &sh->dev[i].flags); 4616 s.locked++; 4617 } 4618 if (s.replacing) 4619 set_bit(STRIPE_INSYNC, &sh->state); 4620 set_bit(STRIPE_REPLACED, &sh->state); 4621 } 4622 if ((s.syncing || s.replacing) && s.locked == 0 && 4623 !test_bit(STRIPE_COMPUTE_RUN, &sh->state) && 4624 test_bit(STRIPE_INSYNC, &sh->state)) { 4625 md_done_sync(conf->mddev, STRIPE_SECTORS, 1); 4626 clear_bit(STRIPE_SYNCING, &sh->state); 4627 if (test_and_clear_bit(R5_Overlap, &sh->dev[sh->pd_idx].flags)) 4628 wake_up(&conf->wait_for_overlap); 4629 } 4630 4631 /* If the failed drives are just a ReadError, then we might need 4632 * to progress the repair/check process 4633 */ 4634 if (s.failed <= conf->max_degraded && !conf->mddev->ro) 4635 for (i = 0; i < s.failed; i++) { 4636 struct r5dev *dev = &sh->dev[s.failed_num[i]]; 4637 if (test_bit(R5_ReadError, &dev->flags) 4638 && !test_bit(R5_LOCKED, &dev->flags) 4639 && test_bit(R5_UPTODATE, &dev->flags) 4640 ) { 4641 if (!test_bit(R5_ReWrite, &dev->flags)) { 4642 set_bit(R5_Wantwrite, &dev->flags); 4643 set_bit(R5_ReWrite, &dev->flags); 4644 set_bit(R5_LOCKED, &dev->flags); 4645 s.locked++; 4646 } else { 4647 /* let's read it back */ 4648 set_bit(R5_Wantread, &dev->flags); 4649 set_bit(R5_LOCKED, &dev->flags); 4650 s.locked++; 4651 } 4652 } 4653 } 4654 4655 /* Finish reconstruct operations initiated by the expansion process */ 4656 if (sh->reconstruct_state == reconstruct_state_result) { 4657 struct stripe_head *sh_src 4658 = raid5_get_active_stripe(conf, sh->sector, 1, 1, 1); 4659 if (sh_src && test_bit(STRIPE_EXPAND_SOURCE, &sh_src->state)) { 4660 /* sh cannot be written until sh_src has been read. 4661 * so arrange for sh to be delayed a little 4662 */ 4663 set_bit(STRIPE_DELAYED, &sh->state); 4664 set_bit(STRIPE_HANDLE, &sh->state); 4665 if (!test_and_set_bit(STRIPE_PREREAD_ACTIVE, 4666 &sh_src->state)) 4667 atomic_inc(&conf->preread_active_stripes); 4668 raid5_release_stripe(sh_src); 4669 goto finish; 4670 } 4671 if (sh_src) 4672 raid5_release_stripe(sh_src); 4673 4674 sh->reconstruct_state = reconstruct_state_idle; 4675 clear_bit(STRIPE_EXPANDING, &sh->state); 4676 for (i = conf->raid_disks; i--; ) { 4677 set_bit(R5_Wantwrite, &sh->dev[i].flags); 4678 set_bit(R5_LOCKED, &sh->dev[i].flags); 4679 s.locked++; 4680 } 4681 } 4682 4683 if (s.expanded && test_bit(STRIPE_EXPANDING, &sh->state) && 4684 !sh->reconstruct_state) { 4685 /* Need to write out all blocks after computing parity */ 4686 sh->disks = conf->raid_disks; 4687 stripe_set_idx(sh->sector, conf, 0, sh); 4688 schedule_reconstruction(sh, &s, 1, 1); 4689 } else if (s.expanded && !sh->reconstruct_state && s.locked == 0) { 4690 clear_bit(STRIPE_EXPAND_READY, &sh->state); 4691 atomic_dec(&conf->reshape_stripes); 4692 wake_up(&conf->wait_for_overlap); 4693 md_done_sync(conf->mddev, STRIPE_SECTORS, 1); 4694 } 4695 4696 if (s.expanding && s.locked == 0 && 4697 !test_bit(STRIPE_COMPUTE_RUN, &sh->state)) 4698 handle_stripe_expansion(conf, sh); 4699 4700 finish: 4701 /* wait for this device to become unblocked */ 4702 if (unlikely(s.blocked_rdev)) { 4703 if (conf->mddev->external) 4704 md_wait_for_blocked_rdev(s.blocked_rdev, 4705 conf->mddev); 4706 else 4707 /* Internal metadata will immediately 4708 * be written by raid5d, so we don't 4709 * need to wait here. 4710 */ 4711 rdev_dec_pending(s.blocked_rdev, 4712 conf->mddev); 4713 } 4714 4715 if (s.handle_bad_blocks) 4716 for (i = disks; i--; ) { 4717 struct md_rdev *rdev; 4718 struct r5dev *dev = &sh->dev[i]; 4719 if (test_and_clear_bit(R5_WriteError, &dev->flags)) { 4720 /* We own a safe reference to the rdev */ 4721 rdev = conf->disks[i].rdev; 4722 if (!rdev_set_badblocks(rdev, sh->sector, 4723 STRIPE_SECTORS, 0)) 4724 md_error(conf->mddev, rdev); 4725 rdev_dec_pending(rdev, conf->mddev); 4726 } 4727 if (test_and_clear_bit(R5_MadeGood, &dev->flags)) { 4728 rdev = conf->disks[i].rdev; 4729 rdev_clear_badblocks(rdev, sh->sector, 4730 STRIPE_SECTORS, 0); 4731 rdev_dec_pending(rdev, conf->mddev); 4732 } 4733 if (test_and_clear_bit(R5_MadeGoodRepl, &dev->flags)) { 4734 rdev = conf->disks[i].replacement; 4735 if (!rdev) 4736 /* rdev have been moved down */ 4737 rdev = conf->disks[i].rdev; 4738 rdev_clear_badblocks(rdev, sh->sector, 4739 STRIPE_SECTORS, 0); 4740 rdev_dec_pending(rdev, conf->mddev); 4741 } 4742 } 4743 4744 if (s.ops_request) 4745 raid_run_ops(sh, s.ops_request); 4746 4747 ops_run_io(sh, &s); 4748 4749 if (s.dec_preread_active) { 4750 /* We delay this until after ops_run_io so that if make_request 4751 * is waiting on a flush, it won't continue until the writes 4752 * have actually been submitted. 4753 */ 4754 atomic_dec(&conf->preread_active_stripes); 4755 if (atomic_read(&conf->preread_active_stripes) < 4756 IO_THRESHOLD) 4757 md_wakeup_thread(conf->mddev->thread); 4758 } 4759 4760 if (!bio_list_empty(&s.return_bi)) { 4761 if (test_bit(MD_SB_CHANGE_PENDING, &conf->mddev->sb_flags)) { 4762 spin_lock_irq(&conf->device_lock); 4763 bio_list_merge(&conf->return_bi, &s.return_bi); 4764 spin_unlock_irq(&conf->device_lock); 4765 md_wakeup_thread(conf->mddev->thread); 4766 } else 4767 return_io(&s.return_bi); 4768 } 4769 4770 clear_bit_unlock(STRIPE_ACTIVE, &sh->state); 4771 } 4772 4773 static void raid5_activate_delayed(struct r5conf *conf) 4774 { 4775 if (atomic_read(&conf->preread_active_stripes) < IO_THRESHOLD) { 4776 while (!list_empty(&conf->delayed_list)) { 4777 struct list_head *l = conf->delayed_list.next; 4778 struct stripe_head *sh; 4779 sh = list_entry(l, struct stripe_head, lru); 4780 list_del_init(l); 4781 clear_bit(STRIPE_DELAYED, &sh->state); 4782 if (!test_and_set_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) 4783 atomic_inc(&conf->preread_active_stripes); 4784 list_add_tail(&sh->lru, &conf->hold_list); 4785 raid5_wakeup_stripe_thread(sh); 4786 } 4787 } 4788 } 4789 4790 static void activate_bit_delay(struct r5conf *conf, 4791 struct list_head *temp_inactive_list) 4792 { 4793 /* device_lock is held */ 4794 struct list_head head; 4795 list_add(&head, &conf->bitmap_list); 4796 list_del_init(&conf->bitmap_list); 4797 while (!list_empty(&head)) { 4798 struct stripe_head *sh = list_entry(head.next, struct stripe_head, lru); 4799 int hash; 4800 list_del_init(&sh->lru); 4801 atomic_inc(&sh->count); 4802 hash = sh->hash_lock_index; 4803 __release_stripe(conf, sh, &temp_inactive_list[hash]); 4804 } 4805 } 4806 4807 static int raid5_congested(struct mddev *mddev, int bits) 4808 { 4809 struct r5conf *conf = mddev->private; 4810 4811 /* No difference between reads and writes. Just check 4812 * how busy the stripe_cache is 4813 */ 4814 4815 if (test_bit(R5_INACTIVE_BLOCKED, &conf->cache_state)) 4816 return 1; 4817 4818 /* Also checks whether there is pressure on r5cache log space */ 4819 if (test_bit(R5C_LOG_TIGHT, &conf->cache_state)) 4820 return 1; 4821 if (conf->quiesce) 4822 return 1; 4823 if (atomic_read(&conf->empty_inactive_list_nr)) 4824 return 1; 4825 4826 return 0; 4827 } 4828 4829 static int in_chunk_boundary(struct mddev *mddev, struct bio *bio) 4830 { 4831 struct r5conf *conf = mddev->private; 4832 sector_t sector = bio->bi_iter.bi_sector + get_start_sect(bio->bi_bdev); 4833 unsigned int chunk_sectors; 4834 unsigned int bio_sectors = bio_sectors(bio); 4835 4836 chunk_sectors = min(conf->chunk_sectors, conf->prev_chunk_sectors); 4837 return chunk_sectors >= 4838 ((sector & (chunk_sectors - 1)) + bio_sectors); 4839 } 4840 4841 /* 4842 * add bio to the retry LIFO ( in O(1) ... we are in interrupt ) 4843 * later sampled by raid5d. 4844 */ 4845 static void add_bio_to_retry(struct bio *bi,struct r5conf *conf) 4846 { 4847 unsigned long flags; 4848 4849 spin_lock_irqsave(&conf->device_lock, flags); 4850 4851 bi->bi_next = conf->retry_read_aligned_list; 4852 conf->retry_read_aligned_list = bi; 4853 4854 spin_unlock_irqrestore(&conf->device_lock, flags); 4855 md_wakeup_thread(conf->mddev->thread); 4856 } 4857 4858 static struct bio *remove_bio_from_retry(struct r5conf *conf) 4859 { 4860 struct bio *bi; 4861 4862 bi = conf->retry_read_aligned; 4863 if (bi) { 4864 conf->retry_read_aligned = NULL; 4865 return bi; 4866 } 4867 bi = conf->retry_read_aligned_list; 4868 if(bi) { 4869 conf->retry_read_aligned_list = bi->bi_next; 4870 bi->bi_next = NULL; 4871 /* 4872 * this sets the active strip count to 1 and the processed 4873 * strip count to zero (upper 8 bits) 4874 */ 4875 raid5_set_bi_stripes(bi, 1); /* biased count of active stripes */ 4876 } 4877 4878 return bi; 4879 } 4880 4881 /* 4882 * The "raid5_align_endio" should check if the read succeeded and if it 4883 * did, call bio_endio on the original bio (having bio_put the new bio 4884 * first). 4885 * If the read failed.. 4886 */ 4887 static void raid5_align_endio(struct bio *bi) 4888 { 4889 struct bio* raid_bi = bi->bi_private; 4890 struct mddev *mddev; 4891 struct r5conf *conf; 4892 struct md_rdev *rdev; 4893 int error = bi->bi_error; 4894 4895 bio_put(bi); 4896 4897 rdev = (void*)raid_bi->bi_next; 4898 raid_bi->bi_next = NULL; 4899 mddev = rdev->mddev; 4900 conf = mddev->private; 4901 4902 rdev_dec_pending(rdev, conf->mddev); 4903 4904 if (!error) { 4905 trace_block_bio_complete(bdev_get_queue(raid_bi->bi_bdev), 4906 raid_bi, 0); 4907 bio_endio(raid_bi); 4908 if (atomic_dec_and_test(&conf->active_aligned_reads)) 4909 wake_up(&conf->wait_for_quiescent); 4910 return; 4911 } 4912 4913 pr_debug("raid5_align_endio : io error...handing IO for a retry\n"); 4914 4915 add_bio_to_retry(raid_bi, conf); 4916 } 4917 4918 static int raid5_read_one_chunk(struct mddev *mddev, struct bio *raid_bio) 4919 { 4920 struct r5conf *conf = mddev->private; 4921 int dd_idx; 4922 struct bio* align_bi; 4923 struct md_rdev *rdev; 4924 sector_t end_sector; 4925 4926 if (!in_chunk_boundary(mddev, raid_bio)) { 4927 pr_debug("%s: non aligned\n", __func__); 4928 return 0; 4929 } 4930 /* 4931 * use bio_clone_mddev to make a copy of the bio 4932 */ 4933 align_bi = bio_clone_mddev(raid_bio, GFP_NOIO, mddev); 4934 if (!align_bi) 4935 return 0; 4936 /* 4937 * set bi_end_io to a new function, and set bi_private to the 4938 * original bio. 4939 */ 4940 align_bi->bi_end_io = raid5_align_endio; 4941 align_bi->bi_private = raid_bio; 4942 /* 4943 * compute position 4944 */ 4945 align_bi->bi_iter.bi_sector = 4946 raid5_compute_sector(conf, raid_bio->bi_iter.bi_sector, 4947 0, &dd_idx, NULL); 4948 4949 end_sector = bio_end_sector(align_bi); 4950 rcu_read_lock(); 4951 rdev = rcu_dereference(conf->disks[dd_idx].replacement); 4952 if (!rdev || test_bit(Faulty, &rdev->flags) || 4953 rdev->recovery_offset < end_sector) { 4954 rdev = rcu_dereference(conf->disks[dd_idx].rdev); 4955 if (rdev && 4956 (test_bit(Faulty, &rdev->flags) || 4957 !(test_bit(In_sync, &rdev->flags) || 4958 rdev->recovery_offset >= end_sector))) 4959 rdev = NULL; 4960 } 4961 if (rdev) { 4962 sector_t first_bad; 4963 int bad_sectors; 4964 4965 atomic_inc(&rdev->nr_pending); 4966 rcu_read_unlock(); 4967 raid_bio->bi_next = (void*)rdev; 4968 align_bi->bi_bdev = rdev->bdev; 4969 bio_clear_flag(align_bi, BIO_SEG_VALID); 4970 4971 if (is_badblock(rdev, align_bi->bi_iter.bi_sector, 4972 bio_sectors(align_bi), 4973 &first_bad, &bad_sectors)) { 4974 bio_put(align_bi); 4975 rdev_dec_pending(rdev, mddev); 4976 return 0; 4977 } 4978 4979 /* No reshape active, so we can trust rdev->data_offset */ 4980 align_bi->bi_iter.bi_sector += rdev->data_offset; 4981 4982 spin_lock_irq(&conf->device_lock); 4983 wait_event_lock_irq(conf->wait_for_quiescent, 4984 conf->quiesce == 0, 4985 conf->device_lock); 4986 atomic_inc(&conf->active_aligned_reads); 4987 spin_unlock_irq(&conf->device_lock); 4988 4989 if (mddev->gendisk) 4990 trace_block_bio_remap(bdev_get_queue(align_bi->bi_bdev), 4991 align_bi, disk_devt(mddev->gendisk), 4992 raid_bio->bi_iter.bi_sector); 4993 generic_make_request(align_bi); 4994 return 1; 4995 } else { 4996 rcu_read_unlock(); 4997 bio_put(align_bi); 4998 return 0; 4999 } 5000 } 5001 5002 static struct bio *chunk_aligned_read(struct mddev *mddev, struct bio *raid_bio) 5003 { 5004 struct bio *split; 5005 5006 do { 5007 sector_t sector = raid_bio->bi_iter.bi_sector; 5008 unsigned chunk_sects = mddev->chunk_sectors; 5009 unsigned sectors = chunk_sects - (sector & (chunk_sects-1)); 5010 5011 if (sectors < bio_sectors(raid_bio)) { 5012 split = bio_split(raid_bio, sectors, GFP_NOIO, fs_bio_set); 5013 bio_chain(split, raid_bio); 5014 } else 5015 split = raid_bio; 5016 5017 if (!raid5_read_one_chunk(mddev, split)) { 5018 if (split != raid_bio) 5019 generic_make_request(raid_bio); 5020 return split; 5021 } 5022 } while (split != raid_bio); 5023 5024 return NULL; 5025 } 5026 5027 /* __get_priority_stripe - get the next stripe to process 5028 * 5029 * Full stripe writes are allowed to pass preread active stripes up until 5030 * the bypass_threshold is exceeded. In general the bypass_count 5031 * increments when the handle_list is handled before the hold_list; however, it 5032 * will not be incremented when STRIPE_IO_STARTED is sampled set signifying a 5033 * stripe with in flight i/o. The bypass_count will be reset when the 5034 * head of the hold_list has changed, i.e. the head was promoted to the 5035 * handle_list. 5036 */ 5037 static struct stripe_head *__get_priority_stripe(struct r5conf *conf, int group) 5038 { 5039 struct stripe_head *sh = NULL, *tmp; 5040 struct list_head *handle_list = NULL; 5041 struct r5worker_group *wg = NULL; 5042 5043 if (conf->worker_cnt_per_group == 0) { 5044 handle_list = &conf->handle_list; 5045 } else if (group != ANY_GROUP) { 5046 handle_list = &conf->worker_groups[group].handle_list; 5047 wg = &conf->worker_groups[group]; 5048 } else { 5049 int i; 5050 for (i = 0; i < conf->group_cnt; i++) { 5051 handle_list = &conf->worker_groups[i].handle_list; 5052 wg = &conf->worker_groups[i]; 5053 if (!list_empty(handle_list)) 5054 break; 5055 } 5056 } 5057 5058 pr_debug("%s: handle: %s hold: %s full_writes: %d bypass_count: %d\n", 5059 __func__, 5060 list_empty(handle_list) ? "empty" : "busy", 5061 list_empty(&conf->hold_list) ? "empty" : "busy", 5062 atomic_read(&conf->pending_full_writes), conf->bypass_count); 5063 5064 if (!list_empty(handle_list)) { 5065 sh = list_entry(handle_list->next, typeof(*sh), lru); 5066 5067 if (list_empty(&conf->hold_list)) 5068 conf->bypass_count = 0; 5069 else if (!test_bit(STRIPE_IO_STARTED, &sh->state)) { 5070 if (conf->hold_list.next == conf->last_hold) 5071 conf->bypass_count++; 5072 else { 5073 conf->last_hold = conf->hold_list.next; 5074 conf->bypass_count -= conf->bypass_threshold; 5075 if (conf->bypass_count < 0) 5076 conf->bypass_count = 0; 5077 } 5078 } 5079 } else if (!list_empty(&conf->hold_list) && 5080 ((conf->bypass_threshold && 5081 conf->bypass_count > conf->bypass_threshold) || 5082 atomic_read(&conf->pending_full_writes) == 0)) { 5083 5084 list_for_each_entry(tmp, &conf->hold_list, lru) { 5085 if (conf->worker_cnt_per_group == 0 || 5086 group == ANY_GROUP || 5087 !cpu_online(tmp->cpu) || 5088 cpu_to_group(tmp->cpu) == group) { 5089 sh = tmp; 5090 break; 5091 } 5092 } 5093 5094 if (sh) { 5095 conf->bypass_count -= conf->bypass_threshold; 5096 if (conf->bypass_count < 0) 5097 conf->bypass_count = 0; 5098 } 5099 wg = NULL; 5100 } 5101 5102 if (!sh) 5103 return NULL; 5104 5105 if (wg) { 5106 wg->stripes_cnt--; 5107 sh->group = NULL; 5108 } 5109 list_del_init(&sh->lru); 5110 BUG_ON(atomic_inc_return(&sh->count) != 1); 5111 return sh; 5112 } 5113 5114 struct raid5_plug_cb { 5115 struct blk_plug_cb cb; 5116 struct list_head list; 5117 struct list_head temp_inactive_list[NR_STRIPE_HASH_LOCKS]; 5118 }; 5119 5120 static void raid5_unplug(struct blk_plug_cb *blk_cb, bool from_schedule) 5121 { 5122 struct raid5_plug_cb *cb = container_of( 5123 blk_cb, struct raid5_plug_cb, cb); 5124 struct stripe_head *sh; 5125 struct mddev *mddev = cb->cb.data; 5126 struct r5conf *conf = mddev->private; 5127 int cnt = 0; 5128 int hash; 5129 5130 if (cb->list.next && !list_empty(&cb->list)) { 5131 spin_lock_irq(&conf->device_lock); 5132 while (!list_empty(&cb->list)) { 5133 sh = list_first_entry(&cb->list, struct stripe_head, lru); 5134 list_del_init(&sh->lru); 5135 /* 5136 * avoid race release_stripe_plug() sees 5137 * STRIPE_ON_UNPLUG_LIST clear but the stripe 5138 * is still in our list 5139 */ 5140 smp_mb__before_atomic(); 5141 clear_bit(STRIPE_ON_UNPLUG_LIST, &sh->state); 5142 /* 5143 * STRIPE_ON_RELEASE_LIST could be set here. In that 5144 * case, the count is always > 1 here 5145 */ 5146 hash = sh->hash_lock_index; 5147 __release_stripe(conf, sh, &cb->temp_inactive_list[hash]); 5148 cnt++; 5149 } 5150 spin_unlock_irq(&conf->device_lock); 5151 } 5152 release_inactive_stripe_list(conf, cb->temp_inactive_list, 5153 NR_STRIPE_HASH_LOCKS); 5154 if (mddev->queue) 5155 trace_block_unplug(mddev->queue, cnt, !from_schedule); 5156 kfree(cb); 5157 } 5158 5159 static void release_stripe_plug(struct mddev *mddev, 5160 struct stripe_head *sh) 5161 { 5162 struct blk_plug_cb *blk_cb = blk_check_plugged( 5163 raid5_unplug, mddev, 5164 sizeof(struct raid5_plug_cb)); 5165 struct raid5_plug_cb *cb; 5166 5167 if (!blk_cb) { 5168 raid5_release_stripe(sh); 5169 return; 5170 } 5171 5172 cb = container_of(blk_cb, struct raid5_plug_cb, cb); 5173 5174 if (cb->list.next == NULL) { 5175 int i; 5176 INIT_LIST_HEAD(&cb->list); 5177 for (i = 0; i < NR_STRIPE_HASH_LOCKS; i++) 5178 INIT_LIST_HEAD(cb->temp_inactive_list + i); 5179 } 5180 5181 if (!test_and_set_bit(STRIPE_ON_UNPLUG_LIST, &sh->state)) 5182 list_add_tail(&sh->lru, &cb->list); 5183 else 5184 raid5_release_stripe(sh); 5185 } 5186 5187 static void make_discard_request(struct mddev *mddev, struct bio *bi) 5188 { 5189 struct r5conf *conf = mddev->private; 5190 sector_t logical_sector, last_sector; 5191 struct stripe_head *sh; 5192 int remaining; 5193 int stripe_sectors; 5194 5195 if (mddev->reshape_position != MaxSector) 5196 /* Skip discard while reshape is happening */ 5197 return; 5198 5199 logical_sector = bi->bi_iter.bi_sector & ~((sector_t)STRIPE_SECTORS-1); 5200 last_sector = bi->bi_iter.bi_sector + (bi->bi_iter.bi_size>>9); 5201 5202 bi->bi_next = NULL; 5203 bi->bi_phys_segments = 1; /* over-loaded to count active stripes */ 5204 5205 stripe_sectors = conf->chunk_sectors * 5206 (conf->raid_disks - conf->max_degraded); 5207 logical_sector = DIV_ROUND_UP_SECTOR_T(logical_sector, 5208 stripe_sectors); 5209 sector_div(last_sector, stripe_sectors); 5210 5211 logical_sector *= conf->chunk_sectors; 5212 last_sector *= conf->chunk_sectors; 5213 5214 for (; logical_sector < last_sector; 5215 logical_sector += STRIPE_SECTORS) { 5216 DEFINE_WAIT(w); 5217 int d; 5218 again: 5219 sh = raid5_get_active_stripe(conf, logical_sector, 0, 0, 0); 5220 prepare_to_wait(&conf->wait_for_overlap, &w, 5221 TASK_UNINTERRUPTIBLE); 5222 set_bit(R5_Overlap, &sh->dev[sh->pd_idx].flags); 5223 if (test_bit(STRIPE_SYNCING, &sh->state)) { 5224 raid5_release_stripe(sh); 5225 schedule(); 5226 goto again; 5227 } 5228 clear_bit(R5_Overlap, &sh->dev[sh->pd_idx].flags); 5229 spin_lock_irq(&sh->stripe_lock); 5230 for (d = 0; d < conf->raid_disks; d++) { 5231 if (d == sh->pd_idx || d == sh->qd_idx) 5232 continue; 5233 if (sh->dev[d].towrite || sh->dev[d].toread) { 5234 set_bit(R5_Overlap, &sh->dev[d].flags); 5235 spin_unlock_irq(&sh->stripe_lock); 5236 raid5_release_stripe(sh); 5237 schedule(); 5238 goto again; 5239 } 5240 } 5241 set_bit(STRIPE_DISCARD, &sh->state); 5242 finish_wait(&conf->wait_for_overlap, &w); 5243 sh->overwrite_disks = 0; 5244 for (d = 0; d < conf->raid_disks; d++) { 5245 if (d == sh->pd_idx || d == sh->qd_idx) 5246 continue; 5247 sh->dev[d].towrite = bi; 5248 set_bit(R5_OVERWRITE, &sh->dev[d].flags); 5249 raid5_inc_bi_active_stripes(bi); 5250 sh->overwrite_disks++; 5251 } 5252 spin_unlock_irq(&sh->stripe_lock); 5253 if (conf->mddev->bitmap) { 5254 for (d = 0; 5255 d < conf->raid_disks - conf->max_degraded; 5256 d++) 5257 bitmap_startwrite(mddev->bitmap, 5258 sh->sector, 5259 STRIPE_SECTORS, 5260 0); 5261 sh->bm_seq = conf->seq_flush + 1; 5262 set_bit(STRIPE_BIT_DELAY, &sh->state); 5263 } 5264 5265 set_bit(STRIPE_HANDLE, &sh->state); 5266 clear_bit(STRIPE_DELAYED, &sh->state); 5267 if (!test_and_set_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) 5268 atomic_inc(&conf->preread_active_stripes); 5269 release_stripe_plug(mddev, sh); 5270 } 5271 5272 remaining = raid5_dec_bi_active_stripes(bi); 5273 if (remaining == 0) { 5274 md_write_end(mddev); 5275 bio_endio(bi); 5276 } 5277 } 5278 5279 static void raid5_make_request(struct mddev *mddev, struct bio * bi) 5280 { 5281 struct r5conf *conf = mddev->private; 5282 int dd_idx; 5283 sector_t new_sector; 5284 sector_t logical_sector, last_sector; 5285 struct stripe_head *sh; 5286 const int rw = bio_data_dir(bi); 5287 int remaining; 5288 DEFINE_WAIT(w); 5289 bool do_prepare; 5290 bool do_flush = false; 5291 5292 if (unlikely(bi->bi_opf & REQ_PREFLUSH)) { 5293 int ret = r5l_handle_flush_request(conf->log, bi); 5294 5295 if (ret == 0) 5296 return; 5297 if (ret == -ENODEV) { 5298 md_flush_request(mddev, bi); 5299 return; 5300 } 5301 /* ret == -EAGAIN, fallback */ 5302 /* 5303 * if r5l_handle_flush_request() didn't clear REQ_PREFLUSH, 5304 * we need to flush journal device 5305 */ 5306 do_flush = bi->bi_opf & REQ_PREFLUSH; 5307 } 5308 5309 md_write_start(mddev, bi); 5310 5311 /* 5312 * If array is degraded, better not do chunk aligned read because 5313 * later we might have to read it again in order to reconstruct 5314 * data on failed drives. 5315 */ 5316 if (rw == READ && mddev->degraded == 0 && 5317 !r5c_is_writeback(conf->log) && 5318 mddev->reshape_position == MaxSector) { 5319 bi = chunk_aligned_read(mddev, bi); 5320 if (!bi) 5321 return; 5322 } 5323 5324 if (unlikely(bio_op(bi) == REQ_OP_DISCARD)) { 5325 make_discard_request(mddev, bi); 5326 return; 5327 } 5328 5329 logical_sector = bi->bi_iter.bi_sector & ~((sector_t)STRIPE_SECTORS-1); 5330 last_sector = bio_end_sector(bi); 5331 bi->bi_next = NULL; 5332 bi->bi_phys_segments = 1; /* over-loaded to count active stripes */ 5333 5334 prepare_to_wait(&conf->wait_for_overlap, &w, TASK_UNINTERRUPTIBLE); 5335 for (;logical_sector < last_sector; logical_sector += STRIPE_SECTORS) { 5336 int previous; 5337 int seq; 5338 5339 do_prepare = false; 5340 retry: 5341 seq = read_seqcount_begin(&conf->gen_lock); 5342 previous = 0; 5343 if (do_prepare) 5344 prepare_to_wait(&conf->wait_for_overlap, &w, 5345 TASK_UNINTERRUPTIBLE); 5346 if (unlikely(conf->reshape_progress != MaxSector)) { 5347 /* spinlock is needed as reshape_progress may be 5348 * 64bit on a 32bit platform, and so it might be 5349 * possible to see a half-updated value 5350 * Of course reshape_progress could change after 5351 * the lock is dropped, so once we get a reference 5352 * to the stripe that we think it is, we will have 5353 * to check again. 5354 */ 5355 spin_lock_irq(&conf->device_lock); 5356 if (mddev->reshape_backwards 5357 ? logical_sector < conf->reshape_progress 5358 : logical_sector >= conf->reshape_progress) { 5359 previous = 1; 5360 } else { 5361 if (mddev->reshape_backwards 5362 ? logical_sector < conf->reshape_safe 5363 : logical_sector >= conf->reshape_safe) { 5364 spin_unlock_irq(&conf->device_lock); 5365 schedule(); 5366 do_prepare = true; 5367 goto retry; 5368 } 5369 } 5370 spin_unlock_irq(&conf->device_lock); 5371 } 5372 5373 new_sector = raid5_compute_sector(conf, logical_sector, 5374 previous, 5375 &dd_idx, NULL); 5376 pr_debug("raid456: raid5_make_request, sector %llu logical %llu\n", 5377 (unsigned long long)new_sector, 5378 (unsigned long long)logical_sector); 5379 5380 sh = raid5_get_active_stripe(conf, new_sector, previous, 5381 (bi->bi_opf & REQ_RAHEAD), 0); 5382 if (sh) { 5383 if (unlikely(previous)) { 5384 /* expansion might have moved on while waiting for a 5385 * stripe, so we must do the range check again. 5386 * Expansion could still move past after this 5387 * test, but as we are holding a reference to 5388 * 'sh', we know that if that happens, 5389 * STRIPE_EXPANDING will get set and the expansion 5390 * won't proceed until we finish with the stripe. 5391 */ 5392 int must_retry = 0; 5393 spin_lock_irq(&conf->device_lock); 5394 if (mddev->reshape_backwards 5395 ? logical_sector >= conf->reshape_progress 5396 : logical_sector < conf->reshape_progress) 5397 /* mismatch, need to try again */ 5398 must_retry = 1; 5399 spin_unlock_irq(&conf->device_lock); 5400 if (must_retry) { 5401 raid5_release_stripe(sh); 5402 schedule(); 5403 do_prepare = true; 5404 goto retry; 5405 } 5406 } 5407 if (read_seqcount_retry(&conf->gen_lock, seq)) { 5408 /* Might have got the wrong stripe_head 5409 * by accident 5410 */ 5411 raid5_release_stripe(sh); 5412 goto retry; 5413 } 5414 5415 if (rw == WRITE && 5416 logical_sector >= mddev->suspend_lo && 5417 logical_sector < mddev->suspend_hi) { 5418 raid5_release_stripe(sh); 5419 /* As the suspend_* range is controlled by 5420 * userspace, we want an interruptible 5421 * wait. 5422 */ 5423 flush_signals(current); 5424 prepare_to_wait(&conf->wait_for_overlap, 5425 &w, TASK_INTERRUPTIBLE); 5426 if (logical_sector >= mddev->suspend_lo && 5427 logical_sector < mddev->suspend_hi) { 5428 schedule(); 5429 do_prepare = true; 5430 } 5431 goto retry; 5432 } 5433 5434 if (test_bit(STRIPE_EXPANDING, &sh->state) || 5435 !add_stripe_bio(sh, bi, dd_idx, rw, previous)) { 5436 /* Stripe is busy expanding or 5437 * add failed due to overlap. Flush everything 5438 * and wait a while 5439 */ 5440 md_wakeup_thread(mddev->thread); 5441 raid5_release_stripe(sh); 5442 schedule(); 5443 do_prepare = true; 5444 goto retry; 5445 } 5446 if (do_flush) { 5447 set_bit(STRIPE_R5C_PREFLUSH, &sh->state); 5448 /* we only need flush for one stripe */ 5449 do_flush = false; 5450 } 5451 5452 set_bit(STRIPE_HANDLE, &sh->state); 5453 clear_bit(STRIPE_DELAYED, &sh->state); 5454 if ((!sh->batch_head || sh == sh->batch_head) && 5455 (bi->bi_opf & REQ_SYNC) && 5456 !test_and_set_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) 5457 atomic_inc(&conf->preread_active_stripes); 5458 release_stripe_plug(mddev, sh); 5459 } else { 5460 /* cannot get stripe for read-ahead, just give-up */ 5461 bi->bi_error = -EIO; 5462 break; 5463 } 5464 } 5465 finish_wait(&conf->wait_for_overlap, &w); 5466 5467 remaining = raid5_dec_bi_active_stripes(bi); 5468 if (remaining == 0) { 5469 5470 if ( rw == WRITE ) 5471 md_write_end(mddev); 5472 5473 trace_block_bio_complete(bdev_get_queue(bi->bi_bdev), 5474 bi, 0); 5475 bio_endio(bi); 5476 } 5477 } 5478 5479 static sector_t raid5_size(struct mddev *mddev, sector_t sectors, int raid_disks); 5480 5481 static sector_t reshape_request(struct mddev *mddev, sector_t sector_nr, int *skipped) 5482 { 5483 /* reshaping is quite different to recovery/resync so it is 5484 * handled quite separately ... here. 5485 * 5486 * On each call to sync_request, we gather one chunk worth of 5487 * destination stripes and flag them as expanding. 5488 * Then we find all the source stripes and request reads. 5489 * As the reads complete, handle_stripe will copy the data 5490 * into the destination stripe and release that stripe. 5491 */ 5492 struct r5conf *conf = mddev->private; 5493 struct stripe_head *sh; 5494 sector_t first_sector, last_sector; 5495 int raid_disks = conf->previous_raid_disks; 5496 int data_disks = raid_disks - conf->max_degraded; 5497 int new_data_disks = conf->raid_disks - conf->max_degraded; 5498 int i; 5499 int dd_idx; 5500 sector_t writepos, readpos, safepos; 5501 sector_t stripe_addr; 5502 int reshape_sectors; 5503 struct list_head stripes; 5504 sector_t retn; 5505 5506 if (sector_nr == 0) { 5507 /* If restarting in the middle, skip the initial sectors */ 5508 if (mddev->reshape_backwards && 5509 conf->reshape_progress < raid5_size(mddev, 0, 0)) { 5510 sector_nr = raid5_size(mddev, 0, 0) 5511 - conf->reshape_progress; 5512 } else if (mddev->reshape_backwards && 5513 conf->reshape_progress == MaxSector) { 5514 /* shouldn't happen, but just in case, finish up.*/ 5515 sector_nr = MaxSector; 5516 } else if (!mddev->reshape_backwards && 5517 conf->reshape_progress > 0) 5518 sector_nr = conf->reshape_progress; 5519 sector_div(sector_nr, new_data_disks); 5520 if (sector_nr) { 5521 mddev->curr_resync_completed = sector_nr; 5522 sysfs_notify(&mddev->kobj, NULL, "sync_completed"); 5523 *skipped = 1; 5524 retn = sector_nr; 5525 goto finish; 5526 } 5527 } 5528 5529 /* We need to process a full chunk at a time. 5530 * If old and new chunk sizes differ, we need to process the 5531 * largest of these 5532 */ 5533 5534 reshape_sectors = max(conf->chunk_sectors, conf->prev_chunk_sectors); 5535 5536 /* We update the metadata at least every 10 seconds, or when 5537 * the data about to be copied would over-write the source of 5538 * the data at the front of the range. i.e. one new_stripe 5539 * along from reshape_progress new_maps to after where 5540 * reshape_safe old_maps to 5541 */ 5542 writepos = conf->reshape_progress; 5543 sector_div(writepos, new_data_disks); 5544 readpos = conf->reshape_progress; 5545 sector_div(readpos, data_disks); 5546 safepos = conf->reshape_safe; 5547 sector_div(safepos, data_disks); 5548 if (mddev->reshape_backwards) { 5549 BUG_ON(writepos < reshape_sectors); 5550 writepos -= reshape_sectors; 5551 readpos += reshape_sectors; 5552 safepos += reshape_sectors; 5553 } else { 5554 writepos += reshape_sectors; 5555 /* readpos and safepos are worst-case calculations. 5556 * A negative number is overly pessimistic, and causes 5557 * obvious problems for unsigned storage. So clip to 0. 5558 */ 5559 readpos -= min_t(sector_t, reshape_sectors, readpos); 5560 safepos -= min_t(sector_t, reshape_sectors, safepos); 5561 } 5562 5563 /* Having calculated the 'writepos' possibly use it 5564 * to set 'stripe_addr' which is where we will write to. 5565 */ 5566 if (mddev->reshape_backwards) { 5567 BUG_ON(conf->reshape_progress == 0); 5568 stripe_addr = writepos; 5569 BUG_ON((mddev->dev_sectors & 5570 ~((sector_t)reshape_sectors - 1)) 5571 - reshape_sectors - stripe_addr 5572 != sector_nr); 5573 } else { 5574 BUG_ON(writepos != sector_nr + reshape_sectors); 5575 stripe_addr = sector_nr; 5576 } 5577 5578 /* 'writepos' is the most advanced device address we might write. 5579 * 'readpos' is the least advanced device address we might read. 5580 * 'safepos' is the least address recorded in the metadata as having 5581 * been reshaped. 5582 * If there is a min_offset_diff, these are adjusted either by 5583 * increasing the safepos/readpos if diff is negative, or 5584 * increasing writepos if diff is positive. 5585 * If 'readpos' is then behind 'writepos', there is no way that we can 5586 * ensure safety in the face of a crash - that must be done by userspace 5587 * making a backup of the data. So in that case there is no particular 5588 * rush to update metadata. 5589 * Otherwise if 'safepos' is behind 'writepos', then we really need to 5590 * update the metadata to advance 'safepos' to match 'readpos' so that 5591 * we can be safe in the event of a crash. 5592 * So we insist on updating metadata if safepos is behind writepos and 5593 * readpos is beyond writepos. 5594 * In any case, update the metadata every 10 seconds. 5595 * Maybe that number should be configurable, but I'm not sure it is 5596 * worth it.... maybe it could be a multiple of safemode_delay??? 5597 */ 5598 if (conf->min_offset_diff < 0) { 5599 safepos += -conf->min_offset_diff; 5600 readpos += -conf->min_offset_diff; 5601 } else 5602 writepos += conf->min_offset_diff; 5603 5604 if ((mddev->reshape_backwards 5605 ? (safepos > writepos && readpos < writepos) 5606 : (safepos < writepos && readpos > writepos)) || 5607 time_after(jiffies, conf->reshape_checkpoint + 10*HZ)) { 5608 /* Cannot proceed until we've updated the superblock... */ 5609 wait_event(conf->wait_for_overlap, 5610 atomic_read(&conf->reshape_stripes)==0 5611 || test_bit(MD_RECOVERY_INTR, &mddev->recovery)); 5612 if (atomic_read(&conf->reshape_stripes) != 0) 5613 return 0; 5614 mddev->reshape_position = conf->reshape_progress; 5615 mddev->curr_resync_completed = sector_nr; 5616 conf->reshape_checkpoint = jiffies; 5617 set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags); 5618 md_wakeup_thread(mddev->thread); 5619 wait_event(mddev->sb_wait, mddev->sb_flags == 0 || 5620 test_bit(MD_RECOVERY_INTR, &mddev->recovery)); 5621 if (test_bit(MD_RECOVERY_INTR, &mddev->recovery)) 5622 return 0; 5623 spin_lock_irq(&conf->device_lock); 5624 conf->reshape_safe = mddev->reshape_position; 5625 spin_unlock_irq(&conf->device_lock); 5626 wake_up(&conf->wait_for_overlap); 5627 sysfs_notify(&mddev->kobj, NULL, "sync_completed"); 5628 } 5629 5630 INIT_LIST_HEAD(&stripes); 5631 for (i = 0; i < reshape_sectors; i += STRIPE_SECTORS) { 5632 int j; 5633 int skipped_disk = 0; 5634 sh = raid5_get_active_stripe(conf, stripe_addr+i, 0, 0, 1); 5635 set_bit(STRIPE_EXPANDING, &sh->state); 5636 atomic_inc(&conf->reshape_stripes); 5637 /* If any of this stripe is beyond the end of the old 5638 * array, then we need to zero those blocks 5639 */ 5640 for (j=sh->disks; j--;) { 5641 sector_t s; 5642 if (j == sh->pd_idx) 5643 continue; 5644 if (conf->level == 6 && 5645 j == sh->qd_idx) 5646 continue; 5647 s = raid5_compute_blocknr(sh, j, 0); 5648 if (s < raid5_size(mddev, 0, 0)) { 5649 skipped_disk = 1; 5650 continue; 5651 } 5652 memset(page_address(sh->dev[j].page), 0, STRIPE_SIZE); 5653 set_bit(R5_Expanded, &sh->dev[j].flags); 5654 set_bit(R5_UPTODATE, &sh->dev[j].flags); 5655 } 5656 if (!skipped_disk) { 5657 set_bit(STRIPE_EXPAND_READY, &sh->state); 5658 set_bit(STRIPE_HANDLE, &sh->state); 5659 } 5660 list_add(&sh->lru, &stripes); 5661 } 5662 spin_lock_irq(&conf->device_lock); 5663 if (mddev->reshape_backwards) 5664 conf->reshape_progress -= reshape_sectors * new_data_disks; 5665 else 5666 conf->reshape_progress += reshape_sectors * new_data_disks; 5667 spin_unlock_irq(&conf->device_lock); 5668 /* Ok, those stripe are ready. We can start scheduling 5669 * reads on the source stripes. 5670 * The source stripes are determined by mapping the first and last 5671 * block on the destination stripes. 5672 */ 5673 first_sector = 5674 raid5_compute_sector(conf, stripe_addr*(new_data_disks), 5675 1, &dd_idx, NULL); 5676 last_sector = 5677 raid5_compute_sector(conf, ((stripe_addr+reshape_sectors) 5678 * new_data_disks - 1), 5679 1, &dd_idx, NULL); 5680 if (last_sector >= mddev->dev_sectors) 5681 last_sector = mddev->dev_sectors - 1; 5682 while (first_sector <= last_sector) { 5683 sh = raid5_get_active_stripe(conf, first_sector, 1, 0, 1); 5684 set_bit(STRIPE_EXPAND_SOURCE, &sh->state); 5685 set_bit(STRIPE_HANDLE, &sh->state); 5686 raid5_release_stripe(sh); 5687 first_sector += STRIPE_SECTORS; 5688 } 5689 /* Now that the sources are clearly marked, we can release 5690 * the destination stripes 5691 */ 5692 while (!list_empty(&stripes)) { 5693 sh = list_entry(stripes.next, struct stripe_head, lru); 5694 list_del_init(&sh->lru); 5695 raid5_release_stripe(sh); 5696 } 5697 /* If this takes us to the resync_max point where we have to pause, 5698 * then we need to write out the superblock. 5699 */ 5700 sector_nr += reshape_sectors; 5701 retn = reshape_sectors; 5702 finish: 5703 if (mddev->curr_resync_completed > mddev->resync_max || 5704 (sector_nr - mddev->curr_resync_completed) * 2 5705 >= mddev->resync_max - mddev->curr_resync_completed) { 5706 /* Cannot proceed until we've updated the superblock... */ 5707 wait_event(conf->wait_for_overlap, 5708 atomic_read(&conf->reshape_stripes) == 0 5709 || test_bit(MD_RECOVERY_INTR, &mddev->recovery)); 5710 if (atomic_read(&conf->reshape_stripes) != 0) 5711 goto ret; 5712 mddev->reshape_position = conf->reshape_progress; 5713 mddev->curr_resync_completed = sector_nr; 5714 conf->reshape_checkpoint = jiffies; 5715 set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags); 5716 md_wakeup_thread(mddev->thread); 5717 wait_event(mddev->sb_wait, 5718 !test_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags) 5719 || test_bit(MD_RECOVERY_INTR, &mddev->recovery)); 5720 if (test_bit(MD_RECOVERY_INTR, &mddev->recovery)) 5721 goto ret; 5722 spin_lock_irq(&conf->device_lock); 5723 conf->reshape_safe = mddev->reshape_position; 5724 spin_unlock_irq(&conf->device_lock); 5725 wake_up(&conf->wait_for_overlap); 5726 sysfs_notify(&mddev->kobj, NULL, "sync_completed"); 5727 } 5728 ret: 5729 return retn; 5730 } 5731 5732 static inline sector_t raid5_sync_request(struct mddev *mddev, sector_t sector_nr, 5733 int *skipped) 5734 { 5735 struct r5conf *conf = mddev->private; 5736 struct stripe_head *sh; 5737 sector_t max_sector = mddev->dev_sectors; 5738 sector_t sync_blocks; 5739 int still_degraded = 0; 5740 int i; 5741 5742 if (sector_nr >= max_sector) { 5743 /* just being told to finish up .. nothing much to do */ 5744 5745 if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)) { 5746 end_reshape(conf); 5747 return 0; 5748 } 5749 5750 if (mddev->curr_resync < max_sector) /* aborted */ 5751 bitmap_end_sync(mddev->bitmap, mddev->curr_resync, 5752 &sync_blocks, 1); 5753 else /* completed sync */ 5754 conf->fullsync = 0; 5755 bitmap_close_sync(mddev->bitmap); 5756 5757 return 0; 5758 } 5759 5760 /* Allow raid5_quiesce to complete */ 5761 wait_event(conf->wait_for_overlap, conf->quiesce != 2); 5762 5763 if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)) 5764 return reshape_request(mddev, sector_nr, skipped); 5765 5766 /* No need to check resync_max as we never do more than one 5767 * stripe, and as resync_max will always be on a chunk boundary, 5768 * if the check in md_do_sync didn't fire, there is no chance 5769 * of overstepping resync_max here 5770 */ 5771 5772 /* if there is too many failed drives and we are trying 5773 * to resync, then assert that we are finished, because there is 5774 * nothing we can do. 5775 */ 5776 if (mddev->degraded >= conf->max_degraded && 5777 test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) { 5778 sector_t rv = mddev->dev_sectors - sector_nr; 5779 *skipped = 1; 5780 return rv; 5781 } 5782 if (!test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery) && 5783 !conf->fullsync && 5784 !bitmap_start_sync(mddev->bitmap, sector_nr, &sync_blocks, 1) && 5785 sync_blocks >= STRIPE_SECTORS) { 5786 /* we can skip this block, and probably more */ 5787 sync_blocks /= STRIPE_SECTORS; 5788 *skipped = 1; 5789 return sync_blocks * STRIPE_SECTORS; /* keep things rounded to whole stripes */ 5790 } 5791 5792 bitmap_cond_end_sync(mddev->bitmap, sector_nr, false); 5793 5794 sh = raid5_get_active_stripe(conf, sector_nr, 0, 1, 0); 5795 if (sh == NULL) { 5796 sh = raid5_get_active_stripe(conf, sector_nr, 0, 0, 0); 5797 /* make sure we don't swamp the stripe cache if someone else 5798 * is trying to get access 5799 */ 5800 schedule_timeout_uninterruptible(1); 5801 } 5802 /* Need to check if array will still be degraded after recovery/resync 5803 * Note in case of > 1 drive failures it's possible we're rebuilding 5804 * one drive while leaving another faulty drive in array. 5805 */ 5806 rcu_read_lock(); 5807 for (i = 0; i < conf->raid_disks; i++) { 5808 struct md_rdev *rdev = ACCESS_ONCE(conf->disks[i].rdev); 5809 5810 if (rdev == NULL || test_bit(Faulty, &rdev->flags)) 5811 still_degraded = 1; 5812 } 5813 rcu_read_unlock(); 5814 5815 bitmap_start_sync(mddev->bitmap, sector_nr, &sync_blocks, still_degraded); 5816 5817 set_bit(STRIPE_SYNC_REQUESTED, &sh->state); 5818 set_bit(STRIPE_HANDLE, &sh->state); 5819 5820 raid5_release_stripe(sh); 5821 5822 return STRIPE_SECTORS; 5823 } 5824 5825 static int retry_aligned_read(struct r5conf *conf, struct bio *raid_bio) 5826 { 5827 /* We may not be able to submit a whole bio at once as there 5828 * may not be enough stripe_heads available. 5829 * We cannot pre-allocate enough stripe_heads as we may need 5830 * more than exist in the cache (if we allow ever large chunks). 5831 * So we do one stripe head at a time and record in 5832 * ->bi_hw_segments how many have been done. 5833 * 5834 * We *know* that this entire raid_bio is in one chunk, so 5835 * it will be only one 'dd_idx' and only need one call to raid5_compute_sector. 5836 */ 5837 struct stripe_head *sh; 5838 int dd_idx; 5839 sector_t sector, logical_sector, last_sector; 5840 int scnt = 0; 5841 int remaining; 5842 int handled = 0; 5843 5844 logical_sector = raid_bio->bi_iter.bi_sector & 5845 ~((sector_t)STRIPE_SECTORS-1); 5846 sector = raid5_compute_sector(conf, logical_sector, 5847 0, &dd_idx, NULL); 5848 last_sector = bio_end_sector(raid_bio); 5849 5850 for (; logical_sector < last_sector; 5851 logical_sector += STRIPE_SECTORS, 5852 sector += STRIPE_SECTORS, 5853 scnt++) { 5854 5855 if (scnt < raid5_bi_processed_stripes(raid_bio)) 5856 /* already done this stripe */ 5857 continue; 5858 5859 sh = raid5_get_active_stripe(conf, sector, 0, 1, 1); 5860 5861 if (!sh) { 5862 /* failed to get a stripe - must wait */ 5863 raid5_set_bi_processed_stripes(raid_bio, scnt); 5864 conf->retry_read_aligned = raid_bio; 5865 return handled; 5866 } 5867 5868 if (!add_stripe_bio(sh, raid_bio, dd_idx, 0, 0)) { 5869 raid5_release_stripe(sh); 5870 raid5_set_bi_processed_stripes(raid_bio, scnt); 5871 conf->retry_read_aligned = raid_bio; 5872 return handled; 5873 } 5874 5875 set_bit(R5_ReadNoMerge, &sh->dev[dd_idx].flags); 5876 handle_stripe(sh); 5877 raid5_release_stripe(sh); 5878 handled++; 5879 } 5880 remaining = raid5_dec_bi_active_stripes(raid_bio); 5881 if (remaining == 0) { 5882 trace_block_bio_complete(bdev_get_queue(raid_bio->bi_bdev), 5883 raid_bio, 0); 5884 bio_endio(raid_bio); 5885 } 5886 if (atomic_dec_and_test(&conf->active_aligned_reads)) 5887 wake_up(&conf->wait_for_quiescent); 5888 return handled; 5889 } 5890 5891 static int handle_active_stripes(struct r5conf *conf, int group, 5892 struct r5worker *worker, 5893 struct list_head *temp_inactive_list) 5894 { 5895 struct stripe_head *batch[MAX_STRIPE_BATCH], *sh; 5896 int i, batch_size = 0, hash; 5897 bool release_inactive = false; 5898 5899 while (batch_size < MAX_STRIPE_BATCH && 5900 (sh = __get_priority_stripe(conf, group)) != NULL) 5901 batch[batch_size++] = sh; 5902 5903 if (batch_size == 0) { 5904 for (i = 0; i < NR_STRIPE_HASH_LOCKS; i++) 5905 if (!list_empty(temp_inactive_list + i)) 5906 break; 5907 if (i == NR_STRIPE_HASH_LOCKS) { 5908 spin_unlock_irq(&conf->device_lock); 5909 r5l_flush_stripe_to_raid(conf->log); 5910 spin_lock_irq(&conf->device_lock); 5911 return batch_size; 5912 } 5913 release_inactive = true; 5914 } 5915 spin_unlock_irq(&conf->device_lock); 5916 5917 release_inactive_stripe_list(conf, temp_inactive_list, 5918 NR_STRIPE_HASH_LOCKS); 5919 5920 r5l_flush_stripe_to_raid(conf->log); 5921 if (release_inactive) { 5922 spin_lock_irq(&conf->device_lock); 5923 return 0; 5924 } 5925 5926 for (i = 0; i < batch_size; i++) 5927 handle_stripe(batch[i]); 5928 r5l_write_stripe_run(conf->log); 5929 5930 cond_resched(); 5931 5932 spin_lock_irq(&conf->device_lock); 5933 for (i = 0; i < batch_size; i++) { 5934 hash = batch[i]->hash_lock_index; 5935 __release_stripe(conf, batch[i], &temp_inactive_list[hash]); 5936 } 5937 return batch_size; 5938 } 5939 5940 static void raid5_do_work(struct work_struct *work) 5941 { 5942 struct r5worker *worker = container_of(work, struct r5worker, work); 5943 struct r5worker_group *group = worker->group; 5944 struct r5conf *conf = group->conf; 5945 int group_id = group - conf->worker_groups; 5946 int handled; 5947 struct blk_plug plug; 5948 5949 pr_debug("+++ raid5worker active\n"); 5950 5951 blk_start_plug(&plug); 5952 handled = 0; 5953 spin_lock_irq(&conf->device_lock); 5954 while (1) { 5955 int batch_size, released; 5956 5957 released = release_stripe_list(conf, worker->temp_inactive_list); 5958 5959 batch_size = handle_active_stripes(conf, group_id, worker, 5960 worker->temp_inactive_list); 5961 worker->working = false; 5962 if (!batch_size && !released) 5963 break; 5964 handled += batch_size; 5965 } 5966 pr_debug("%d stripes handled\n", handled); 5967 5968 spin_unlock_irq(&conf->device_lock); 5969 blk_finish_plug(&plug); 5970 5971 pr_debug("--- raid5worker inactive\n"); 5972 } 5973 5974 /* 5975 * This is our raid5 kernel thread. 5976 * 5977 * We scan the hash table for stripes which can be handled now. 5978 * During the scan, completed stripes are saved for us by the interrupt 5979 * handler, so that they will not have to wait for our next wakeup. 5980 */ 5981 static void raid5d(struct md_thread *thread) 5982 { 5983 struct mddev *mddev = thread->mddev; 5984 struct r5conf *conf = mddev->private; 5985 int handled; 5986 struct blk_plug plug; 5987 5988 pr_debug("+++ raid5d active\n"); 5989 5990 md_check_recovery(mddev); 5991 5992 if (!bio_list_empty(&conf->return_bi) && 5993 !test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags)) { 5994 struct bio_list tmp = BIO_EMPTY_LIST; 5995 spin_lock_irq(&conf->device_lock); 5996 if (!test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags)) { 5997 bio_list_merge(&tmp, &conf->return_bi); 5998 bio_list_init(&conf->return_bi); 5999 } 6000 spin_unlock_irq(&conf->device_lock); 6001 return_io(&tmp); 6002 } 6003 6004 blk_start_plug(&plug); 6005 handled = 0; 6006 spin_lock_irq(&conf->device_lock); 6007 while (1) { 6008 struct bio *bio; 6009 int batch_size, released; 6010 6011 released = release_stripe_list(conf, conf->temp_inactive_list); 6012 if (released) 6013 clear_bit(R5_DID_ALLOC, &conf->cache_state); 6014 6015 if ( 6016 !list_empty(&conf->bitmap_list)) { 6017 /* Now is a good time to flush some bitmap updates */ 6018 conf->seq_flush++; 6019 spin_unlock_irq(&conf->device_lock); 6020 bitmap_unplug(mddev->bitmap); 6021 spin_lock_irq(&conf->device_lock); 6022 conf->seq_write = conf->seq_flush; 6023 activate_bit_delay(conf, conf->temp_inactive_list); 6024 } 6025 raid5_activate_delayed(conf); 6026 6027 while ((bio = remove_bio_from_retry(conf))) { 6028 int ok; 6029 spin_unlock_irq(&conf->device_lock); 6030 ok = retry_aligned_read(conf, bio); 6031 spin_lock_irq(&conf->device_lock); 6032 if (!ok) 6033 break; 6034 handled++; 6035 } 6036 6037 batch_size = handle_active_stripes(conf, ANY_GROUP, NULL, 6038 conf->temp_inactive_list); 6039 if (!batch_size && !released) 6040 break; 6041 handled += batch_size; 6042 6043 if (mddev->sb_flags & ~(1 << MD_SB_CHANGE_PENDING)) { 6044 spin_unlock_irq(&conf->device_lock); 6045 md_check_recovery(mddev); 6046 spin_lock_irq(&conf->device_lock); 6047 } 6048 } 6049 pr_debug("%d stripes handled\n", handled); 6050 6051 spin_unlock_irq(&conf->device_lock); 6052 if (test_and_clear_bit(R5_ALLOC_MORE, &conf->cache_state) && 6053 mutex_trylock(&conf->cache_size_mutex)) { 6054 grow_one_stripe(conf, __GFP_NOWARN); 6055 /* Set flag even if allocation failed. This helps 6056 * slow down allocation requests when mem is short 6057 */ 6058 set_bit(R5_DID_ALLOC, &conf->cache_state); 6059 mutex_unlock(&conf->cache_size_mutex); 6060 } 6061 6062 r5l_flush_stripe_to_raid(conf->log); 6063 6064 async_tx_issue_pending_all(); 6065 blk_finish_plug(&plug); 6066 6067 pr_debug("--- raid5d inactive\n"); 6068 } 6069 6070 static ssize_t 6071 raid5_show_stripe_cache_size(struct mddev *mddev, char *page) 6072 { 6073 struct r5conf *conf; 6074 int ret = 0; 6075 spin_lock(&mddev->lock); 6076 conf = mddev->private; 6077 if (conf) 6078 ret = sprintf(page, "%d\n", conf->min_nr_stripes); 6079 spin_unlock(&mddev->lock); 6080 return ret; 6081 } 6082 6083 int 6084 raid5_set_cache_size(struct mddev *mddev, int size) 6085 { 6086 struct r5conf *conf = mddev->private; 6087 int err; 6088 6089 if (size <= 16 || size > 32768) 6090 return -EINVAL; 6091 6092 conf->min_nr_stripes = size; 6093 mutex_lock(&conf->cache_size_mutex); 6094 while (size < conf->max_nr_stripes && 6095 drop_one_stripe(conf)) 6096 ; 6097 mutex_unlock(&conf->cache_size_mutex); 6098 6099 6100 err = md_allow_write(mddev); 6101 if (err) 6102 return err; 6103 6104 mutex_lock(&conf->cache_size_mutex); 6105 while (size > conf->max_nr_stripes) 6106 if (!grow_one_stripe(conf, GFP_KERNEL)) 6107 break; 6108 mutex_unlock(&conf->cache_size_mutex); 6109 6110 return 0; 6111 } 6112 EXPORT_SYMBOL(raid5_set_cache_size); 6113 6114 static ssize_t 6115 raid5_store_stripe_cache_size(struct mddev *mddev, const char *page, size_t len) 6116 { 6117 struct r5conf *conf; 6118 unsigned long new; 6119 int err; 6120 6121 if (len >= PAGE_SIZE) 6122 return -EINVAL; 6123 if (kstrtoul(page, 10, &new)) 6124 return -EINVAL; 6125 err = mddev_lock(mddev); 6126 if (err) 6127 return err; 6128 conf = mddev->private; 6129 if (!conf) 6130 err = -ENODEV; 6131 else 6132 err = raid5_set_cache_size(mddev, new); 6133 mddev_unlock(mddev); 6134 6135 return err ?: len; 6136 } 6137 6138 static struct md_sysfs_entry 6139 raid5_stripecache_size = __ATTR(stripe_cache_size, S_IRUGO | S_IWUSR, 6140 raid5_show_stripe_cache_size, 6141 raid5_store_stripe_cache_size); 6142 6143 static ssize_t 6144 raid5_show_rmw_level(struct mddev *mddev, char *page) 6145 { 6146 struct r5conf *conf = mddev->private; 6147 if (conf) 6148 return sprintf(page, "%d\n", conf->rmw_level); 6149 else 6150 return 0; 6151 } 6152 6153 static ssize_t 6154 raid5_store_rmw_level(struct mddev *mddev, const char *page, size_t len) 6155 { 6156 struct r5conf *conf = mddev->private; 6157 unsigned long new; 6158 6159 if (!conf) 6160 return -ENODEV; 6161 6162 if (len >= PAGE_SIZE) 6163 return -EINVAL; 6164 6165 if (kstrtoul(page, 10, &new)) 6166 return -EINVAL; 6167 6168 if (new != PARITY_DISABLE_RMW && !raid6_call.xor_syndrome) 6169 return -EINVAL; 6170 6171 if (new != PARITY_DISABLE_RMW && 6172 new != PARITY_ENABLE_RMW && 6173 new != PARITY_PREFER_RMW) 6174 return -EINVAL; 6175 6176 conf->rmw_level = new; 6177 return len; 6178 } 6179 6180 static struct md_sysfs_entry 6181 raid5_rmw_level = __ATTR(rmw_level, S_IRUGO | S_IWUSR, 6182 raid5_show_rmw_level, 6183 raid5_store_rmw_level); 6184 6185 6186 static ssize_t 6187 raid5_show_preread_threshold(struct mddev *mddev, char *page) 6188 { 6189 struct r5conf *conf; 6190 int ret = 0; 6191 spin_lock(&mddev->lock); 6192 conf = mddev->private; 6193 if (conf) 6194 ret = sprintf(page, "%d\n", conf->bypass_threshold); 6195 spin_unlock(&mddev->lock); 6196 return ret; 6197 } 6198 6199 static ssize_t 6200 raid5_store_preread_threshold(struct mddev *mddev, const char *page, size_t len) 6201 { 6202 struct r5conf *conf; 6203 unsigned long new; 6204 int err; 6205 6206 if (len >= PAGE_SIZE) 6207 return -EINVAL; 6208 if (kstrtoul(page, 10, &new)) 6209 return -EINVAL; 6210 6211 err = mddev_lock(mddev); 6212 if (err) 6213 return err; 6214 conf = mddev->private; 6215 if (!conf) 6216 err = -ENODEV; 6217 else if (new > conf->min_nr_stripes) 6218 err = -EINVAL; 6219 else 6220 conf->bypass_threshold = new; 6221 mddev_unlock(mddev); 6222 return err ?: len; 6223 } 6224 6225 static struct md_sysfs_entry 6226 raid5_preread_bypass_threshold = __ATTR(preread_bypass_threshold, 6227 S_IRUGO | S_IWUSR, 6228 raid5_show_preread_threshold, 6229 raid5_store_preread_threshold); 6230 6231 static ssize_t 6232 raid5_show_skip_copy(struct mddev *mddev, char *page) 6233 { 6234 struct r5conf *conf; 6235 int ret = 0; 6236 spin_lock(&mddev->lock); 6237 conf = mddev->private; 6238 if (conf) 6239 ret = sprintf(page, "%d\n", conf->skip_copy); 6240 spin_unlock(&mddev->lock); 6241 return ret; 6242 } 6243 6244 static ssize_t 6245 raid5_store_skip_copy(struct mddev *mddev, const char *page, size_t len) 6246 { 6247 struct r5conf *conf; 6248 unsigned long new; 6249 int err; 6250 6251 if (len >= PAGE_SIZE) 6252 return -EINVAL; 6253 if (kstrtoul(page, 10, &new)) 6254 return -EINVAL; 6255 new = !!new; 6256 6257 err = mddev_lock(mddev); 6258 if (err) 6259 return err; 6260 conf = mddev->private; 6261 if (!conf) 6262 err = -ENODEV; 6263 else if (new != conf->skip_copy) { 6264 mddev_suspend(mddev); 6265 conf->skip_copy = new; 6266 if (new) 6267 mddev->queue->backing_dev_info.capabilities |= 6268 BDI_CAP_STABLE_WRITES; 6269 else 6270 mddev->queue->backing_dev_info.capabilities &= 6271 ~BDI_CAP_STABLE_WRITES; 6272 mddev_resume(mddev); 6273 } 6274 mddev_unlock(mddev); 6275 return err ?: len; 6276 } 6277 6278 static struct md_sysfs_entry 6279 raid5_skip_copy = __ATTR(skip_copy, S_IRUGO | S_IWUSR, 6280 raid5_show_skip_copy, 6281 raid5_store_skip_copy); 6282 6283 static ssize_t 6284 stripe_cache_active_show(struct mddev *mddev, char *page) 6285 { 6286 struct r5conf *conf = mddev->private; 6287 if (conf) 6288 return sprintf(page, "%d\n", atomic_read(&conf->active_stripes)); 6289 else 6290 return 0; 6291 } 6292 6293 static struct md_sysfs_entry 6294 raid5_stripecache_active = __ATTR_RO(stripe_cache_active); 6295 6296 static ssize_t 6297 raid5_show_group_thread_cnt(struct mddev *mddev, char *page) 6298 { 6299 struct r5conf *conf; 6300 int ret = 0; 6301 spin_lock(&mddev->lock); 6302 conf = mddev->private; 6303 if (conf) 6304 ret = sprintf(page, "%d\n", conf->worker_cnt_per_group); 6305 spin_unlock(&mddev->lock); 6306 return ret; 6307 } 6308 6309 static int alloc_thread_groups(struct r5conf *conf, int cnt, 6310 int *group_cnt, 6311 int *worker_cnt_per_group, 6312 struct r5worker_group **worker_groups); 6313 static ssize_t 6314 raid5_store_group_thread_cnt(struct mddev *mddev, const char *page, size_t len) 6315 { 6316 struct r5conf *conf; 6317 unsigned long new; 6318 int err; 6319 struct r5worker_group *new_groups, *old_groups; 6320 int group_cnt, worker_cnt_per_group; 6321 6322 if (len >= PAGE_SIZE) 6323 return -EINVAL; 6324 if (kstrtoul(page, 10, &new)) 6325 return -EINVAL; 6326 6327 err = mddev_lock(mddev); 6328 if (err) 6329 return err; 6330 conf = mddev->private; 6331 if (!conf) 6332 err = -ENODEV; 6333 else if (new != conf->worker_cnt_per_group) { 6334 mddev_suspend(mddev); 6335 6336 old_groups = conf->worker_groups; 6337 if (old_groups) 6338 flush_workqueue(raid5_wq); 6339 6340 err = alloc_thread_groups(conf, new, 6341 &group_cnt, &worker_cnt_per_group, 6342 &new_groups); 6343 if (!err) { 6344 spin_lock_irq(&conf->device_lock); 6345 conf->group_cnt = group_cnt; 6346 conf->worker_cnt_per_group = worker_cnt_per_group; 6347 conf->worker_groups = new_groups; 6348 spin_unlock_irq(&conf->device_lock); 6349 6350 if (old_groups) 6351 kfree(old_groups[0].workers); 6352 kfree(old_groups); 6353 } 6354 mddev_resume(mddev); 6355 } 6356 mddev_unlock(mddev); 6357 6358 return err ?: len; 6359 } 6360 6361 static struct md_sysfs_entry 6362 raid5_group_thread_cnt = __ATTR(group_thread_cnt, S_IRUGO | S_IWUSR, 6363 raid5_show_group_thread_cnt, 6364 raid5_store_group_thread_cnt); 6365 6366 static struct attribute *raid5_attrs[] = { 6367 &raid5_stripecache_size.attr, 6368 &raid5_stripecache_active.attr, 6369 &raid5_preread_bypass_threshold.attr, 6370 &raid5_group_thread_cnt.attr, 6371 &raid5_skip_copy.attr, 6372 &raid5_rmw_level.attr, 6373 &r5c_journal_mode.attr, 6374 NULL, 6375 }; 6376 static struct attribute_group raid5_attrs_group = { 6377 .name = NULL, 6378 .attrs = raid5_attrs, 6379 }; 6380 6381 static int alloc_thread_groups(struct r5conf *conf, int cnt, 6382 int *group_cnt, 6383 int *worker_cnt_per_group, 6384 struct r5worker_group **worker_groups) 6385 { 6386 int i, j, k; 6387 ssize_t size; 6388 struct r5worker *workers; 6389 6390 *worker_cnt_per_group = cnt; 6391 if (cnt == 0) { 6392 *group_cnt = 0; 6393 *worker_groups = NULL; 6394 return 0; 6395 } 6396 *group_cnt = num_possible_nodes(); 6397 size = sizeof(struct r5worker) * cnt; 6398 workers = kzalloc(size * *group_cnt, GFP_NOIO); 6399 *worker_groups = kzalloc(sizeof(struct r5worker_group) * 6400 *group_cnt, GFP_NOIO); 6401 if (!*worker_groups || !workers) { 6402 kfree(workers); 6403 kfree(*worker_groups); 6404 return -ENOMEM; 6405 } 6406 6407 for (i = 0; i < *group_cnt; i++) { 6408 struct r5worker_group *group; 6409 6410 group = &(*worker_groups)[i]; 6411 INIT_LIST_HEAD(&group->handle_list); 6412 group->conf = conf; 6413 group->workers = workers + i * cnt; 6414 6415 for (j = 0; j < cnt; j++) { 6416 struct r5worker *worker = group->workers + j; 6417 worker->group = group; 6418 INIT_WORK(&worker->work, raid5_do_work); 6419 6420 for (k = 0; k < NR_STRIPE_HASH_LOCKS; k++) 6421 INIT_LIST_HEAD(worker->temp_inactive_list + k); 6422 } 6423 } 6424 6425 return 0; 6426 } 6427 6428 static void free_thread_groups(struct r5conf *conf) 6429 { 6430 if (conf->worker_groups) 6431 kfree(conf->worker_groups[0].workers); 6432 kfree(conf->worker_groups); 6433 conf->worker_groups = NULL; 6434 } 6435 6436 static sector_t 6437 raid5_size(struct mddev *mddev, sector_t sectors, int raid_disks) 6438 { 6439 struct r5conf *conf = mddev->private; 6440 6441 if (!sectors) 6442 sectors = mddev->dev_sectors; 6443 if (!raid_disks) 6444 /* size is defined by the smallest of previous and new size */ 6445 raid_disks = min(conf->raid_disks, conf->previous_raid_disks); 6446 6447 sectors &= ~((sector_t)conf->chunk_sectors - 1); 6448 sectors &= ~((sector_t)conf->prev_chunk_sectors - 1); 6449 return sectors * (raid_disks - conf->max_degraded); 6450 } 6451 6452 static void free_scratch_buffer(struct r5conf *conf, struct raid5_percpu *percpu) 6453 { 6454 safe_put_page(percpu->spare_page); 6455 if (percpu->scribble) 6456 flex_array_free(percpu->scribble); 6457 percpu->spare_page = NULL; 6458 percpu->scribble = NULL; 6459 } 6460 6461 static int alloc_scratch_buffer(struct r5conf *conf, struct raid5_percpu *percpu) 6462 { 6463 if (conf->level == 6 && !percpu->spare_page) 6464 percpu->spare_page = alloc_page(GFP_KERNEL); 6465 if (!percpu->scribble) 6466 percpu->scribble = scribble_alloc(max(conf->raid_disks, 6467 conf->previous_raid_disks), 6468 max(conf->chunk_sectors, 6469 conf->prev_chunk_sectors) 6470 / STRIPE_SECTORS, 6471 GFP_KERNEL); 6472 6473 if (!percpu->scribble || (conf->level == 6 && !percpu->spare_page)) { 6474 free_scratch_buffer(conf, percpu); 6475 return -ENOMEM; 6476 } 6477 6478 return 0; 6479 } 6480 6481 static int raid456_cpu_dead(unsigned int cpu, struct hlist_node *node) 6482 { 6483 struct r5conf *conf = hlist_entry_safe(node, struct r5conf, node); 6484 6485 free_scratch_buffer(conf, per_cpu_ptr(conf->percpu, cpu)); 6486 return 0; 6487 } 6488 6489 static void raid5_free_percpu(struct r5conf *conf) 6490 { 6491 if (!conf->percpu) 6492 return; 6493 6494 cpuhp_state_remove_instance(CPUHP_MD_RAID5_PREPARE, &conf->node); 6495 free_percpu(conf->percpu); 6496 } 6497 6498 static void free_conf(struct r5conf *conf) 6499 { 6500 int i; 6501 6502 if (conf->log) 6503 r5l_exit_log(conf->log); 6504 if (conf->shrinker.nr_deferred) 6505 unregister_shrinker(&conf->shrinker); 6506 6507 free_thread_groups(conf); 6508 shrink_stripes(conf); 6509 raid5_free_percpu(conf); 6510 for (i = 0; i < conf->pool_size; i++) 6511 if (conf->disks[i].extra_page) 6512 put_page(conf->disks[i].extra_page); 6513 kfree(conf->disks); 6514 kfree(conf->stripe_hashtbl); 6515 kfree(conf); 6516 } 6517 6518 static int raid456_cpu_up_prepare(unsigned int cpu, struct hlist_node *node) 6519 { 6520 struct r5conf *conf = hlist_entry_safe(node, struct r5conf, node); 6521 struct raid5_percpu *percpu = per_cpu_ptr(conf->percpu, cpu); 6522 6523 if (alloc_scratch_buffer(conf, percpu)) { 6524 pr_warn("%s: failed memory allocation for cpu%u\n", 6525 __func__, cpu); 6526 return -ENOMEM; 6527 } 6528 return 0; 6529 } 6530 6531 static int raid5_alloc_percpu(struct r5conf *conf) 6532 { 6533 int err = 0; 6534 6535 conf->percpu = alloc_percpu(struct raid5_percpu); 6536 if (!conf->percpu) 6537 return -ENOMEM; 6538 6539 err = cpuhp_state_add_instance(CPUHP_MD_RAID5_PREPARE, &conf->node); 6540 if (!err) { 6541 conf->scribble_disks = max(conf->raid_disks, 6542 conf->previous_raid_disks); 6543 conf->scribble_sectors = max(conf->chunk_sectors, 6544 conf->prev_chunk_sectors); 6545 } 6546 return err; 6547 } 6548 6549 static unsigned long raid5_cache_scan(struct shrinker *shrink, 6550 struct shrink_control *sc) 6551 { 6552 struct r5conf *conf = container_of(shrink, struct r5conf, shrinker); 6553 unsigned long ret = SHRINK_STOP; 6554 6555 if (mutex_trylock(&conf->cache_size_mutex)) { 6556 ret= 0; 6557 while (ret < sc->nr_to_scan && 6558 conf->max_nr_stripes > conf->min_nr_stripes) { 6559 if (drop_one_stripe(conf) == 0) { 6560 ret = SHRINK_STOP; 6561 break; 6562 } 6563 ret++; 6564 } 6565 mutex_unlock(&conf->cache_size_mutex); 6566 } 6567 return ret; 6568 } 6569 6570 static unsigned long raid5_cache_count(struct shrinker *shrink, 6571 struct shrink_control *sc) 6572 { 6573 struct r5conf *conf = container_of(shrink, struct r5conf, shrinker); 6574 6575 if (conf->max_nr_stripes < conf->min_nr_stripes) 6576 /* unlikely, but not impossible */ 6577 return 0; 6578 return conf->max_nr_stripes - conf->min_nr_stripes; 6579 } 6580 6581 static struct r5conf *setup_conf(struct mddev *mddev) 6582 { 6583 struct r5conf *conf; 6584 int raid_disk, memory, max_disks; 6585 struct md_rdev *rdev; 6586 struct disk_info *disk; 6587 char pers_name[6]; 6588 int i; 6589 int group_cnt, worker_cnt_per_group; 6590 struct r5worker_group *new_group; 6591 6592 if (mddev->new_level != 5 6593 && mddev->new_level != 4 6594 && mddev->new_level != 6) { 6595 pr_warn("md/raid:%s: raid level not set to 4/5/6 (%d)\n", 6596 mdname(mddev), mddev->new_level); 6597 return ERR_PTR(-EIO); 6598 } 6599 if ((mddev->new_level == 5 6600 && !algorithm_valid_raid5(mddev->new_layout)) || 6601 (mddev->new_level == 6 6602 && !algorithm_valid_raid6(mddev->new_layout))) { 6603 pr_warn("md/raid:%s: layout %d not supported\n", 6604 mdname(mddev), mddev->new_layout); 6605 return ERR_PTR(-EIO); 6606 } 6607 if (mddev->new_level == 6 && mddev->raid_disks < 4) { 6608 pr_warn("md/raid:%s: not enough configured devices (%d, minimum 4)\n", 6609 mdname(mddev), mddev->raid_disks); 6610 return ERR_PTR(-EINVAL); 6611 } 6612 6613 if (!mddev->new_chunk_sectors || 6614 (mddev->new_chunk_sectors << 9) % PAGE_SIZE || 6615 !is_power_of_2(mddev->new_chunk_sectors)) { 6616 pr_warn("md/raid:%s: invalid chunk size %d\n", 6617 mdname(mddev), mddev->new_chunk_sectors << 9); 6618 return ERR_PTR(-EINVAL); 6619 } 6620 6621 conf = kzalloc(sizeof(struct r5conf), GFP_KERNEL); 6622 if (conf == NULL) 6623 goto abort; 6624 /* Don't enable multi-threading by default*/ 6625 if (!alloc_thread_groups(conf, 0, &group_cnt, &worker_cnt_per_group, 6626 &new_group)) { 6627 conf->group_cnt = group_cnt; 6628 conf->worker_cnt_per_group = worker_cnt_per_group; 6629 conf->worker_groups = new_group; 6630 } else 6631 goto abort; 6632 spin_lock_init(&conf->device_lock); 6633 seqcount_init(&conf->gen_lock); 6634 mutex_init(&conf->cache_size_mutex); 6635 init_waitqueue_head(&conf->wait_for_quiescent); 6636 init_waitqueue_head(&conf->wait_for_stripe); 6637 init_waitqueue_head(&conf->wait_for_overlap); 6638 INIT_LIST_HEAD(&conf->handle_list); 6639 INIT_LIST_HEAD(&conf->hold_list); 6640 INIT_LIST_HEAD(&conf->delayed_list); 6641 INIT_LIST_HEAD(&conf->bitmap_list); 6642 bio_list_init(&conf->return_bi); 6643 init_llist_head(&conf->released_stripes); 6644 atomic_set(&conf->active_stripes, 0); 6645 atomic_set(&conf->preread_active_stripes, 0); 6646 atomic_set(&conf->active_aligned_reads, 0); 6647 conf->bypass_threshold = BYPASS_THRESHOLD; 6648 conf->recovery_disabled = mddev->recovery_disabled - 1; 6649 6650 conf->raid_disks = mddev->raid_disks; 6651 if (mddev->reshape_position == MaxSector) 6652 conf->previous_raid_disks = mddev->raid_disks; 6653 else 6654 conf->previous_raid_disks = mddev->raid_disks - mddev->delta_disks; 6655 max_disks = max(conf->raid_disks, conf->previous_raid_disks); 6656 6657 conf->disks = kzalloc(max_disks * sizeof(struct disk_info), 6658 GFP_KERNEL); 6659 6660 if (!conf->disks) 6661 goto abort; 6662 6663 for (i = 0; i < max_disks; i++) { 6664 conf->disks[i].extra_page = alloc_page(GFP_KERNEL); 6665 if (!conf->disks[i].extra_page) 6666 goto abort; 6667 } 6668 6669 conf->mddev = mddev; 6670 6671 if ((conf->stripe_hashtbl = kzalloc(PAGE_SIZE, GFP_KERNEL)) == NULL) 6672 goto abort; 6673 6674 /* We init hash_locks[0] separately to that it can be used 6675 * as the reference lock in the spin_lock_nest_lock() call 6676 * in lock_all_device_hash_locks_irq in order to convince 6677 * lockdep that we know what we are doing. 6678 */ 6679 spin_lock_init(conf->hash_locks); 6680 for (i = 1; i < NR_STRIPE_HASH_LOCKS; i++) 6681 spin_lock_init(conf->hash_locks + i); 6682 6683 for (i = 0; i < NR_STRIPE_HASH_LOCKS; i++) 6684 INIT_LIST_HEAD(conf->inactive_list + i); 6685 6686 for (i = 0; i < NR_STRIPE_HASH_LOCKS; i++) 6687 INIT_LIST_HEAD(conf->temp_inactive_list + i); 6688 6689 atomic_set(&conf->r5c_cached_full_stripes, 0); 6690 INIT_LIST_HEAD(&conf->r5c_full_stripe_list); 6691 atomic_set(&conf->r5c_cached_partial_stripes, 0); 6692 INIT_LIST_HEAD(&conf->r5c_partial_stripe_list); 6693 6694 conf->level = mddev->new_level; 6695 conf->chunk_sectors = mddev->new_chunk_sectors; 6696 if (raid5_alloc_percpu(conf) != 0) 6697 goto abort; 6698 6699 pr_debug("raid456: run(%s) called.\n", mdname(mddev)); 6700 6701 rdev_for_each(rdev, mddev) { 6702 raid_disk = rdev->raid_disk; 6703 if (raid_disk >= max_disks 6704 || raid_disk < 0 || test_bit(Journal, &rdev->flags)) 6705 continue; 6706 disk = conf->disks + raid_disk; 6707 6708 if (test_bit(Replacement, &rdev->flags)) { 6709 if (disk->replacement) 6710 goto abort; 6711 disk->replacement = rdev; 6712 } else { 6713 if (disk->rdev) 6714 goto abort; 6715 disk->rdev = rdev; 6716 } 6717 6718 if (test_bit(In_sync, &rdev->flags)) { 6719 char b[BDEVNAME_SIZE]; 6720 pr_info("md/raid:%s: device %s operational as raid disk %d\n", 6721 mdname(mddev), bdevname(rdev->bdev, b), raid_disk); 6722 } else if (rdev->saved_raid_disk != raid_disk) 6723 /* Cannot rely on bitmap to complete recovery */ 6724 conf->fullsync = 1; 6725 } 6726 6727 conf->level = mddev->new_level; 6728 if (conf->level == 6) { 6729 conf->max_degraded = 2; 6730 if (raid6_call.xor_syndrome) 6731 conf->rmw_level = PARITY_ENABLE_RMW; 6732 else 6733 conf->rmw_level = PARITY_DISABLE_RMW; 6734 } else { 6735 conf->max_degraded = 1; 6736 conf->rmw_level = PARITY_ENABLE_RMW; 6737 } 6738 conf->algorithm = mddev->new_layout; 6739 conf->reshape_progress = mddev->reshape_position; 6740 if (conf->reshape_progress != MaxSector) { 6741 conf->prev_chunk_sectors = mddev->chunk_sectors; 6742 conf->prev_algo = mddev->layout; 6743 } else { 6744 conf->prev_chunk_sectors = conf->chunk_sectors; 6745 conf->prev_algo = conf->algorithm; 6746 } 6747 6748 conf->min_nr_stripes = NR_STRIPES; 6749 if (mddev->reshape_position != MaxSector) { 6750 int stripes = max_t(int, 6751 ((mddev->chunk_sectors << 9) / STRIPE_SIZE) * 4, 6752 ((mddev->new_chunk_sectors << 9) / STRIPE_SIZE) * 4); 6753 conf->min_nr_stripes = max(NR_STRIPES, stripes); 6754 if (conf->min_nr_stripes != NR_STRIPES) 6755 pr_info("md/raid:%s: force stripe size %d for reshape\n", 6756 mdname(mddev), conf->min_nr_stripes); 6757 } 6758 memory = conf->min_nr_stripes * (sizeof(struct stripe_head) + 6759 max_disks * ((sizeof(struct bio) + PAGE_SIZE))) / 1024; 6760 atomic_set(&conf->empty_inactive_list_nr, NR_STRIPE_HASH_LOCKS); 6761 if (grow_stripes(conf, conf->min_nr_stripes)) { 6762 pr_warn("md/raid:%s: couldn't allocate %dkB for buffers\n", 6763 mdname(mddev), memory); 6764 goto abort; 6765 } else 6766 pr_debug("md/raid:%s: allocated %dkB\n", mdname(mddev), memory); 6767 /* 6768 * Losing a stripe head costs more than the time to refill it, 6769 * it reduces the queue depth and so can hurt throughput. 6770 * So set it rather large, scaled by number of devices. 6771 */ 6772 conf->shrinker.seeks = DEFAULT_SEEKS * conf->raid_disks * 4; 6773 conf->shrinker.scan_objects = raid5_cache_scan; 6774 conf->shrinker.count_objects = raid5_cache_count; 6775 conf->shrinker.batch = 128; 6776 conf->shrinker.flags = 0; 6777 if (register_shrinker(&conf->shrinker)) { 6778 pr_warn("md/raid:%s: couldn't register shrinker.\n", 6779 mdname(mddev)); 6780 goto abort; 6781 } 6782 6783 sprintf(pers_name, "raid%d", mddev->new_level); 6784 conf->thread = md_register_thread(raid5d, mddev, pers_name); 6785 if (!conf->thread) { 6786 pr_warn("md/raid:%s: couldn't allocate thread.\n", 6787 mdname(mddev)); 6788 goto abort; 6789 } 6790 6791 return conf; 6792 6793 abort: 6794 if (conf) { 6795 free_conf(conf); 6796 return ERR_PTR(-EIO); 6797 } else 6798 return ERR_PTR(-ENOMEM); 6799 } 6800 6801 static int only_parity(int raid_disk, int algo, int raid_disks, int max_degraded) 6802 { 6803 switch (algo) { 6804 case ALGORITHM_PARITY_0: 6805 if (raid_disk < max_degraded) 6806 return 1; 6807 break; 6808 case ALGORITHM_PARITY_N: 6809 if (raid_disk >= raid_disks - max_degraded) 6810 return 1; 6811 break; 6812 case ALGORITHM_PARITY_0_6: 6813 if (raid_disk == 0 || 6814 raid_disk == raid_disks - 1) 6815 return 1; 6816 break; 6817 case ALGORITHM_LEFT_ASYMMETRIC_6: 6818 case ALGORITHM_RIGHT_ASYMMETRIC_6: 6819 case ALGORITHM_LEFT_SYMMETRIC_6: 6820 case ALGORITHM_RIGHT_SYMMETRIC_6: 6821 if (raid_disk == raid_disks - 1) 6822 return 1; 6823 } 6824 return 0; 6825 } 6826 6827 static int raid5_run(struct mddev *mddev) 6828 { 6829 struct r5conf *conf; 6830 int working_disks = 0; 6831 int dirty_parity_disks = 0; 6832 struct md_rdev *rdev; 6833 struct md_rdev *journal_dev = NULL; 6834 sector_t reshape_offset = 0; 6835 int i; 6836 long long min_offset_diff = 0; 6837 int first = 1; 6838 6839 if (mddev->recovery_cp != MaxSector) 6840 pr_notice("md/raid:%s: not clean -- starting background reconstruction\n", 6841 mdname(mddev)); 6842 6843 rdev_for_each(rdev, mddev) { 6844 long long diff; 6845 6846 if (test_bit(Journal, &rdev->flags)) { 6847 journal_dev = rdev; 6848 continue; 6849 } 6850 if (rdev->raid_disk < 0) 6851 continue; 6852 diff = (rdev->new_data_offset - rdev->data_offset); 6853 if (first) { 6854 min_offset_diff = diff; 6855 first = 0; 6856 } else if (mddev->reshape_backwards && 6857 diff < min_offset_diff) 6858 min_offset_diff = diff; 6859 else if (!mddev->reshape_backwards && 6860 diff > min_offset_diff) 6861 min_offset_diff = diff; 6862 } 6863 6864 if (mddev->reshape_position != MaxSector) { 6865 /* Check that we can continue the reshape. 6866 * Difficulties arise if the stripe we would write to 6867 * next is at or after the stripe we would read from next. 6868 * For a reshape that changes the number of devices, this 6869 * is only possible for a very short time, and mdadm makes 6870 * sure that time appears to have past before assembling 6871 * the array. So we fail if that time hasn't passed. 6872 * For a reshape that keeps the number of devices the same 6873 * mdadm must be monitoring the reshape can keeping the 6874 * critical areas read-only and backed up. It will start 6875 * the array in read-only mode, so we check for that. 6876 */ 6877 sector_t here_new, here_old; 6878 int old_disks; 6879 int max_degraded = (mddev->level == 6 ? 2 : 1); 6880 int chunk_sectors; 6881 int new_data_disks; 6882 6883 if (journal_dev) { 6884 pr_warn("md/raid:%s: don't support reshape with journal - aborting.\n", 6885 mdname(mddev)); 6886 return -EINVAL; 6887 } 6888 6889 if (mddev->new_level != mddev->level) { 6890 pr_warn("md/raid:%s: unsupported reshape required - aborting.\n", 6891 mdname(mddev)); 6892 return -EINVAL; 6893 } 6894 old_disks = mddev->raid_disks - mddev->delta_disks; 6895 /* reshape_position must be on a new-stripe boundary, and one 6896 * further up in new geometry must map after here in old 6897 * geometry. 6898 * If the chunk sizes are different, then as we perform reshape 6899 * in units of the largest of the two, reshape_position needs 6900 * be a multiple of the largest chunk size times new data disks. 6901 */ 6902 here_new = mddev->reshape_position; 6903 chunk_sectors = max(mddev->chunk_sectors, mddev->new_chunk_sectors); 6904 new_data_disks = mddev->raid_disks - max_degraded; 6905 if (sector_div(here_new, chunk_sectors * new_data_disks)) { 6906 pr_warn("md/raid:%s: reshape_position not on a stripe boundary\n", 6907 mdname(mddev)); 6908 return -EINVAL; 6909 } 6910 reshape_offset = here_new * chunk_sectors; 6911 /* here_new is the stripe we will write to */ 6912 here_old = mddev->reshape_position; 6913 sector_div(here_old, chunk_sectors * (old_disks-max_degraded)); 6914 /* here_old is the first stripe that we might need to read 6915 * from */ 6916 if (mddev->delta_disks == 0) { 6917 /* We cannot be sure it is safe to start an in-place 6918 * reshape. It is only safe if user-space is monitoring 6919 * and taking constant backups. 6920 * mdadm always starts a situation like this in 6921 * readonly mode so it can take control before 6922 * allowing any writes. So just check for that. 6923 */ 6924 if (abs(min_offset_diff) >= mddev->chunk_sectors && 6925 abs(min_offset_diff) >= mddev->new_chunk_sectors) 6926 /* not really in-place - so OK */; 6927 else if (mddev->ro == 0) { 6928 pr_warn("md/raid:%s: in-place reshape must be started in read-only mode - aborting\n", 6929 mdname(mddev)); 6930 return -EINVAL; 6931 } 6932 } else if (mddev->reshape_backwards 6933 ? (here_new * chunk_sectors + min_offset_diff <= 6934 here_old * chunk_sectors) 6935 : (here_new * chunk_sectors >= 6936 here_old * chunk_sectors + (-min_offset_diff))) { 6937 /* Reading from the same stripe as writing to - bad */ 6938 pr_warn("md/raid:%s: reshape_position too early for auto-recovery - aborting.\n", 6939 mdname(mddev)); 6940 return -EINVAL; 6941 } 6942 pr_debug("md/raid:%s: reshape will continue\n", mdname(mddev)); 6943 /* OK, we should be able to continue; */ 6944 } else { 6945 BUG_ON(mddev->level != mddev->new_level); 6946 BUG_ON(mddev->layout != mddev->new_layout); 6947 BUG_ON(mddev->chunk_sectors != mddev->new_chunk_sectors); 6948 BUG_ON(mddev->delta_disks != 0); 6949 } 6950 6951 if (mddev->private == NULL) 6952 conf = setup_conf(mddev); 6953 else 6954 conf = mddev->private; 6955 6956 if (IS_ERR(conf)) 6957 return PTR_ERR(conf); 6958 6959 if (test_bit(MD_HAS_JOURNAL, &mddev->flags)) { 6960 if (!journal_dev) { 6961 pr_warn("md/raid:%s: journal disk is missing, force array readonly\n", 6962 mdname(mddev)); 6963 mddev->ro = 1; 6964 set_disk_ro(mddev->gendisk, 1); 6965 } else if (mddev->recovery_cp == MaxSector) 6966 set_bit(MD_JOURNAL_CLEAN, &mddev->flags); 6967 } 6968 6969 conf->min_offset_diff = min_offset_diff; 6970 mddev->thread = conf->thread; 6971 conf->thread = NULL; 6972 mddev->private = conf; 6973 6974 for (i = 0; i < conf->raid_disks && conf->previous_raid_disks; 6975 i++) { 6976 rdev = conf->disks[i].rdev; 6977 if (!rdev && conf->disks[i].replacement) { 6978 /* The replacement is all we have yet */ 6979 rdev = conf->disks[i].replacement; 6980 conf->disks[i].replacement = NULL; 6981 clear_bit(Replacement, &rdev->flags); 6982 conf->disks[i].rdev = rdev; 6983 } 6984 if (!rdev) 6985 continue; 6986 if (conf->disks[i].replacement && 6987 conf->reshape_progress != MaxSector) { 6988 /* replacements and reshape simply do not mix. */ 6989 pr_warn("md: cannot handle concurrent replacement and reshape.\n"); 6990 goto abort; 6991 } 6992 if (test_bit(In_sync, &rdev->flags)) { 6993 working_disks++; 6994 continue; 6995 } 6996 /* This disc is not fully in-sync. However if it 6997 * just stored parity (beyond the recovery_offset), 6998 * when we don't need to be concerned about the 6999 * array being dirty. 7000 * When reshape goes 'backwards', we never have 7001 * partially completed devices, so we only need 7002 * to worry about reshape going forwards. 7003 */ 7004 /* Hack because v0.91 doesn't store recovery_offset properly. */ 7005 if (mddev->major_version == 0 && 7006 mddev->minor_version > 90) 7007 rdev->recovery_offset = reshape_offset; 7008 7009 if (rdev->recovery_offset < reshape_offset) { 7010 /* We need to check old and new layout */ 7011 if (!only_parity(rdev->raid_disk, 7012 conf->algorithm, 7013 conf->raid_disks, 7014 conf->max_degraded)) 7015 continue; 7016 } 7017 if (!only_parity(rdev->raid_disk, 7018 conf->prev_algo, 7019 conf->previous_raid_disks, 7020 conf->max_degraded)) 7021 continue; 7022 dirty_parity_disks++; 7023 } 7024 7025 /* 7026 * 0 for a fully functional array, 1 or 2 for a degraded array. 7027 */ 7028 mddev->degraded = calc_degraded(conf); 7029 7030 if (has_failed(conf)) { 7031 pr_crit("md/raid:%s: not enough operational devices (%d/%d failed)\n", 7032 mdname(mddev), mddev->degraded, conf->raid_disks); 7033 goto abort; 7034 } 7035 7036 /* device size must be a multiple of chunk size */ 7037 mddev->dev_sectors &= ~(mddev->chunk_sectors - 1); 7038 mddev->resync_max_sectors = mddev->dev_sectors; 7039 7040 if (mddev->degraded > dirty_parity_disks && 7041 mddev->recovery_cp != MaxSector) { 7042 if (mddev->ok_start_degraded) 7043 pr_crit("md/raid:%s: starting dirty degraded array - data corruption possible.\n", 7044 mdname(mddev)); 7045 else { 7046 pr_crit("md/raid:%s: cannot start dirty degraded array.\n", 7047 mdname(mddev)); 7048 goto abort; 7049 } 7050 } 7051 7052 pr_info("md/raid:%s: raid level %d active with %d out of %d devices, algorithm %d\n", 7053 mdname(mddev), conf->level, 7054 mddev->raid_disks-mddev->degraded, mddev->raid_disks, 7055 mddev->new_layout); 7056 7057 print_raid5_conf(conf); 7058 7059 if (conf->reshape_progress != MaxSector) { 7060 conf->reshape_safe = conf->reshape_progress; 7061 atomic_set(&conf->reshape_stripes, 0); 7062 clear_bit(MD_RECOVERY_SYNC, &mddev->recovery); 7063 clear_bit(MD_RECOVERY_CHECK, &mddev->recovery); 7064 set_bit(MD_RECOVERY_RESHAPE, &mddev->recovery); 7065 set_bit(MD_RECOVERY_RUNNING, &mddev->recovery); 7066 mddev->sync_thread = md_register_thread(md_do_sync, mddev, 7067 "reshape"); 7068 } 7069 7070 /* Ok, everything is just fine now */ 7071 if (mddev->to_remove == &raid5_attrs_group) 7072 mddev->to_remove = NULL; 7073 else if (mddev->kobj.sd && 7074 sysfs_create_group(&mddev->kobj, &raid5_attrs_group)) 7075 pr_warn("raid5: failed to create sysfs attributes for %s\n", 7076 mdname(mddev)); 7077 md_set_array_sectors(mddev, raid5_size(mddev, 0, 0)); 7078 7079 if (mddev->queue) { 7080 int chunk_size; 7081 bool discard_supported = true; 7082 /* read-ahead size must cover two whole stripes, which 7083 * is 2 * (datadisks) * chunksize where 'n' is the 7084 * number of raid devices 7085 */ 7086 int data_disks = conf->previous_raid_disks - conf->max_degraded; 7087 int stripe = data_disks * 7088 ((mddev->chunk_sectors << 9) / PAGE_SIZE); 7089 if (mddev->queue->backing_dev_info.ra_pages < 2 * stripe) 7090 mddev->queue->backing_dev_info.ra_pages = 2 * stripe; 7091 7092 chunk_size = mddev->chunk_sectors << 9; 7093 blk_queue_io_min(mddev->queue, chunk_size); 7094 blk_queue_io_opt(mddev->queue, chunk_size * 7095 (conf->raid_disks - conf->max_degraded)); 7096 mddev->queue->limits.raid_partial_stripes_expensive = 1; 7097 /* 7098 * We can only discard a whole stripe. It doesn't make sense to 7099 * discard data disk but write parity disk 7100 */ 7101 stripe = stripe * PAGE_SIZE; 7102 /* Round up to power of 2, as discard handling 7103 * currently assumes that */ 7104 while ((stripe-1) & stripe) 7105 stripe = (stripe | (stripe-1)) + 1; 7106 mddev->queue->limits.discard_alignment = stripe; 7107 mddev->queue->limits.discard_granularity = stripe; 7108 7109 /* 7110 * We use 16-bit counter of active stripes in bi_phys_segments 7111 * (minus one for over-loaded initialization) 7112 */ 7113 blk_queue_max_hw_sectors(mddev->queue, 0xfffe * STRIPE_SECTORS); 7114 blk_queue_max_discard_sectors(mddev->queue, 7115 0xfffe * STRIPE_SECTORS); 7116 7117 /* 7118 * unaligned part of discard request will be ignored, so can't 7119 * guarantee discard_zeroes_data 7120 */ 7121 mddev->queue->limits.discard_zeroes_data = 0; 7122 7123 blk_queue_max_write_same_sectors(mddev->queue, 0); 7124 7125 rdev_for_each(rdev, mddev) { 7126 disk_stack_limits(mddev->gendisk, rdev->bdev, 7127 rdev->data_offset << 9); 7128 disk_stack_limits(mddev->gendisk, rdev->bdev, 7129 rdev->new_data_offset << 9); 7130 /* 7131 * discard_zeroes_data is required, otherwise data 7132 * could be lost. Consider a scenario: discard a stripe 7133 * (the stripe could be inconsistent if 7134 * discard_zeroes_data is 0); write one disk of the 7135 * stripe (the stripe could be inconsistent again 7136 * depending on which disks are used to calculate 7137 * parity); the disk is broken; The stripe data of this 7138 * disk is lost. 7139 */ 7140 if (!blk_queue_discard(bdev_get_queue(rdev->bdev)) || 7141 !bdev_get_queue(rdev->bdev)-> 7142 limits.discard_zeroes_data) 7143 discard_supported = false; 7144 /* Unfortunately, discard_zeroes_data is not currently 7145 * a guarantee - just a hint. So we only allow DISCARD 7146 * if the sysadmin has confirmed that only safe devices 7147 * are in use by setting a module parameter. 7148 */ 7149 if (!devices_handle_discard_safely) { 7150 if (discard_supported) { 7151 pr_info("md/raid456: discard support disabled due to uncertainty.\n"); 7152 pr_info("Set raid456.devices_handle_discard_safely=Y to override.\n"); 7153 } 7154 discard_supported = false; 7155 } 7156 } 7157 7158 if (discard_supported && 7159 mddev->queue->limits.max_discard_sectors >= (stripe >> 9) && 7160 mddev->queue->limits.discard_granularity >= stripe) 7161 queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, 7162 mddev->queue); 7163 else 7164 queue_flag_clear_unlocked(QUEUE_FLAG_DISCARD, 7165 mddev->queue); 7166 7167 blk_queue_max_hw_sectors(mddev->queue, UINT_MAX); 7168 } 7169 7170 if (journal_dev) { 7171 char b[BDEVNAME_SIZE]; 7172 7173 pr_debug("md/raid:%s: using device %s as journal\n", 7174 mdname(mddev), bdevname(journal_dev->bdev, b)); 7175 if (r5l_init_log(conf, journal_dev)) 7176 goto abort; 7177 } 7178 7179 return 0; 7180 abort: 7181 md_unregister_thread(&mddev->thread); 7182 print_raid5_conf(conf); 7183 free_conf(conf); 7184 mddev->private = NULL; 7185 pr_warn("md/raid:%s: failed to run raid set.\n", mdname(mddev)); 7186 return -EIO; 7187 } 7188 7189 static void raid5_free(struct mddev *mddev, void *priv) 7190 { 7191 struct r5conf *conf = priv; 7192 7193 free_conf(conf); 7194 mddev->to_remove = &raid5_attrs_group; 7195 } 7196 7197 static void raid5_status(struct seq_file *seq, struct mddev *mddev) 7198 { 7199 struct r5conf *conf = mddev->private; 7200 int i; 7201 7202 seq_printf(seq, " level %d, %dk chunk, algorithm %d", mddev->level, 7203 conf->chunk_sectors / 2, mddev->layout); 7204 seq_printf (seq, " [%d/%d] [", conf->raid_disks, conf->raid_disks - mddev->degraded); 7205 rcu_read_lock(); 7206 for (i = 0; i < conf->raid_disks; i++) { 7207 struct md_rdev *rdev = rcu_dereference(conf->disks[i].rdev); 7208 seq_printf (seq, "%s", rdev && test_bit(In_sync, &rdev->flags) ? "U" : "_"); 7209 } 7210 rcu_read_unlock(); 7211 seq_printf (seq, "]"); 7212 } 7213 7214 static void print_raid5_conf (struct r5conf *conf) 7215 { 7216 int i; 7217 struct disk_info *tmp; 7218 7219 pr_debug("RAID conf printout:\n"); 7220 if (!conf) { 7221 pr_debug("(conf==NULL)\n"); 7222 return; 7223 } 7224 pr_debug(" --- level:%d rd:%d wd:%d\n", conf->level, 7225 conf->raid_disks, 7226 conf->raid_disks - conf->mddev->degraded); 7227 7228 for (i = 0; i < conf->raid_disks; i++) { 7229 char b[BDEVNAME_SIZE]; 7230 tmp = conf->disks + i; 7231 if (tmp->rdev) 7232 pr_debug(" disk %d, o:%d, dev:%s\n", 7233 i, !test_bit(Faulty, &tmp->rdev->flags), 7234 bdevname(tmp->rdev->bdev, b)); 7235 } 7236 } 7237 7238 static int raid5_spare_active(struct mddev *mddev) 7239 { 7240 int i; 7241 struct r5conf *conf = mddev->private; 7242 struct disk_info *tmp; 7243 int count = 0; 7244 unsigned long flags; 7245 7246 for (i = 0; i < conf->raid_disks; i++) { 7247 tmp = conf->disks + i; 7248 if (tmp->replacement 7249 && tmp->replacement->recovery_offset == MaxSector 7250 && !test_bit(Faulty, &tmp->replacement->flags) 7251 && !test_and_set_bit(In_sync, &tmp->replacement->flags)) { 7252 /* Replacement has just become active. */ 7253 if (!tmp->rdev 7254 || !test_and_clear_bit(In_sync, &tmp->rdev->flags)) 7255 count++; 7256 if (tmp->rdev) { 7257 /* Replaced device not technically faulty, 7258 * but we need to be sure it gets removed 7259 * and never re-added. 7260 */ 7261 set_bit(Faulty, &tmp->rdev->flags); 7262 sysfs_notify_dirent_safe( 7263 tmp->rdev->sysfs_state); 7264 } 7265 sysfs_notify_dirent_safe(tmp->replacement->sysfs_state); 7266 } else if (tmp->rdev 7267 && tmp->rdev->recovery_offset == MaxSector 7268 && !test_bit(Faulty, &tmp->rdev->flags) 7269 && !test_and_set_bit(In_sync, &tmp->rdev->flags)) { 7270 count++; 7271 sysfs_notify_dirent_safe(tmp->rdev->sysfs_state); 7272 } 7273 } 7274 spin_lock_irqsave(&conf->device_lock, flags); 7275 mddev->degraded = calc_degraded(conf); 7276 spin_unlock_irqrestore(&conf->device_lock, flags); 7277 print_raid5_conf(conf); 7278 return count; 7279 } 7280 7281 static int raid5_remove_disk(struct mddev *mddev, struct md_rdev *rdev) 7282 { 7283 struct r5conf *conf = mddev->private; 7284 int err = 0; 7285 int number = rdev->raid_disk; 7286 struct md_rdev **rdevp; 7287 struct disk_info *p = conf->disks + number; 7288 7289 print_raid5_conf(conf); 7290 if (test_bit(Journal, &rdev->flags) && conf->log) { 7291 struct r5l_log *log; 7292 /* 7293 * we can't wait pending write here, as this is called in 7294 * raid5d, wait will deadlock. 7295 */ 7296 if (atomic_read(&mddev->writes_pending)) 7297 return -EBUSY; 7298 log = conf->log; 7299 conf->log = NULL; 7300 synchronize_rcu(); 7301 r5l_exit_log(log); 7302 return 0; 7303 } 7304 if (rdev == p->rdev) 7305 rdevp = &p->rdev; 7306 else if (rdev == p->replacement) 7307 rdevp = &p->replacement; 7308 else 7309 return 0; 7310 7311 if (number >= conf->raid_disks && 7312 conf->reshape_progress == MaxSector) 7313 clear_bit(In_sync, &rdev->flags); 7314 7315 if (test_bit(In_sync, &rdev->flags) || 7316 atomic_read(&rdev->nr_pending)) { 7317 err = -EBUSY; 7318 goto abort; 7319 } 7320 /* Only remove non-faulty devices if recovery 7321 * isn't possible. 7322 */ 7323 if (!test_bit(Faulty, &rdev->flags) && 7324 mddev->recovery_disabled != conf->recovery_disabled && 7325 !has_failed(conf) && 7326 (!p->replacement || p->replacement == rdev) && 7327 number < conf->raid_disks) { 7328 err = -EBUSY; 7329 goto abort; 7330 } 7331 *rdevp = NULL; 7332 if (!test_bit(RemoveSynchronized, &rdev->flags)) { 7333 synchronize_rcu(); 7334 if (atomic_read(&rdev->nr_pending)) { 7335 /* lost the race, try later */ 7336 err = -EBUSY; 7337 *rdevp = rdev; 7338 } 7339 } 7340 if (p->replacement) { 7341 /* We must have just cleared 'rdev' */ 7342 p->rdev = p->replacement; 7343 clear_bit(Replacement, &p->replacement->flags); 7344 smp_mb(); /* Make sure other CPUs may see both as identical 7345 * but will never see neither - if they are careful 7346 */ 7347 p->replacement = NULL; 7348 clear_bit(WantReplacement, &rdev->flags); 7349 } else 7350 /* We might have just removed the Replacement as faulty- 7351 * clear the bit just in case 7352 */ 7353 clear_bit(WantReplacement, &rdev->flags); 7354 abort: 7355 7356 print_raid5_conf(conf); 7357 return err; 7358 } 7359 7360 static int raid5_add_disk(struct mddev *mddev, struct md_rdev *rdev) 7361 { 7362 struct r5conf *conf = mddev->private; 7363 int err = -EEXIST; 7364 int disk; 7365 struct disk_info *p; 7366 int first = 0; 7367 int last = conf->raid_disks - 1; 7368 7369 if (test_bit(Journal, &rdev->flags)) { 7370 char b[BDEVNAME_SIZE]; 7371 if (conf->log) 7372 return -EBUSY; 7373 7374 rdev->raid_disk = 0; 7375 /* 7376 * The array is in readonly mode if journal is missing, so no 7377 * write requests running. We should be safe 7378 */ 7379 r5l_init_log(conf, rdev); 7380 pr_debug("md/raid:%s: using device %s as journal\n", 7381 mdname(mddev), bdevname(rdev->bdev, b)); 7382 return 0; 7383 } 7384 if (mddev->recovery_disabled == conf->recovery_disabled) 7385 return -EBUSY; 7386 7387 if (rdev->saved_raid_disk < 0 && has_failed(conf)) 7388 /* no point adding a device */ 7389 return -EINVAL; 7390 7391 if (rdev->raid_disk >= 0) 7392 first = last = rdev->raid_disk; 7393 7394 /* 7395 * find the disk ... but prefer rdev->saved_raid_disk 7396 * if possible. 7397 */ 7398 if (rdev->saved_raid_disk >= 0 && 7399 rdev->saved_raid_disk >= first && 7400 conf->disks[rdev->saved_raid_disk].rdev == NULL) 7401 first = rdev->saved_raid_disk; 7402 7403 for (disk = first; disk <= last; disk++) { 7404 p = conf->disks + disk; 7405 if (p->rdev == NULL) { 7406 clear_bit(In_sync, &rdev->flags); 7407 rdev->raid_disk = disk; 7408 err = 0; 7409 if (rdev->saved_raid_disk != disk) 7410 conf->fullsync = 1; 7411 rcu_assign_pointer(p->rdev, rdev); 7412 goto out; 7413 } 7414 } 7415 for (disk = first; disk <= last; disk++) { 7416 p = conf->disks + disk; 7417 if (test_bit(WantReplacement, &p->rdev->flags) && 7418 p->replacement == NULL) { 7419 clear_bit(In_sync, &rdev->flags); 7420 set_bit(Replacement, &rdev->flags); 7421 rdev->raid_disk = disk; 7422 err = 0; 7423 conf->fullsync = 1; 7424 rcu_assign_pointer(p->replacement, rdev); 7425 break; 7426 } 7427 } 7428 out: 7429 print_raid5_conf(conf); 7430 return err; 7431 } 7432 7433 static int raid5_resize(struct mddev *mddev, sector_t sectors) 7434 { 7435 /* no resync is happening, and there is enough space 7436 * on all devices, so we can resize. 7437 * We need to make sure resync covers any new space. 7438 * If the array is shrinking we should possibly wait until 7439 * any io in the removed space completes, but it hardly seems 7440 * worth it. 7441 */ 7442 sector_t newsize; 7443 struct r5conf *conf = mddev->private; 7444 7445 if (conf->log) 7446 return -EINVAL; 7447 sectors &= ~((sector_t)conf->chunk_sectors - 1); 7448 newsize = raid5_size(mddev, sectors, mddev->raid_disks); 7449 if (mddev->external_size && 7450 mddev->array_sectors > newsize) 7451 return -EINVAL; 7452 if (mddev->bitmap) { 7453 int ret = bitmap_resize(mddev->bitmap, sectors, 0, 0); 7454 if (ret) 7455 return ret; 7456 } 7457 md_set_array_sectors(mddev, newsize); 7458 set_capacity(mddev->gendisk, mddev->array_sectors); 7459 revalidate_disk(mddev->gendisk); 7460 if (sectors > mddev->dev_sectors && 7461 mddev->recovery_cp > mddev->dev_sectors) { 7462 mddev->recovery_cp = mddev->dev_sectors; 7463 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 7464 } 7465 mddev->dev_sectors = sectors; 7466 mddev->resync_max_sectors = sectors; 7467 return 0; 7468 } 7469 7470 static int check_stripe_cache(struct mddev *mddev) 7471 { 7472 /* Can only proceed if there are plenty of stripe_heads. 7473 * We need a minimum of one full stripe,, and for sensible progress 7474 * it is best to have about 4 times that. 7475 * If we require 4 times, then the default 256 4K stripe_heads will 7476 * allow for chunk sizes up to 256K, which is probably OK. 7477 * If the chunk size is greater, user-space should request more 7478 * stripe_heads first. 7479 */ 7480 struct r5conf *conf = mddev->private; 7481 if (((mddev->chunk_sectors << 9) / STRIPE_SIZE) * 4 7482 > conf->min_nr_stripes || 7483 ((mddev->new_chunk_sectors << 9) / STRIPE_SIZE) * 4 7484 > conf->min_nr_stripes) { 7485 pr_warn("md/raid:%s: reshape: not enough stripes. Needed %lu\n", 7486 mdname(mddev), 7487 ((max(mddev->chunk_sectors, mddev->new_chunk_sectors) << 9) 7488 / STRIPE_SIZE)*4); 7489 return 0; 7490 } 7491 return 1; 7492 } 7493 7494 static int check_reshape(struct mddev *mddev) 7495 { 7496 struct r5conf *conf = mddev->private; 7497 7498 if (conf->log) 7499 return -EINVAL; 7500 if (mddev->delta_disks == 0 && 7501 mddev->new_layout == mddev->layout && 7502 mddev->new_chunk_sectors == mddev->chunk_sectors) 7503 return 0; /* nothing to do */ 7504 if (has_failed(conf)) 7505 return -EINVAL; 7506 if (mddev->delta_disks < 0 && mddev->reshape_position == MaxSector) { 7507 /* We might be able to shrink, but the devices must 7508 * be made bigger first. 7509 * For raid6, 4 is the minimum size. 7510 * Otherwise 2 is the minimum 7511 */ 7512 int min = 2; 7513 if (mddev->level == 6) 7514 min = 4; 7515 if (mddev->raid_disks + mddev->delta_disks < min) 7516 return -EINVAL; 7517 } 7518 7519 if (!check_stripe_cache(mddev)) 7520 return -ENOSPC; 7521 7522 if (mddev->new_chunk_sectors > mddev->chunk_sectors || 7523 mddev->delta_disks > 0) 7524 if (resize_chunks(conf, 7525 conf->previous_raid_disks 7526 + max(0, mddev->delta_disks), 7527 max(mddev->new_chunk_sectors, 7528 mddev->chunk_sectors) 7529 ) < 0) 7530 return -ENOMEM; 7531 return resize_stripes(conf, (conf->previous_raid_disks 7532 + mddev->delta_disks)); 7533 } 7534 7535 static int raid5_start_reshape(struct mddev *mddev) 7536 { 7537 struct r5conf *conf = mddev->private; 7538 struct md_rdev *rdev; 7539 int spares = 0; 7540 unsigned long flags; 7541 7542 if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) 7543 return -EBUSY; 7544 7545 if (!check_stripe_cache(mddev)) 7546 return -ENOSPC; 7547 7548 if (has_failed(conf)) 7549 return -EINVAL; 7550 7551 rdev_for_each(rdev, mddev) { 7552 if (!test_bit(In_sync, &rdev->flags) 7553 && !test_bit(Faulty, &rdev->flags)) 7554 spares++; 7555 } 7556 7557 if (spares - mddev->degraded < mddev->delta_disks - conf->max_degraded) 7558 /* Not enough devices even to make a degraded array 7559 * of that size 7560 */ 7561 return -EINVAL; 7562 7563 /* Refuse to reduce size of the array. Any reductions in 7564 * array size must be through explicit setting of array_size 7565 * attribute. 7566 */ 7567 if (raid5_size(mddev, 0, conf->raid_disks + mddev->delta_disks) 7568 < mddev->array_sectors) { 7569 pr_warn("md/raid:%s: array size must be reduced before number of disks\n", 7570 mdname(mddev)); 7571 return -EINVAL; 7572 } 7573 7574 atomic_set(&conf->reshape_stripes, 0); 7575 spin_lock_irq(&conf->device_lock); 7576 write_seqcount_begin(&conf->gen_lock); 7577 conf->previous_raid_disks = conf->raid_disks; 7578 conf->raid_disks += mddev->delta_disks; 7579 conf->prev_chunk_sectors = conf->chunk_sectors; 7580 conf->chunk_sectors = mddev->new_chunk_sectors; 7581 conf->prev_algo = conf->algorithm; 7582 conf->algorithm = mddev->new_layout; 7583 conf->generation++; 7584 /* Code that selects data_offset needs to see the generation update 7585 * if reshape_progress has been set - so a memory barrier needed. 7586 */ 7587 smp_mb(); 7588 if (mddev->reshape_backwards) 7589 conf->reshape_progress = raid5_size(mddev, 0, 0); 7590 else 7591 conf->reshape_progress = 0; 7592 conf->reshape_safe = conf->reshape_progress; 7593 write_seqcount_end(&conf->gen_lock); 7594 spin_unlock_irq(&conf->device_lock); 7595 7596 /* Now make sure any requests that proceeded on the assumption 7597 * the reshape wasn't running - like Discard or Read - have 7598 * completed. 7599 */ 7600 mddev_suspend(mddev); 7601 mddev_resume(mddev); 7602 7603 /* Add some new drives, as many as will fit. 7604 * We know there are enough to make the newly sized array work. 7605 * Don't add devices if we are reducing the number of 7606 * devices in the array. This is because it is not possible 7607 * to correctly record the "partially reconstructed" state of 7608 * such devices during the reshape and confusion could result. 7609 */ 7610 if (mddev->delta_disks >= 0) { 7611 rdev_for_each(rdev, mddev) 7612 if (rdev->raid_disk < 0 && 7613 !test_bit(Faulty, &rdev->flags)) { 7614 if (raid5_add_disk(mddev, rdev) == 0) { 7615 if (rdev->raid_disk 7616 >= conf->previous_raid_disks) 7617 set_bit(In_sync, &rdev->flags); 7618 else 7619 rdev->recovery_offset = 0; 7620 7621 if (sysfs_link_rdev(mddev, rdev)) 7622 /* Failure here is OK */; 7623 } 7624 } else if (rdev->raid_disk >= conf->previous_raid_disks 7625 && !test_bit(Faulty, &rdev->flags)) { 7626 /* This is a spare that was manually added */ 7627 set_bit(In_sync, &rdev->flags); 7628 } 7629 7630 /* When a reshape changes the number of devices, 7631 * ->degraded is measured against the larger of the 7632 * pre and post number of devices. 7633 */ 7634 spin_lock_irqsave(&conf->device_lock, flags); 7635 mddev->degraded = calc_degraded(conf); 7636 spin_unlock_irqrestore(&conf->device_lock, flags); 7637 } 7638 mddev->raid_disks = conf->raid_disks; 7639 mddev->reshape_position = conf->reshape_progress; 7640 set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags); 7641 7642 clear_bit(MD_RECOVERY_SYNC, &mddev->recovery); 7643 clear_bit(MD_RECOVERY_CHECK, &mddev->recovery); 7644 clear_bit(MD_RECOVERY_DONE, &mddev->recovery); 7645 set_bit(MD_RECOVERY_RESHAPE, &mddev->recovery); 7646 set_bit(MD_RECOVERY_RUNNING, &mddev->recovery); 7647 mddev->sync_thread = md_register_thread(md_do_sync, mddev, 7648 "reshape"); 7649 if (!mddev->sync_thread) { 7650 mddev->recovery = 0; 7651 spin_lock_irq(&conf->device_lock); 7652 write_seqcount_begin(&conf->gen_lock); 7653 mddev->raid_disks = conf->raid_disks = conf->previous_raid_disks; 7654 mddev->new_chunk_sectors = 7655 conf->chunk_sectors = conf->prev_chunk_sectors; 7656 mddev->new_layout = conf->algorithm = conf->prev_algo; 7657 rdev_for_each(rdev, mddev) 7658 rdev->new_data_offset = rdev->data_offset; 7659 smp_wmb(); 7660 conf->generation --; 7661 conf->reshape_progress = MaxSector; 7662 mddev->reshape_position = MaxSector; 7663 write_seqcount_end(&conf->gen_lock); 7664 spin_unlock_irq(&conf->device_lock); 7665 return -EAGAIN; 7666 } 7667 conf->reshape_checkpoint = jiffies; 7668 md_wakeup_thread(mddev->sync_thread); 7669 md_new_event(mddev); 7670 return 0; 7671 } 7672 7673 /* This is called from the reshape thread and should make any 7674 * changes needed in 'conf' 7675 */ 7676 static void end_reshape(struct r5conf *conf) 7677 { 7678 7679 if (!test_bit(MD_RECOVERY_INTR, &conf->mddev->recovery)) { 7680 struct md_rdev *rdev; 7681 7682 spin_lock_irq(&conf->device_lock); 7683 conf->previous_raid_disks = conf->raid_disks; 7684 rdev_for_each(rdev, conf->mddev) 7685 rdev->data_offset = rdev->new_data_offset; 7686 smp_wmb(); 7687 conf->reshape_progress = MaxSector; 7688 conf->mddev->reshape_position = MaxSector; 7689 spin_unlock_irq(&conf->device_lock); 7690 wake_up(&conf->wait_for_overlap); 7691 7692 /* read-ahead size must cover two whole stripes, which is 7693 * 2 * (datadisks) * chunksize where 'n' is the number of raid devices 7694 */ 7695 if (conf->mddev->queue) { 7696 int data_disks = conf->raid_disks - conf->max_degraded; 7697 int stripe = data_disks * ((conf->chunk_sectors << 9) 7698 / PAGE_SIZE); 7699 if (conf->mddev->queue->backing_dev_info.ra_pages < 2 * stripe) 7700 conf->mddev->queue->backing_dev_info.ra_pages = 2 * stripe; 7701 } 7702 } 7703 } 7704 7705 /* This is called from the raid5d thread with mddev_lock held. 7706 * It makes config changes to the device. 7707 */ 7708 static void raid5_finish_reshape(struct mddev *mddev) 7709 { 7710 struct r5conf *conf = mddev->private; 7711 7712 if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery)) { 7713 7714 if (mddev->delta_disks > 0) { 7715 md_set_array_sectors(mddev, raid5_size(mddev, 0, 0)); 7716 if (mddev->queue) { 7717 set_capacity(mddev->gendisk, mddev->array_sectors); 7718 revalidate_disk(mddev->gendisk); 7719 } 7720 } else { 7721 int d; 7722 spin_lock_irq(&conf->device_lock); 7723 mddev->degraded = calc_degraded(conf); 7724 spin_unlock_irq(&conf->device_lock); 7725 for (d = conf->raid_disks ; 7726 d < conf->raid_disks - mddev->delta_disks; 7727 d++) { 7728 struct md_rdev *rdev = conf->disks[d].rdev; 7729 if (rdev) 7730 clear_bit(In_sync, &rdev->flags); 7731 rdev = conf->disks[d].replacement; 7732 if (rdev) 7733 clear_bit(In_sync, &rdev->flags); 7734 } 7735 } 7736 mddev->layout = conf->algorithm; 7737 mddev->chunk_sectors = conf->chunk_sectors; 7738 mddev->reshape_position = MaxSector; 7739 mddev->delta_disks = 0; 7740 mddev->reshape_backwards = 0; 7741 } 7742 } 7743 7744 static void raid5_quiesce(struct mddev *mddev, int state) 7745 { 7746 struct r5conf *conf = mddev->private; 7747 7748 switch(state) { 7749 case 2: /* resume for a suspend */ 7750 wake_up(&conf->wait_for_overlap); 7751 break; 7752 7753 case 1: /* stop all writes */ 7754 lock_all_device_hash_locks_irq(conf); 7755 /* '2' tells resync/reshape to pause so that all 7756 * active stripes can drain 7757 */ 7758 r5c_flush_cache(conf, INT_MAX); 7759 conf->quiesce = 2; 7760 wait_event_cmd(conf->wait_for_quiescent, 7761 atomic_read(&conf->active_stripes) == 0 && 7762 atomic_read(&conf->active_aligned_reads) == 0, 7763 unlock_all_device_hash_locks_irq(conf), 7764 lock_all_device_hash_locks_irq(conf)); 7765 conf->quiesce = 1; 7766 unlock_all_device_hash_locks_irq(conf); 7767 /* allow reshape to continue */ 7768 wake_up(&conf->wait_for_overlap); 7769 break; 7770 7771 case 0: /* re-enable writes */ 7772 lock_all_device_hash_locks_irq(conf); 7773 conf->quiesce = 0; 7774 wake_up(&conf->wait_for_quiescent); 7775 wake_up(&conf->wait_for_overlap); 7776 unlock_all_device_hash_locks_irq(conf); 7777 break; 7778 } 7779 r5l_quiesce(conf->log, state); 7780 } 7781 7782 static void *raid45_takeover_raid0(struct mddev *mddev, int level) 7783 { 7784 struct r0conf *raid0_conf = mddev->private; 7785 sector_t sectors; 7786 7787 /* for raid0 takeover only one zone is supported */ 7788 if (raid0_conf->nr_strip_zones > 1) { 7789 pr_warn("md/raid:%s: cannot takeover raid0 with more than one zone.\n", 7790 mdname(mddev)); 7791 return ERR_PTR(-EINVAL); 7792 } 7793 7794 sectors = raid0_conf->strip_zone[0].zone_end; 7795 sector_div(sectors, raid0_conf->strip_zone[0].nb_dev); 7796 mddev->dev_sectors = sectors; 7797 mddev->new_level = level; 7798 mddev->new_layout = ALGORITHM_PARITY_N; 7799 mddev->new_chunk_sectors = mddev->chunk_sectors; 7800 mddev->raid_disks += 1; 7801 mddev->delta_disks = 1; 7802 /* make sure it will be not marked as dirty */ 7803 mddev->recovery_cp = MaxSector; 7804 7805 return setup_conf(mddev); 7806 } 7807 7808 static void *raid5_takeover_raid1(struct mddev *mddev) 7809 { 7810 int chunksect; 7811 void *ret; 7812 7813 if (mddev->raid_disks != 2 || 7814 mddev->degraded > 1) 7815 return ERR_PTR(-EINVAL); 7816 7817 /* Should check if there are write-behind devices? */ 7818 7819 chunksect = 64*2; /* 64K by default */ 7820 7821 /* The array must be an exact multiple of chunksize */ 7822 while (chunksect && (mddev->array_sectors & (chunksect-1))) 7823 chunksect >>= 1; 7824 7825 if ((chunksect<<9) < STRIPE_SIZE) 7826 /* array size does not allow a suitable chunk size */ 7827 return ERR_PTR(-EINVAL); 7828 7829 mddev->new_level = 5; 7830 mddev->new_layout = ALGORITHM_LEFT_SYMMETRIC; 7831 mddev->new_chunk_sectors = chunksect; 7832 7833 ret = setup_conf(mddev); 7834 if (!IS_ERR(ret)) 7835 mddev_clear_unsupported_flags(mddev, 7836 UNSUPPORTED_MDDEV_FLAGS); 7837 return ret; 7838 } 7839 7840 static void *raid5_takeover_raid6(struct mddev *mddev) 7841 { 7842 int new_layout; 7843 7844 switch (mddev->layout) { 7845 case ALGORITHM_LEFT_ASYMMETRIC_6: 7846 new_layout = ALGORITHM_LEFT_ASYMMETRIC; 7847 break; 7848 case ALGORITHM_RIGHT_ASYMMETRIC_6: 7849 new_layout = ALGORITHM_RIGHT_ASYMMETRIC; 7850 break; 7851 case ALGORITHM_LEFT_SYMMETRIC_6: 7852 new_layout = ALGORITHM_LEFT_SYMMETRIC; 7853 break; 7854 case ALGORITHM_RIGHT_SYMMETRIC_6: 7855 new_layout = ALGORITHM_RIGHT_SYMMETRIC; 7856 break; 7857 case ALGORITHM_PARITY_0_6: 7858 new_layout = ALGORITHM_PARITY_0; 7859 break; 7860 case ALGORITHM_PARITY_N: 7861 new_layout = ALGORITHM_PARITY_N; 7862 break; 7863 default: 7864 return ERR_PTR(-EINVAL); 7865 } 7866 mddev->new_level = 5; 7867 mddev->new_layout = new_layout; 7868 mddev->delta_disks = -1; 7869 mddev->raid_disks -= 1; 7870 return setup_conf(mddev); 7871 } 7872 7873 static int raid5_check_reshape(struct mddev *mddev) 7874 { 7875 /* For a 2-drive array, the layout and chunk size can be changed 7876 * immediately as not restriping is needed. 7877 * For larger arrays we record the new value - after validation 7878 * to be used by a reshape pass. 7879 */ 7880 struct r5conf *conf = mddev->private; 7881 int new_chunk = mddev->new_chunk_sectors; 7882 7883 if (mddev->new_layout >= 0 && !algorithm_valid_raid5(mddev->new_layout)) 7884 return -EINVAL; 7885 if (new_chunk > 0) { 7886 if (!is_power_of_2(new_chunk)) 7887 return -EINVAL; 7888 if (new_chunk < (PAGE_SIZE>>9)) 7889 return -EINVAL; 7890 if (mddev->array_sectors & (new_chunk-1)) 7891 /* not factor of array size */ 7892 return -EINVAL; 7893 } 7894 7895 /* They look valid */ 7896 7897 if (mddev->raid_disks == 2) { 7898 /* can make the change immediately */ 7899 if (mddev->new_layout >= 0) { 7900 conf->algorithm = mddev->new_layout; 7901 mddev->layout = mddev->new_layout; 7902 } 7903 if (new_chunk > 0) { 7904 conf->chunk_sectors = new_chunk ; 7905 mddev->chunk_sectors = new_chunk; 7906 } 7907 set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags); 7908 md_wakeup_thread(mddev->thread); 7909 } 7910 return check_reshape(mddev); 7911 } 7912 7913 static int raid6_check_reshape(struct mddev *mddev) 7914 { 7915 int new_chunk = mddev->new_chunk_sectors; 7916 7917 if (mddev->new_layout >= 0 && !algorithm_valid_raid6(mddev->new_layout)) 7918 return -EINVAL; 7919 if (new_chunk > 0) { 7920 if (!is_power_of_2(new_chunk)) 7921 return -EINVAL; 7922 if (new_chunk < (PAGE_SIZE >> 9)) 7923 return -EINVAL; 7924 if (mddev->array_sectors & (new_chunk-1)) 7925 /* not factor of array size */ 7926 return -EINVAL; 7927 } 7928 7929 /* They look valid */ 7930 return check_reshape(mddev); 7931 } 7932 7933 static void *raid5_takeover(struct mddev *mddev) 7934 { 7935 /* raid5 can take over: 7936 * raid0 - if there is only one strip zone - make it a raid4 layout 7937 * raid1 - if there are two drives. We need to know the chunk size 7938 * raid4 - trivial - just use a raid4 layout. 7939 * raid6 - Providing it is a *_6 layout 7940 */ 7941 if (mddev->level == 0) 7942 return raid45_takeover_raid0(mddev, 5); 7943 if (mddev->level == 1) 7944 return raid5_takeover_raid1(mddev); 7945 if (mddev->level == 4) { 7946 mddev->new_layout = ALGORITHM_PARITY_N; 7947 mddev->new_level = 5; 7948 return setup_conf(mddev); 7949 } 7950 if (mddev->level == 6) 7951 return raid5_takeover_raid6(mddev); 7952 7953 return ERR_PTR(-EINVAL); 7954 } 7955 7956 static void *raid4_takeover(struct mddev *mddev) 7957 { 7958 /* raid4 can take over: 7959 * raid0 - if there is only one strip zone 7960 * raid5 - if layout is right 7961 */ 7962 if (mddev->level == 0) 7963 return raid45_takeover_raid0(mddev, 4); 7964 if (mddev->level == 5 && 7965 mddev->layout == ALGORITHM_PARITY_N) { 7966 mddev->new_layout = 0; 7967 mddev->new_level = 4; 7968 return setup_conf(mddev); 7969 } 7970 return ERR_PTR(-EINVAL); 7971 } 7972 7973 static struct md_personality raid5_personality; 7974 7975 static void *raid6_takeover(struct mddev *mddev) 7976 { 7977 /* Currently can only take over a raid5. We map the 7978 * personality to an equivalent raid6 personality 7979 * with the Q block at the end. 7980 */ 7981 int new_layout; 7982 7983 if (mddev->pers != &raid5_personality) 7984 return ERR_PTR(-EINVAL); 7985 if (mddev->degraded > 1) 7986 return ERR_PTR(-EINVAL); 7987 if (mddev->raid_disks > 253) 7988 return ERR_PTR(-EINVAL); 7989 if (mddev->raid_disks < 3) 7990 return ERR_PTR(-EINVAL); 7991 7992 switch (mddev->layout) { 7993 case ALGORITHM_LEFT_ASYMMETRIC: 7994 new_layout = ALGORITHM_LEFT_ASYMMETRIC_6; 7995 break; 7996 case ALGORITHM_RIGHT_ASYMMETRIC: 7997 new_layout = ALGORITHM_RIGHT_ASYMMETRIC_6; 7998 break; 7999 case ALGORITHM_LEFT_SYMMETRIC: 8000 new_layout = ALGORITHM_LEFT_SYMMETRIC_6; 8001 break; 8002 case ALGORITHM_RIGHT_SYMMETRIC: 8003 new_layout = ALGORITHM_RIGHT_SYMMETRIC_6; 8004 break; 8005 case ALGORITHM_PARITY_0: 8006 new_layout = ALGORITHM_PARITY_0_6; 8007 break; 8008 case ALGORITHM_PARITY_N: 8009 new_layout = ALGORITHM_PARITY_N; 8010 break; 8011 default: 8012 return ERR_PTR(-EINVAL); 8013 } 8014 mddev->new_level = 6; 8015 mddev->new_layout = new_layout; 8016 mddev->delta_disks = 1; 8017 mddev->raid_disks += 1; 8018 return setup_conf(mddev); 8019 } 8020 8021 static struct md_personality raid6_personality = 8022 { 8023 .name = "raid6", 8024 .level = 6, 8025 .owner = THIS_MODULE, 8026 .make_request = raid5_make_request, 8027 .run = raid5_run, 8028 .free = raid5_free, 8029 .status = raid5_status, 8030 .error_handler = raid5_error, 8031 .hot_add_disk = raid5_add_disk, 8032 .hot_remove_disk= raid5_remove_disk, 8033 .spare_active = raid5_spare_active, 8034 .sync_request = raid5_sync_request, 8035 .resize = raid5_resize, 8036 .size = raid5_size, 8037 .check_reshape = raid6_check_reshape, 8038 .start_reshape = raid5_start_reshape, 8039 .finish_reshape = raid5_finish_reshape, 8040 .quiesce = raid5_quiesce, 8041 .takeover = raid6_takeover, 8042 .congested = raid5_congested, 8043 }; 8044 static struct md_personality raid5_personality = 8045 { 8046 .name = "raid5", 8047 .level = 5, 8048 .owner = THIS_MODULE, 8049 .make_request = raid5_make_request, 8050 .run = raid5_run, 8051 .free = raid5_free, 8052 .status = raid5_status, 8053 .error_handler = raid5_error, 8054 .hot_add_disk = raid5_add_disk, 8055 .hot_remove_disk= raid5_remove_disk, 8056 .spare_active = raid5_spare_active, 8057 .sync_request = raid5_sync_request, 8058 .resize = raid5_resize, 8059 .size = raid5_size, 8060 .check_reshape = raid5_check_reshape, 8061 .start_reshape = raid5_start_reshape, 8062 .finish_reshape = raid5_finish_reshape, 8063 .quiesce = raid5_quiesce, 8064 .takeover = raid5_takeover, 8065 .congested = raid5_congested, 8066 }; 8067 8068 static struct md_personality raid4_personality = 8069 { 8070 .name = "raid4", 8071 .level = 4, 8072 .owner = THIS_MODULE, 8073 .make_request = raid5_make_request, 8074 .run = raid5_run, 8075 .free = raid5_free, 8076 .status = raid5_status, 8077 .error_handler = raid5_error, 8078 .hot_add_disk = raid5_add_disk, 8079 .hot_remove_disk= raid5_remove_disk, 8080 .spare_active = raid5_spare_active, 8081 .sync_request = raid5_sync_request, 8082 .resize = raid5_resize, 8083 .size = raid5_size, 8084 .check_reshape = raid5_check_reshape, 8085 .start_reshape = raid5_start_reshape, 8086 .finish_reshape = raid5_finish_reshape, 8087 .quiesce = raid5_quiesce, 8088 .takeover = raid4_takeover, 8089 .congested = raid5_congested, 8090 }; 8091 8092 static int __init raid5_init(void) 8093 { 8094 int ret; 8095 8096 raid5_wq = alloc_workqueue("raid5wq", 8097 WQ_UNBOUND|WQ_MEM_RECLAIM|WQ_CPU_INTENSIVE|WQ_SYSFS, 0); 8098 if (!raid5_wq) 8099 return -ENOMEM; 8100 8101 ret = cpuhp_setup_state_multi(CPUHP_MD_RAID5_PREPARE, 8102 "md/raid5:prepare", 8103 raid456_cpu_up_prepare, 8104 raid456_cpu_dead); 8105 if (ret) { 8106 destroy_workqueue(raid5_wq); 8107 return ret; 8108 } 8109 register_md_personality(&raid6_personality); 8110 register_md_personality(&raid5_personality); 8111 register_md_personality(&raid4_personality); 8112 return 0; 8113 } 8114 8115 static void raid5_exit(void) 8116 { 8117 unregister_md_personality(&raid6_personality); 8118 unregister_md_personality(&raid5_personality); 8119 unregister_md_personality(&raid4_personality); 8120 cpuhp_remove_multi_state(CPUHP_MD_RAID5_PREPARE); 8121 destroy_workqueue(raid5_wq); 8122 } 8123 8124 module_init(raid5_init); 8125 module_exit(raid5_exit); 8126 MODULE_LICENSE("GPL"); 8127 MODULE_DESCRIPTION("RAID4/5/6 (striping with parity) personality for MD"); 8128 MODULE_ALIAS("md-personality-4"); /* RAID5 */ 8129 MODULE_ALIAS("md-raid5"); 8130 MODULE_ALIAS("md-raid4"); 8131 MODULE_ALIAS("md-level-5"); 8132 MODULE_ALIAS("md-level-4"); 8133 MODULE_ALIAS("md-personality-8"); /* RAID6 */ 8134 MODULE_ALIAS("md-raid6"); 8135 MODULE_ALIAS("md-level-6"); 8136 8137 /* This used to be two separate modules, they were: */ 8138 MODULE_ALIAS("raid5"); 8139 MODULE_ALIAS("raid6"); 8140