1 /* 2 * raid1.c : Multiple Devices driver for Linux 3 * 4 * Copyright (C) 1999, 2000, 2001 Ingo Molnar, Red Hat 5 * 6 * Copyright (C) 1996, 1997, 1998 Ingo Molnar, Miguel de Icaza, Gadi Oxman 7 * 8 * RAID-1 management functions. 9 * 10 * Better read-balancing code written by Mika Kuoppala <miku@iki.fi>, 2000 11 * 12 * Fixes to reconstruction by Jakob �stergaard" <jakob@ostenfeld.dk> 13 * Various fixes by Neil Brown <neilb@cse.unsw.edu.au> 14 * 15 * Changes by Peter T. Breuer <ptb@it.uc3m.es> 31/1/2003 to support 16 * bitmapped intelligence in resync: 17 * 18 * - bitmap marked during normal i/o 19 * - bitmap used to skip nondirty blocks during sync 20 * 21 * Additions to bitmap code, (C) 2003-2004 Paul Clements, SteelEye Technology: 22 * - persistent bitmap code 23 * 24 * This program is free software; you can redistribute it and/or modify 25 * it under the terms of the GNU General Public License as published by 26 * the Free Software Foundation; either version 2, or (at your option) 27 * any later version. 28 * 29 * You should have received a copy of the GNU General Public License 30 * (for example /usr/src/linux/COPYING); if not, write to the Free 31 * Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. 32 */ 33 34 #include "dm-bio-list.h" 35 #include <linux/raid/raid1.h> 36 #include <linux/raid/bitmap.h> 37 38 #define DEBUG 0 39 #if DEBUG 40 #define PRINTK(x...) printk(x) 41 #else 42 #define PRINTK(x...) 43 #endif 44 45 /* 46 * Number of guaranteed r1bios in case of extreme VM load: 47 */ 48 #define NR_RAID1_BIOS 256 49 50 static mdk_personality_t raid1_personality; 51 52 static void unplug_slaves(mddev_t *mddev); 53 54 55 static void * r1bio_pool_alloc(gfp_t gfp_flags, void *data) 56 { 57 struct pool_info *pi = data; 58 r1bio_t *r1_bio; 59 int size = offsetof(r1bio_t, bios[pi->raid_disks]); 60 61 /* allocate a r1bio with room for raid_disks entries in the bios array */ 62 r1_bio = kmalloc(size, gfp_flags); 63 if (r1_bio) 64 memset(r1_bio, 0, size); 65 else 66 unplug_slaves(pi->mddev); 67 68 return r1_bio; 69 } 70 71 static void r1bio_pool_free(void *r1_bio, void *data) 72 { 73 kfree(r1_bio); 74 } 75 76 #define RESYNC_BLOCK_SIZE (64*1024) 77 //#define RESYNC_BLOCK_SIZE PAGE_SIZE 78 #define RESYNC_SECTORS (RESYNC_BLOCK_SIZE >> 9) 79 #define RESYNC_PAGES ((RESYNC_BLOCK_SIZE + PAGE_SIZE-1) / PAGE_SIZE) 80 #define RESYNC_WINDOW (2048*1024) 81 82 static void * r1buf_pool_alloc(gfp_t gfp_flags, void *data) 83 { 84 struct pool_info *pi = data; 85 struct page *page; 86 r1bio_t *r1_bio; 87 struct bio *bio; 88 int i, j; 89 90 r1_bio = r1bio_pool_alloc(gfp_flags, pi); 91 if (!r1_bio) { 92 unplug_slaves(pi->mddev); 93 return NULL; 94 } 95 96 /* 97 * Allocate bios : 1 for reading, n-1 for writing 98 */ 99 for (j = pi->raid_disks ; j-- ; ) { 100 bio = bio_alloc(gfp_flags, RESYNC_PAGES); 101 if (!bio) 102 goto out_free_bio; 103 r1_bio->bios[j] = bio; 104 } 105 /* 106 * Allocate RESYNC_PAGES data pages and attach them to 107 * the first bio; 108 */ 109 bio = r1_bio->bios[0]; 110 for (i = 0; i < RESYNC_PAGES; i++) { 111 page = alloc_page(gfp_flags); 112 if (unlikely(!page)) 113 goto out_free_pages; 114 115 bio->bi_io_vec[i].bv_page = page; 116 } 117 118 r1_bio->master_bio = NULL; 119 120 return r1_bio; 121 122 out_free_pages: 123 for ( ; i > 0 ; i--) 124 __free_page(bio->bi_io_vec[i-1].bv_page); 125 out_free_bio: 126 while ( ++j < pi->raid_disks ) 127 bio_put(r1_bio->bios[j]); 128 r1bio_pool_free(r1_bio, data); 129 return NULL; 130 } 131 132 static void r1buf_pool_free(void *__r1_bio, void *data) 133 { 134 struct pool_info *pi = data; 135 int i; 136 r1bio_t *r1bio = __r1_bio; 137 struct bio *bio = r1bio->bios[0]; 138 139 for (i = 0; i < RESYNC_PAGES; i++) { 140 __free_page(bio->bi_io_vec[i].bv_page); 141 bio->bi_io_vec[i].bv_page = NULL; 142 } 143 for (i=0 ; i < pi->raid_disks; i++) 144 bio_put(r1bio->bios[i]); 145 146 r1bio_pool_free(r1bio, data); 147 } 148 149 static void put_all_bios(conf_t *conf, r1bio_t *r1_bio) 150 { 151 int i; 152 153 for (i = 0; i < conf->raid_disks; i++) { 154 struct bio **bio = r1_bio->bios + i; 155 if (*bio) 156 bio_put(*bio); 157 *bio = NULL; 158 } 159 } 160 161 static inline void free_r1bio(r1bio_t *r1_bio) 162 { 163 unsigned long flags; 164 165 conf_t *conf = mddev_to_conf(r1_bio->mddev); 166 167 /* 168 * Wake up any possible resync thread that waits for the device 169 * to go idle. 170 */ 171 spin_lock_irqsave(&conf->resync_lock, flags); 172 if (!--conf->nr_pending) { 173 wake_up(&conf->wait_idle); 174 wake_up(&conf->wait_resume); 175 } 176 spin_unlock_irqrestore(&conf->resync_lock, flags); 177 178 put_all_bios(conf, r1_bio); 179 mempool_free(r1_bio, conf->r1bio_pool); 180 } 181 182 static inline void put_buf(r1bio_t *r1_bio) 183 { 184 conf_t *conf = mddev_to_conf(r1_bio->mddev); 185 unsigned long flags; 186 187 mempool_free(r1_bio, conf->r1buf_pool); 188 189 spin_lock_irqsave(&conf->resync_lock, flags); 190 if (!conf->barrier) 191 BUG(); 192 --conf->barrier; 193 wake_up(&conf->wait_resume); 194 wake_up(&conf->wait_idle); 195 196 if (!--conf->nr_pending) { 197 wake_up(&conf->wait_idle); 198 wake_up(&conf->wait_resume); 199 } 200 spin_unlock_irqrestore(&conf->resync_lock, flags); 201 } 202 203 static void reschedule_retry(r1bio_t *r1_bio) 204 { 205 unsigned long flags; 206 mddev_t *mddev = r1_bio->mddev; 207 conf_t *conf = mddev_to_conf(mddev); 208 209 spin_lock_irqsave(&conf->device_lock, flags); 210 list_add(&r1_bio->retry_list, &conf->retry_list); 211 spin_unlock_irqrestore(&conf->device_lock, flags); 212 213 md_wakeup_thread(mddev->thread); 214 } 215 216 /* 217 * raid_end_bio_io() is called when we have finished servicing a mirrored 218 * operation and are ready to return a success/failure code to the buffer 219 * cache layer. 220 */ 221 static void raid_end_bio_io(r1bio_t *r1_bio) 222 { 223 struct bio *bio = r1_bio->master_bio; 224 225 /* if nobody has done the final endio yet, do it now */ 226 if (!test_and_set_bit(R1BIO_Returned, &r1_bio->state)) { 227 PRINTK(KERN_DEBUG "raid1: sync end %s on sectors %llu-%llu\n", 228 (bio_data_dir(bio) == WRITE) ? "write" : "read", 229 (unsigned long long) bio->bi_sector, 230 (unsigned long long) bio->bi_sector + 231 (bio->bi_size >> 9) - 1); 232 233 bio_endio(bio, bio->bi_size, 234 test_bit(R1BIO_Uptodate, &r1_bio->state) ? 0 : -EIO); 235 } 236 free_r1bio(r1_bio); 237 } 238 239 /* 240 * Update disk head position estimator based on IRQ completion info. 241 */ 242 static inline void update_head_pos(int disk, r1bio_t *r1_bio) 243 { 244 conf_t *conf = mddev_to_conf(r1_bio->mddev); 245 246 conf->mirrors[disk].head_position = 247 r1_bio->sector + (r1_bio->sectors); 248 } 249 250 static int raid1_end_read_request(struct bio *bio, unsigned int bytes_done, int error) 251 { 252 int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags); 253 r1bio_t * r1_bio = (r1bio_t *)(bio->bi_private); 254 int mirror; 255 conf_t *conf = mddev_to_conf(r1_bio->mddev); 256 257 if (bio->bi_size) 258 return 1; 259 260 mirror = r1_bio->read_disk; 261 /* 262 * this branch is our 'one mirror IO has finished' event handler: 263 */ 264 if (!uptodate) 265 md_error(r1_bio->mddev, conf->mirrors[mirror].rdev); 266 else 267 /* 268 * Set R1BIO_Uptodate in our master bio, so that 269 * we will return a good error code for to the higher 270 * levels even if IO on some other mirrored buffer fails. 271 * 272 * The 'master' represents the composite IO operation to 273 * user-side. So if something waits for IO, then it will 274 * wait for the 'master' bio. 275 */ 276 set_bit(R1BIO_Uptodate, &r1_bio->state); 277 278 update_head_pos(mirror, r1_bio); 279 280 /* 281 * we have only one bio on the read side 282 */ 283 if (uptodate) 284 raid_end_bio_io(r1_bio); 285 else { 286 /* 287 * oops, read error: 288 */ 289 char b[BDEVNAME_SIZE]; 290 if (printk_ratelimit()) 291 printk(KERN_ERR "raid1: %s: rescheduling sector %llu\n", 292 bdevname(conf->mirrors[mirror].rdev->bdev,b), (unsigned long long)r1_bio->sector); 293 reschedule_retry(r1_bio); 294 } 295 296 rdev_dec_pending(conf->mirrors[mirror].rdev, conf->mddev); 297 return 0; 298 } 299 300 static int raid1_end_write_request(struct bio *bio, unsigned int bytes_done, int error) 301 { 302 int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags); 303 r1bio_t * r1_bio = (r1bio_t *)(bio->bi_private); 304 int mirror, behind; 305 conf_t *conf = mddev_to_conf(r1_bio->mddev); 306 307 if (bio->bi_size) 308 return 1; 309 310 for (mirror = 0; mirror < conf->raid_disks; mirror++) 311 if (r1_bio->bios[mirror] == bio) 312 break; 313 314 /* 315 * this branch is our 'one mirror IO has finished' event handler: 316 */ 317 if (!uptodate) { 318 md_error(r1_bio->mddev, conf->mirrors[mirror].rdev); 319 /* an I/O failed, we can't clear the bitmap */ 320 set_bit(R1BIO_Degraded, &r1_bio->state); 321 } else 322 /* 323 * Set R1BIO_Uptodate in our master bio, so that 324 * we will return a good error code for to the higher 325 * levels even if IO on some other mirrored buffer fails. 326 * 327 * The 'master' represents the composite IO operation to 328 * user-side. So if something waits for IO, then it will 329 * wait for the 'master' bio. 330 */ 331 set_bit(R1BIO_Uptodate, &r1_bio->state); 332 333 update_head_pos(mirror, r1_bio); 334 335 behind = test_bit(R1BIO_BehindIO, &r1_bio->state); 336 if (behind) { 337 if (test_bit(WriteMostly, &conf->mirrors[mirror].rdev->flags)) 338 atomic_dec(&r1_bio->behind_remaining); 339 340 /* In behind mode, we ACK the master bio once the I/O has safely 341 * reached all non-writemostly disks. Setting the Returned bit 342 * ensures that this gets done only once -- we don't ever want to 343 * return -EIO here, instead we'll wait */ 344 345 if (atomic_read(&r1_bio->behind_remaining) >= (atomic_read(&r1_bio->remaining)-1) && 346 test_bit(R1BIO_Uptodate, &r1_bio->state)) { 347 /* Maybe we can return now */ 348 if (!test_and_set_bit(R1BIO_Returned, &r1_bio->state)) { 349 struct bio *mbio = r1_bio->master_bio; 350 PRINTK(KERN_DEBUG "raid1: behind end write sectors %llu-%llu\n", 351 (unsigned long long) mbio->bi_sector, 352 (unsigned long long) mbio->bi_sector + 353 (mbio->bi_size >> 9) - 1); 354 bio_endio(mbio, mbio->bi_size, 0); 355 } 356 } 357 } 358 /* 359 * 360 * Let's see if all mirrored write operations have finished 361 * already. 362 */ 363 if (atomic_dec_and_test(&r1_bio->remaining)) { 364 if (test_bit(R1BIO_BehindIO, &r1_bio->state)) { 365 /* free extra copy of the data pages */ 366 int i = bio->bi_vcnt; 367 while (i--) 368 __free_page(bio->bi_io_vec[i].bv_page); 369 } 370 /* clear the bitmap if all writes complete successfully */ 371 bitmap_endwrite(r1_bio->mddev->bitmap, r1_bio->sector, 372 r1_bio->sectors, 373 !test_bit(R1BIO_Degraded, &r1_bio->state), 374 behind); 375 md_write_end(r1_bio->mddev); 376 raid_end_bio_io(r1_bio); 377 } 378 379 rdev_dec_pending(conf->mirrors[mirror].rdev, conf->mddev); 380 return 0; 381 } 382 383 384 /* 385 * This routine returns the disk from which the requested read should 386 * be done. There is a per-array 'next expected sequential IO' sector 387 * number - if this matches on the next IO then we use the last disk. 388 * There is also a per-disk 'last know head position' sector that is 389 * maintained from IRQ contexts, both the normal and the resync IO 390 * completion handlers update this position correctly. If there is no 391 * perfect sequential match then we pick the disk whose head is closest. 392 * 393 * If there are 2 mirrors in the same 2 devices, performance degrades 394 * because position is mirror, not device based. 395 * 396 * The rdev for the device selected will have nr_pending incremented. 397 */ 398 static int read_balance(conf_t *conf, r1bio_t *r1_bio) 399 { 400 const unsigned long this_sector = r1_bio->sector; 401 int new_disk = conf->last_used, disk = new_disk; 402 int wonly_disk = -1; 403 const int sectors = r1_bio->sectors; 404 sector_t new_distance, current_distance; 405 mdk_rdev_t *rdev; 406 407 rcu_read_lock(); 408 /* 409 * Check if we can balance. We can balance on the whole 410 * device if no resync is going on, or below the resync window. 411 * We take the first readable disk when above the resync window. 412 */ 413 retry: 414 if (conf->mddev->recovery_cp < MaxSector && 415 (this_sector + sectors >= conf->next_resync)) { 416 /* Choose the first operation device, for consistancy */ 417 new_disk = 0; 418 419 for (rdev = conf->mirrors[new_disk].rdev; 420 !rdev || !rdev->in_sync 421 || test_bit(WriteMostly, &rdev->flags); 422 rdev = conf->mirrors[++new_disk].rdev) { 423 424 if (rdev && rdev->in_sync) 425 wonly_disk = new_disk; 426 427 if (new_disk == conf->raid_disks - 1) { 428 new_disk = wonly_disk; 429 break; 430 } 431 } 432 goto rb_out; 433 } 434 435 436 /* make sure the disk is operational */ 437 for (rdev = conf->mirrors[new_disk].rdev; 438 !rdev || !rdev->in_sync || 439 test_bit(WriteMostly, &rdev->flags); 440 rdev = conf->mirrors[new_disk].rdev) { 441 442 if (rdev && rdev->in_sync) 443 wonly_disk = new_disk; 444 445 if (new_disk <= 0) 446 new_disk = conf->raid_disks; 447 new_disk--; 448 if (new_disk == disk) { 449 new_disk = wonly_disk; 450 break; 451 } 452 } 453 454 if (new_disk < 0) 455 goto rb_out; 456 457 disk = new_disk; 458 /* now disk == new_disk == starting point for search */ 459 460 /* 461 * Don't change to another disk for sequential reads: 462 */ 463 if (conf->next_seq_sect == this_sector) 464 goto rb_out; 465 if (this_sector == conf->mirrors[new_disk].head_position) 466 goto rb_out; 467 468 current_distance = abs(this_sector - conf->mirrors[disk].head_position); 469 470 /* Find the disk whose head is closest */ 471 472 do { 473 if (disk <= 0) 474 disk = conf->raid_disks; 475 disk--; 476 477 rdev = conf->mirrors[disk].rdev; 478 479 if (!rdev || 480 !rdev->in_sync || 481 test_bit(WriteMostly, &rdev->flags)) 482 continue; 483 484 if (!atomic_read(&rdev->nr_pending)) { 485 new_disk = disk; 486 break; 487 } 488 new_distance = abs(this_sector - conf->mirrors[disk].head_position); 489 if (new_distance < current_distance) { 490 current_distance = new_distance; 491 new_disk = disk; 492 } 493 } while (disk != conf->last_used); 494 495 rb_out: 496 497 498 if (new_disk >= 0) { 499 rdev = conf->mirrors[new_disk].rdev; 500 if (!rdev) 501 goto retry; 502 atomic_inc(&rdev->nr_pending); 503 if (!rdev->in_sync) { 504 /* cannot risk returning a device that failed 505 * before we inc'ed nr_pending 506 */ 507 atomic_dec(&rdev->nr_pending); 508 goto retry; 509 } 510 conf->next_seq_sect = this_sector + sectors; 511 conf->last_used = new_disk; 512 } 513 rcu_read_unlock(); 514 515 return new_disk; 516 } 517 518 static void unplug_slaves(mddev_t *mddev) 519 { 520 conf_t *conf = mddev_to_conf(mddev); 521 int i; 522 523 rcu_read_lock(); 524 for (i=0; i<mddev->raid_disks; i++) { 525 mdk_rdev_t *rdev = conf->mirrors[i].rdev; 526 if (rdev && !rdev->faulty && atomic_read(&rdev->nr_pending)) { 527 request_queue_t *r_queue = bdev_get_queue(rdev->bdev); 528 529 atomic_inc(&rdev->nr_pending); 530 rcu_read_unlock(); 531 532 if (r_queue->unplug_fn) 533 r_queue->unplug_fn(r_queue); 534 535 rdev_dec_pending(rdev, mddev); 536 rcu_read_lock(); 537 } 538 } 539 rcu_read_unlock(); 540 } 541 542 static void raid1_unplug(request_queue_t *q) 543 { 544 mddev_t *mddev = q->queuedata; 545 546 unplug_slaves(mddev); 547 md_wakeup_thread(mddev->thread); 548 } 549 550 static int raid1_issue_flush(request_queue_t *q, struct gendisk *disk, 551 sector_t *error_sector) 552 { 553 mddev_t *mddev = q->queuedata; 554 conf_t *conf = mddev_to_conf(mddev); 555 int i, ret = 0; 556 557 rcu_read_lock(); 558 for (i=0; i<mddev->raid_disks && ret == 0; i++) { 559 mdk_rdev_t *rdev = conf->mirrors[i].rdev; 560 if (rdev && !rdev->faulty) { 561 struct block_device *bdev = rdev->bdev; 562 request_queue_t *r_queue = bdev_get_queue(bdev); 563 564 if (!r_queue->issue_flush_fn) 565 ret = -EOPNOTSUPP; 566 else { 567 atomic_inc(&rdev->nr_pending); 568 rcu_read_unlock(); 569 ret = r_queue->issue_flush_fn(r_queue, bdev->bd_disk, 570 error_sector); 571 rdev_dec_pending(rdev, mddev); 572 rcu_read_lock(); 573 } 574 } 575 } 576 rcu_read_unlock(); 577 return ret; 578 } 579 580 /* 581 * Throttle resync depth, so that we can both get proper overlapping of 582 * requests, but are still able to handle normal requests quickly. 583 */ 584 #define RESYNC_DEPTH 32 585 586 static void device_barrier(conf_t *conf, sector_t sect) 587 { 588 spin_lock_irq(&conf->resync_lock); 589 wait_event_lock_irq(conf->wait_idle, !waitqueue_active(&conf->wait_resume), 590 conf->resync_lock, raid1_unplug(conf->mddev->queue)); 591 592 if (!conf->barrier++) { 593 wait_event_lock_irq(conf->wait_idle, !conf->nr_pending, 594 conf->resync_lock, raid1_unplug(conf->mddev->queue)); 595 if (conf->nr_pending) 596 BUG(); 597 } 598 wait_event_lock_irq(conf->wait_resume, conf->barrier < RESYNC_DEPTH, 599 conf->resync_lock, raid1_unplug(conf->mddev->queue)); 600 conf->next_resync = sect; 601 spin_unlock_irq(&conf->resync_lock); 602 } 603 604 /* duplicate the data pages for behind I/O */ 605 static struct page **alloc_behind_pages(struct bio *bio) 606 { 607 int i; 608 struct bio_vec *bvec; 609 struct page **pages = kmalloc(bio->bi_vcnt * sizeof(struct page *), 610 GFP_NOIO); 611 if (unlikely(!pages)) 612 goto do_sync_io; 613 614 memset(pages, 0, bio->bi_vcnt * sizeof(struct page *)); 615 616 bio_for_each_segment(bvec, bio, i) { 617 pages[i] = alloc_page(GFP_NOIO); 618 if (unlikely(!pages[i])) 619 goto do_sync_io; 620 memcpy(kmap(pages[i]) + bvec->bv_offset, 621 kmap(bvec->bv_page) + bvec->bv_offset, bvec->bv_len); 622 kunmap(pages[i]); 623 kunmap(bvec->bv_page); 624 } 625 626 return pages; 627 628 do_sync_io: 629 if (pages) 630 for (i = 0; i < bio->bi_vcnt && pages[i]; i++) 631 __free_page(pages[i]); 632 kfree(pages); 633 PRINTK("%dB behind alloc failed, doing sync I/O\n", bio->bi_size); 634 return NULL; 635 } 636 637 static int make_request(request_queue_t *q, struct bio * bio) 638 { 639 mddev_t *mddev = q->queuedata; 640 conf_t *conf = mddev_to_conf(mddev); 641 mirror_info_t *mirror; 642 r1bio_t *r1_bio; 643 struct bio *read_bio; 644 int i, targets = 0, disks; 645 mdk_rdev_t *rdev; 646 struct bitmap *bitmap = mddev->bitmap; 647 unsigned long flags; 648 struct bio_list bl; 649 struct page **behind_pages = NULL; 650 const int rw = bio_data_dir(bio); 651 652 if (unlikely(bio_barrier(bio))) { 653 bio_endio(bio, bio->bi_size, -EOPNOTSUPP); 654 return 0; 655 } 656 657 /* 658 * Register the new request and wait if the reconstruction 659 * thread has put up a bar for new requests. 660 * Continue immediately if no resync is active currently. 661 */ 662 md_write_start(mddev, bio); /* wait on superblock update early */ 663 664 spin_lock_irq(&conf->resync_lock); 665 wait_event_lock_irq(conf->wait_resume, !conf->barrier, conf->resync_lock, ); 666 conf->nr_pending++; 667 spin_unlock_irq(&conf->resync_lock); 668 669 disk_stat_inc(mddev->gendisk, ios[rw]); 670 disk_stat_add(mddev->gendisk, sectors[rw], bio_sectors(bio)); 671 672 /* 673 * make_request() can abort the operation when READA is being 674 * used and no empty request is available. 675 * 676 */ 677 r1_bio = mempool_alloc(conf->r1bio_pool, GFP_NOIO); 678 679 r1_bio->master_bio = bio; 680 r1_bio->sectors = bio->bi_size >> 9; 681 r1_bio->state = 0; 682 r1_bio->mddev = mddev; 683 r1_bio->sector = bio->bi_sector; 684 685 if (rw == READ) { 686 /* 687 * read balancing logic: 688 */ 689 int rdisk = read_balance(conf, r1_bio); 690 691 if (rdisk < 0) { 692 /* couldn't find anywhere to read from */ 693 raid_end_bio_io(r1_bio); 694 return 0; 695 } 696 mirror = conf->mirrors + rdisk; 697 698 r1_bio->read_disk = rdisk; 699 700 read_bio = bio_clone(bio, GFP_NOIO); 701 702 r1_bio->bios[rdisk] = read_bio; 703 704 read_bio->bi_sector = r1_bio->sector + mirror->rdev->data_offset; 705 read_bio->bi_bdev = mirror->rdev->bdev; 706 read_bio->bi_end_io = raid1_end_read_request; 707 read_bio->bi_rw = READ; 708 read_bio->bi_private = r1_bio; 709 710 generic_make_request(read_bio); 711 return 0; 712 } 713 714 /* 715 * WRITE: 716 */ 717 /* first select target devices under spinlock and 718 * inc refcount on their rdev. Record them by setting 719 * bios[x] to bio 720 */ 721 disks = conf->raid_disks; 722 #if 0 723 { static int first=1; 724 if (first) printk("First Write sector %llu disks %d\n", 725 (unsigned long long)r1_bio->sector, disks); 726 first = 0; 727 } 728 #endif 729 rcu_read_lock(); 730 for (i = 0; i < disks; i++) { 731 if ((rdev=conf->mirrors[i].rdev) != NULL && 732 !rdev->faulty) { 733 atomic_inc(&rdev->nr_pending); 734 if (rdev->faulty) { 735 atomic_dec(&rdev->nr_pending); 736 r1_bio->bios[i] = NULL; 737 } else 738 r1_bio->bios[i] = bio; 739 targets++; 740 } else 741 r1_bio->bios[i] = NULL; 742 } 743 rcu_read_unlock(); 744 745 BUG_ON(targets == 0); /* we never fail the last device */ 746 747 if (targets < conf->raid_disks) { 748 /* array is degraded, we will not clear the bitmap 749 * on I/O completion (see raid1_end_write_request) */ 750 set_bit(R1BIO_Degraded, &r1_bio->state); 751 } 752 753 /* do behind I/O ? */ 754 if (bitmap && 755 atomic_read(&bitmap->behind_writes) < bitmap->max_write_behind && 756 (behind_pages = alloc_behind_pages(bio)) != NULL) 757 set_bit(R1BIO_BehindIO, &r1_bio->state); 758 759 atomic_set(&r1_bio->remaining, 0); 760 atomic_set(&r1_bio->behind_remaining, 0); 761 762 bio_list_init(&bl); 763 for (i = 0; i < disks; i++) { 764 struct bio *mbio; 765 if (!r1_bio->bios[i]) 766 continue; 767 768 mbio = bio_clone(bio, GFP_NOIO); 769 r1_bio->bios[i] = mbio; 770 771 mbio->bi_sector = r1_bio->sector + conf->mirrors[i].rdev->data_offset; 772 mbio->bi_bdev = conf->mirrors[i].rdev->bdev; 773 mbio->bi_end_io = raid1_end_write_request; 774 mbio->bi_rw = WRITE; 775 mbio->bi_private = r1_bio; 776 777 if (behind_pages) { 778 struct bio_vec *bvec; 779 int j; 780 781 /* Yes, I really want the '__' version so that 782 * we clear any unused pointer in the io_vec, rather 783 * than leave them unchanged. This is important 784 * because when we come to free the pages, we won't 785 * know the originial bi_idx, so we just free 786 * them all 787 */ 788 __bio_for_each_segment(bvec, mbio, j, 0) 789 bvec->bv_page = behind_pages[j]; 790 if (test_bit(WriteMostly, &conf->mirrors[i].rdev->flags)) 791 atomic_inc(&r1_bio->behind_remaining); 792 } 793 794 atomic_inc(&r1_bio->remaining); 795 796 bio_list_add(&bl, mbio); 797 } 798 kfree(behind_pages); /* the behind pages are attached to the bios now */ 799 800 bitmap_startwrite(bitmap, bio->bi_sector, r1_bio->sectors, 801 test_bit(R1BIO_BehindIO, &r1_bio->state)); 802 spin_lock_irqsave(&conf->device_lock, flags); 803 bio_list_merge(&conf->pending_bio_list, &bl); 804 bio_list_init(&bl); 805 806 blk_plug_device(mddev->queue); 807 spin_unlock_irqrestore(&conf->device_lock, flags); 808 809 #if 0 810 while ((bio = bio_list_pop(&bl)) != NULL) 811 generic_make_request(bio); 812 #endif 813 814 return 0; 815 } 816 817 static void status(struct seq_file *seq, mddev_t *mddev) 818 { 819 conf_t *conf = mddev_to_conf(mddev); 820 int i; 821 822 seq_printf(seq, " [%d/%d] [", conf->raid_disks, 823 conf->working_disks); 824 for (i = 0; i < conf->raid_disks; i++) 825 seq_printf(seq, "%s", 826 conf->mirrors[i].rdev && 827 conf->mirrors[i].rdev->in_sync ? "U" : "_"); 828 seq_printf(seq, "]"); 829 } 830 831 832 static void error(mddev_t *mddev, mdk_rdev_t *rdev) 833 { 834 char b[BDEVNAME_SIZE]; 835 conf_t *conf = mddev_to_conf(mddev); 836 837 /* 838 * If it is not operational, then we have already marked it as dead 839 * else if it is the last working disks, ignore the error, let the 840 * next level up know. 841 * else mark the drive as failed 842 */ 843 if (rdev->in_sync 844 && conf->working_disks == 1) 845 /* 846 * Don't fail the drive, act as though we were just a 847 * normal single drive 848 */ 849 return; 850 if (rdev->in_sync) { 851 mddev->degraded++; 852 conf->working_disks--; 853 /* 854 * if recovery is running, make sure it aborts. 855 */ 856 set_bit(MD_RECOVERY_ERR, &mddev->recovery); 857 } 858 rdev->in_sync = 0; 859 rdev->faulty = 1; 860 mddev->sb_dirty = 1; 861 printk(KERN_ALERT "raid1: Disk failure on %s, disabling device. \n" 862 " Operation continuing on %d devices\n", 863 bdevname(rdev->bdev,b), conf->working_disks); 864 } 865 866 static void print_conf(conf_t *conf) 867 { 868 int i; 869 mirror_info_t *tmp; 870 871 printk("RAID1 conf printout:\n"); 872 if (!conf) { 873 printk("(!conf)\n"); 874 return; 875 } 876 printk(" --- wd:%d rd:%d\n", conf->working_disks, 877 conf->raid_disks); 878 879 for (i = 0; i < conf->raid_disks; i++) { 880 char b[BDEVNAME_SIZE]; 881 tmp = conf->mirrors + i; 882 if (tmp->rdev) 883 printk(" disk %d, wo:%d, o:%d, dev:%s\n", 884 i, !tmp->rdev->in_sync, !tmp->rdev->faulty, 885 bdevname(tmp->rdev->bdev,b)); 886 } 887 } 888 889 static void close_sync(conf_t *conf) 890 { 891 spin_lock_irq(&conf->resync_lock); 892 wait_event_lock_irq(conf->wait_resume, !conf->barrier, 893 conf->resync_lock, raid1_unplug(conf->mddev->queue)); 894 spin_unlock_irq(&conf->resync_lock); 895 896 if (conf->barrier) BUG(); 897 if (waitqueue_active(&conf->wait_idle)) BUG(); 898 899 mempool_destroy(conf->r1buf_pool); 900 conf->r1buf_pool = NULL; 901 } 902 903 static int raid1_spare_active(mddev_t *mddev) 904 { 905 int i; 906 conf_t *conf = mddev->private; 907 mirror_info_t *tmp; 908 909 /* 910 * Find all failed disks within the RAID1 configuration 911 * and mark them readable 912 */ 913 for (i = 0; i < conf->raid_disks; i++) { 914 tmp = conf->mirrors + i; 915 if (tmp->rdev 916 && !tmp->rdev->faulty 917 && !tmp->rdev->in_sync) { 918 conf->working_disks++; 919 mddev->degraded--; 920 tmp->rdev->in_sync = 1; 921 } 922 } 923 924 print_conf(conf); 925 return 0; 926 } 927 928 929 static int raid1_add_disk(mddev_t *mddev, mdk_rdev_t *rdev) 930 { 931 conf_t *conf = mddev->private; 932 int found = 0; 933 int mirror = 0; 934 mirror_info_t *p; 935 936 if (rdev->saved_raid_disk >= 0 && 937 conf->mirrors[rdev->saved_raid_disk].rdev == NULL) 938 mirror = rdev->saved_raid_disk; 939 for (mirror=0; mirror < mddev->raid_disks; mirror++) 940 if ( !(p=conf->mirrors+mirror)->rdev) { 941 942 blk_queue_stack_limits(mddev->queue, 943 rdev->bdev->bd_disk->queue); 944 /* as we don't honour merge_bvec_fn, we must never risk 945 * violating it, so limit ->max_sector to one PAGE, as 946 * a one page request is never in violation. 947 */ 948 if (rdev->bdev->bd_disk->queue->merge_bvec_fn && 949 mddev->queue->max_sectors > (PAGE_SIZE>>9)) 950 blk_queue_max_sectors(mddev->queue, PAGE_SIZE>>9); 951 952 p->head_position = 0; 953 rdev->raid_disk = mirror; 954 found = 1; 955 if (rdev->saved_raid_disk != mirror) 956 conf->fullsync = 1; 957 p->rdev = rdev; 958 break; 959 } 960 961 print_conf(conf); 962 return found; 963 } 964 965 static int raid1_remove_disk(mddev_t *mddev, int number) 966 { 967 conf_t *conf = mddev->private; 968 int err = 0; 969 mdk_rdev_t *rdev; 970 mirror_info_t *p = conf->mirrors+ number; 971 972 print_conf(conf); 973 rdev = p->rdev; 974 if (rdev) { 975 if (rdev->in_sync || 976 atomic_read(&rdev->nr_pending)) { 977 err = -EBUSY; 978 goto abort; 979 } 980 p->rdev = NULL; 981 synchronize_rcu(); 982 if (atomic_read(&rdev->nr_pending)) { 983 /* lost the race, try later */ 984 err = -EBUSY; 985 p->rdev = rdev; 986 } 987 } 988 abort: 989 990 print_conf(conf); 991 return err; 992 } 993 994 995 static int end_sync_read(struct bio *bio, unsigned int bytes_done, int error) 996 { 997 int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags); 998 r1bio_t * r1_bio = (r1bio_t *)(bio->bi_private); 999 conf_t *conf = mddev_to_conf(r1_bio->mddev); 1000 1001 if (bio->bi_size) 1002 return 1; 1003 1004 if (r1_bio->bios[r1_bio->read_disk] != bio) 1005 BUG(); 1006 update_head_pos(r1_bio->read_disk, r1_bio); 1007 /* 1008 * we have read a block, now it needs to be re-written, 1009 * or re-read if the read failed. 1010 * We don't do much here, just schedule handling by raid1d 1011 */ 1012 if (!uptodate) { 1013 md_error(r1_bio->mddev, 1014 conf->mirrors[r1_bio->read_disk].rdev); 1015 } else 1016 set_bit(R1BIO_Uptodate, &r1_bio->state); 1017 rdev_dec_pending(conf->mirrors[r1_bio->read_disk].rdev, conf->mddev); 1018 reschedule_retry(r1_bio); 1019 return 0; 1020 } 1021 1022 static int end_sync_write(struct bio *bio, unsigned int bytes_done, int error) 1023 { 1024 int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags); 1025 r1bio_t * r1_bio = (r1bio_t *)(bio->bi_private); 1026 mddev_t *mddev = r1_bio->mddev; 1027 conf_t *conf = mddev_to_conf(mddev); 1028 int i; 1029 int mirror=0; 1030 1031 if (bio->bi_size) 1032 return 1; 1033 1034 for (i = 0; i < conf->raid_disks; i++) 1035 if (r1_bio->bios[i] == bio) { 1036 mirror = i; 1037 break; 1038 } 1039 if (!uptodate) 1040 md_error(mddev, conf->mirrors[mirror].rdev); 1041 1042 update_head_pos(mirror, r1_bio); 1043 1044 if (atomic_dec_and_test(&r1_bio->remaining)) { 1045 md_done_sync(mddev, r1_bio->sectors, uptodate); 1046 put_buf(r1_bio); 1047 } 1048 rdev_dec_pending(conf->mirrors[mirror].rdev, mddev); 1049 return 0; 1050 } 1051 1052 static void sync_request_write(mddev_t *mddev, r1bio_t *r1_bio) 1053 { 1054 conf_t *conf = mddev_to_conf(mddev); 1055 int i; 1056 int disks = conf->raid_disks; 1057 struct bio *bio, *wbio; 1058 1059 bio = r1_bio->bios[r1_bio->read_disk]; 1060 1061 /* 1062 if (r1_bio->sector == 0) printk("First sync write startss\n"); 1063 */ 1064 /* 1065 * schedule writes 1066 */ 1067 if (!test_bit(R1BIO_Uptodate, &r1_bio->state)) { 1068 /* 1069 * There is no point trying a read-for-reconstruct as 1070 * reconstruct is about to be aborted 1071 */ 1072 char b[BDEVNAME_SIZE]; 1073 printk(KERN_ALERT "raid1: %s: unrecoverable I/O read error" 1074 " for block %llu\n", 1075 bdevname(bio->bi_bdev,b), 1076 (unsigned long long)r1_bio->sector); 1077 md_done_sync(mddev, r1_bio->sectors, 0); 1078 put_buf(r1_bio); 1079 return; 1080 } 1081 1082 atomic_set(&r1_bio->remaining, 1); 1083 for (i = 0; i < disks ; i++) { 1084 wbio = r1_bio->bios[i]; 1085 if (wbio->bi_end_io != end_sync_write) 1086 continue; 1087 1088 atomic_inc(&conf->mirrors[i].rdev->nr_pending); 1089 atomic_inc(&r1_bio->remaining); 1090 md_sync_acct(conf->mirrors[i].rdev->bdev, wbio->bi_size >> 9); 1091 1092 generic_make_request(wbio); 1093 } 1094 1095 if (atomic_dec_and_test(&r1_bio->remaining)) { 1096 /* if we're here, all write(s) have completed, so clean up */ 1097 md_done_sync(mddev, r1_bio->sectors, 1); 1098 put_buf(r1_bio); 1099 } 1100 } 1101 1102 /* 1103 * This is a kernel thread which: 1104 * 1105 * 1. Retries failed read operations on working mirrors. 1106 * 2. Updates the raid superblock when problems encounter. 1107 * 3. Performs writes following reads for array syncronising. 1108 */ 1109 1110 static void raid1d(mddev_t *mddev) 1111 { 1112 r1bio_t *r1_bio; 1113 struct bio *bio; 1114 unsigned long flags; 1115 conf_t *conf = mddev_to_conf(mddev); 1116 struct list_head *head = &conf->retry_list; 1117 int unplug=0; 1118 mdk_rdev_t *rdev; 1119 1120 md_check_recovery(mddev); 1121 1122 for (;;) { 1123 char b[BDEVNAME_SIZE]; 1124 spin_lock_irqsave(&conf->device_lock, flags); 1125 1126 if (conf->pending_bio_list.head) { 1127 bio = bio_list_get(&conf->pending_bio_list); 1128 blk_remove_plug(mddev->queue); 1129 spin_unlock_irqrestore(&conf->device_lock, flags); 1130 /* flush any pending bitmap writes to disk before proceeding w/ I/O */ 1131 if (bitmap_unplug(mddev->bitmap) != 0) 1132 printk("%s: bitmap file write failed!\n", mdname(mddev)); 1133 1134 while (bio) { /* submit pending writes */ 1135 struct bio *next = bio->bi_next; 1136 bio->bi_next = NULL; 1137 generic_make_request(bio); 1138 bio = next; 1139 } 1140 unplug = 1; 1141 1142 continue; 1143 } 1144 1145 if (list_empty(head)) 1146 break; 1147 r1_bio = list_entry(head->prev, r1bio_t, retry_list); 1148 list_del(head->prev); 1149 spin_unlock_irqrestore(&conf->device_lock, flags); 1150 1151 mddev = r1_bio->mddev; 1152 conf = mddev_to_conf(mddev); 1153 if (test_bit(R1BIO_IsSync, &r1_bio->state)) { 1154 sync_request_write(mddev, r1_bio); 1155 unplug = 1; 1156 } else { 1157 int disk; 1158 bio = r1_bio->bios[r1_bio->read_disk]; 1159 if ((disk=read_balance(conf, r1_bio)) == -1) { 1160 printk(KERN_ALERT "raid1: %s: unrecoverable I/O" 1161 " read error for block %llu\n", 1162 bdevname(bio->bi_bdev,b), 1163 (unsigned long long)r1_bio->sector); 1164 raid_end_bio_io(r1_bio); 1165 } else { 1166 r1_bio->bios[r1_bio->read_disk] = NULL; 1167 r1_bio->read_disk = disk; 1168 bio_put(bio); 1169 bio = bio_clone(r1_bio->master_bio, GFP_NOIO); 1170 r1_bio->bios[r1_bio->read_disk] = bio; 1171 rdev = conf->mirrors[disk].rdev; 1172 if (printk_ratelimit()) 1173 printk(KERN_ERR "raid1: %s: redirecting sector %llu to" 1174 " another mirror\n", 1175 bdevname(rdev->bdev,b), 1176 (unsigned long long)r1_bio->sector); 1177 bio->bi_sector = r1_bio->sector + rdev->data_offset; 1178 bio->bi_bdev = rdev->bdev; 1179 bio->bi_end_io = raid1_end_read_request; 1180 bio->bi_rw = READ; 1181 bio->bi_private = r1_bio; 1182 unplug = 1; 1183 generic_make_request(bio); 1184 } 1185 } 1186 } 1187 spin_unlock_irqrestore(&conf->device_lock, flags); 1188 if (unplug) 1189 unplug_slaves(mddev); 1190 } 1191 1192 1193 static int init_resync(conf_t *conf) 1194 { 1195 int buffs; 1196 1197 buffs = RESYNC_WINDOW / RESYNC_BLOCK_SIZE; 1198 if (conf->r1buf_pool) 1199 BUG(); 1200 conf->r1buf_pool = mempool_create(buffs, r1buf_pool_alloc, r1buf_pool_free, 1201 conf->poolinfo); 1202 if (!conf->r1buf_pool) 1203 return -ENOMEM; 1204 conf->next_resync = 0; 1205 return 0; 1206 } 1207 1208 /* 1209 * perform a "sync" on one "block" 1210 * 1211 * We need to make sure that no normal I/O request - particularly write 1212 * requests - conflict with active sync requests. 1213 * 1214 * This is achieved by tracking pending requests and a 'barrier' concept 1215 * that can be installed to exclude normal IO requests. 1216 */ 1217 1218 static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, int go_faster) 1219 { 1220 conf_t *conf = mddev_to_conf(mddev); 1221 mirror_info_t *mirror; 1222 r1bio_t *r1_bio; 1223 struct bio *bio; 1224 sector_t max_sector, nr_sectors; 1225 int disk; 1226 int i; 1227 int wonly; 1228 int write_targets = 0; 1229 int sync_blocks; 1230 int still_degraded = 0; 1231 1232 if (!conf->r1buf_pool) 1233 { 1234 /* 1235 printk("sync start - bitmap %p\n", mddev->bitmap); 1236 */ 1237 if (init_resync(conf)) 1238 return 0; 1239 } 1240 1241 max_sector = mddev->size << 1; 1242 if (sector_nr >= max_sector) { 1243 /* If we aborted, we need to abort the 1244 * sync on the 'current' bitmap chunk (there will 1245 * only be one in raid1 resync. 1246 * We can find the current addess in mddev->curr_resync 1247 */ 1248 if (mddev->curr_resync < max_sector) /* aborted */ 1249 bitmap_end_sync(mddev->bitmap, mddev->curr_resync, 1250 &sync_blocks, 1); 1251 else /* completed sync */ 1252 conf->fullsync = 0; 1253 1254 bitmap_close_sync(mddev->bitmap); 1255 close_sync(conf); 1256 return 0; 1257 } 1258 1259 /* before building a request, check if we can skip these blocks.. 1260 * This call the bitmap_start_sync doesn't actually record anything 1261 */ 1262 if (!bitmap_start_sync(mddev->bitmap, sector_nr, &sync_blocks, 1) && 1263 !conf->fullsync) { 1264 /* We can skip this block, and probably several more */ 1265 *skipped = 1; 1266 return sync_blocks; 1267 } 1268 /* 1269 * If there is non-resync activity waiting for us then 1270 * put in a delay to throttle resync. 1271 */ 1272 if (!go_faster && waitqueue_active(&conf->wait_resume)) 1273 msleep_interruptible(1000); 1274 device_barrier(conf, sector_nr + RESYNC_SECTORS); 1275 1276 /* 1277 * If reconstructing, and >1 working disc, 1278 * could dedicate one to rebuild and others to 1279 * service read requests .. 1280 */ 1281 disk = conf->last_used; 1282 /* make sure disk is operational */ 1283 wonly = disk; 1284 while (conf->mirrors[disk].rdev == NULL || 1285 !conf->mirrors[disk].rdev->in_sync || 1286 test_bit(WriteMostly, &conf->mirrors[disk].rdev->flags) 1287 ) { 1288 if (conf->mirrors[disk].rdev && 1289 conf->mirrors[disk].rdev->in_sync) 1290 wonly = disk; 1291 if (disk <= 0) 1292 disk = conf->raid_disks; 1293 disk--; 1294 if (disk == conf->last_used) { 1295 disk = wonly; 1296 break; 1297 } 1298 } 1299 conf->last_used = disk; 1300 atomic_inc(&conf->mirrors[disk].rdev->nr_pending); 1301 1302 1303 mirror = conf->mirrors + disk; 1304 1305 r1_bio = mempool_alloc(conf->r1buf_pool, GFP_NOIO); 1306 1307 spin_lock_irq(&conf->resync_lock); 1308 conf->nr_pending++; 1309 spin_unlock_irq(&conf->resync_lock); 1310 1311 r1_bio->mddev = mddev; 1312 r1_bio->sector = sector_nr; 1313 r1_bio->state = 0; 1314 set_bit(R1BIO_IsSync, &r1_bio->state); 1315 r1_bio->read_disk = disk; 1316 1317 for (i=0; i < conf->raid_disks; i++) { 1318 bio = r1_bio->bios[i]; 1319 1320 /* take from bio_init */ 1321 bio->bi_next = NULL; 1322 bio->bi_flags |= 1 << BIO_UPTODATE; 1323 bio->bi_rw = 0; 1324 bio->bi_vcnt = 0; 1325 bio->bi_idx = 0; 1326 bio->bi_phys_segments = 0; 1327 bio->bi_hw_segments = 0; 1328 bio->bi_size = 0; 1329 bio->bi_end_io = NULL; 1330 bio->bi_private = NULL; 1331 1332 if (i == disk) { 1333 bio->bi_rw = READ; 1334 bio->bi_end_io = end_sync_read; 1335 } else if (conf->mirrors[i].rdev == NULL || 1336 conf->mirrors[i].rdev->faulty) { 1337 still_degraded = 1; 1338 continue; 1339 } else if (!conf->mirrors[i].rdev->in_sync || 1340 sector_nr + RESYNC_SECTORS > mddev->recovery_cp) { 1341 bio->bi_rw = WRITE; 1342 bio->bi_end_io = end_sync_write; 1343 write_targets ++; 1344 } else 1345 /* no need to read or write here */ 1346 continue; 1347 bio->bi_sector = sector_nr + conf->mirrors[i].rdev->data_offset; 1348 bio->bi_bdev = conf->mirrors[i].rdev->bdev; 1349 bio->bi_private = r1_bio; 1350 } 1351 1352 if (write_targets == 0) { 1353 /* There is nowhere to write, so all non-sync 1354 * drives must be failed - so we are finished 1355 */ 1356 sector_t rv = max_sector - sector_nr; 1357 *skipped = 1; 1358 put_buf(r1_bio); 1359 rdev_dec_pending(conf->mirrors[disk].rdev, mddev); 1360 return rv; 1361 } 1362 1363 nr_sectors = 0; 1364 sync_blocks = 0; 1365 do { 1366 struct page *page; 1367 int len = PAGE_SIZE; 1368 if (sector_nr + (len>>9) > max_sector) 1369 len = (max_sector - sector_nr) << 9; 1370 if (len == 0) 1371 break; 1372 if (sync_blocks == 0) { 1373 if (!bitmap_start_sync(mddev->bitmap, sector_nr, 1374 &sync_blocks, still_degraded) && 1375 !conf->fullsync) 1376 break; 1377 if (sync_blocks < (PAGE_SIZE>>9)) 1378 BUG(); 1379 if (len > (sync_blocks<<9)) 1380 len = sync_blocks<<9; 1381 } 1382 1383 for (i=0 ; i < conf->raid_disks; i++) { 1384 bio = r1_bio->bios[i]; 1385 if (bio->bi_end_io) { 1386 page = r1_bio->bios[0]->bi_io_vec[bio->bi_vcnt].bv_page; 1387 if (bio_add_page(bio, page, len, 0) == 0) { 1388 /* stop here */ 1389 r1_bio->bios[0]->bi_io_vec[bio->bi_vcnt].bv_page = page; 1390 while (i > 0) { 1391 i--; 1392 bio = r1_bio->bios[i]; 1393 if (bio->bi_end_io==NULL) 1394 continue; 1395 /* remove last page from this bio */ 1396 bio->bi_vcnt--; 1397 bio->bi_size -= len; 1398 bio->bi_flags &= ~(1<< BIO_SEG_VALID); 1399 } 1400 goto bio_full; 1401 } 1402 } 1403 } 1404 nr_sectors += len>>9; 1405 sector_nr += len>>9; 1406 sync_blocks -= (len>>9); 1407 } while (r1_bio->bios[disk]->bi_vcnt < RESYNC_PAGES); 1408 bio_full: 1409 bio = r1_bio->bios[disk]; 1410 r1_bio->sectors = nr_sectors; 1411 1412 md_sync_acct(mirror->rdev->bdev, nr_sectors); 1413 1414 generic_make_request(bio); 1415 1416 return nr_sectors; 1417 } 1418 1419 static int run(mddev_t *mddev) 1420 { 1421 conf_t *conf; 1422 int i, j, disk_idx; 1423 mirror_info_t *disk; 1424 mdk_rdev_t *rdev; 1425 struct list_head *tmp; 1426 1427 if (mddev->level != 1) { 1428 printk("raid1: %s: raid level not set to mirroring (%d)\n", 1429 mdname(mddev), mddev->level); 1430 goto out; 1431 } 1432 /* 1433 * copy the already verified devices into our private RAID1 1434 * bookkeeping area. [whatever we allocate in run(), 1435 * should be freed in stop()] 1436 */ 1437 conf = kmalloc(sizeof(conf_t), GFP_KERNEL); 1438 mddev->private = conf; 1439 if (!conf) 1440 goto out_no_mem; 1441 1442 memset(conf, 0, sizeof(*conf)); 1443 conf->mirrors = kmalloc(sizeof(struct mirror_info)*mddev->raid_disks, 1444 GFP_KERNEL); 1445 if (!conf->mirrors) 1446 goto out_no_mem; 1447 1448 memset(conf->mirrors, 0, sizeof(struct mirror_info)*mddev->raid_disks); 1449 1450 conf->poolinfo = kmalloc(sizeof(*conf->poolinfo), GFP_KERNEL); 1451 if (!conf->poolinfo) 1452 goto out_no_mem; 1453 conf->poolinfo->mddev = mddev; 1454 conf->poolinfo->raid_disks = mddev->raid_disks; 1455 conf->r1bio_pool = mempool_create(NR_RAID1_BIOS, r1bio_pool_alloc, 1456 r1bio_pool_free, 1457 conf->poolinfo); 1458 if (!conf->r1bio_pool) 1459 goto out_no_mem; 1460 1461 ITERATE_RDEV(mddev, rdev, tmp) { 1462 disk_idx = rdev->raid_disk; 1463 if (disk_idx >= mddev->raid_disks 1464 || disk_idx < 0) 1465 continue; 1466 disk = conf->mirrors + disk_idx; 1467 1468 disk->rdev = rdev; 1469 1470 blk_queue_stack_limits(mddev->queue, 1471 rdev->bdev->bd_disk->queue); 1472 /* as we don't honour merge_bvec_fn, we must never risk 1473 * violating it, so limit ->max_sector to one PAGE, as 1474 * a one page request is never in violation. 1475 */ 1476 if (rdev->bdev->bd_disk->queue->merge_bvec_fn && 1477 mddev->queue->max_sectors > (PAGE_SIZE>>9)) 1478 blk_queue_max_sectors(mddev->queue, PAGE_SIZE>>9); 1479 1480 disk->head_position = 0; 1481 if (!rdev->faulty && rdev->in_sync) 1482 conf->working_disks++; 1483 } 1484 conf->raid_disks = mddev->raid_disks; 1485 conf->mddev = mddev; 1486 spin_lock_init(&conf->device_lock); 1487 INIT_LIST_HEAD(&conf->retry_list); 1488 if (conf->working_disks == 1) 1489 mddev->recovery_cp = MaxSector; 1490 1491 spin_lock_init(&conf->resync_lock); 1492 init_waitqueue_head(&conf->wait_idle); 1493 init_waitqueue_head(&conf->wait_resume); 1494 1495 bio_list_init(&conf->pending_bio_list); 1496 bio_list_init(&conf->flushing_bio_list); 1497 1498 if (!conf->working_disks) { 1499 printk(KERN_ERR "raid1: no operational mirrors for %s\n", 1500 mdname(mddev)); 1501 goto out_free_conf; 1502 } 1503 1504 mddev->degraded = 0; 1505 for (i = 0; i < conf->raid_disks; i++) { 1506 1507 disk = conf->mirrors + i; 1508 1509 if (!disk->rdev) { 1510 disk->head_position = 0; 1511 mddev->degraded++; 1512 } 1513 } 1514 1515 /* 1516 * find the first working one and use it as a starting point 1517 * to read balancing. 1518 */ 1519 for (j = 0; j < conf->raid_disks && 1520 (!conf->mirrors[j].rdev || 1521 !conf->mirrors[j].rdev->in_sync) ; j++) 1522 /* nothing */; 1523 conf->last_used = j; 1524 1525 1526 mddev->thread = md_register_thread(raid1d, mddev, "%s_raid1"); 1527 if (!mddev->thread) { 1528 printk(KERN_ERR 1529 "raid1: couldn't allocate thread for %s\n", 1530 mdname(mddev)); 1531 goto out_free_conf; 1532 } 1533 if (mddev->bitmap) mddev->thread->timeout = mddev->bitmap->daemon_sleep * HZ; 1534 1535 printk(KERN_INFO 1536 "raid1: raid set %s active with %d out of %d mirrors\n", 1537 mdname(mddev), mddev->raid_disks - mddev->degraded, 1538 mddev->raid_disks); 1539 /* 1540 * Ok, everything is just fine now 1541 */ 1542 mddev->array_size = mddev->size; 1543 1544 mddev->queue->unplug_fn = raid1_unplug; 1545 mddev->queue->issue_flush_fn = raid1_issue_flush; 1546 1547 return 0; 1548 1549 out_no_mem: 1550 printk(KERN_ERR "raid1: couldn't allocate memory for %s\n", 1551 mdname(mddev)); 1552 1553 out_free_conf: 1554 if (conf) { 1555 if (conf->r1bio_pool) 1556 mempool_destroy(conf->r1bio_pool); 1557 kfree(conf->mirrors); 1558 kfree(conf->poolinfo); 1559 kfree(conf); 1560 mddev->private = NULL; 1561 } 1562 out: 1563 return -EIO; 1564 } 1565 1566 static int stop(mddev_t *mddev) 1567 { 1568 conf_t *conf = mddev_to_conf(mddev); 1569 struct bitmap *bitmap = mddev->bitmap; 1570 int behind_wait = 0; 1571 1572 /* wait for behind writes to complete */ 1573 while (bitmap && atomic_read(&bitmap->behind_writes) > 0) { 1574 behind_wait++; 1575 printk(KERN_INFO "raid1: behind writes in progress on device %s, waiting to stop (%d)\n", mdname(mddev), behind_wait); 1576 set_current_state(TASK_UNINTERRUPTIBLE); 1577 schedule_timeout(HZ); /* wait a second */ 1578 /* need to kick something here to make sure I/O goes? */ 1579 } 1580 1581 md_unregister_thread(mddev->thread); 1582 mddev->thread = NULL; 1583 blk_sync_queue(mddev->queue); /* the unplug fn references 'conf'*/ 1584 if (conf->r1bio_pool) 1585 mempool_destroy(conf->r1bio_pool); 1586 kfree(conf->mirrors); 1587 kfree(conf->poolinfo); 1588 kfree(conf); 1589 mddev->private = NULL; 1590 return 0; 1591 } 1592 1593 static int raid1_resize(mddev_t *mddev, sector_t sectors) 1594 { 1595 /* no resync is happening, and there is enough space 1596 * on all devices, so we can resize. 1597 * We need to make sure resync covers any new space. 1598 * If the array is shrinking we should possibly wait until 1599 * any io in the removed space completes, but it hardly seems 1600 * worth it. 1601 */ 1602 mddev->array_size = sectors>>1; 1603 set_capacity(mddev->gendisk, mddev->array_size << 1); 1604 mddev->changed = 1; 1605 if (mddev->array_size > mddev->size && mddev->recovery_cp == MaxSector) { 1606 mddev->recovery_cp = mddev->size << 1; 1607 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 1608 } 1609 mddev->size = mddev->array_size; 1610 mddev->resync_max_sectors = sectors; 1611 return 0; 1612 } 1613 1614 static int raid1_reshape(mddev_t *mddev, int raid_disks) 1615 { 1616 /* We need to: 1617 * 1/ resize the r1bio_pool 1618 * 2/ resize conf->mirrors 1619 * 1620 * We allocate a new r1bio_pool if we can. 1621 * Then raise a device barrier and wait until all IO stops. 1622 * Then resize conf->mirrors and swap in the new r1bio pool. 1623 * 1624 * At the same time, we "pack" the devices so that all the missing 1625 * devices have the higher raid_disk numbers. 1626 */ 1627 mempool_t *newpool, *oldpool; 1628 struct pool_info *newpoolinfo; 1629 mirror_info_t *newmirrors; 1630 conf_t *conf = mddev_to_conf(mddev); 1631 int cnt; 1632 1633 int d, d2; 1634 1635 if (raid_disks < conf->raid_disks) { 1636 cnt=0; 1637 for (d= 0; d < conf->raid_disks; d++) 1638 if (conf->mirrors[d].rdev) 1639 cnt++; 1640 if (cnt > raid_disks) 1641 return -EBUSY; 1642 } 1643 1644 newpoolinfo = kmalloc(sizeof(*newpoolinfo), GFP_KERNEL); 1645 if (!newpoolinfo) 1646 return -ENOMEM; 1647 newpoolinfo->mddev = mddev; 1648 newpoolinfo->raid_disks = raid_disks; 1649 1650 newpool = mempool_create(NR_RAID1_BIOS, r1bio_pool_alloc, 1651 r1bio_pool_free, newpoolinfo); 1652 if (!newpool) { 1653 kfree(newpoolinfo); 1654 return -ENOMEM; 1655 } 1656 newmirrors = kmalloc(sizeof(struct mirror_info) * raid_disks, GFP_KERNEL); 1657 if (!newmirrors) { 1658 kfree(newpoolinfo); 1659 mempool_destroy(newpool); 1660 return -ENOMEM; 1661 } 1662 memset(newmirrors, 0, sizeof(struct mirror_info)*raid_disks); 1663 1664 spin_lock_irq(&conf->resync_lock); 1665 conf->barrier++; 1666 wait_event_lock_irq(conf->wait_idle, !conf->nr_pending, 1667 conf->resync_lock, raid1_unplug(mddev->queue)); 1668 spin_unlock_irq(&conf->resync_lock); 1669 1670 /* ok, everything is stopped */ 1671 oldpool = conf->r1bio_pool; 1672 conf->r1bio_pool = newpool; 1673 1674 for (d=d2=0; d < conf->raid_disks; d++) 1675 if (conf->mirrors[d].rdev) { 1676 conf->mirrors[d].rdev->raid_disk = d2; 1677 newmirrors[d2++].rdev = conf->mirrors[d].rdev; 1678 } 1679 kfree(conf->mirrors); 1680 conf->mirrors = newmirrors; 1681 kfree(conf->poolinfo); 1682 conf->poolinfo = newpoolinfo; 1683 1684 mddev->degraded += (raid_disks - conf->raid_disks); 1685 conf->raid_disks = mddev->raid_disks = raid_disks; 1686 1687 conf->last_used = 0; /* just make sure it is in-range */ 1688 spin_lock_irq(&conf->resync_lock); 1689 conf->barrier--; 1690 spin_unlock_irq(&conf->resync_lock); 1691 wake_up(&conf->wait_resume); 1692 wake_up(&conf->wait_idle); 1693 1694 1695 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 1696 md_wakeup_thread(mddev->thread); 1697 1698 mempool_destroy(oldpool); 1699 return 0; 1700 } 1701 1702 static void raid1_quiesce(mddev_t *mddev, int state) 1703 { 1704 conf_t *conf = mddev_to_conf(mddev); 1705 1706 switch(state) { 1707 case 1: 1708 spin_lock_irq(&conf->resync_lock); 1709 conf->barrier++; 1710 wait_event_lock_irq(conf->wait_idle, !conf->nr_pending, 1711 conf->resync_lock, raid1_unplug(mddev->queue)); 1712 spin_unlock_irq(&conf->resync_lock); 1713 break; 1714 case 0: 1715 spin_lock_irq(&conf->resync_lock); 1716 conf->barrier--; 1717 spin_unlock_irq(&conf->resync_lock); 1718 wake_up(&conf->wait_resume); 1719 wake_up(&conf->wait_idle); 1720 break; 1721 } 1722 if (mddev->thread) { 1723 if (mddev->bitmap) 1724 mddev->thread->timeout = mddev->bitmap->daemon_sleep * HZ; 1725 else 1726 mddev->thread->timeout = MAX_SCHEDULE_TIMEOUT; 1727 md_wakeup_thread(mddev->thread); 1728 } 1729 } 1730 1731 1732 static mdk_personality_t raid1_personality = 1733 { 1734 .name = "raid1", 1735 .owner = THIS_MODULE, 1736 .make_request = make_request, 1737 .run = run, 1738 .stop = stop, 1739 .status = status, 1740 .error_handler = error, 1741 .hot_add_disk = raid1_add_disk, 1742 .hot_remove_disk= raid1_remove_disk, 1743 .spare_active = raid1_spare_active, 1744 .sync_request = sync_request, 1745 .resize = raid1_resize, 1746 .reshape = raid1_reshape, 1747 .quiesce = raid1_quiesce, 1748 }; 1749 1750 static int __init raid_init(void) 1751 { 1752 return register_md_personality(RAID1, &raid1_personality); 1753 } 1754 1755 static void raid_exit(void) 1756 { 1757 unregister_md_personality(RAID1); 1758 } 1759 1760 module_init(raid_init); 1761 module_exit(raid_exit); 1762 MODULE_LICENSE("GPL"); 1763 MODULE_ALIAS("md-personality-3"); /* RAID1 */ 1764