1 /* 2 * Copyright (C) 2003 Sistina Software Limited. 3 * 4 * This file is released under the GPL. 5 */ 6 7 #include "dm.h" 8 #include "dm-bio-list.h" 9 #include "dm-io.h" 10 #include "dm-log.h" 11 #include "kcopyd.h" 12 13 #include <linux/ctype.h> 14 #include <linux/init.h> 15 #include <linux/mempool.h> 16 #include <linux/module.h> 17 #include <linux/pagemap.h> 18 #include <linux/slab.h> 19 #include <linux/time.h> 20 #include <linux/vmalloc.h> 21 #include <linux/workqueue.h> 22 23 #define DM_MSG_PREFIX "raid1" 24 25 static struct workqueue_struct *_kmirrord_wq; 26 static struct work_struct _kmirrord_work; 27 28 static inline void wake(void) 29 { 30 queue_work(_kmirrord_wq, &_kmirrord_work); 31 } 32 33 /*----------------------------------------------------------------- 34 * Region hash 35 * 36 * The mirror splits itself up into discrete regions. Each 37 * region can be in one of three states: clean, dirty, 38 * nosync. There is no need to put clean regions in the hash. 39 * 40 * In addition to being present in the hash table a region _may_ 41 * be present on one of three lists. 42 * 43 * clean_regions: Regions on this list have no io pending to 44 * them, they are in sync, we are no longer interested in them, 45 * they are dull. rh_update_states() will remove them from the 46 * hash table. 47 * 48 * quiesced_regions: These regions have been spun down, ready 49 * for recovery. rh_recovery_start() will remove regions from 50 * this list and hand them to kmirrord, which will schedule the 51 * recovery io with kcopyd. 52 * 53 * recovered_regions: Regions that kcopyd has successfully 54 * recovered. rh_update_states() will now schedule any delayed 55 * io, up the recovery_count, and remove the region from the 56 * hash. 57 * 58 * There are 2 locks: 59 * A rw spin lock 'hash_lock' protects just the hash table, 60 * this is never held in write mode from interrupt context, 61 * which I believe means that we only have to disable irqs when 62 * doing a write lock. 63 * 64 * An ordinary spin lock 'region_lock' that protects the three 65 * lists in the region_hash, with the 'state', 'list' and 66 * 'bhs_delayed' fields of the regions. This is used from irq 67 * context, so all other uses will have to suspend local irqs. 68 *---------------------------------------------------------------*/ 69 struct mirror_set; 70 struct region_hash { 71 struct mirror_set *ms; 72 uint32_t region_size; 73 unsigned region_shift; 74 75 /* holds persistent region state */ 76 struct dirty_log *log; 77 78 /* hash table */ 79 rwlock_t hash_lock; 80 mempool_t *region_pool; 81 unsigned int mask; 82 unsigned int nr_buckets; 83 struct list_head *buckets; 84 85 spinlock_t region_lock; 86 struct semaphore recovery_count; 87 struct list_head clean_regions; 88 struct list_head quiesced_regions; 89 struct list_head recovered_regions; 90 }; 91 92 enum { 93 RH_CLEAN, 94 RH_DIRTY, 95 RH_NOSYNC, 96 RH_RECOVERING 97 }; 98 99 struct region { 100 struct region_hash *rh; /* FIXME: can we get rid of this ? */ 101 region_t key; 102 int state; 103 104 struct list_head hash_list; 105 struct list_head list; 106 107 atomic_t pending; 108 struct bio_list delayed_bios; 109 }; 110 111 112 /*----------------------------------------------------------------- 113 * Mirror set structures. 114 *---------------------------------------------------------------*/ 115 struct mirror { 116 atomic_t error_count; 117 struct dm_dev *dev; 118 sector_t offset; 119 }; 120 121 struct mirror_set { 122 struct dm_target *ti; 123 struct list_head list; 124 struct region_hash rh; 125 struct kcopyd_client *kcopyd_client; 126 127 spinlock_t lock; /* protects the next two lists */ 128 struct bio_list reads; 129 struct bio_list writes; 130 131 /* recovery */ 132 region_t nr_regions; 133 int in_sync; 134 135 struct mirror *default_mirror; /* Default mirror */ 136 137 unsigned int nr_mirrors; 138 struct mirror mirror[0]; 139 }; 140 141 /* 142 * Conversion fns 143 */ 144 static inline region_t bio_to_region(struct region_hash *rh, struct bio *bio) 145 { 146 return (bio->bi_sector - rh->ms->ti->begin) >> rh->region_shift; 147 } 148 149 static inline sector_t region_to_sector(struct region_hash *rh, region_t region) 150 { 151 return region << rh->region_shift; 152 } 153 154 /* FIXME move this */ 155 static void queue_bio(struct mirror_set *ms, struct bio *bio, int rw); 156 157 #define MIN_REGIONS 64 158 #define MAX_RECOVERY 1 159 static int rh_init(struct region_hash *rh, struct mirror_set *ms, 160 struct dirty_log *log, uint32_t region_size, 161 region_t nr_regions) 162 { 163 unsigned int nr_buckets, max_buckets; 164 size_t i; 165 166 /* 167 * Calculate a suitable number of buckets for our hash 168 * table. 169 */ 170 max_buckets = nr_regions >> 6; 171 for (nr_buckets = 128u; nr_buckets < max_buckets; nr_buckets <<= 1) 172 ; 173 nr_buckets >>= 1; 174 175 rh->ms = ms; 176 rh->log = log; 177 rh->region_size = region_size; 178 rh->region_shift = ffs(region_size) - 1; 179 rwlock_init(&rh->hash_lock); 180 rh->mask = nr_buckets - 1; 181 rh->nr_buckets = nr_buckets; 182 183 rh->buckets = vmalloc(nr_buckets * sizeof(*rh->buckets)); 184 if (!rh->buckets) { 185 DMERR("unable to allocate region hash memory"); 186 return -ENOMEM; 187 } 188 189 for (i = 0; i < nr_buckets; i++) 190 INIT_LIST_HEAD(rh->buckets + i); 191 192 spin_lock_init(&rh->region_lock); 193 sema_init(&rh->recovery_count, 0); 194 INIT_LIST_HEAD(&rh->clean_regions); 195 INIT_LIST_HEAD(&rh->quiesced_regions); 196 INIT_LIST_HEAD(&rh->recovered_regions); 197 198 rh->region_pool = mempool_create_kmalloc_pool(MIN_REGIONS, 199 sizeof(struct region)); 200 if (!rh->region_pool) { 201 vfree(rh->buckets); 202 rh->buckets = NULL; 203 return -ENOMEM; 204 } 205 206 return 0; 207 } 208 209 static void rh_exit(struct region_hash *rh) 210 { 211 unsigned int h; 212 struct region *reg, *nreg; 213 214 BUG_ON(!list_empty(&rh->quiesced_regions)); 215 for (h = 0; h < rh->nr_buckets; h++) { 216 list_for_each_entry_safe(reg, nreg, rh->buckets + h, hash_list) { 217 BUG_ON(atomic_read(®->pending)); 218 mempool_free(reg, rh->region_pool); 219 } 220 } 221 222 if (rh->log) 223 dm_destroy_dirty_log(rh->log); 224 if (rh->region_pool) 225 mempool_destroy(rh->region_pool); 226 vfree(rh->buckets); 227 } 228 229 #define RH_HASH_MULT 2654435387U 230 231 static inline unsigned int rh_hash(struct region_hash *rh, region_t region) 232 { 233 return (unsigned int) ((region * RH_HASH_MULT) >> 12) & rh->mask; 234 } 235 236 static struct region *__rh_lookup(struct region_hash *rh, region_t region) 237 { 238 struct region *reg; 239 240 list_for_each_entry (reg, rh->buckets + rh_hash(rh, region), hash_list) 241 if (reg->key == region) 242 return reg; 243 244 return NULL; 245 } 246 247 static void __rh_insert(struct region_hash *rh, struct region *reg) 248 { 249 unsigned int h = rh_hash(rh, reg->key); 250 list_add(®->hash_list, rh->buckets + h); 251 } 252 253 static struct region *__rh_alloc(struct region_hash *rh, region_t region) 254 { 255 struct region *reg, *nreg; 256 257 read_unlock(&rh->hash_lock); 258 nreg = mempool_alloc(rh->region_pool, GFP_ATOMIC); 259 if (unlikely(!nreg)) 260 nreg = kmalloc(sizeof(struct region), GFP_NOIO); 261 nreg->state = rh->log->type->in_sync(rh->log, region, 1) ? 262 RH_CLEAN : RH_NOSYNC; 263 nreg->rh = rh; 264 nreg->key = region; 265 266 INIT_LIST_HEAD(&nreg->list); 267 268 atomic_set(&nreg->pending, 0); 269 bio_list_init(&nreg->delayed_bios); 270 write_lock_irq(&rh->hash_lock); 271 272 reg = __rh_lookup(rh, region); 273 if (reg) 274 /* we lost the race */ 275 mempool_free(nreg, rh->region_pool); 276 277 else { 278 __rh_insert(rh, nreg); 279 if (nreg->state == RH_CLEAN) { 280 spin_lock(&rh->region_lock); 281 list_add(&nreg->list, &rh->clean_regions); 282 spin_unlock(&rh->region_lock); 283 } 284 reg = nreg; 285 } 286 write_unlock_irq(&rh->hash_lock); 287 read_lock(&rh->hash_lock); 288 289 return reg; 290 } 291 292 static inline struct region *__rh_find(struct region_hash *rh, region_t region) 293 { 294 struct region *reg; 295 296 reg = __rh_lookup(rh, region); 297 if (!reg) 298 reg = __rh_alloc(rh, region); 299 300 return reg; 301 } 302 303 static int rh_state(struct region_hash *rh, region_t region, int may_block) 304 { 305 int r; 306 struct region *reg; 307 308 read_lock(&rh->hash_lock); 309 reg = __rh_lookup(rh, region); 310 read_unlock(&rh->hash_lock); 311 312 if (reg) 313 return reg->state; 314 315 /* 316 * The region wasn't in the hash, so we fall back to the 317 * dirty log. 318 */ 319 r = rh->log->type->in_sync(rh->log, region, may_block); 320 321 /* 322 * Any error from the dirty log (eg. -EWOULDBLOCK) gets 323 * taken as a RH_NOSYNC 324 */ 325 return r == 1 ? RH_CLEAN : RH_NOSYNC; 326 } 327 328 static inline int rh_in_sync(struct region_hash *rh, 329 region_t region, int may_block) 330 { 331 int state = rh_state(rh, region, may_block); 332 return state == RH_CLEAN || state == RH_DIRTY; 333 } 334 335 static void dispatch_bios(struct mirror_set *ms, struct bio_list *bio_list) 336 { 337 struct bio *bio; 338 339 while ((bio = bio_list_pop(bio_list))) { 340 queue_bio(ms, bio, WRITE); 341 } 342 } 343 344 static void rh_update_states(struct region_hash *rh) 345 { 346 struct region *reg, *next; 347 348 LIST_HEAD(clean); 349 LIST_HEAD(recovered); 350 351 /* 352 * Quickly grab the lists. 353 */ 354 write_lock_irq(&rh->hash_lock); 355 spin_lock(&rh->region_lock); 356 if (!list_empty(&rh->clean_regions)) { 357 list_splice(&rh->clean_regions, &clean); 358 INIT_LIST_HEAD(&rh->clean_regions); 359 360 list_for_each_entry (reg, &clean, list) { 361 rh->log->type->clear_region(rh->log, reg->key); 362 list_del(®->hash_list); 363 } 364 } 365 366 if (!list_empty(&rh->recovered_regions)) { 367 list_splice(&rh->recovered_regions, &recovered); 368 INIT_LIST_HEAD(&rh->recovered_regions); 369 370 list_for_each_entry (reg, &recovered, list) 371 list_del(®->hash_list); 372 } 373 spin_unlock(&rh->region_lock); 374 write_unlock_irq(&rh->hash_lock); 375 376 /* 377 * All the regions on the recovered and clean lists have 378 * now been pulled out of the system, so no need to do 379 * any more locking. 380 */ 381 list_for_each_entry_safe (reg, next, &recovered, list) { 382 rh->log->type->clear_region(rh->log, reg->key); 383 rh->log->type->complete_resync_work(rh->log, reg->key, 1); 384 dispatch_bios(rh->ms, ®->delayed_bios); 385 up(&rh->recovery_count); 386 mempool_free(reg, rh->region_pool); 387 } 388 389 if (!list_empty(&recovered)) 390 rh->log->type->flush(rh->log); 391 392 list_for_each_entry_safe (reg, next, &clean, list) 393 mempool_free(reg, rh->region_pool); 394 } 395 396 static void rh_inc(struct region_hash *rh, region_t region) 397 { 398 struct region *reg; 399 400 read_lock(&rh->hash_lock); 401 reg = __rh_find(rh, region); 402 403 spin_lock_irq(&rh->region_lock); 404 atomic_inc(®->pending); 405 406 if (reg->state == RH_CLEAN) { 407 reg->state = RH_DIRTY; 408 list_del_init(®->list); /* take off the clean list */ 409 spin_unlock_irq(&rh->region_lock); 410 411 rh->log->type->mark_region(rh->log, reg->key); 412 } else 413 spin_unlock_irq(&rh->region_lock); 414 415 416 read_unlock(&rh->hash_lock); 417 } 418 419 static void rh_inc_pending(struct region_hash *rh, struct bio_list *bios) 420 { 421 struct bio *bio; 422 423 for (bio = bios->head; bio; bio = bio->bi_next) 424 rh_inc(rh, bio_to_region(rh, bio)); 425 } 426 427 static void rh_dec(struct region_hash *rh, region_t region) 428 { 429 unsigned long flags; 430 struct region *reg; 431 int should_wake = 0; 432 433 read_lock(&rh->hash_lock); 434 reg = __rh_lookup(rh, region); 435 read_unlock(&rh->hash_lock); 436 437 spin_lock_irqsave(&rh->region_lock, flags); 438 if (atomic_dec_and_test(®->pending)) { 439 /* 440 * There is no pending I/O for this region. 441 * We can move the region to corresponding list for next action. 442 * At this point, the region is not yet connected to any list. 443 * 444 * If the state is RH_NOSYNC, the region should be kept off 445 * from clean list. 446 * The hash entry for RH_NOSYNC will remain in memory 447 * until the region is recovered or the map is reloaded. 448 */ 449 450 /* do nothing for RH_NOSYNC */ 451 if (reg->state == RH_RECOVERING) { 452 list_add_tail(®->list, &rh->quiesced_regions); 453 } else if (reg->state == RH_DIRTY) { 454 reg->state = RH_CLEAN; 455 list_add(®->list, &rh->clean_regions); 456 } 457 should_wake = 1; 458 } 459 spin_unlock_irqrestore(&rh->region_lock, flags); 460 461 if (should_wake) 462 wake(); 463 } 464 465 /* 466 * Starts quiescing a region in preparation for recovery. 467 */ 468 static int __rh_recovery_prepare(struct region_hash *rh) 469 { 470 int r; 471 struct region *reg; 472 region_t region; 473 474 /* 475 * Ask the dirty log what's next. 476 */ 477 r = rh->log->type->get_resync_work(rh->log, ®ion); 478 if (r <= 0) 479 return r; 480 481 /* 482 * Get this region, and start it quiescing by setting the 483 * recovering flag. 484 */ 485 read_lock(&rh->hash_lock); 486 reg = __rh_find(rh, region); 487 read_unlock(&rh->hash_lock); 488 489 spin_lock_irq(&rh->region_lock); 490 reg->state = RH_RECOVERING; 491 492 /* Already quiesced ? */ 493 if (atomic_read(®->pending)) 494 list_del_init(®->list); 495 else 496 list_move(®->list, &rh->quiesced_regions); 497 498 spin_unlock_irq(&rh->region_lock); 499 500 return 1; 501 } 502 503 static void rh_recovery_prepare(struct region_hash *rh) 504 { 505 while (!down_trylock(&rh->recovery_count)) 506 if (__rh_recovery_prepare(rh) <= 0) { 507 up(&rh->recovery_count); 508 break; 509 } 510 } 511 512 /* 513 * Returns any quiesced regions. 514 */ 515 static struct region *rh_recovery_start(struct region_hash *rh) 516 { 517 struct region *reg = NULL; 518 519 spin_lock_irq(&rh->region_lock); 520 if (!list_empty(&rh->quiesced_regions)) { 521 reg = list_entry(rh->quiesced_regions.next, 522 struct region, list); 523 list_del_init(®->list); /* remove from the quiesced list */ 524 } 525 spin_unlock_irq(&rh->region_lock); 526 527 return reg; 528 } 529 530 /* FIXME: success ignored for now */ 531 static void rh_recovery_end(struct region *reg, int success) 532 { 533 struct region_hash *rh = reg->rh; 534 535 spin_lock_irq(&rh->region_lock); 536 list_add(®->list, ®->rh->recovered_regions); 537 spin_unlock_irq(&rh->region_lock); 538 539 wake(); 540 } 541 542 static void rh_flush(struct region_hash *rh) 543 { 544 rh->log->type->flush(rh->log); 545 } 546 547 static void rh_delay(struct region_hash *rh, struct bio *bio) 548 { 549 struct region *reg; 550 551 read_lock(&rh->hash_lock); 552 reg = __rh_find(rh, bio_to_region(rh, bio)); 553 bio_list_add(®->delayed_bios, bio); 554 read_unlock(&rh->hash_lock); 555 } 556 557 static void rh_stop_recovery(struct region_hash *rh) 558 { 559 int i; 560 561 /* wait for any recovering regions */ 562 for (i = 0; i < MAX_RECOVERY; i++) 563 down(&rh->recovery_count); 564 } 565 566 static void rh_start_recovery(struct region_hash *rh) 567 { 568 int i; 569 570 for (i = 0; i < MAX_RECOVERY; i++) 571 up(&rh->recovery_count); 572 573 wake(); 574 } 575 576 /* 577 * Every mirror should look like this one. 578 */ 579 #define DEFAULT_MIRROR 0 580 581 /* 582 * This is yucky. We squirrel the mirror_set struct away inside 583 * bi_next for write buffers. This is safe since the bh 584 * doesn't get submitted to the lower levels of block layer. 585 */ 586 static struct mirror_set *bio_get_ms(struct bio *bio) 587 { 588 return (struct mirror_set *) bio->bi_next; 589 } 590 591 static void bio_set_ms(struct bio *bio, struct mirror_set *ms) 592 { 593 bio->bi_next = (struct bio *) ms; 594 } 595 596 /*----------------------------------------------------------------- 597 * Recovery. 598 * 599 * When a mirror is first activated we may find that some regions 600 * are in the no-sync state. We have to recover these by 601 * recopying from the default mirror to all the others. 602 *---------------------------------------------------------------*/ 603 static void recovery_complete(int read_err, unsigned int write_err, 604 void *context) 605 { 606 struct region *reg = (struct region *) context; 607 608 /* FIXME: better error handling */ 609 rh_recovery_end(reg, !(read_err || write_err)); 610 } 611 612 static int recover(struct mirror_set *ms, struct region *reg) 613 { 614 int r; 615 unsigned int i; 616 struct io_region from, to[KCOPYD_MAX_REGIONS], *dest; 617 struct mirror *m; 618 unsigned long flags = 0; 619 620 /* fill in the source */ 621 m = ms->default_mirror; 622 from.bdev = m->dev->bdev; 623 from.sector = m->offset + region_to_sector(reg->rh, reg->key); 624 if (reg->key == (ms->nr_regions - 1)) { 625 /* 626 * The final region may be smaller than 627 * region_size. 628 */ 629 from.count = ms->ti->len & (reg->rh->region_size - 1); 630 if (!from.count) 631 from.count = reg->rh->region_size; 632 } else 633 from.count = reg->rh->region_size; 634 635 /* fill in the destinations */ 636 for (i = 0, dest = to; i < ms->nr_mirrors; i++) { 637 if (&ms->mirror[i] == ms->default_mirror) 638 continue; 639 640 m = ms->mirror + i; 641 dest->bdev = m->dev->bdev; 642 dest->sector = m->offset + region_to_sector(reg->rh, reg->key); 643 dest->count = from.count; 644 dest++; 645 } 646 647 /* hand to kcopyd */ 648 set_bit(KCOPYD_IGNORE_ERROR, &flags); 649 r = kcopyd_copy(ms->kcopyd_client, &from, ms->nr_mirrors - 1, to, flags, 650 recovery_complete, reg); 651 652 return r; 653 } 654 655 static void do_recovery(struct mirror_set *ms) 656 { 657 int r; 658 struct region *reg; 659 struct dirty_log *log = ms->rh.log; 660 661 /* 662 * Start quiescing some regions. 663 */ 664 rh_recovery_prepare(&ms->rh); 665 666 /* 667 * Copy any already quiesced regions. 668 */ 669 while ((reg = rh_recovery_start(&ms->rh))) { 670 r = recover(ms, reg); 671 if (r) 672 rh_recovery_end(reg, 0); 673 } 674 675 /* 676 * Update the in sync flag. 677 */ 678 if (!ms->in_sync && 679 (log->type->get_sync_count(log) == ms->nr_regions)) { 680 /* the sync is complete */ 681 dm_table_event(ms->ti->table); 682 ms->in_sync = 1; 683 } 684 } 685 686 /*----------------------------------------------------------------- 687 * Reads 688 *---------------------------------------------------------------*/ 689 static struct mirror *choose_mirror(struct mirror_set *ms, sector_t sector) 690 { 691 /* FIXME: add read balancing */ 692 return ms->default_mirror; 693 } 694 695 /* 696 * remap a buffer to a particular mirror. 697 */ 698 static void map_bio(struct mirror_set *ms, struct mirror *m, struct bio *bio) 699 { 700 bio->bi_bdev = m->dev->bdev; 701 bio->bi_sector = m->offset + (bio->bi_sector - ms->ti->begin); 702 } 703 704 static void do_reads(struct mirror_set *ms, struct bio_list *reads) 705 { 706 region_t region; 707 struct bio *bio; 708 struct mirror *m; 709 710 while ((bio = bio_list_pop(reads))) { 711 region = bio_to_region(&ms->rh, bio); 712 713 /* 714 * We can only read balance if the region is in sync. 715 */ 716 if (rh_in_sync(&ms->rh, region, 0)) 717 m = choose_mirror(ms, bio->bi_sector); 718 else 719 m = ms->default_mirror; 720 721 map_bio(ms, m, bio); 722 generic_make_request(bio); 723 } 724 } 725 726 /*----------------------------------------------------------------- 727 * Writes. 728 * 729 * We do different things with the write io depending on the 730 * state of the region that it's in: 731 * 732 * SYNC: increment pending, use kcopyd to write to *all* mirrors 733 * RECOVERING: delay the io until recovery completes 734 * NOSYNC: increment pending, just write to the default mirror 735 *---------------------------------------------------------------*/ 736 static void write_callback(unsigned long error, void *context) 737 { 738 unsigned int i; 739 int uptodate = 1; 740 struct bio *bio = (struct bio *) context; 741 struct mirror_set *ms; 742 743 ms = bio_get_ms(bio); 744 bio_set_ms(bio, NULL); 745 746 /* 747 * NOTE: We don't decrement the pending count here, 748 * instead it is done by the targets endio function. 749 * This way we handle both writes to SYNC and NOSYNC 750 * regions with the same code. 751 */ 752 753 if (error) { 754 /* 755 * only error the io if all mirrors failed. 756 * FIXME: bogus 757 */ 758 uptodate = 0; 759 for (i = 0; i < ms->nr_mirrors; i++) 760 if (!test_bit(i, &error)) { 761 uptodate = 1; 762 break; 763 } 764 } 765 bio_endio(bio, bio->bi_size, 0); 766 } 767 768 static void do_write(struct mirror_set *ms, struct bio *bio) 769 { 770 unsigned int i; 771 struct io_region io[KCOPYD_MAX_REGIONS+1]; 772 struct mirror *m; 773 774 for (i = 0; i < ms->nr_mirrors; i++) { 775 m = ms->mirror + i; 776 777 io[i].bdev = m->dev->bdev; 778 io[i].sector = m->offset + (bio->bi_sector - ms->ti->begin); 779 io[i].count = bio->bi_size >> 9; 780 } 781 782 bio_set_ms(bio, ms); 783 dm_io_async_bvec(ms->nr_mirrors, io, WRITE, 784 bio->bi_io_vec + bio->bi_idx, 785 write_callback, bio); 786 } 787 788 static void do_writes(struct mirror_set *ms, struct bio_list *writes) 789 { 790 int state; 791 struct bio *bio; 792 struct bio_list sync, nosync, recover, *this_list = NULL; 793 794 if (!writes->head) 795 return; 796 797 /* 798 * Classify each write. 799 */ 800 bio_list_init(&sync); 801 bio_list_init(&nosync); 802 bio_list_init(&recover); 803 804 while ((bio = bio_list_pop(writes))) { 805 state = rh_state(&ms->rh, bio_to_region(&ms->rh, bio), 1); 806 switch (state) { 807 case RH_CLEAN: 808 case RH_DIRTY: 809 this_list = &sync; 810 break; 811 812 case RH_NOSYNC: 813 this_list = &nosync; 814 break; 815 816 case RH_RECOVERING: 817 this_list = &recover; 818 break; 819 } 820 821 bio_list_add(this_list, bio); 822 } 823 824 /* 825 * Increment the pending counts for any regions that will 826 * be written to (writes to recover regions are going to 827 * be delayed). 828 */ 829 rh_inc_pending(&ms->rh, &sync); 830 rh_inc_pending(&ms->rh, &nosync); 831 rh_flush(&ms->rh); 832 833 /* 834 * Dispatch io. 835 */ 836 while ((bio = bio_list_pop(&sync))) 837 do_write(ms, bio); 838 839 while ((bio = bio_list_pop(&recover))) 840 rh_delay(&ms->rh, bio); 841 842 while ((bio = bio_list_pop(&nosync))) { 843 map_bio(ms, ms->default_mirror, bio); 844 generic_make_request(bio); 845 } 846 } 847 848 /*----------------------------------------------------------------- 849 * kmirrord 850 *---------------------------------------------------------------*/ 851 static LIST_HEAD(_mirror_sets); 852 static DECLARE_RWSEM(_mirror_sets_lock); 853 854 static void do_mirror(struct mirror_set *ms) 855 { 856 struct bio_list reads, writes; 857 858 spin_lock(&ms->lock); 859 reads = ms->reads; 860 writes = ms->writes; 861 bio_list_init(&ms->reads); 862 bio_list_init(&ms->writes); 863 spin_unlock(&ms->lock); 864 865 rh_update_states(&ms->rh); 866 do_recovery(ms); 867 do_reads(ms, &reads); 868 do_writes(ms, &writes); 869 } 870 871 static void do_work(void *ignored) 872 { 873 struct mirror_set *ms; 874 875 down_read(&_mirror_sets_lock); 876 list_for_each_entry (ms, &_mirror_sets, list) 877 do_mirror(ms); 878 up_read(&_mirror_sets_lock); 879 } 880 881 /*----------------------------------------------------------------- 882 * Target functions 883 *---------------------------------------------------------------*/ 884 static struct mirror_set *alloc_context(unsigned int nr_mirrors, 885 uint32_t region_size, 886 struct dm_target *ti, 887 struct dirty_log *dl) 888 { 889 size_t len; 890 struct mirror_set *ms = NULL; 891 892 if (array_too_big(sizeof(*ms), sizeof(ms->mirror[0]), nr_mirrors)) 893 return NULL; 894 895 len = sizeof(*ms) + (sizeof(ms->mirror[0]) * nr_mirrors); 896 897 ms = kmalloc(len, GFP_KERNEL); 898 if (!ms) { 899 ti->error = "Cannot allocate mirror context"; 900 return NULL; 901 } 902 903 memset(ms, 0, len); 904 spin_lock_init(&ms->lock); 905 906 ms->ti = ti; 907 ms->nr_mirrors = nr_mirrors; 908 ms->nr_regions = dm_sector_div_up(ti->len, region_size); 909 ms->in_sync = 0; 910 ms->default_mirror = &ms->mirror[DEFAULT_MIRROR]; 911 912 if (rh_init(&ms->rh, ms, dl, region_size, ms->nr_regions)) { 913 ti->error = "Error creating dirty region hash"; 914 kfree(ms); 915 return NULL; 916 } 917 918 return ms; 919 } 920 921 static void free_context(struct mirror_set *ms, struct dm_target *ti, 922 unsigned int m) 923 { 924 while (m--) 925 dm_put_device(ti, ms->mirror[m].dev); 926 927 rh_exit(&ms->rh); 928 kfree(ms); 929 } 930 931 static inline int _check_region_size(struct dm_target *ti, uint32_t size) 932 { 933 return !(size % (PAGE_SIZE >> 9) || (size & (size - 1)) || 934 size > ti->len); 935 } 936 937 static int get_mirror(struct mirror_set *ms, struct dm_target *ti, 938 unsigned int mirror, char **argv) 939 { 940 unsigned long long offset; 941 942 if (sscanf(argv[1], "%llu", &offset) != 1) { 943 ti->error = "Invalid offset"; 944 return -EINVAL; 945 } 946 947 if (dm_get_device(ti, argv[0], offset, ti->len, 948 dm_table_get_mode(ti->table), 949 &ms->mirror[mirror].dev)) { 950 ti->error = "Device lookup failure"; 951 return -ENXIO; 952 } 953 954 ms->mirror[mirror].offset = offset; 955 956 return 0; 957 } 958 959 static int add_mirror_set(struct mirror_set *ms) 960 { 961 down_write(&_mirror_sets_lock); 962 list_add_tail(&ms->list, &_mirror_sets); 963 up_write(&_mirror_sets_lock); 964 wake(); 965 966 return 0; 967 } 968 969 static void del_mirror_set(struct mirror_set *ms) 970 { 971 down_write(&_mirror_sets_lock); 972 list_del(&ms->list); 973 up_write(&_mirror_sets_lock); 974 } 975 976 /* 977 * Create dirty log: log_type #log_params <log_params> 978 */ 979 static struct dirty_log *create_dirty_log(struct dm_target *ti, 980 unsigned int argc, char **argv, 981 unsigned int *args_used) 982 { 983 unsigned int param_count; 984 struct dirty_log *dl; 985 986 if (argc < 2) { 987 ti->error = "Insufficient mirror log arguments"; 988 return NULL; 989 } 990 991 if (sscanf(argv[1], "%u", ¶m_count) != 1) { 992 ti->error = "Invalid mirror log argument count"; 993 return NULL; 994 } 995 996 *args_used = 2 + param_count; 997 998 if (argc < *args_used) { 999 ti->error = "Insufficient mirror log arguments"; 1000 return NULL; 1001 } 1002 1003 dl = dm_create_dirty_log(argv[0], ti, param_count, argv + 2); 1004 if (!dl) { 1005 ti->error = "Error creating mirror dirty log"; 1006 return NULL; 1007 } 1008 1009 if (!_check_region_size(ti, dl->type->get_region_size(dl))) { 1010 ti->error = "Invalid region size"; 1011 dm_destroy_dirty_log(dl); 1012 return NULL; 1013 } 1014 1015 return dl; 1016 } 1017 1018 /* 1019 * Construct a mirror mapping: 1020 * 1021 * log_type #log_params <log_params> 1022 * #mirrors [mirror_path offset]{2,} 1023 * 1024 * log_type is "core" or "disk" 1025 * #log_params is between 1 and 3 1026 */ 1027 #define DM_IO_PAGES 64 1028 static int mirror_ctr(struct dm_target *ti, unsigned int argc, char **argv) 1029 { 1030 int r; 1031 unsigned int nr_mirrors, m, args_used; 1032 struct mirror_set *ms; 1033 struct dirty_log *dl; 1034 1035 dl = create_dirty_log(ti, argc, argv, &args_used); 1036 if (!dl) 1037 return -EINVAL; 1038 1039 argv += args_used; 1040 argc -= args_used; 1041 1042 if (!argc || sscanf(argv[0], "%u", &nr_mirrors) != 1 || 1043 nr_mirrors < 2 || nr_mirrors > KCOPYD_MAX_REGIONS + 1) { 1044 ti->error = "Invalid number of mirrors"; 1045 dm_destroy_dirty_log(dl); 1046 return -EINVAL; 1047 } 1048 1049 argv++, argc--; 1050 1051 if (argc != nr_mirrors * 2) { 1052 ti->error = "Wrong number of mirror arguments"; 1053 dm_destroy_dirty_log(dl); 1054 return -EINVAL; 1055 } 1056 1057 ms = alloc_context(nr_mirrors, dl->type->get_region_size(dl), ti, dl); 1058 if (!ms) { 1059 dm_destroy_dirty_log(dl); 1060 return -ENOMEM; 1061 } 1062 1063 /* Get the mirror parameter sets */ 1064 for (m = 0; m < nr_mirrors; m++) { 1065 r = get_mirror(ms, ti, m, argv); 1066 if (r) { 1067 free_context(ms, ti, m); 1068 return r; 1069 } 1070 argv += 2; 1071 argc -= 2; 1072 } 1073 1074 ti->private = ms; 1075 ti->split_io = ms->rh.region_size; 1076 1077 r = kcopyd_client_create(DM_IO_PAGES, &ms->kcopyd_client); 1078 if (r) { 1079 free_context(ms, ti, ms->nr_mirrors); 1080 return r; 1081 } 1082 1083 add_mirror_set(ms); 1084 return 0; 1085 } 1086 1087 static void mirror_dtr(struct dm_target *ti) 1088 { 1089 struct mirror_set *ms = (struct mirror_set *) ti->private; 1090 1091 del_mirror_set(ms); 1092 kcopyd_client_destroy(ms->kcopyd_client); 1093 free_context(ms, ti, ms->nr_mirrors); 1094 } 1095 1096 static void queue_bio(struct mirror_set *ms, struct bio *bio, int rw) 1097 { 1098 int should_wake = 0; 1099 struct bio_list *bl; 1100 1101 bl = (rw == WRITE) ? &ms->writes : &ms->reads; 1102 spin_lock(&ms->lock); 1103 should_wake = !(bl->head); 1104 bio_list_add(bl, bio); 1105 spin_unlock(&ms->lock); 1106 1107 if (should_wake) 1108 wake(); 1109 } 1110 1111 /* 1112 * Mirror mapping function 1113 */ 1114 static int mirror_map(struct dm_target *ti, struct bio *bio, 1115 union map_info *map_context) 1116 { 1117 int r, rw = bio_rw(bio); 1118 struct mirror *m; 1119 struct mirror_set *ms = ti->private; 1120 1121 map_context->ll = bio_to_region(&ms->rh, bio); 1122 1123 if (rw == WRITE) { 1124 queue_bio(ms, bio, rw); 1125 return 0; 1126 } 1127 1128 r = ms->rh.log->type->in_sync(ms->rh.log, 1129 bio_to_region(&ms->rh, bio), 0); 1130 if (r < 0 && r != -EWOULDBLOCK) 1131 return r; 1132 1133 if (r == -EWOULDBLOCK) /* FIXME: ugly */ 1134 r = 0; 1135 1136 /* 1137 * We don't want to fast track a recovery just for a read 1138 * ahead. So we just let it silently fail. 1139 * FIXME: get rid of this. 1140 */ 1141 if (!r && rw == READA) 1142 return -EIO; 1143 1144 if (!r) { 1145 /* Pass this io over to the daemon */ 1146 queue_bio(ms, bio, rw); 1147 return 0; 1148 } 1149 1150 m = choose_mirror(ms, bio->bi_sector); 1151 if (!m) 1152 return -EIO; 1153 1154 map_bio(ms, m, bio); 1155 return 1; 1156 } 1157 1158 static int mirror_end_io(struct dm_target *ti, struct bio *bio, 1159 int error, union map_info *map_context) 1160 { 1161 int rw = bio_rw(bio); 1162 struct mirror_set *ms = (struct mirror_set *) ti->private; 1163 region_t region = map_context->ll; 1164 1165 /* 1166 * We need to dec pending if this was a write. 1167 */ 1168 if (rw == WRITE) 1169 rh_dec(&ms->rh, region); 1170 1171 return 0; 1172 } 1173 1174 static void mirror_postsuspend(struct dm_target *ti) 1175 { 1176 struct mirror_set *ms = (struct mirror_set *) ti->private; 1177 struct dirty_log *log = ms->rh.log; 1178 1179 rh_stop_recovery(&ms->rh); 1180 if (log->type->suspend && log->type->suspend(log)) 1181 /* FIXME: need better error handling */ 1182 DMWARN("log suspend failed"); 1183 } 1184 1185 static void mirror_resume(struct dm_target *ti) 1186 { 1187 struct mirror_set *ms = (struct mirror_set *) ti->private; 1188 struct dirty_log *log = ms->rh.log; 1189 if (log->type->resume && log->type->resume(log)) 1190 /* FIXME: need better error handling */ 1191 DMWARN("log resume failed"); 1192 rh_start_recovery(&ms->rh); 1193 } 1194 1195 static int mirror_status(struct dm_target *ti, status_type_t type, 1196 char *result, unsigned int maxlen) 1197 { 1198 unsigned int m, sz; 1199 struct mirror_set *ms = (struct mirror_set *) ti->private; 1200 1201 sz = ms->rh.log->type->status(ms->rh.log, type, result, maxlen); 1202 1203 switch (type) { 1204 case STATUSTYPE_INFO: 1205 DMEMIT("%d ", ms->nr_mirrors); 1206 for (m = 0; m < ms->nr_mirrors; m++) 1207 DMEMIT("%s ", ms->mirror[m].dev->name); 1208 1209 DMEMIT("%llu/%llu", 1210 (unsigned long long)ms->rh.log->type-> 1211 get_sync_count(ms->rh.log), 1212 (unsigned long long)ms->nr_regions); 1213 break; 1214 1215 case STATUSTYPE_TABLE: 1216 DMEMIT("%d ", ms->nr_mirrors); 1217 for (m = 0; m < ms->nr_mirrors; m++) 1218 DMEMIT("%s %llu ", ms->mirror[m].dev->name, 1219 (unsigned long long)ms->mirror[m].offset); 1220 } 1221 1222 return 0; 1223 } 1224 1225 static struct target_type mirror_target = { 1226 .name = "mirror", 1227 .version = {1, 0, 2}, 1228 .module = THIS_MODULE, 1229 .ctr = mirror_ctr, 1230 .dtr = mirror_dtr, 1231 .map = mirror_map, 1232 .end_io = mirror_end_io, 1233 .postsuspend = mirror_postsuspend, 1234 .resume = mirror_resume, 1235 .status = mirror_status, 1236 }; 1237 1238 static int __init dm_mirror_init(void) 1239 { 1240 int r; 1241 1242 r = dm_dirty_log_init(); 1243 if (r) 1244 return r; 1245 1246 _kmirrord_wq = create_singlethread_workqueue("kmirrord"); 1247 if (!_kmirrord_wq) { 1248 DMERR("couldn't start kmirrord"); 1249 dm_dirty_log_exit(); 1250 return r; 1251 } 1252 INIT_WORK(&_kmirrord_work, do_work, NULL); 1253 1254 r = dm_register_target(&mirror_target); 1255 if (r < 0) { 1256 DMERR("%s: Failed to register mirror target", 1257 mirror_target.name); 1258 dm_dirty_log_exit(); 1259 destroy_workqueue(_kmirrord_wq); 1260 } 1261 1262 return r; 1263 } 1264 1265 static void __exit dm_mirror_exit(void) 1266 { 1267 int r; 1268 1269 r = dm_unregister_target(&mirror_target); 1270 if (r < 0) 1271 DMERR("%s: unregister failed %d", mirror_target.name, r); 1272 1273 destroy_workqueue(_kmirrord_wq); 1274 dm_dirty_log_exit(); 1275 } 1276 1277 /* Module hooks */ 1278 module_init(dm_mirror_init); 1279 module_exit(dm_mirror_exit); 1280 1281 MODULE_DESCRIPTION(DM_NAME " mirror target"); 1282 MODULE_AUTHOR("Joe Thornber"); 1283 MODULE_LICENSE("GPL"); 1284