1 /* 2 * Copyright (C) 2003 Sistina Software Limited. 3 * Copyright (C) 2004-2008 Red Hat, Inc. All rights reserved. 4 * 5 * This file is released under the GPL. 6 */ 7 8 #include <linux/dm-dirty-log.h> 9 #include <linux/dm-region-hash.h> 10 11 #include <linux/ctype.h> 12 #include <linux/init.h> 13 #include <linux/module.h> 14 #include <linux/vmalloc.h> 15 16 #include "dm.h" 17 18 #define DM_MSG_PREFIX "region hash" 19 20 /*----------------------------------------------------------------- 21 * Region hash 22 * 23 * The mirror splits itself up into discrete regions. Each 24 * region can be in one of three states: clean, dirty, 25 * nosync. There is no need to put clean regions in the hash. 26 * 27 * In addition to being present in the hash table a region _may_ 28 * be present on one of three lists. 29 * 30 * clean_regions: Regions on this list have no io pending to 31 * them, they are in sync, we are no longer interested in them, 32 * they are dull. dm_rh_update_states() will remove them from the 33 * hash table. 34 * 35 * quiesced_regions: These regions have been spun down, ready 36 * for recovery. rh_recovery_start() will remove regions from 37 * this list and hand them to kmirrord, which will schedule the 38 * recovery io with kcopyd. 39 * 40 * recovered_regions: Regions that kcopyd has successfully 41 * recovered. dm_rh_update_states() will now schedule any delayed 42 * io, up the recovery_count, and remove the region from the 43 * hash. 44 * 45 * There are 2 locks: 46 * A rw spin lock 'hash_lock' protects just the hash table, 47 * this is never held in write mode from interrupt context, 48 * which I believe means that we only have to disable irqs when 49 * doing a write lock. 50 * 51 * An ordinary spin lock 'region_lock' that protects the three 52 * lists in the region_hash, with the 'state', 'list' and 53 * 'delayed_bios' fields of the regions. This is used from irq 54 * context, so all other uses will have to suspend local irqs. 55 *---------------------------------------------------------------*/ 56 struct dm_region_hash { 57 uint32_t region_size; 58 unsigned region_shift; 59 60 /* holds persistent region state */ 61 struct dm_dirty_log *log; 62 63 /* hash table */ 64 rwlock_t hash_lock; 65 mempool_t *region_pool; 66 unsigned mask; 67 unsigned nr_buckets; 68 unsigned prime; 69 unsigned shift; 70 struct list_head *buckets; 71 72 unsigned max_recovery; /* Max # of regions to recover in parallel */ 73 74 spinlock_t region_lock; 75 atomic_t recovery_in_flight; 76 struct semaphore recovery_count; 77 struct list_head clean_regions; 78 struct list_head quiesced_regions; 79 struct list_head recovered_regions; 80 struct list_head failed_recovered_regions; 81 82 void *context; 83 sector_t target_begin; 84 85 /* Callback function to schedule bios writes */ 86 void (*dispatch_bios)(void *context, struct bio_list *bios); 87 88 /* Callback function to wakeup callers worker thread. */ 89 void (*wakeup_workers)(void *context); 90 91 /* Callback function to wakeup callers recovery waiters. */ 92 void (*wakeup_all_recovery_waiters)(void *context); 93 }; 94 95 struct dm_region { 96 struct dm_region_hash *rh; /* FIXME: can we get rid of this ? */ 97 region_t key; 98 int state; 99 100 struct list_head hash_list; 101 struct list_head list; 102 103 atomic_t pending; 104 struct bio_list delayed_bios; 105 }; 106 107 /* 108 * Conversion fns 109 */ 110 static region_t dm_rh_sector_to_region(struct dm_region_hash *rh, sector_t sector) 111 { 112 return sector >> rh->region_shift; 113 } 114 115 sector_t dm_rh_region_to_sector(struct dm_region_hash *rh, region_t region) 116 { 117 return region << rh->region_shift; 118 } 119 EXPORT_SYMBOL_GPL(dm_rh_region_to_sector); 120 121 region_t dm_rh_bio_to_region(struct dm_region_hash *rh, struct bio *bio) 122 { 123 return dm_rh_sector_to_region(rh, bio->bi_sector - rh->target_begin); 124 } 125 EXPORT_SYMBOL_GPL(dm_rh_bio_to_region); 126 127 void *dm_rh_region_context(struct dm_region *reg) 128 { 129 return reg->rh->context; 130 } 131 EXPORT_SYMBOL_GPL(dm_rh_region_context); 132 133 region_t dm_rh_get_region_key(struct dm_region *reg) 134 { 135 return reg->key; 136 } 137 EXPORT_SYMBOL_GPL(dm_rh_get_region_key); 138 139 sector_t dm_rh_get_region_size(struct dm_region_hash *rh) 140 { 141 return rh->region_size; 142 } 143 EXPORT_SYMBOL_GPL(dm_rh_get_region_size); 144 145 /* 146 * FIXME: shall we pass in a structure instead of all these args to 147 * dm_region_hash_create()???? 148 */ 149 #define RH_HASH_MULT 2654435387U 150 #define RH_HASH_SHIFT 12 151 152 #define MIN_REGIONS 64 153 struct dm_region_hash *dm_region_hash_create( 154 void *context, void (*dispatch_bios)(void *context, 155 struct bio_list *bios), 156 void (*wakeup_workers)(void *context), 157 void (*wakeup_all_recovery_waiters)(void *context), 158 sector_t target_begin, unsigned max_recovery, 159 struct dm_dirty_log *log, uint32_t region_size, 160 region_t nr_regions) 161 { 162 struct dm_region_hash *rh; 163 unsigned nr_buckets, max_buckets; 164 size_t i; 165 166 /* 167 * Calculate a suitable number of buckets for our hash 168 * table. 169 */ 170 max_buckets = nr_regions >> 6; 171 for (nr_buckets = 128u; nr_buckets < max_buckets; nr_buckets <<= 1) 172 ; 173 nr_buckets >>= 1; 174 175 rh = kmalloc(sizeof(*rh), GFP_KERNEL); 176 if (!rh) { 177 DMERR("unable to allocate region hash memory"); 178 return ERR_PTR(-ENOMEM); 179 } 180 181 rh->context = context; 182 rh->dispatch_bios = dispatch_bios; 183 rh->wakeup_workers = wakeup_workers; 184 rh->wakeup_all_recovery_waiters = wakeup_all_recovery_waiters; 185 rh->target_begin = target_begin; 186 rh->max_recovery = max_recovery; 187 rh->log = log; 188 rh->region_size = region_size; 189 rh->region_shift = ffs(region_size) - 1; 190 rwlock_init(&rh->hash_lock); 191 rh->mask = nr_buckets - 1; 192 rh->nr_buckets = nr_buckets; 193 194 rh->shift = RH_HASH_SHIFT; 195 rh->prime = RH_HASH_MULT; 196 197 rh->buckets = vmalloc(nr_buckets * sizeof(*rh->buckets)); 198 if (!rh->buckets) { 199 DMERR("unable to allocate region hash bucket memory"); 200 kfree(rh); 201 return ERR_PTR(-ENOMEM); 202 } 203 204 for (i = 0; i < nr_buckets; i++) 205 INIT_LIST_HEAD(rh->buckets + i); 206 207 spin_lock_init(&rh->region_lock); 208 sema_init(&rh->recovery_count, 0); 209 atomic_set(&rh->recovery_in_flight, 0); 210 INIT_LIST_HEAD(&rh->clean_regions); 211 INIT_LIST_HEAD(&rh->quiesced_regions); 212 INIT_LIST_HEAD(&rh->recovered_regions); 213 INIT_LIST_HEAD(&rh->failed_recovered_regions); 214 215 rh->region_pool = mempool_create_kmalloc_pool(MIN_REGIONS, 216 sizeof(struct dm_region)); 217 if (!rh->region_pool) { 218 vfree(rh->buckets); 219 kfree(rh); 220 rh = ERR_PTR(-ENOMEM); 221 } 222 223 return rh; 224 } 225 EXPORT_SYMBOL_GPL(dm_region_hash_create); 226 227 void dm_region_hash_destroy(struct dm_region_hash *rh) 228 { 229 unsigned h; 230 struct dm_region *reg, *nreg; 231 232 BUG_ON(!list_empty(&rh->quiesced_regions)); 233 for (h = 0; h < rh->nr_buckets; h++) { 234 list_for_each_entry_safe(reg, nreg, rh->buckets + h, 235 hash_list) { 236 BUG_ON(atomic_read(®->pending)); 237 mempool_free(reg, rh->region_pool); 238 } 239 } 240 241 if (rh->log) 242 dm_dirty_log_destroy(rh->log); 243 244 if (rh->region_pool) 245 mempool_destroy(rh->region_pool); 246 247 vfree(rh->buckets); 248 kfree(rh); 249 } 250 EXPORT_SYMBOL_GPL(dm_region_hash_destroy); 251 252 struct dm_dirty_log *dm_rh_dirty_log(struct dm_region_hash *rh) 253 { 254 return rh->log; 255 } 256 EXPORT_SYMBOL_GPL(dm_rh_dirty_log); 257 258 static unsigned rh_hash(struct dm_region_hash *rh, region_t region) 259 { 260 return (unsigned) ((region * rh->prime) >> rh->shift) & rh->mask; 261 } 262 263 static struct dm_region *__rh_lookup(struct dm_region_hash *rh, region_t region) 264 { 265 struct dm_region *reg; 266 struct list_head *bucket = rh->buckets + rh_hash(rh, region); 267 268 list_for_each_entry(reg, bucket, hash_list) 269 if (reg->key == region) 270 return reg; 271 272 return NULL; 273 } 274 275 static void __rh_insert(struct dm_region_hash *rh, struct dm_region *reg) 276 { 277 list_add(®->hash_list, rh->buckets + rh_hash(rh, reg->key)); 278 } 279 280 static struct dm_region *__rh_alloc(struct dm_region_hash *rh, region_t region) 281 { 282 struct dm_region *reg, *nreg; 283 284 nreg = mempool_alloc(rh->region_pool, GFP_ATOMIC); 285 if (unlikely(!nreg)) 286 nreg = kmalloc(sizeof(*nreg), GFP_NOIO); 287 288 nreg->state = rh->log->type->in_sync(rh->log, region, 1) ? 289 DM_RH_CLEAN : DM_RH_NOSYNC; 290 nreg->rh = rh; 291 nreg->key = region; 292 INIT_LIST_HEAD(&nreg->list); 293 atomic_set(&nreg->pending, 0); 294 bio_list_init(&nreg->delayed_bios); 295 296 write_lock_irq(&rh->hash_lock); 297 reg = __rh_lookup(rh, region); 298 if (reg) 299 /* We lost the race. */ 300 mempool_free(nreg, rh->region_pool); 301 else { 302 __rh_insert(rh, nreg); 303 if (nreg->state == DM_RH_CLEAN) { 304 spin_lock(&rh->region_lock); 305 list_add(&nreg->list, &rh->clean_regions); 306 spin_unlock(&rh->region_lock); 307 } 308 309 reg = nreg; 310 } 311 write_unlock_irq(&rh->hash_lock); 312 313 return reg; 314 } 315 316 static struct dm_region *__rh_find(struct dm_region_hash *rh, region_t region) 317 { 318 struct dm_region *reg; 319 320 reg = __rh_lookup(rh, region); 321 if (!reg) { 322 read_unlock(&rh->hash_lock); 323 reg = __rh_alloc(rh, region); 324 read_lock(&rh->hash_lock); 325 } 326 327 return reg; 328 } 329 330 int dm_rh_get_state(struct dm_region_hash *rh, region_t region, int may_block) 331 { 332 int r; 333 struct dm_region *reg; 334 335 read_lock(&rh->hash_lock); 336 reg = __rh_lookup(rh, region); 337 read_unlock(&rh->hash_lock); 338 339 if (reg) 340 return reg->state; 341 342 /* 343 * The region wasn't in the hash, so we fall back to the 344 * dirty log. 345 */ 346 r = rh->log->type->in_sync(rh->log, region, may_block); 347 348 /* 349 * Any error from the dirty log (eg. -EWOULDBLOCK) gets 350 * taken as a DM_RH_NOSYNC 351 */ 352 return r == 1 ? DM_RH_CLEAN : DM_RH_NOSYNC; 353 } 354 EXPORT_SYMBOL_GPL(dm_rh_get_state); 355 356 static void complete_resync_work(struct dm_region *reg, int success) 357 { 358 struct dm_region_hash *rh = reg->rh; 359 360 rh->log->type->set_region_sync(rh->log, reg->key, success); 361 362 /* 363 * Dispatch the bios before we call 'wake_up_all'. 364 * This is important because if we are suspending, 365 * we want to know that recovery is complete and 366 * the work queue is flushed. If we wake_up_all 367 * before we dispatch_bios (queue bios and call wake()), 368 * then we risk suspending before the work queue 369 * has been properly flushed. 370 */ 371 rh->dispatch_bios(rh->context, ®->delayed_bios); 372 if (atomic_dec_and_test(&rh->recovery_in_flight)) 373 rh->wakeup_all_recovery_waiters(rh->context); 374 up(&rh->recovery_count); 375 } 376 377 /* dm_rh_mark_nosync 378 * @ms 379 * @bio 380 * @done 381 * @error 382 * 383 * The bio was written on some mirror(s) but failed on other mirror(s). 384 * We can successfully endio the bio but should avoid the region being 385 * marked clean by setting the state DM_RH_NOSYNC. 386 * 387 * This function is _not_ safe in interrupt context! 388 */ 389 void dm_rh_mark_nosync(struct dm_region_hash *rh, 390 struct bio *bio, unsigned done, int error) 391 { 392 unsigned long flags; 393 struct dm_dirty_log *log = rh->log; 394 struct dm_region *reg; 395 region_t region = dm_rh_bio_to_region(rh, bio); 396 int recovering = 0; 397 398 /* We must inform the log that the sync count has changed. */ 399 log->type->set_region_sync(log, region, 0); 400 401 read_lock(&rh->hash_lock); 402 reg = __rh_find(rh, region); 403 read_unlock(&rh->hash_lock); 404 405 /* region hash entry should exist because write was in-flight */ 406 BUG_ON(!reg); 407 BUG_ON(!list_empty(®->list)); 408 409 spin_lock_irqsave(&rh->region_lock, flags); 410 /* 411 * Possible cases: 412 * 1) DM_RH_DIRTY 413 * 2) DM_RH_NOSYNC: was dirty, other preceeding writes failed 414 * 3) DM_RH_RECOVERING: flushing pending writes 415 * Either case, the region should have not been connected to list. 416 */ 417 recovering = (reg->state == DM_RH_RECOVERING); 418 reg->state = DM_RH_NOSYNC; 419 BUG_ON(!list_empty(®->list)); 420 spin_unlock_irqrestore(&rh->region_lock, flags); 421 422 bio_endio(bio, error); 423 if (recovering) 424 complete_resync_work(reg, 0); 425 } 426 EXPORT_SYMBOL_GPL(dm_rh_mark_nosync); 427 428 void dm_rh_update_states(struct dm_region_hash *rh, int errors_handled) 429 { 430 struct dm_region *reg, *next; 431 432 LIST_HEAD(clean); 433 LIST_HEAD(recovered); 434 LIST_HEAD(failed_recovered); 435 436 /* 437 * Quickly grab the lists. 438 */ 439 write_lock_irq(&rh->hash_lock); 440 spin_lock(&rh->region_lock); 441 if (!list_empty(&rh->clean_regions)) { 442 list_splice_init(&rh->clean_regions, &clean); 443 444 list_for_each_entry(reg, &clean, list) 445 list_del(®->hash_list); 446 } 447 448 if (!list_empty(&rh->recovered_regions)) { 449 list_splice_init(&rh->recovered_regions, &recovered); 450 451 list_for_each_entry(reg, &recovered, list) 452 list_del(®->hash_list); 453 } 454 455 if (!list_empty(&rh->failed_recovered_regions)) { 456 list_splice_init(&rh->failed_recovered_regions, 457 &failed_recovered); 458 459 list_for_each_entry(reg, &failed_recovered, list) 460 list_del(®->hash_list); 461 } 462 463 spin_unlock(&rh->region_lock); 464 write_unlock_irq(&rh->hash_lock); 465 466 /* 467 * All the regions on the recovered and clean lists have 468 * now been pulled out of the system, so no need to do 469 * any more locking. 470 */ 471 list_for_each_entry_safe(reg, next, &recovered, list) { 472 rh->log->type->clear_region(rh->log, reg->key); 473 complete_resync_work(reg, 1); 474 mempool_free(reg, rh->region_pool); 475 } 476 477 list_for_each_entry_safe(reg, next, &failed_recovered, list) { 478 complete_resync_work(reg, errors_handled ? 0 : 1); 479 mempool_free(reg, rh->region_pool); 480 } 481 482 list_for_each_entry_safe(reg, next, &clean, list) { 483 rh->log->type->clear_region(rh->log, reg->key); 484 mempool_free(reg, rh->region_pool); 485 } 486 487 rh->log->type->flush(rh->log); 488 } 489 EXPORT_SYMBOL_GPL(dm_rh_update_states); 490 491 static void rh_inc(struct dm_region_hash *rh, region_t region) 492 { 493 struct dm_region *reg; 494 495 read_lock(&rh->hash_lock); 496 reg = __rh_find(rh, region); 497 498 spin_lock_irq(&rh->region_lock); 499 atomic_inc(®->pending); 500 501 if (reg->state == DM_RH_CLEAN) { 502 reg->state = DM_RH_DIRTY; 503 list_del_init(®->list); /* take off the clean list */ 504 spin_unlock_irq(&rh->region_lock); 505 506 rh->log->type->mark_region(rh->log, reg->key); 507 } else 508 spin_unlock_irq(&rh->region_lock); 509 510 511 read_unlock(&rh->hash_lock); 512 } 513 514 void dm_rh_inc_pending(struct dm_region_hash *rh, struct bio_list *bios) 515 { 516 struct bio *bio; 517 518 for (bio = bios->head; bio; bio = bio->bi_next) 519 rh_inc(rh, dm_rh_bio_to_region(rh, bio)); 520 } 521 EXPORT_SYMBOL_GPL(dm_rh_inc_pending); 522 523 void dm_rh_dec(struct dm_region_hash *rh, region_t region) 524 { 525 unsigned long flags; 526 struct dm_region *reg; 527 int should_wake = 0; 528 529 read_lock(&rh->hash_lock); 530 reg = __rh_lookup(rh, region); 531 read_unlock(&rh->hash_lock); 532 533 spin_lock_irqsave(&rh->region_lock, flags); 534 if (atomic_dec_and_test(®->pending)) { 535 /* 536 * There is no pending I/O for this region. 537 * We can move the region to corresponding list for next action. 538 * At this point, the region is not yet connected to any list. 539 * 540 * If the state is DM_RH_NOSYNC, the region should be kept off 541 * from clean list. 542 * The hash entry for DM_RH_NOSYNC will remain in memory 543 * until the region is recovered or the map is reloaded. 544 */ 545 546 /* do nothing for DM_RH_NOSYNC */ 547 if (reg->state == DM_RH_RECOVERING) { 548 list_add_tail(®->list, &rh->quiesced_regions); 549 } else if (reg->state == DM_RH_DIRTY) { 550 reg->state = DM_RH_CLEAN; 551 list_add(®->list, &rh->clean_regions); 552 } 553 should_wake = 1; 554 } 555 spin_unlock_irqrestore(&rh->region_lock, flags); 556 557 if (should_wake) 558 rh->wakeup_workers(rh->context); 559 } 560 EXPORT_SYMBOL_GPL(dm_rh_dec); 561 562 /* 563 * Starts quiescing a region in preparation for recovery. 564 */ 565 static int __rh_recovery_prepare(struct dm_region_hash *rh) 566 { 567 int r; 568 region_t region; 569 struct dm_region *reg; 570 571 /* 572 * Ask the dirty log what's next. 573 */ 574 r = rh->log->type->get_resync_work(rh->log, ®ion); 575 if (r <= 0) 576 return r; 577 578 /* 579 * Get this region, and start it quiescing by setting the 580 * recovering flag. 581 */ 582 read_lock(&rh->hash_lock); 583 reg = __rh_find(rh, region); 584 read_unlock(&rh->hash_lock); 585 586 spin_lock_irq(&rh->region_lock); 587 reg->state = DM_RH_RECOVERING; 588 589 /* Already quiesced ? */ 590 if (atomic_read(®->pending)) 591 list_del_init(®->list); 592 else 593 list_move(®->list, &rh->quiesced_regions); 594 595 spin_unlock_irq(&rh->region_lock); 596 597 return 1; 598 } 599 600 void dm_rh_recovery_prepare(struct dm_region_hash *rh) 601 { 602 /* Extra reference to avoid race with dm_rh_stop_recovery */ 603 atomic_inc(&rh->recovery_in_flight); 604 605 while (!down_trylock(&rh->recovery_count)) { 606 atomic_inc(&rh->recovery_in_flight); 607 if (__rh_recovery_prepare(rh) <= 0) { 608 atomic_dec(&rh->recovery_in_flight); 609 up(&rh->recovery_count); 610 break; 611 } 612 } 613 614 /* Drop the extra reference */ 615 if (atomic_dec_and_test(&rh->recovery_in_flight)) 616 rh->wakeup_all_recovery_waiters(rh->context); 617 } 618 EXPORT_SYMBOL_GPL(dm_rh_recovery_prepare); 619 620 /* 621 * Returns any quiesced regions. 622 */ 623 struct dm_region *dm_rh_recovery_start(struct dm_region_hash *rh) 624 { 625 struct dm_region *reg = NULL; 626 627 spin_lock_irq(&rh->region_lock); 628 if (!list_empty(&rh->quiesced_regions)) { 629 reg = list_entry(rh->quiesced_regions.next, 630 struct dm_region, list); 631 list_del_init(®->list); /* remove from the quiesced list */ 632 } 633 spin_unlock_irq(&rh->region_lock); 634 635 return reg; 636 } 637 EXPORT_SYMBOL_GPL(dm_rh_recovery_start); 638 639 void dm_rh_recovery_end(struct dm_region *reg, int success) 640 { 641 struct dm_region_hash *rh = reg->rh; 642 643 spin_lock_irq(&rh->region_lock); 644 if (success) 645 list_add(®->list, ®->rh->recovered_regions); 646 else { 647 reg->state = DM_RH_NOSYNC; 648 list_add(®->list, ®->rh->failed_recovered_regions); 649 } 650 spin_unlock_irq(&rh->region_lock); 651 652 rh->wakeup_workers(rh->context); 653 } 654 EXPORT_SYMBOL_GPL(dm_rh_recovery_end); 655 656 /* Return recovery in flight count. */ 657 int dm_rh_recovery_in_flight(struct dm_region_hash *rh) 658 { 659 return atomic_read(&rh->recovery_in_flight); 660 } 661 EXPORT_SYMBOL_GPL(dm_rh_recovery_in_flight); 662 663 int dm_rh_flush(struct dm_region_hash *rh) 664 { 665 return rh->log->type->flush(rh->log); 666 } 667 EXPORT_SYMBOL_GPL(dm_rh_flush); 668 669 void dm_rh_delay(struct dm_region_hash *rh, struct bio *bio) 670 { 671 struct dm_region *reg; 672 673 read_lock(&rh->hash_lock); 674 reg = __rh_find(rh, dm_rh_bio_to_region(rh, bio)); 675 bio_list_add(®->delayed_bios, bio); 676 read_unlock(&rh->hash_lock); 677 } 678 EXPORT_SYMBOL_GPL(dm_rh_delay); 679 680 void dm_rh_stop_recovery(struct dm_region_hash *rh) 681 { 682 int i; 683 684 /* wait for any recovering regions */ 685 for (i = 0; i < rh->max_recovery; i++) 686 down(&rh->recovery_count); 687 } 688 EXPORT_SYMBOL_GPL(dm_rh_stop_recovery); 689 690 void dm_rh_start_recovery(struct dm_region_hash *rh) 691 { 692 int i; 693 694 for (i = 0; i < rh->max_recovery; i++) 695 up(&rh->recovery_count); 696 697 rh->wakeup_workers(rh->context); 698 } 699 EXPORT_SYMBOL_GPL(dm_rh_start_recovery); 700 701 MODULE_DESCRIPTION(DM_NAME " region hash"); 702 MODULE_AUTHOR("Joe Thornber/Heinz Mauelshagen <dm-devel@redhat.com>"); 703 MODULE_LICENSE("GPL"); 704