1 /* 2 * Copyright (C) 2003 Sistina Software Limited. 3 * Copyright (C) 2004-2008 Red Hat, Inc. All rights reserved. 4 * 5 * This file is released under the GPL. 6 */ 7 8 #include <linux/dm-dirty-log.h> 9 #include <linux/dm-region-hash.h> 10 11 #include <linux/ctype.h> 12 #include <linux/init.h> 13 #include <linux/module.h> 14 #include <linux/vmalloc.h> 15 16 #include "dm.h" 17 #include "dm-bio-list.h" 18 19 #define DM_MSG_PREFIX "region hash" 20 21 /*----------------------------------------------------------------- 22 * Region hash 23 * 24 * The mirror splits itself up into discrete regions. Each 25 * region can be in one of three states: clean, dirty, 26 * nosync. There is no need to put clean regions in the hash. 27 * 28 * In addition to being present in the hash table a region _may_ 29 * be present on one of three lists. 30 * 31 * clean_regions: Regions on this list have no io pending to 32 * them, they are in sync, we are no longer interested in them, 33 * they are dull. dm_rh_update_states() will remove them from the 34 * hash table. 35 * 36 * quiesced_regions: These regions have been spun down, ready 37 * for recovery. rh_recovery_start() will remove regions from 38 * this list and hand them to kmirrord, which will schedule the 39 * recovery io with kcopyd. 40 * 41 * recovered_regions: Regions that kcopyd has successfully 42 * recovered. dm_rh_update_states() will now schedule any delayed 43 * io, up the recovery_count, and remove the region from the 44 * hash. 45 * 46 * There are 2 locks: 47 * A rw spin lock 'hash_lock' protects just the hash table, 48 * this is never held in write mode from interrupt context, 49 * which I believe means that we only have to disable irqs when 50 * doing a write lock. 51 * 52 * An ordinary spin lock 'region_lock' that protects the three 53 * lists in the region_hash, with the 'state', 'list' and 54 * 'delayed_bios' fields of the regions. This is used from irq 55 * context, so all other uses will have to suspend local irqs. 56 *---------------------------------------------------------------*/ 57 struct dm_region_hash { 58 uint32_t region_size; 59 unsigned region_shift; 60 61 /* holds persistent region state */ 62 struct dm_dirty_log *log; 63 64 /* hash table */ 65 rwlock_t hash_lock; 66 mempool_t *region_pool; 67 unsigned mask; 68 unsigned nr_buckets; 69 unsigned prime; 70 unsigned shift; 71 struct list_head *buckets; 72 73 unsigned max_recovery; /* Max # of regions to recover in parallel */ 74 75 spinlock_t region_lock; 76 atomic_t recovery_in_flight; 77 struct semaphore recovery_count; 78 struct list_head clean_regions; 79 struct list_head quiesced_regions; 80 struct list_head recovered_regions; 81 struct list_head failed_recovered_regions; 82 83 void *context; 84 sector_t target_begin; 85 86 /* Callback function to schedule bios writes */ 87 void (*dispatch_bios)(void *context, struct bio_list *bios); 88 89 /* Callback function to wakeup callers worker thread. */ 90 void (*wakeup_workers)(void *context); 91 92 /* Callback function to wakeup callers recovery waiters. */ 93 void (*wakeup_all_recovery_waiters)(void *context); 94 }; 95 96 struct dm_region { 97 struct dm_region_hash *rh; /* FIXME: can we get rid of this ? */ 98 region_t key; 99 int state; 100 101 struct list_head hash_list; 102 struct list_head list; 103 104 atomic_t pending; 105 struct bio_list delayed_bios; 106 }; 107 108 /* 109 * Conversion fns 110 */ 111 static region_t dm_rh_sector_to_region(struct dm_region_hash *rh, sector_t sector) 112 { 113 return sector >> rh->region_shift; 114 } 115 116 sector_t dm_rh_region_to_sector(struct dm_region_hash *rh, region_t region) 117 { 118 return region << rh->region_shift; 119 } 120 EXPORT_SYMBOL_GPL(dm_rh_region_to_sector); 121 122 region_t dm_rh_bio_to_region(struct dm_region_hash *rh, struct bio *bio) 123 { 124 return dm_rh_sector_to_region(rh, bio->bi_sector - rh->target_begin); 125 } 126 EXPORT_SYMBOL_GPL(dm_rh_bio_to_region); 127 128 void *dm_rh_region_context(struct dm_region *reg) 129 { 130 return reg->rh->context; 131 } 132 EXPORT_SYMBOL_GPL(dm_rh_region_context); 133 134 region_t dm_rh_get_region_key(struct dm_region *reg) 135 { 136 return reg->key; 137 } 138 EXPORT_SYMBOL_GPL(dm_rh_get_region_key); 139 140 sector_t dm_rh_get_region_size(struct dm_region_hash *rh) 141 { 142 return rh->region_size; 143 } 144 EXPORT_SYMBOL_GPL(dm_rh_get_region_size); 145 146 /* 147 * FIXME: shall we pass in a structure instead of all these args to 148 * dm_region_hash_create()???? 149 */ 150 #define RH_HASH_MULT 2654435387U 151 #define RH_HASH_SHIFT 12 152 153 #define MIN_REGIONS 64 154 struct dm_region_hash *dm_region_hash_create( 155 void *context, void (*dispatch_bios)(void *context, 156 struct bio_list *bios), 157 void (*wakeup_workers)(void *context), 158 void (*wakeup_all_recovery_waiters)(void *context), 159 sector_t target_begin, unsigned max_recovery, 160 struct dm_dirty_log *log, uint32_t region_size, 161 region_t nr_regions) 162 { 163 struct dm_region_hash *rh; 164 unsigned nr_buckets, max_buckets; 165 size_t i; 166 167 /* 168 * Calculate a suitable number of buckets for our hash 169 * table. 170 */ 171 max_buckets = nr_regions >> 6; 172 for (nr_buckets = 128u; nr_buckets < max_buckets; nr_buckets <<= 1) 173 ; 174 nr_buckets >>= 1; 175 176 rh = kmalloc(sizeof(*rh), GFP_KERNEL); 177 if (!rh) { 178 DMERR("unable to allocate region hash memory"); 179 return ERR_PTR(-ENOMEM); 180 } 181 182 rh->context = context; 183 rh->dispatch_bios = dispatch_bios; 184 rh->wakeup_workers = wakeup_workers; 185 rh->wakeup_all_recovery_waiters = wakeup_all_recovery_waiters; 186 rh->target_begin = target_begin; 187 rh->max_recovery = max_recovery; 188 rh->log = log; 189 rh->region_size = region_size; 190 rh->region_shift = ffs(region_size) - 1; 191 rwlock_init(&rh->hash_lock); 192 rh->mask = nr_buckets - 1; 193 rh->nr_buckets = nr_buckets; 194 195 rh->shift = RH_HASH_SHIFT; 196 rh->prime = RH_HASH_MULT; 197 198 rh->buckets = vmalloc(nr_buckets * sizeof(*rh->buckets)); 199 if (!rh->buckets) { 200 DMERR("unable to allocate region hash bucket memory"); 201 kfree(rh); 202 return ERR_PTR(-ENOMEM); 203 } 204 205 for (i = 0; i < nr_buckets; i++) 206 INIT_LIST_HEAD(rh->buckets + i); 207 208 spin_lock_init(&rh->region_lock); 209 sema_init(&rh->recovery_count, 0); 210 atomic_set(&rh->recovery_in_flight, 0); 211 INIT_LIST_HEAD(&rh->clean_regions); 212 INIT_LIST_HEAD(&rh->quiesced_regions); 213 INIT_LIST_HEAD(&rh->recovered_regions); 214 INIT_LIST_HEAD(&rh->failed_recovered_regions); 215 216 rh->region_pool = mempool_create_kmalloc_pool(MIN_REGIONS, 217 sizeof(struct dm_region)); 218 if (!rh->region_pool) { 219 vfree(rh->buckets); 220 kfree(rh); 221 rh = ERR_PTR(-ENOMEM); 222 } 223 224 return rh; 225 } 226 EXPORT_SYMBOL_GPL(dm_region_hash_create); 227 228 void dm_region_hash_destroy(struct dm_region_hash *rh) 229 { 230 unsigned h; 231 struct dm_region *reg, *nreg; 232 233 BUG_ON(!list_empty(&rh->quiesced_regions)); 234 for (h = 0; h < rh->nr_buckets; h++) { 235 list_for_each_entry_safe(reg, nreg, rh->buckets + h, 236 hash_list) { 237 BUG_ON(atomic_read(®->pending)); 238 mempool_free(reg, rh->region_pool); 239 } 240 } 241 242 if (rh->log) 243 dm_dirty_log_destroy(rh->log); 244 245 if (rh->region_pool) 246 mempool_destroy(rh->region_pool); 247 248 vfree(rh->buckets); 249 kfree(rh); 250 } 251 EXPORT_SYMBOL_GPL(dm_region_hash_destroy); 252 253 struct dm_dirty_log *dm_rh_dirty_log(struct dm_region_hash *rh) 254 { 255 return rh->log; 256 } 257 EXPORT_SYMBOL_GPL(dm_rh_dirty_log); 258 259 static unsigned rh_hash(struct dm_region_hash *rh, region_t region) 260 { 261 return (unsigned) ((region * rh->prime) >> rh->shift) & rh->mask; 262 } 263 264 static struct dm_region *__rh_lookup(struct dm_region_hash *rh, region_t region) 265 { 266 struct dm_region *reg; 267 struct list_head *bucket = rh->buckets + rh_hash(rh, region); 268 269 list_for_each_entry(reg, bucket, hash_list) 270 if (reg->key == region) 271 return reg; 272 273 return NULL; 274 } 275 276 static void __rh_insert(struct dm_region_hash *rh, struct dm_region *reg) 277 { 278 list_add(®->hash_list, rh->buckets + rh_hash(rh, reg->key)); 279 } 280 281 static struct dm_region *__rh_alloc(struct dm_region_hash *rh, region_t region) 282 { 283 struct dm_region *reg, *nreg; 284 285 nreg = mempool_alloc(rh->region_pool, GFP_ATOMIC); 286 if (unlikely(!nreg)) 287 nreg = kmalloc(sizeof(*nreg), GFP_NOIO); 288 289 nreg->state = rh->log->type->in_sync(rh->log, region, 1) ? 290 DM_RH_CLEAN : DM_RH_NOSYNC; 291 nreg->rh = rh; 292 nreg->key = region; 293 INIT_LIST_HEAD(&nreg->list); 294 atomic_set(&nreg->pending, 0); 295 bio_list_init(&nreg->delayed_bios); 296 297 write_lock_irq(&rh->hash_lock); 298 reg = __rh_lookup(rh, region); 299 if (reg) 300 /* We lost the race. */ 301 mempool_free(nreg, rh->region_pool); 302 else { 303 __rh_insert(rh, nreg); 304 if (nreg->state == DM_RH_CLEAN) { 305 spin_lock(&rh->region_lock); 306 list_add(&nreg->list, &rh->clean_regions); 307 spin_unlock(&rh->region_lock); 308 } 309 310 reg = nreg; 311 } 312 write_unlock_irq(&rh->hash_lock); 313 314 return reg; 315 } 316 317 static struct dm_region *__rh_find(struct dm_region_hash *rh, region_t region) 318 { 319 struct dm_region *reg; 320 321 reg = __rh_lookup(rh, region); 322 if (!reg) { 323 read_unlock(&rh->hash_lock); 324 reg = __rh_alloc(rh, region); 325 read_lock(&rh->hash_lock); 326 } 327 328 return reg; 329 } 330 331 int dm_rh_get_state(struct dm_region_hash *rh, region_t region, int may_block) 332 { 333 int r; 334 struct dm_region *reg; 335 336 read_lock(&rh->hash_lock); 337 reg = __rh_lookup(rh, region); 338 read_unlock(&rh->hash_lock); 339 340 if (reg) 341 return reg->state; 342 343 /* 344 * The region wasn't in the hash, so we fall back to the 345 * dirty log. 346 */ 347 r = rh->log->type->in_sync(rh->log, region, may_block); 348 349 /* 350 * Any error from the dirty log (eg. -EWOULDBLOCK) gets 351 * taken as a DM_RH_NOSYNC 352 */ 353 return r == 1 ? DM_RH_CLEAN : DM_RH_NOSYNC; 354 } 355 EXPORT_SYMBOL_GPL(dm_rh_get_state); 356 357 static void complete_resync_work(struct dm_region *reg, int success) 358 { 359 struct dm_region_hash *rh = reg->rh; 360 361 rh->log->type->set_region_sync(rh->log, reg->key, success); 362 363 /* 364 * Dispatch the bios before we call 'wake_up_all'. 365 * This is important because if we are suspending, 366 * we want to know that recovery is complete and 367 * the work queue is flushed. If we wake_up_all 368 * before we dispatch_bios (queue bios and call wake()), 369 * then we risk suspending before the work queue 370 * has been properly flushed. 371 */ 372 rh->dispatch_bios(rh->context, ®->delayed_bios); 373 if (atomic_dec_and_test(&rh->recovery_in_flight)) 374 rh->wakeup_all_recovery_waiters(rh->context); 375 up(&rh->recovery_count); 376 } 377 378 /* dm_rh_mark_nosync 379 * @ms 380 * @bio 381 * @done 382 * @error 383 * 384 * The bio was written on some mirror(s) but failed on other mirror(s). 385 * We can successfully endio the bio but should avoid the region being 386 * marked clean by setting the state DM_RH_NOSYNC. 387 * 388 * This function is _not_ safe in interrupt context! 389 */ 390 void dm_rh_mark_nosync(struct dm_region_hash *rh, 391 struct bio *bio, unsigned done, int error) 392 { 393 unsigned long flags; 394 struct dm_dirty_log *log = rh->log; 395 struct dm_region *reg; 396 region_t region = dm_rh_bio_to_region(rh, bio); 397 int recovering = 0; 398 399 /* We must inform the log that the sync count has changed. */ 400 log->type->set_region_sync(log, region, 0); 401 402 read_lock(&rh->hash_lock); 403 reg = __rh_find(rh, region); 404 read_unlock(&rh->hash_lock); 405 406 /* region hash entry should exist because write was in-flight */ 407 BUG_ON(!reg); 408 BUG_ON(!list_empty(®->list)); 409 410 spin_lock_irqsave(&rh->region_lock, flags); 411 /* 412 * Possible cases: 413 * 1) DM_RH_DIRTY 414 * 2) DM_RH_NOSYNC: was dirty, other preceeding writes failed 415 * 3) DM_RH_RECOVERING: flushing pending writes 416 * Either case, the region should have not been connected to list. 417 */ 418 recovering = (reg->state == DM_RH_RECOVERING); 419 reg->state = DM_RH_NOSYNC; 420 BUG_ON(!list_empty(®->list)); 421 spin_unlock_irqrestore(&rh->region_lock, flags); 422 423 bio_endio(bio, error); 424 if (recovering) 425 complete_resync_work(reg, 0); 426 } 427 EXPORT_SYMBOL_GPL(dm_rh_mark_nosync); 428 429 void dm_rh_update_states(struct dm_region_hash *rh, int errors_handled) 430 { 431 struct dm_region *reg, *next; 432 433 LIST_HEAD(clean); 434 LIST_HEAD(recovered); 435 LIST_HEAD(failed_recovered); 436 437 /* 438 * Quickly grab the lists. 439 */ 440 write_lock_irq(&rh->hash_lock); 441 spin_lock(&rh->region_lock); 442 if (!list_empty(&rh->clean_regions)) { 443 list_splice_init(&rh->clean_regions, &clean); 444 445 list_for_each_entry(reg, &clean, list) 446 list_del(®->hash_list); 447 } 448 449 if (!list_empty(&rh->recovered_regions)) { 450 list_splice_init(&rh->recovered_regions, &recovered); 451 452 list_for_each_entry(reg, &recovered, list) 453 list_del(®->hash_list); 454 } 455 456 if (!list_empty(&rh->failed_recovered_regions)) { 457 list_splice_init(&rh->failed_recovered_regions, 458 &failed_recovered); 459 460 list_for_each_entry(reg, &failed_recovered, list) 461 list_del(®->hash_list); 462 } 463 464 spin_unlock(&rh->region_lock); 465 write_unlock_irq(&rh->hash_lock); 466 467 /* 468 * All the regions on the recovered and clean lists have 469 * now been pulled out of the system, so no need to do 470 * any more locking. 471 */ 472 list_for_each_entry_safe(reg, next, &recovered, list) { 473 rh->log->type->clear_region(rh->log, reg->key); 474 complete_resync_work(reg, 1); 475 mempool_free(reg, rh->region_pool); 476 } 477 478 list_for_each_entry_safe(reg, next, &failed_recovered, list) { 479 complete_resync_work(reg, errors_handled ? 0 : 1); 480 mempool_free(reg, rh->region_pool); 481 } 482 483 list_for_each_entry_safe(reg, next, &clean, list) { 484 rh->log->type->clear_region(rh->log, reg->key); 485 mempool_free(reg, rh->region_pool); 486 } 487 488 rh->log->type->flush(rh->log); 489 } 490 EXPORT_SYMBOL_GPL(dm_rh_update_states); 491 492 static void rh_inc(struct dm_region_hash *rh, region_t region) 493 { 494 struct dm_region *reg; 495 496 read_lock(&rh->hash_lock); 497 reg = __rh_find(rh, region); 498 499 spin_lock_irq(&rh->region_lock); 500 atomic_inc(®->pending); 501 502 if (reg->state == DM_RH_CLEAN) { 503 reg->state = DM_RH_DIRTY; 504 list_del_init(®->list); /* take off the clean list */ 505 spin_unlock_irq(&rh->region_lock); 506 507 rh->log->type->mark_region(rh->log, reg->key); 508 } else 509 spin_unlock_irq(&rh->region_lock); 510 511 512 read_unlock(&rh->hash_lock); 513 } 514 515 void dm_rh_inc_pending(struct dm_region_hash *rh, struct bio_list *bios) 516 { 517 struct bio *bio; 518 519 for (bio = bios->head; bio; bio = bio->bi_next) 520 rh_inc(rh, dm_rh_bio_to_region(rh, bio)); 521 } 522 EXPORT_SYMBOL_GPL(dm_rh_inc_pending); 523 524 void dm_rh_dec(struct dm_region_hash *rh, region_t region) 525 { 526 unsigned long flags; 527 struct dm_region *reg; 528 int should_wake = 0; 529 530 read_lock(&rh->hash_lock); 531 reg = __rh_lookup(rh, region); 532 read_unlock(&rh->hash_lock); 533 534 spin_lock_irqsave(&rh->region_lock, flags); 535 if (atomic_dec_and_test(®->pending)) { 536 /* 537 * There is no pending I/O for this region. 538 * We can move the region to corresponding list for next action. 539 * At this point, the region is not yet connected to any list. 540 * 541 * If the state is DM_RH_NOSYNC, the region should be kept off 542 * from clean list. 543 * The hash entry for DM_RH_NOSYNC will remain in memory 544 * until the region is recovered or the map is reloaded. 545 */ 546 547 /* do nothing for DM_RH_NOSYNC */ 548 if (reg->state == DM_RH_RECOVERING) { 549 list_add_tail(®->list, &rh->quiesced_regions); 550 } else if (reg->state == DM_RH_DIRTY) { 551 reg->state = DM_RH_CLEAN; 552 list_add(®->list, &rh->clean_regions); 553 } 554 should_wake = 1; 555 } 556 spin_unlock_irqrestore(&rh->region_lock, flags); 557 558 if (should_wake) 559 rh->wakeup_workers(rh->context); 560 } 561 EXPORT_SYMBOL_GPL(dm_rh_dec); 562 563 /* 564 * Starts quiescing a region in preparation for recovery. 565 */ 566 static int __rh_recovery_prepare(struct dm_region_hash *rh) 567 { 568 int r; 569 region_t region; 570 struct dm_region *reg; 571 572 /* 573 * Ask the dirty log what's next. 574 */ 575 r = rh->log->type->get_resync_work(rh->log, ®ion); 576 if (r <= 0) 577 return r; 578 579 /* 580 * Get this region, and start it quiescing by setting the 581 * recovering flag. 582 */ 583 read_lock(&rh->hash_lock); 584 reg = __rh_find(rh, region); 585 read_unlock(&rh->hash_lock); 586 587 spin_lock_irq(&rh->region_lock); 588 reg->state = DM_RH_RECOVERING; 589 590 /* Already quiesced ? */ 591 if (atomic_read(®->pending)) 592 list_del_init(®->list); 593 else 594 list_move(®->list, &rh->quiesced_regions); 595 596 spin_unlock_irq(&rh->region_lock); 597 598 return 1; 599 } 600 601 void dm_rh_recovery_prepare(struct dm_region_hash *rh) 602 { 603 /* Extra reference to avoid race with dm_rh_stop_recovery */ 604 atomic_inc(&rh->recovery_in_flight); 605 606 while (!down_trylock(&rh->recovery_count)) { 607 atomic_inc(&rh->recovery_in_flight); 608 if (__rh_recovery_prepare(rh) <= 0) { 609 atomic_dec(&rh->recovery_in_flight); 610 up(&rh->recovery_count); 611 break; 612 } 613 } 614 615 /* Drop the extra reference */ 616 if (atomic_dec_and_test(&rh->recovery_in_flight)) 617 rh->wakeup_all_recovery_waiters(rh->context); 618 } 619 EXPORT_SYMBOL_GPL(dm_rh_recovery_prepare); 620 621 /* 622 * Returns any quiesced regions. 623 */ 624 struct dm_region *dm_rh_recovery_start(struct dm_region_hash *rh) 625 { 626 struct dm_region *reg = NULL; 627 628 spin_lock_irq(&rh->region_lock); 629 if (!list_empty(&rh->quiesced_regions)) { 630 reg = list_entry(rh->quiesced_regions.next, 631 struct dm_region, list); 632 list_del_init(®->list); /* remove from the quiesced list */ 633 } 634 spin_unlock_irq(&rh->region_lock); 635 636 return reg; 637 } 638 EXPORT_SYMBOL_GPL(dm_rh_recovery_start); 639 640 void dm_rh_recovery_end(struct dm_region *reg, int success) 641 { 642 struct dm_region_hash *rh = reg->rh; 643 644 spin_lock_irq(&rh->region_lock); 645 if (success) 646 list_add(®->list, ®->rh->recovered_regions); 647 else { 648 reg->state = DM_RH_NOSYNC; 649 list_add(®->list, ®->rh->failed_recovered_regions); 650 } 651 spin_unlock_irq(&rh->region_lock); 652 653 rh->wakeup_workers(rh->context); 654 } 655 EXPORT_SYMBOL_GPL(dm_rh_recovery_end); 656 657 /* Return recovery in flight count. */ 658 int dm_rh_recovery_in_flight(struct dm_region_hash *rh) 659 { 660 return atomic_read(&rh->recovery_in_flight); 661 } 662 EXPORT_SYMBOL_GPL(dm_rh_recovery_in_flight); 663 664 int dm_rh_flush(struct dm_region_hash *rh) 665 { 666 return rh->log->type->flush(rh->log); 667 } 668 EXPORT_SYMBOL_GPL(dm_rh_flush); 669 670 void dm_rh_delay(struct dm_region_hash *rh, struct bio *bio) 671 { 672 struct dm_region *reg; 673 674 read_lock(&rh->hash_lock); 675 reg = __rh_find(rh, dm_rh_bio_to_region(rh, bio)); 676 bio_list_add(®->delayed_bios, bio); 677 read_unlock(&rh->hash_lock); 678 } 679 EXPORT_SYMBOL_GPL(dm_rh_delay); 680 681 void dm_rh_stop_recovery(struct dm_region_hash *rh) 682 { 683 int i; 684 685 /* wait for any recovering regions */ 686 for (i = 0; i < rh->max_recovery; i++) 687 down(&rh->recovery_count); 688 } 689 EXPORT_SYMBOL_GPL(dm_rh_stop_recovery); 690 691 void dm_rh_start_recovery(struct dm_region_hash *rh) 692 { 693 int i; 694 695 for (i = 0; i < rh->max_recovery; i++) 696 up(&rh->recovery_count); 697 698 rh->wakeup_workers(rh->context); 699 } 700 EXPORT_SYMBOL_GPL(dm_rh_start_recovery); 701 702 MODULE_DESCRIPTION(DM_NAME " region hash"); 703 MODULE_AUTHOR("Joe Thornber/Heinz Mauelshagen <dm-devel@redhat.com>"); 704 MODULE_LICENSE("GPL"); 705