1 /* 2 * dm-snapshot.c 3 * 4 * Copyright (C) 2001-2002 Sistina Software (UK) Limited. 5 * 6 * This file is released under the GPL. 7 */ 8 9 #include <linux/blkdev.h> 10 #include <linux/device-mapper.h> 11 #include <linux/delay.h> 12 #include <linux/fs.h> 13 #include <linux/init.h> 14 #include <linux/kdev_t.h> 15 #include <linux/list.h> 16 #include <linux/mempool.h> 17 #include <linux/module.h> 18 #include <linux/slab.h> 19 #include <linux/vmalloc.h> 20 #include <linux/log2.h> 21 #include <linux/dm-kcopyd.h> 22 23 #include "dm-exception-store.h" 24 25 #define DM_MSG_PREFIX "snapshots" 26 27 static const char dm_snapshot_merge_target_name[] = "snapshot-merge"; 28 29 #define dm_target_is_snapshot_merge(ti) \ 30 ((ti)->type->name == dm_snapshot_merge_target_name) 31 32 /* 33 * The percentage increment we will wake up users at 34 */ 35 #define WAKE_UP_PERCENT 5 36 37 /* 38 * kcopyd priority of snapshot operations 39 */ 40 #define SNAPSHOT_COPY_PRIORITY 2 41 42 /* 43 * Reserve 1MB for each snapshot initially (with minimum of 1 page). 44 */ 45 #define SNAPSHOT_PAGES (((1UL << 20) >> PAGE_SHIFT) ? : 1) 46 47 /* 48 * The size of the mempool used to track chunks in use. 49 */ 50 #define MIN_IOS 256 51 52 #define DM_TRACKED_CHUNK_HASH_SIZE 16 53 #define DM_TRACKED_CHUNK_HASH(x) ((unsigned long)(x) & \ 54 (DM_TRACKED_CHUNK_HASH_SIZE - 1)) 55 56 struct dm_exception_table { 57 uint32_t hash_mask; 58 unsigned hash_shift; 59 struct list_head *table; 60 }; 61 62 struct dm_snapshot { 63 struct rw_semaphore lock; 64 65 struct dm_dev *origin; 66 struct dm_dev *cow; 67 68 struct dm_target *ti; 69 70 /* List of snapshots per Origin */ 71 struct list_head list; 72 73 /* 74 * You can't use a snapshot if this is 0 (e.g. if full). 75 * A snapshot-merge target never clears this. 76 */ 77 int valid; 78 79 /* Origin writes don't trigger exceptions until this is set */ 80 int active; 81 82 atomic_t pending_exceptions_count; 83 84 mempool_t *pending_pool; 85 86 struct dm_exception_table pending; 87 struct dm_exception_table complete; 88 89 /* 90 * pe_lock protects all pending_exception operations and access 91 * as well as the snapshot_bios list. 92 */ 93 spinlock_t pe_lock; 94 95 /* Chunks with outstanding reads */ 96 spinlock_t tracked_chunk_lock; 97 mempool_t *tracked_chunk_pool; 98 struct hlist_head tracked_chunk_hash[DM_TRACKED_CHUNK_HASH_SIZE]; 99 100 /* The on disk metadata handler */ 101 struct dm_exception_store *store; 102 103 struct dm_kcopyd_client *kcopyd_client; 104 105 /* Wait for events based on state_bits */ 106 unsigned long state_bits; 107 108 /* Range of chunks currently being merged. */ 109 chunk_t first_merging_chunk; 110 int num_merging_chunks; 111 112 /* 113 * The merge operation failed if this flag is set. 114 * Failure modes are handled as follows: 115 * - I/O error reading the header 116 * => don't load the target; abort. 117 * - Header does not have "valid" flag set 118 * => use the origin; forget about the snapshot. 119 * - I/O error when reading exceptions 120 * => don't load the target; abort. 121 * (We can't use the intermediate origin state.) 122 * - I/O error while merging 123 * => stop merging; set merge_failed; process I/O normally. 124 */ 125 int merge_failed; 126 127 /* 128 * Incoming bios that overlap with chunks being merged must wait 129 * for them to be committed. 130 */ 131 struct bio_list bios_queued_during_merge; 132 }; 133 134 /* 135 * state_bits: 136 * RUNNING_MERGE - Merge operation is in progress. 137 * SHUTDOWN_MERGE - Set to signal that merge needs to be stopped; 138 * cleared afterwards. 139 */ 140 #define RUNNING_MERGE 0 141 #define SHUTDOWN_MERGE 1 142 143 struct dm_dev *dm_snap_origin(struct dm_snapshot *s) 144 { 145 return s->origin; 146 } 147 EXPORT_SYMBOL(dm_snap_origin); 148 149 struct dm_dev *dm_snap_cow(struct dm_snapshot *s) 150 { 151 return s->cow; 152 } 153 EXPORT_SYMBOL(dm_snap_cow); 154 155 static sector_t chunk_to_sector(struct dm_exception_store *store, 156 chunk_t chunk) 157 { 158 return chunk << store->chunk_shift; 159 } 160 161 static int bdev_equal(struct block_device *lhs, struct block_device *rhs) 162 { 163 /* 164 * There is only ever one instance of a particular block 165 * device so we can compare pointers safely. 166 */ 167 return lhs == rhs; 168 } 169 170 struct dm_snap_pending_exception { 171 struct dm_exception e; 172 173 /* 174 * Origin buffers waiting for this to complete are held 175 * in a bio list 176 */ 177 struct bio_list origin_bios; 178 struct bio_list snapshot_bios; 179 180 /* Pointer back to snapshot context */ 181 struct dm_snapshot *snap; 182 183 /* 184 * 1 indicates the exception has already been sent to 185 * kcopyd. 186 */ 187 int started; 188 }; 189 190 /* 191 * Hash table mapping origin volumes to lists of snapshots and 192 * a lock to protect it 193 */ 194 static struct kmem_cache *exception_cache; 195 static struct kmem_cache *pending_cache; 196 197 struct dm_snap_tracked_chunk { 198 struct hlist_node node; 199 chunk_t chunk; 200 }; 201 202 static struct kmem_cache *tracked_chunk_cache; 203 204 static struct dm_snap_tracked_chunk *track_chunk(struct dm_snapshot *s, 205 chunk_t chunk) 206 { 207 struct dm_snap_tracked_chunk *c = mempool_alloc(s->tracked_chunk_pool, 208 GFP_NOIO); 209 unsigned long flags; 210 211 c->chunk = chunk; 212 213 spin_lock_irqsave(&s->tracked_chunk_lock, flags); 214 hlist_add_head(&c->node, 215 &s->tracked_chunk_hash[DM_TRACKED_CHUNK_HASH(chunk)]); 216 spin_unlock_irqrestore(&s->tracked_chunk_lock, flags); 217 218 return c; 219 } 220 221 static void stop_tracking_chunk(struct dm_snapshot *s, 222 struct dm_snap_tracked_chunk *c) 223 { 224 unsigned long flags; 225 226 spin_lock_irqsave(&s->tracked_chunk_lock, flags); 227 hlist_del(&c->node); 228 spin_unlock_irqrestore(&s->tracked_chunk_lock, flags); 229 230 mempool_free(c, s->tracked_chunk_pool); 231 } 232 233 static int __chunk_is_tracked(struct dm_snapshot *s, chunk_t chunk) 234 { 235 struct dm_snap_tracked_chunk *c; 236 struct hlist_node *hn; 237 int found = 0; 238 239 spin_lock_irq(&s->tracked_chunk_lock); 240 241 hlist_for_each_entry(c, hn, 242 &s->tracked_chunk_hash[DM_TRACKED_CHUNK_HASH(chunk)], node) { 243 if (c->chunk == chunk) { 244 found = 1; 245 break; 246 } 247 } 248 249 spin_unlock_irq(&s->tracked_chunk_lock); 250 251 return found; 252 } 253 254 /* 255 * This conflicting I/O is extremely improbable in the caller, 256 * so msleep(1) is sufficient and there is no need for a wait queue. 257 */ 258 static void __check_for_conflicting_io(struct dm_snapshot *s, chunk_t chunk) 259 { 260 while (__chunk_is_tracked(s, chunk)) 261 msleep(1); 262 } 263 264 /* 265 * One of these per registered origin, held in the snapshot_origins hash 266 */ 267 struct origin { 268 /* The origin device */ 269 struct block_device *bdev; 270 271 struct list_head hash_list; 272 273 /* List of snapshots for this origin */ 274 struct list_head snapshots; 275 }; 276 277 /* 278 * Size of the hash table for origin volumes. If we make this 279 * the size of the minors list then it should be nearly perfect 280 */ 281 #define ORIGIN_HASH_SIZE 256 282 #define ORIGIN_MASK 0xFF 283 static struct list_head *_origins; 284 static struct rw_semaphore _origins_lock; 285 286 static DECLARE_WAIT_QUEUE_HEAD(_pending_exceptions_done); 287 static DEFINE_SPINLOCK(_pending_exceptions_done_spinlock); 288 static uint64_t _pending_exceptions_done_count; 289 290 static int init_origin_hash(void) 291 { 292 int i; 293 294 _origins = kmalloc(ORIGIN_HASH_SIZE * sizeof(struct list_head), 295 GFP_KERNEL); 296 if (!_origins) { 297 DMERR("unable to allocate memory"); 298 return -ENOMEM; 299 } 300 301 for (i = 0; i < ORIGIN_HASH_SIZE; i++) 302 INIT_LIST_HEAD(_origins + i); 303 init_rwsem(&_origins_lock); 304 305 return 0; 306 } 307 308 static void exit_origin_hash(void) 309 { 310 kfree(_origins); 311 } 312 313 static unsigned origin_hash(struct block_device *bdev) 314 { 315 return bdev->bd_dev & ORIGIN_MASK; 316 } 317 318 static struct origin *__lookup_origin(struct block_device *origin) 319 { 320 struct list_head *ol; 321 struct origin *o; 322 323 ol = &_origins[origin_hash(origin)]; 324 list_for_each_entry (o, ol, hash_list) 325 if (bdev_equal(o->bdev, origin)) 326 return o; 327 328 return NULL; 329 } 330 331 static void __insert_origin(struct origin *o) 332 { 333 struct list_head *sl = &_origins[origin_hash(o->bdev)]; 334 list_add_tail(&o->hash_list, sl); 335 } 336 337 /* 338 * _origins_lock must be held when calling this function. 339 * Returns number of snapshots registered using the supplied cow device, plus: 340 * snap_src - a snapshot suitable for use as a source of exception handover 341 * snap_dest - a snapshot capable of receiving exception handover. 342 * snap_merge - an existing snapshot-merge target linked to the same origin. 343 * There can be at most one snapshot-merge target. The parameter is optional. 344 * 345 * Possible return values and states of snap_src and snap_dest. 346 * 0: NULL, NULL - first new snapshot 347 * 1: snap_src, NULL - normal snapshot 348 * 2: snap_src, snap_dest - waiting for handover 349 * 2: snap_src, NULL - handed over, waiting for old to be deleted 350 * 1: NULL, snap_dest - source got destroyed without handover 351 */ 352 static int __find_snapshots_sharing_cow(struct dm_snapshot *snap, 353 struct dm_snapshot **snap_src, 354 struct dm_snapshot **snap_dest, 355 struct dm_snapshot **snap_merge) 356 { 357 struct dm_snapshot *s; 358 struct origin *o; 359 int count = 0; 360 int active; 361 362 o = __lookup_origin(snap->origin->bdev); 363 if (!o) 364 goto out; 365 366 list_for_each_entry(s, &o->snapshots, list) { 367 if (dm_target_is_snapshot_merge(s->ti) && snap_merge) 368 *snap_merge = s; 369 if (!bdev_equal(s->cow->bdev, snap->cow->bdev)) 370 continue; 371 372 down_read(&s->lock); 373 active = s->active; 374 up_read(&s->lock); 375 376 if (active) { 377 if (snap_src) 378 *snap_src = s; 379 } else if (snap_dest) 380 *snap_dest = s; 381 382 count++; 383 } 384 385 out: 386 return count; 387 } 388 389 /* 390 * On success, returns 1 if this snapshot is a handover destination, 391 * otherwise returns 0. 392 */ 393 static int __validate_exception_handover(struct dm_snapshot *snap) 394 { 395 struct dm_snapshot *snap_src = NULL, *snap_dest = NULL; 396 struct dm_snapshot *snap_merge = NULL; 397 398 /* Does snapshot need exceptions handed over to it? */ 399 if ((__find_snapshots_sharing_cow(snap, &snap_src, &snap_dest, 400 &snap_merge) == 2) || 401 snap_dest) { 402 snap->ti->error = "Snapshot cow pairing for exception " 403 "table handover failed"; 404 return -EINVAL; 405 } 406 407 /* 408 * If no snap_src was found, snap cannot become a handover 409 * destination. 410 */ 411 if (!snap_src) 412 return 0; 413 414 /* 415 * Non-snapshot-merge handover? 416 */ 417 if (!dm_target_is_snapshot_merge(snap->ti)) 418 return 1; 419 420 /* 421 * Do not allow more than one merging snapshot. 422 */ 423 if (snap_merge) { 424 snap->ti->error = "A snapshot is already merging."; 425 return -EINVAL; 426 } 427 428 if (!snap_src->store->type->prepare_merge || 429 !snap_src->store->type->commit_merge) { 430 snap->ti->error = "Snapshot exception store does not " 431 "support snapshot-merge."; 432 return -EINVAL; 433 } 434 435 return 1; 436 } 437 438 static void __insert_snapshot(struct origin *o, struct dm_snapshot *s) 439 { 440 struct dm_snapshot *l; 441 442 /* Sort the list according to chunk size, largest-first smallest-last */ 443 list_for_each_entry(l, &o->snapshots, list) 444 if (l->store->chunk_size < s->store->chunk_size) 445 break; 446 list_add_tail(&s->list, &l->list); 447 } 448 449 /* 450 * Make a note of the snapshot and its origin so we can look it 451 * up when the origin has a write on it. 452 * 453 * Also validate snapshot exception store handovers. 454 * On success, returns 1 if this registration is a handover destination, 455 * otherwise returns 0. 456 */ 457 static int register_snapshot(struct dm_snapshot *snap) 458 { 459 struct origin *o, *new_o = NULL; 460 struct block_device *bdev = snap->origin->bdev; 461 int r = 0; 462 463 new_o = kmalloc(sizeof(*new_o), GFP_KERNEL); 464 if (!new_o) 465 return -ENOMEM; 466 467 down_write(&_origins_lock); 468 469 r = __validate_exception_handover(snap); 470 if (r < 0) { 471 kfree(new_o); 472 goto out; 473 } 474 475 o = __lookup_origin(bdev); 476 if (o) 477 kfree(new_o); 478 else { 479 /* New origin */ 480 o = new_o; 481 482 /* Initialise the struct */ 483 INIT_LIST_HEAD(&o->snapshots); 484 o->bdev = bdev; 485 486 __insert_origin(o); 487 } 488 489 __insert_snapshot(o, snap); 490 491 out: 492 up_write(&_origins_lock); 493 494 return r; 495 } 496 497 /* 498 * Move snapshot to correct place in list according to chunk size. 499 */ 500 static void reregister_snapshot(struct dm_snapshot *s) 501 { 502 struct block_device *bdev = s->origin->bdev; 503 504 down_write(&_origins_lock); 505 506 list_del(&s->list); 507 __insert_snapshot(__lookup_origin(bdev), s); 508 509 up_write(&_origins_lock); 510 } 511 512 static void unregister_snapshot(struct dm_snapshot *s) 513 { 514 struct origin *o; 515 516 down_write(&_origins_lock); 517 o = __lookup_origin(s->origin->bdev); 518 519 list_del(&s->list); 520 if (o && list_empty(&o->snapshots)) { 521 list_del(&o->hash_list); 522 kfree(o); 523 } 524 525 up_write(&_origins_lock); 526 } 527 528 /* 529 * Implementation of the exception hash tables. 530 * The lowest hash_shift bits of the chunk number are ignored, allowing 531 * some consecutive chunks to be grouped together. 532 */ 533 static int dm_exception_table_init(struct dm_exception_table *et, 534 uint32_t size, unsigned hash_shift) 535 { 536 unsigned int i; 537 538 et->hash_shift = hash_shift; 539 et->hash_mask = size - 1; 540 et->table = dm_vcalloc(size, sizeof(struct list_head)); 541 if (!et->table) 542 return -ENOMEM; 543 544 for (i = 0; i < size; i++) 545 INIT_LIST_HEAD(et->table + i); 546 547 return 0; 548 } 549 550 static void dm_exception_table_exit(struct dm_exception_table *et, 551 struct kmem_cache *mem) 552 { 553 struct list_head *slot; 554 struct dm_exception *ex, *next; 555 int i, size; 556 557 size = et->hash_mask + 1; 558 for (i = 0; i < size; i++) { 559 slot = et->table + i; 560 561 list_for_each_entry_safe (ex, next, slot, hash_list) 562 kmem_cache_free(mem, ex); 563 } 564 565 vfree(et->table); 566 } 567 568 static uint32_t exception_hash(struct dm_exception_table *et, chunk_t chunk) 569 { 570 return (chunk >> et->hash_shift) & et->hash_mask; 571 } 572 573 static void dm_remove_exception(struct dm_exception *e) 574 { 575 list_del(&e->hash_list); 576 } 577 578 /* 579 * Return the exception data for a sector, or NULL if not 580 * remapped. 581 */ 582 static struct dm_exception *dm_lookup_exception(struct dm_exception_table *et, 583 chunk_t chunk) 584 { 585 struct list_head *slot; 586 struct dm_exception *e; 587 588 slot = &et->table[exception_hash(et, chunk)]; 589 list_for_each_entry (e, slot, hash_list) 590 if (chunk >= e->old_chunk && 591 chunk <= e->old_chunk + dm_consecutive_chunk_count(e)) 592 return e; 593 594 return NULL; 595 } 596 597 static struct dm_exception *alloc_completed_exception(void) 598 { 599 struct dm_exception *e; 600 601 e = kmem_cache_alloc(exception_cache, GFP_NOIO); 602 if (!e) 603 e = kmem_cache_alloc(exception_cache, GFP_ATOMIC); 604 605 return e; 606 } 607 608 static void free_completed_exception(struct dm_exception *e) 609 { 610 kmem_cache_free(exception_cache, e); 611 } 612 613 static struct dm_snap_pending_exception *alloc_pending_exception(struct dm_snapshot *s) 614 { 615 struct dm_snap_pending_exception *pe = mempool_alloc(s->pending_pool, 616 GFP_NOIO); 617 618 atomic_inc(&s->pending_exceptions_count); 619 pe->snap = s; 620 621 return pe; 622 } 623 624 static void free_pending_exception(struct dm_snap_pending_exception *pe) 625 { 626 struct dm_snapshot *s = pe->snap; 627 628 mempool_free(pe, s->pending_pool); 629 smp_mb__before_atomic_dec(); 630 atomic_dec(&s->pending_exceptions_count); 631 } 632 633 static void dm_insert_exception(struct dm_exception_table *eh, 634 struct dm_exception *new_e) 635 { 636 struct list_head *l; 637 struct dm_exception *e = NULL; 638 639 l = &eh->table[exception_hash(eh, new_e->old_chunk)]; 640 641 /* Add immediately if this table doesn't support consecutive chunks */ 642 if (!eh->hash_shift) 643 goto out; 644 645 /* List is ordered by old_chunk */ 646 list_for_each_entry_reverse(e, l, hash_list) { 647 /* Insert after an existing chunk? */ 648 if (new_e->old_chunk == (e->old_chunk + 649 dm_consecutive_chunk_count(e) + 1) && 650 new_e->new_chunk == (dm_chunk_number(e->new_chunk) + 651 dm_consecutive_chunk_count(e) + 1)) { 652 dm_consecutive_chunk_count_inc(e); 653 free_completed_exception(new_e); 654 return; 655 } 656 657 /* Insert before an existing chunk? */ 658 if (new_e->old_chunk == (e->old_chunk - 1) && 659 new_e->new_chunk == (dm_chunk_number(e->new_chunk) - 1)) { 660 dm_consecutive_chunk_count_inc(e); 661 e->old_chunk--; 662 e->new_chunk--; 663 free_completed_exception(new_e); 664 return; 665 } 666 667 if (new_e->old_chunk > e->old_chunk) 668 break; 669 } 670 671 out: 672 list_add(&new_e->hash_list, e ? &e->hash_list : l); 673 } 674 675 /* 676 * Callback used by the exception stores to load exceptions when 677 * initialising. 678 */ 679 static int dm_add_exception(void *context, chunk_t old, chunk_t new) 680 { 681 struct dm_snapshot *s = context; 682 struct dm_exception *e; 683 684 e = alloc_completed_exception(); 685 if (!e) 686 return -ENOMEM; 687 688 e->old_chunk = old; 689 690 /* Consecutive_count is implicitly initialised to zero */ 691 e->new_chunk = new; 692 693 dm_insert_exception(&s->complete, e); 694 695 return 0; 696 } 697 698 /* 699 * Return a minimum chunk size of all snapshots that have the specified origin. 700 * Return zero if the origin has no snapshots. 701 */ 702 static sector_t __minimum_chunk_size(struct origin *o) 703 { 704 struct dm_snapshot *snap; 705 unsigned chunk_size = 0; 706 707 if (o) 708 list_for_each_entry(snap, &o->snapshots, list) 709 chunk_size = min_not_zero(chunk_size, 710 snap->store->chunk_size); 711 712 return chunk_size; 713 } 714 715 /* 716 * Hard coded magic. 717 */ 718 static int calc_max_buckets(void) 719 { 720 /* use a fixed size of 2MB */ 721 unsigned long mem = 2 * 1024 * 1024; 722 mem /= sizeof(struct list_head); 723 724 return mem; 725 } 726 727 /* 728 * Allocate room for a suitable hash table. 729 */ 730 static int init_hash_tables(struct dm_snapshot *s) 731 { 732 sector_t hash_size, cow_dev_size, origin_dev_size, max_buckets; 733 734 /* 735 * Calculate based on the size of the original volume or 736 * the COW volume... 737 */ 738 cow_dev_size = get_dev_size(s->cow->bdev); 739 origin_dev_size = get_dev_size(s->origin->bdev); 740 max_buckets = calc_max_buckets(); 741 742 hash_size = min(origin_dev_size, cow_dev_size) >> s->store->chunk_shift; 743 hash_size = min(hash_size, max_buckets); 744 745 if (hash_size < 64) 746 hash_size = 64; 747 hash_size = rounddown_pow_of_two(hash_size); 748 if (dm_exception_table_init(&s->complete, hash_size, 749 DM_CHUNK_CONSECUTIVE_BITS)) 750 return -ENOMEM; 751 752 /* 753 * Allocate hash table for in-flight exceptions 754 * Make this smaller than the real hash table 755 */ 756 hash_size >>= 3; 757 if (hash_size < 64) 758 hash_size = 64; 759 760 if (dm_exception_table_init(&s->pending, hash_size, 0)) { 761 dm_exception_table_exit(&s->complete, exception_cache); 762 return -ENOMEM; 763 } 764 765 return 0; 766 } 767 768 static void merge_shutdown(struct dm_snapshot *s) 769 { 770 clear_bit_unlock(RUNNING_MERGE, &s->state_bits); 771 smp_mb__after_clear_bit(); 772 wake_up_bit(&s->state_bits, RUNNING_MERGE); 773 } 774 775 static struct bio *__release_queued_bios_after_merge(struct dm_snapshot *s) 776 { 777 s->first_merging_chunk = 0; 778 s->num_merging_chunks = 0; 779 780 return bio_list_get(&s->bios_queued_during_merge); 781 } 782 783 /* 784 * Remove one chunk from the index of completed exceptions. 785 */ 786 static int __remove_single_exception_chunk(struct dm_snapshot *s, 787 chunk_t old_chunk) 788 { 789 struct dm_exception *e; 790 791 e = dm_lookup_exception(&s->complete, old_chunk); 792 if (!e) { 793 DMERR("Corruption detected: exception for block %llu is " 794 "on disk but not in memory", 795 (unsigned long long)old_chunk); 796 return -EINVAL; 797 } 798 799 /* 800 * If this is the only chunk using this exception, remove exception. 801 */ 802 if (!dm_consecutive_chunk_count(e)) { 803 dm_remove_exception(e); 804 free_completed_exception(e); 805 return 0; 806 } 807 808 /* 809 * The chunk may be either at the beginning or the end of a 810 * group of consecutive chunks - never in the middle. We are 811 * removing chunks in the opposite order to that in which they 812 * were added, so this should always be true. 813 * Decrement the consecutive chunk counter and adjust the 814 * starting point if necessary. 815 */ 816 if (old_chunk == e->old_chunk) { 817 e->old_chunk++; 818 e->new_chunk++; 819 } else if (old_chunk != e->old_chunk + 820 dm_consecutive_chunk_count(e)) { 821 DMERR("Attempt to merge block %llu from the " 822 "middle of a chunk range [%llu - %llu]", 823 (unsigned long long)old_chunk, 824 (unsigned long long)e->old_chunk, 825 (unsigned long long) 826 e->old_chunk + dm_consecutive_chunk_count(e)); 827 return -EINVAL; 828 } 829 830 dm_consecutive_chunk_count_dec(e); 831 832 return 0; 833 } 834 835 static void flush_bios(struct bio *bio); 836 837 static int remove_single_exception_chunk(struct dm_snapshot *s) 838 { 839 struct bio *b = NULL; 840 int r; 841 chunk_t old_chunk = s->first_merging_chunk + s->num_merging_chunks - 1; 842 843 down_write(&s->lock); 844 845 /* 846 * Process chunks (and associated exceptions) in reverse order 847 * so that dm_consecutive_chunk_count_dec() accounting works. 848 */ 849 do { 850 r = __remove_single_exception_chunk(s, old_chunk); 851 if (r) 852 goto out; 853 } while (old_chunk-- > s->first_merging_chunk); 854 855 b = __release_queued_bios_after_merge(s); 856 857 out: 858 up_write(&s->lock); 859 if (b) 860 flush_bios(b); 861 862 return r; 863 } 864 865 static int origin_write_extent(struct dm_snapshot *merging_snap, 866 sector_t sector, unsigned chunk_size); 867 868 static void merge_callback(int read_err, unsigned long write_err, 869 void *context); 870 871 static uint64_t read_pending_exceptions_done_count(void) 872 { 873 uint64_t pending_exceptions_done; 874 875 spin_lock(&_pending_exceptions_done_spinlock); 876 pending_exceptions_done = _pending_exceptions_done_count; 877 spin_unlock(&_pending_exceptions_done_spinlock); 878 879 return pending_exceptions_done; 880 } 881 882 static void increment_pending_exceptions_done_count(void) 883 { 884 spin_lock(&_pending_exceptions_done_spinlock); 885 _pending_exceptions_done_count++; 886 spin_unlock(&_pending_exceptions_done_spinlock); 887 888 wake_up_all(&_pending_exceptions_done); 889 } 890 891 static void snapshot_merge_next_chunks(struct dm_snapshot *s) 892 { 893 int i, linear_chunks; 894 chunk_t old_chunk, new_chunk; 895 struct dm_io_region src, dest; 896 sector_t io_size; 897 uint64_t previous_count; 898 899 BUG_ON(!test_bit(RUNNING_MERGE, &s->state_bits)); 900 if (unlikely(test_bit(SHUTDOWN_MERGE, &s->state_bits))) 901 goto shut; 902 903 /* 904 * valid flag never changes during merge, so no lock required. 905 */ 906 if (!s->valid) { 907 DMERR("Snapshot is invalid: can't merge"); 908 goto shut; 909 } 910 911 linear_chunks = s->store->type->prepare_merge(s->store, &old_chunk, 912 &new_chunk); 913 if (linear_chunks <= 0) { 914 if (linear_chunks < 0) { 915 DMERR("Read error in exception store: " 916 "shutting down merge"); 917 down_write(&s->lock); 918 s->merge_failed = 1; 919 up_write(&s->lock); 920 } 921 goto shut; 922 } 923 924 /* Adjust old_chunk and new_chunk to reflect start of linear region */ 925 old_chunk = old_chunk + 1 - linear_chunks; 926 new_chunk = new_chunk + 1 - linear_chunks; 927 928 /* 929 * Use one (potentially large) I/O to copy all 'linear_chunks' 930 * from the exception store to the origin 931 */ 932 io_size = linear_chunks * s->store->chunk_size; 933 934 dest.bdev = s->origin->bdev; 935 dest.sector = chunk_to_sector(s->store, old_chunk); 936 dest.count = min(io_size, get_dev_size(dest.bdev) - dest.sector); 937 938 src.bdev = s->cow->bdev; 939 src.sector = chunk_to_sector(s->store, new_chunk); 940 src.count = dest.count; 941 942 /* 943 * Reallocate any exceptions needed in other snapshots then 944 * wait for the pending exceptions to complete. 945 * Each time any pending exception (globally on the system) 946 * completes we are woken and repeat the process to find out 947 * if we can proceed. While this may not seem a particularly 948 * efficient algorithm, it is not expected to have any 949 * significant impact on performance. 950 */ 951 previous_count = read_pending_exceptions_done_count(); 952 while (origin_write_extent(s, dest.sector, io_size)) { 953 wait_event(_pending_exceptions_done, 954 (read_pending_exceptions_done_count() != 955 previous_count)); 956 /* Retry after the wait, until all exceptions are done. */ 957 previous_count = read_pending_exceptions_done_count(); 958 } 959 960 down_write(&s->lock); 961 s->first_merging_chunk = old_chunk; 962 s->num_merging_chunks = linear_chunks; 963 up_write(&s->lock); 964 965 /* Wait until writes to all 'linear_chunks' drain */ 966 for (i = 0; i < linear_chunks; i++) 967 __check_for_conflicting_io(s, old_chunk + i); 968 969 dm_kcopyd_copy(s->kcopyd_client, &src, 1, &dest, 0, merge_callback, s); 970 return; 971 972 shut: 973 merge_shutdown(s); 974 } 975 976 static void error_bios(struct bio *bio); 977 978 static void merge_callback(int read_err, unsigned long write_err, void *context) 979 { 980 struct dm_snapshot *s = context; 981 struct bio *b = NULL; 982 983 if (read_err || write_err) { 984 if (read_err) 985 DMERR("Read error: shutting down merge."); 986 else 987 DMERR("Write error: shutting down merge."); 988 goto shut; 989 } 990 991 if (s->store->type->commit_merge(s->store, 992 s->num_merging_chunks) < 0) { 993 DMERR("Write error in exception store: shutting down merge"); 994 goto shut; 995 } 996 997 if (remove_single_exception_chunk(s) < 0) 998 goto shut; 999 1000 snapshot_merge_next_chunks(s); 1001 1002 return; 1003 1004 shut: 1005 down_write(&s->lock); 1006 s->merge_failed = 1; 1007 b = __release_queued_bios_after_merge(s); 1008 up_write(&s->lock); 1009 error_bios(b); 1010 1011 merge_shutdown(s); 1012 } 1013 1014 static void start_merge(struct dm_snapshot *s) 1015 { 1016 if (!test_and_set_bit(RUNNING_MERGE, &s->state_bits)) 1017 snapshot_merge_next_chunks(s); 1018 } 1019 1020 static int wait_schedule(void *ptr) 1021 { 1022 schedule(); 1023 1024 return 0; 1025 } 1026 1027 /* 1028 * Stop the merging process and wait until it finishes. 1029 */ 1030 static void stop_merge(struct dm_snapshot *s) 1031 { 1032 set_bit(SHUTDOWN_MERGE, &s->state_bits); 1033 wait_on_bit(&s->state_bits, RUNNING_MERGE, wait_schedule, 1034 TASK_UNINTERRUPTIBLE); 1035 clear_bit(SHUTDOWN_MERGE, &s->state_bits); 1036 } 1037 1038 /* 1039 * Construct a snapshot mapping: <origin_dev> <COW-dev> <p/n> <chunk-size> 1040 */ 1041 static int snapshot_ctr(struct dm_target *ti, unsigned int argc, char **argv) 1042 { 1043 struct dm_snapshot *s; 1044 int i; 1045 int r = -EINVAL; 1046 char *origin_path, *cow_path; 1047 unsigned args_used, num_flush_requests = 1; 1048 fmode_t origin_mode = FMODE_READ; 1049 1050 if (argc != 4) { 1051 ti->error = "requires exactly 4 arguments"; 1052 r = -EINVAL; 1053 goto bad; 1054 } 1055 1056 if (dm_target_is_snapshot_merge(ti)) { 1057 num_flush_requests = 2; 1058 origin_mode = FMODE_WRITE; 1059 } 1060 1061 s = kmalloc(sizeof(*s), GFP_KERNEL); 1062 if (!s) { 1063 ti->error = "Cannot allocate snapshot context private " 1064 "structure"; 1065 r = -ENOMEM; 1066 goto bad; 1067 } 1068 1069 origin_path = argv[0]; 1070 argv++; 1071 argc--; 1072 1073 r = dm_get_device(ti, origin_path, origin_mode, &s->origin); 1074 if (r) { 1075 ti->error = "Cannot get origin device"; 1076 goto bad_origin; 1077 } 1078 1079 cow_path = argv[0]; 1080 argv++; 1081 argc--; 1082 1083 r = dm_get_device(ti, cow_path, FMODE_READ | FMODE_WRITE, &s->cow); 1084 if (r) { 1085 ti->error = "Cannot get COW device"; 1086 goto bad_cow; 1087 } 1088 1089 r = dm_exception_store_create(ti, argc, argv, s, &args_used, &s->store); 1090 if (r) { 1091 ti->error = "Couldn't create exception store"; 1092 r = -EINVAL; 1093 goto bad_store; 1094 } 1095 1096 argv += args_used; 1097 argc -= args_used; 1098 1099 s->ti = ti; 1100 s->valid = 1; 1101 s->active = 0; 1102 atomic_set(&s->pending_exceptions_count, 0); 1103 init_rwsem(&s->lock); 1104 INIT_LIST_HEAD(&s->list); 1105 spin_lock_init(&s->pe_lock); 1106 s->state_bits = 0; 1107 s->merge_failed = 0; 1108 s->first_merging_chunk = 0; 1109 s->num_merging_chunks = 0; 1110 bio_list_init(&s->bios_queued_during_merge); 1111 1112 /* Allocate hash table for COW data */ 1113 if (init_hash_tables(s)) { 1114 ti->error = "Unable to allocate hash table space"; 1115 r = -ENOMEM; 1116 goto bad_hash_tables; 1117 } 1118 1119 r = dm_kcopyd_client_create(SNAPSHOT_PAGES, &s->kcopyd_client); 1120 if (r) { 1121 ti->error = "Could not create kcopyd client"; 1122 goto bad_kcopyd; 1123 } 1124 1125 s->pending_pool = mempool_create_slab_pool(MIN_IOS, pending_cache); 1126 if (!s->pending_pool) { 1127 ti->error = "Could not allocate mempool for pending exceptions"; 1128 goto bad_pending_pool; 1129 } 1130 1131 s->tracked_chunk_pool = mempool_create_slab_pool(MIN_IOS, 1132 tracked_chunk_cache); 1133 if (!s->tracked_chunk_pool) { 1134 ti->error = "Could not allocate tracked_chunk mempool for " 1135 "tracking reads"; 1136 goto bad_tracked_chunk_pool; 1137 } 1138 1139 for (i = 0; i < DM_TRACKED_CHUNK_HASH_SIZE; i++) 1140 INIT_HLIST_HEAD(&s->tracked_chunk_hash[i]); 1141 1142 spin_lock_init(&s->tracked_chunk_lock); 1143 1144 ti->private = s; 1145 ti->num_flush_requests = num_flush_requests; 1146 1147 /* Add snapshot to the list of snapshots for this origin */ 1148 /* Exceptions aren't triggered till snapshot_resume() is called */ 1149 r = register_snapshot(s); 1150 if (r == -ENOMEM) { 1151 ti->error = "Snapshot origin struct allocation failed"; 1152 goto bad_load_and_register; 1153 } else if (r < 0) { 1154 /* invalid handover, register_snapshot has set ti->error */ 1155 goto bad_load_and_register; 1156 } 1157 1158 /* 1159 * Metadata must only be loaded into one table at once, so skip this 1160 * if metadata will be handed over during resume. 1161 * Chunk size will be set during the handover - set it to zero to 1162 * ensure it's ignored. 1163 */ 1164 if (r > 0) { 1165 s->store->chunk_size = 0; 1166 return 0; 1167 } 1168 1169 r = s->store->type->read_metadata(s->store, dm_add_exception, 1170 (void *)s); 1171 if (r < 0) { 1172 ti->error = "Failed to read snapshot metadata"; 1173 goto bad_read_metadata; 1174 } else if (r > 0) { 1175 s->valid = 0; 1176 DMWARN("Snapshot is marked invalid."); 1177 } 1178 1179 if (!s->store->chunk_size) { 1180 ti->error = "Chunk size not set"; 1181 goto bad_read_metadata; 1182 } 1183 ti->split_io = s->store->chunk_size; 1184 1185 return 0; 1186 1187 bad_read_metadata: 1188 unregister_snapshot(s); 1189 1190 bad_load_and_register: 1191 mempool_destroy(s->tracked_chunk_pool); 1192 1193 bad_tracked_chunk_pool: 1194 mempool_destroy(s->pending_pool); 1195 1196 bad_pending_pool: 1197 dm_kcopyd_client_destroy(s->kcopyd_client); 1198 1199 bad_kcopyd: 1200 dm_exception_table_exit(&s->pending, pending_cache); 1201 dm_exception_table_exit(&s->complete, exception_cache); 1202 1203 bad_hash_tables: 1204 dm_exception_store_destroy(s->store); 1205 1206 bad_store: 1207 dm_put_device(ti, s->cow); 1208 1209 bad_cow: 1210 dm_put_device(ti, s->origin); 1211 1212 bad_origin: 1213 kfree(s); 1214 1215 bad: 1216 return r; 1217 } 1218 1219 static void __free_exceptions(struct dm_snapshot *s) 1220 { 1221 dm_kcopyd_client_destroy(s->kcopyd_client); 1222 s->kcopyd_client = NULL; 1223 1224 dm_exception_table_exit(&s->pending, pending_cache); 1225 dm_exception_table_exit(&s->complete, exception_cache); 1226 } 1227 1228 static void __handover_exceptions(struct dm_snapshot *snap_src, 1229 struct dm_snapshot *snap_dest) 1230 { 1231 union { 1232 struct dm_exception_table table_swap; 1233 struct dm_exception_store *store_swap; 1234 } u; 1235 1236 /* 1237 * Swap all snapshot context information between the two instances. 1238 */ 1239 u.table_swap = snap_dest->complete; 1240 snap_dest->complete = snap_src->complete; 1241 snap_src->complete = u.table_swap; 1242 1243 u.store_swap = snap_dest->store; 1244 snap_dest->store = snap_src->store; 1245 snap_src->store = u.store_swap; 1246 1247 snap_dest->store->snap = snap_dest; 1248 snap_src->store->snap = snap_src; 1249 1250 snap_dest->ti->split_io = snap_dest->store->chunk_size; 1251 snap_dest->valid = snap_src->valid; 1252 1253 /* 1254 * Set source invalid to ensure it receives no further I/O. 1255 */ 1256 snap_src->valid = 0; 1257 } 1258 1259 static void snapshot_dtr(struct dm_target *ti) 1260 { 1261 #ifdef CONFIG_DM_DEBUG 1262 int i; 1263 #endif 1264 struct dm_snapshot *s = ti->private; 1265 struct dm_snapshot *snap_src = NULL, *snap_dest = NULL; 1266 1267 down_read(&_origins_lock); 1268 /* Check whether exception handover must be cancelled */ 1269 (void) __find_snapshots_sharing_cow(s, &snap_src, &snap_dest, NULL); 1270 if (snap_src && snap_dest && (s == snap_src)) { 1271 down_write(&snap_dest->lock); 1272 snap_dest->valid = 0; 1273 up_write(&snap_dest->lock); 1274 DMERR("Cancelling snapshot handover."); 1275 } 1276 up_read(&_origins_lock); 1277 1278 if (dm_target_is_snapshot_merge(ti)) 1279 stop_merge(s); 1280 1281 /* Prevent further origin writes from using this snapshot. */ 1282 /* After this returns there can be no new kcopyd jobs. */ 1283 unregister_snapshot(s); 1284 1285 while (atomic_read(&s->pending_exceptions_count)) 1286 msleep(1); 1287 /* 1288 * Ensure instructions in mempool_destroy aren't reordered 1289 * before atomic_read. 1290 */ 1291 smp_mb(); 1292 1293 #ifdef CONFIG_DM_DEBUG 1294 for (i = 0; i < DM_TRACKED_CHUNK_HASH_SIZE; i++) 1295 BUG_ON(!hlist_empty(&s->tracked_chunk_hash[i])); 1296 #endif 1297 1298 mempool_destroy(s->tracked_chunk_pool); 1299 1300 __free_exceptions(s); 1301 1302 mempool_destroy(s->pending_pool); 1303 1304 dm_exception_store_destroy(s->store); 1305 1306 dm_put_device(ti, s->cow); 1307 1308 dm_put_device(ti, s->origin); 1309 1310 kfree(s); 1311 } 1312 1313 /* 1314 * Flush a list of buffers. 1315 */ 1316 static void flush_bios(struct bio *bio) 1317 { 1318 struct bio *n; 1319 1320 while (bio) { 1321 n = bio->bi_next; 1322 bio->bi_next = NULL; 1323 generic_make_request(bio); 1324 bio = n; 1325 } 1326 } 1327 1328 static int do_origin(struct dm_dev *origin, struct bio *bio); 1329 1330 /* 1331 * Flush a list of buffers. 1332 */ 1333 static void retry_origin_bios(struct dm_snapshot *s, struct bio *bio) 1334 { 1335 struct bio *n; 1336 int r; 1337 1338 while (bio) { 1339 n = bio->bi_next; 1340 bio->bi_next = NULL; 1341 r = do_origin(s->origin, bio); 1342 if (r == DM_MAPIO_REMAPPED) 1343 generic_make_request(bio); 1344 bio = n; 1345 } 1346 } 1347 1348 /* 1349 * Error a list of buffers. 1350 */ 1351 static void error_bios(struct bio *bio) 1352 { 1353 struct bio *n; 1354 1355 while (bio) { 1356 n = bio->bi_next; 1357 bio->bi_next = NULL; 1358 bio_io_error(bio); 1359 bio = n; 1360 } 1361 } 1362 1363 static void __invalidate_snapshot(struct dm_snapshot *s, int err) 1364 { 1365 if (!s->valid) 1366 return; 1367 1368 if (err == -EIO) 1369 DMERR("Invalidating snapshot: Error reading/writing."); 1370 else if (err == -ENOMEM) 1371 DMERR("Invalidating snapshot: Unable to allocate exception."); 1372 1373 if (s->store->type->drop_snapshot) 1374 s->store->type->drop_snapshot(s->store); 1375 1376 s->valid = 0; 1377 1378 dm_table_event(s->ti->table); 1379 } 1380 1381 static void pending_complete(struct dm_snap_pending_exception *pe, int success) 1382 { 1383 struct dm_exception *e; 1384 struct dm_snapshot *s = pe->snap; 1385 struct bio *origin_bios = NULL; 1386 struct bio *snapshot_bios = NULL; 1387 int error = 0; 1388 1389 if (!success) { 1390 /* Read/write error - snapshot is unusable */ 1391 down_write(&s->lock); 1392 __invalidate_snapshot(s, -EIO); 1393 error = 1; 1394 goto out; 1395 } 1396 1397 e = alloc_completed_exception(); 1398 if (!e) { 1399 down_write(&s->lock); 1400 __invalidate_snapshot(s, -ENOMEM); 1401 error = 1; 1402 goto out; 1403 } 1404 *e = pe->e; 1405 1406 down_write(&s->lock); 1407 if (!s->valid) { 1408 free_completed_exception(e); 1409 error = 1; 1410 goto out; 1411 } 1412 1413 /* Check for conflicting reads */ 1414 __check_for_conflicting_io(s, pe->e.old_chunk); 1415 1416 /* 1417 * Add a proper exception, and remove the 1418 * in-flight exception from the list. 1419 */ 1420 dm_insert_exception(&s->complete, e); 1421 1422 out: 1423 dm_remove_exception(&pe->e); 1424 snapshot_bios = bio_list_get(&pe->snapshot_bios); 1425 origin_bios = bio_list_get(&pe->origin_bios); 1426 free_pending_exception(pe); 1427 1428 increment_pending_exceptions_done_count(); 1429 1430 up_write(&s->lock); 1431 1432 /* Submit any pending write bios */ 1433 if (error) 1434 error_bios(snapshot_bios); 1435 else 1436 flush_bios(snapshot_bios); 1437 1438 retry_origin_bios(s, origin_bios); 1439 } 1440 1441 static void commit_callback(void *context, int success) 1442 { 1443 struct dm_snap_pending_exception *pe = context; 1444 1445 pending_complete(pe, success); 1446 } 1447 1448 /* 1449 * Called when the copy I/O has finished. kcopyd actually runs 1450 * this code so don't block. 1451 */ 1452 static void copy_callback(int read_err, unsigned long write_err, void *context) 1453 { 1454 struct dm_snap_pending_exception *pe = context; 1455 struct dm_snapshot *s = pe->snap; 1456 1457 if (read_err || write_err) 1458 pending_complete(pe, 0); 1459 1460 else 1461 /* Update the metadata if we are persistent */ 1462 s->store->type->commit_exception(s->store, &pe->e, 1463 commit_callback, pe); 1464 } 1465 1466 /* 1467 * Dispatches the copy operation to kcopyd. 1468 */ 1469 static void start_copy(struct dm_snap_pending_exception *pe) 1470 { 1471 struct dm_snapshot *s = pe->snap; 1472 struct dm_io_region src, dest; 1473 struct block_device *bdev = s->origin->bdev; 1474 sector_t dev_size; 1475 1476 dev_size = get_dev_size(bdev); 1477 1478 src.bdev = bdev; 1479 src.sector = chunk_to_sector(s->store, pe->e.old_chunk); 1480 src.count = min((sector_t)s->store->chunk_size, dev_size - src.sector); 1481 1482 dest.bdev = s->cow->bdev; 1483 dest.sector = chunk_to_sector(s->store, pe->e.new_chunk); 1484 dest.count = src.count; 1485 1486 /* Hand over to kcopyd */ 1487 dm_kcopyd_copy(s->kcopyd_client, 1488 &src, 1, &dest, 0, copy_callback, pe); 1489 } 1490 1491 static struct dm_snap_pending_exception * 1492 __lookup_pending_exception(struct dm_snapshot *s, chunk_t chunk) 1493 { 1494 struct dm_exception *e = dm_lookup_exception(&s->pending, chunk); 1495 1496 if (!e) 1497 return NULL; 1498 1499 return container_of(e, struct dm_snap_pending_exception, e); 1500 } 1501 1502 /* 1503 * Looks to see if this snapshot already has a pending exception 1504 * for this chunk, otherwise it allocates a new one and inserts 1505 * it into the pending table. 1506 * 1507 * NOTE: a write lock must be held on snap->lock before calling 1508 * this. 1509 */ 1510 static struct dm_snap_pending_exception * 1511 __find_pending_exception(struct dm_snapshot *s, 1512 struct dm_snap_pending_exception *pe, chunk_t chunk) 1513 { 1514 struct dm_snap_pending_exception *pe2; 1515 1516 pe2 = __lookup_pending_exception(s, chunk); 1517 if (pe2) { 1518 free_pending_exception(pe); 1519 return pe2; 1520 } 1521 1522 pe->e.old_chunk = chunk; 1523 bio_list_init(&pe->origin_bios); 1524 bio_list_init(&pe->snapshot_bios); 1525 pe->started = 0; 1526 1527 if (s->store->type->prepare_exception(s->store, &pe->e)) { 1528 free_pending_exception(pe); 1529 return NULL; 1530 } 1531 1532 dm_insert_exception(&s->pending, &pe->e); 1533 1534 return pe; 1535 } 1536 1537 static void remap_exception(struct dm_snapshot *s, struct dm_exception *e, 1538 struct bio *bio, chunk_t chunk) 1539 { 1540 bio->bi_bdev = s->cow->bdev; 1541 bio->bi_sector = chunk_to_sector(s->store, 1542 dm_chunk_number(e->new_chunk) + 1543 (chunk - e->old_chunk)) + 1544 (bio->bi_sector & 1545 s->store->chunk_mask); 1546 } 1547 1548 static int snapshot_map(struct dm_target *ti, struct bio *bio, 1549 union map_info *map_context) 1550 { 1551 struct dm_exception *e; 1552 struct dm_snapshot *s = ti->private; 1553 int r = DM_MAPIO_REMAPPED; 1554 chunk_t chunk; 1555 struct dm_snap_pending_exception *pe = NULL; 1556 1557 if (bio->bi_rw & REQ_FLUSH) { 1558 bio->bi_bdev = s->cow->bdev; 1559 return DM_MAPIO_REMAPPED; 1560 } 1561 1562 chunk = sector_to_chunk(s->store, bio->bi_sector); 1563 1564 /* Full snapshots are not usable */ 1565 /* To get here the table must be live so s->active is always set. */ 1566 if (!s->valid) 1567 return -EIO; 1568 1569 /* FIXME: should only take write lock if we need 1570 * to copy an exception */ 1571 down_write(&s->lock); 1572 1573 if (!s->valid) { 1574 r = -EIO; 1575 goto out_unlock; 1576 } 1577 1578 /* If the block is already remapped - use that, else remap it */ 1579 e = dm_lookup_exception(&s->complete, chunk); 1580 if (e) { 1581 remap_exception(s, e, bio, chunk); 1582 goto out_unlock; 1583 } 1584 1585 /* 1586 * Write to snapshot - higher level takes care of RW/RO 1587 * flags so we should only get this if we are 1588 * writeable. 1589 */ 1590 if (bio_rw(bio) == WRITE) { 1591 pe = __lookup_pending_exception(s, chunk); 1592 if (!pe) { 1593 up_write(&s->lock); 1594 pe = alloc_pending_exception(s); 1595 down_write(&s->lock); 1596 1597 if (!s->valid) { 1598 free_pending_exception(pe); 1599 r = -EIO; 1600 goto out_unlock; 1601 } 1602 1603 e = dm_lookup_exception(&s->complete, chunk); 1604 if (e) { 1605 free_pending_exception(pe); 1606 remap_exception(s, e, bio, chunk); 1607 goto out_unlock; 1608 } 1609 1610 pe = __find_pending_exception(s, pe, chunk); 1611 if (!pe) { 1612 __invalidate_snapshot(s, -ENOMEM); 1613 r = -EIO; 1614 goto out_unlock; 1615 } 1616 } 1617 1618 remap_exception(s, &pe->e, bio, chunk); 1619 bio_list_add(&pe->snapshot_bios, bio); 1620 1621 r = DM_MAPIO_SUBMITTED; 1622 1623 if (!pe->started) { 1624 /* this is protected by snap->lock */ 1625 pe->started = 1; 1626 up_write(&s->lock); 1627 start_copy(pe); 1628 goto out; 1629 } 1630 } else { 1631 bio->bi_bdev = s->origin->bdev; 1632 map_context->ptr = track_chunk(s, chunk); 1633 } 1634 1635 out_unlock: 1636 up_write(&s->lock); 1637 out: 1638 return r; 1639 } 1640 1641 /* 1642 * A snapshot-merge target behaves like a combination of a snapshot 1643 * target and a snapshot-origin target. It only generates new 1644 * exceptions in other snapshots and not in the one that is being 1645 * merged. 1646 * 1647 * For each chunk, if there is an existing exception, it is used to 1648 * redirect I/O to the cow device. Otherwise I/O is sent to the origin, 1649 * which in turn might generate exceptions in other snapshots. 1650 * If merging is currently taking place on the chunk in question, the 1651 * I/O is deferred by adding it to s->bios_queued_during_merge. 1652 */ 1653 static int snapshot_merge_map(struct dm_target *ti, struct bio *bio, 1654 union map_info *map_context) 1655 { 1656 struct dm_exception *e; 1657 struct dm_snapshot *s = ti->private; 1658 int r = DM_MAPIO_REMAPPED; 1659 chunk_t chunk; 1660 1661 if (bio->bi_rw & REQ_FLUSH) { 1662 if (!map_context->target_request_nr) 1663 bio->bi_bdev = s->origin->bdev; 1664 else 1665 bio->bi_bdev = s->cow->bdev; 1666 map_context->ptr = NULL; 1667 return DM_MAPIO_REMAPPED; 1668 } 1669 1670 chunk = sector_to_chunk(s->store, bio->bi_sector); 1671 1672 down_write(&s->lock); 1673 1674 /* Full merging snapshots are redirected to the origin */ 1675 if (!s->valid) 1676 goto redirect_to_origin; 1677 1678 /* If the block is already remapped - use that */ 1679 e = dm_lookup_exception(&s->complete, chunk); 1680 if (e) { 1681 /* Queue writes overlapping with chunks being merged */ 1682 if (bio_rw(bio) == WRITE && 1683 chunk >= s->first_merging_chunk && 1684 chunk < (s->first_merging_chunk + 1685 s->num_merging_chunks)) { 1686 bio->bi_bdev = s->origin->bdev; 1687 bio_list_add(&s->bios_queued_during_merge, bio); 1688 r = DM_MAPIO_SUBMITTED; 1689 goto out_unlock; 1690 } 1691 1692 remap_exception(s, e, bio, chunk); 1693 1694 if (bio_rw(bio) == WRITE) 1695 map_context->ptr = track_chunk(s, chunk); 1696 goto out_unlock; 1697 } 1698 1699 redirect_to_origin: 1700 bio->bi_bdev = s->origin->bdev; 1701 1702 if (bio_rw(bio) == WRITE) { 1703 up_write(&s->lock); 1704 return do_origin(s->origin, bio); 1705 } 1706 1707 out_unlock: 1708 up_write(&s->lock); 1709 1710 return r; 1711 } 1712 1713 static int snapshot_end_io(struct dm_target *ti, struct bio *bio, 1714 int error, union map_info *map_context) 1715 { 1716 struct dm_snapshot *s = ti->private; 1717 struct dm_snap_tracked_chunk *c = map_context->ptr; 1718 1719 if (c) 1720 stop_tracking_chunk(s, c); 1721 1722 return 0; 1723 } 1724 1725 static void snapshot_merge_presuspend(struct dm_target *ti) 1726 { 1727 struct dm_snapshot *s = ti->private; 1728 1729 stop_merge(s); 1730 } 1731 1732 static int snapshot_preresume(struct dm_target *ti) 1733 { 1734 int r = 0; 1735 struct dm_snapshot *s = ti->private; 1736 struct dm_snapshot *snap_src = NULL, *snap_dest = NULL; 1737 1738 down_read(&_origins_lock); 1739 (void) __find_snapshots_sharing_cow(s, &snap_src, &snap_dest, NULL); 1740 if (snap_src && snap_dest) { 1741 down_read(&snap_src->lock); 1742 if (s == snap_src) { 1743 DMERR("Unable to resume snapshot source until " 1744 "handover completes."); 1745 r = -EINVAL; 1746 } else if (!dm_suspended(snap_src->ti)) { 1747 DMERR("Unable to perform snapshot handover until " 1748 "source is suspended."); 1749 r = -EINVAL; 1750 } 1751 up_read(&snap_src->lock); 1752 } 1753 up_read(&_origins_lock); 1754 1755 return r; 1756 } 1757 1758 static void snapshot_resume(struct dm_target *ti) 1759 { 1760 struct dm_snapshot *s = ti->private; 1761 struct dm_snapshot *snap_src = NULL, *snap_dest = NULL; 1762 1763 down_read(&_origins_lock); 1764 (void) __find_snapshots_sharing_cow(s, &snap_src, &snap_dest, NULL); 1765 if (snap_src && snap_dest) { 1766 down_write(&snap_src->lock); 1767 down_write_nested(&snap_dest->lock, SINGLE_DEPTH_NESTING); 1768 __handover_exceptions(snap_src, snap_dest); 1769 up_write(&snap_dest->lock); 1770 up_write(&snap_src->lock); 1771 } 1772 up_read(&_origins_lock); 1773 1774 /* Now we have correct chunk size, reregister */ 1775 reregister_snapshot(s); 1776 1777 down_write(&s->lock); 1778 s->active = 1; 1779 up_write(&s->lock); 1780 } 1781 1782 static sector_t get_origin_minimum_chunksize(struct block_device *bdev) 1783 { 1784 sector_t min_chunksize; 1785 1786 down_read(&_origins_lock); 1787 min_chunksize = __minimum_chunk_size(__lookup_origin(bdev)); 1788 up_read(&_origins_lock); 1789 1790 return min_chunksize; 1791 } 1792 1793 static void snapshot_merge_resume(struct dm_target *ti) 1794 { 1795 struct dm_snapshot *s = ti->private; 1796 1797 /* 1798 * Handover exceptions from existing snapshot. 1799 */ 1800 snapshot_resume(ti); 1801 1802 /* 1803 * snapshot-merge acts as an origin, so set ti->split_io 1804 */ 1805 ti->split_io = get_origin_minimum_chunksize(s->origin->bdev); 1806 1807 start_merge(s); 1808 } 1809 1810 static int snapshot_status(struct dm_target *ti, status_type_t type, 1811 char *result, unsigned int maxlen) 1812 { 1813 unsigned sz = 0; 1814 struct dm_snapshot *snap = ti->private; 1815 1816 switch (type) { 1817 case STATUSTYPE_INFO: 1818 1819 down_write(&snap->lock); 1820 1821 if (!snap->valid) 1822 DMEMIT("Invalid"); 1823 else if (snap->merge_failed) 1824 DMEMIT("Merge failed"); 1825 else { 1826 if (snap->store->type->usage) { 1827 sector_t total_sectors, sectors_allocated, 1828 metadata_sectors; 1829 snap->store->type->usage(snap->store, 1830 &total_sectors, 1831 §ors_allocated, 1832 &metadata_sectors); 1833 DMEMIT("%llu/%llu %llu", 1834 (unsigned long long)sectors_allocated, 1835 (unsigned long long)total_sectors, 1836 (unsigned long long)metadata_sectors); 1837 } 1838 else 1839 DMEMIT("Unknown"); 1840 } 1841 1842 up_write(&snap->lock); 1843 1844 break; 1845 1846 case STATUSTYPE_TABLE: 1847 /* 1848 * kdevname returns a static pointer so we need 1849 * to make private copies if the output is to 1850 * make sense. 1851 */ 1852 DMEMIT("%s %s", snap->origin->name, snap->cow->name); 1853 snap->store->type->status(snap->store, type, result + sz, 1854 maxlen - sz); 1855 break; 1856 } 1857 1858 return 0; 1859 } 1860 1861 static int snapshot_iterate_devices(struct dm_target *ti, 1862 iterate_devices_callout_fn fn, void *data) 1863 { 1864 struct dm_snapshot *snap = ti->private; 1865 int r; 1866 1867 r = fn(ti, snap->origin, 0, ti->len, data); 1868 1869 if (!r) 1870 r = fn(ti, snap->cow, 0, get_dev_size(snap->cow->bdev), data); 1871 1872 return r; 1873 } 1874 1875 1876 /*----------------------------------------------------------------- 1877 * Origin methods 1878 *---------------------------------------------------------------*/ 1879 1880 /* 1881 * If no exceptions need creating, DM_MAPIO_REMAPPED is returned and any 1882 * supplied bio was ignored. The caller may submit it immediately. 1883 * (No remapping actually occurs as the origin is always a direct linear 1884 * map.) 1885 * 1886 * If further exceptions are required, DM_MAPIO_SUBMITTED is returned 1887 * and any supplied bio is added to a list to be submitted once all 1888 * the necessary exceptions exist. 1889 */ 1890 static int __origin_write(struct list_head *snapshots, sector_t sector, 1891 struct bio *bio) 1892 { 1893 int r = DM_MAPIO_REMAPPED; 1894 struct dm_snapshot *snap; 1895 struct dm_exception *e; 1896 struct dm_snap_pending_exception *pe; 1897 struct dm_snap_pending_exception *pe_to_start_now = NULL; 1898 struct dm_snap_pending_exception *pe_to_start_last = NULL; 1899 chunk_t chunk; 1900 1901 /* Do all the snapshots on this origin */ 1902 list_for_each_entry (snap, snapshots, list) { 1903 /* 1904 * Don't make new exceptions in a merging snapshot 1905 * because it has effectively been deleted 1906 */ 1907 if (dm_target_is_snapshot_merge(snap->ti)) 1908 continue; 1909 1910 down_write(&snap->lock); 1911 1912 /* Only deal with valid and active snapshots */ 1913 if (!snap->valid || !snap->active) 1914 goto next_snapshot; 1915 1916 /* Nothing to do if writing beyond end of snapshot */ 1917 if (sector >= dm_table_get_size(snap->ti->table)) 1918 goto next_snapshot; 1919 1920 /* 1921 * Remember, different snapshots can have 1922 * different chunk sizes. 1923 */ 1924 chunk = sector_to_chunk(snap->store, sector); 1925 1926 /* 1927 * Check exception table to see if block 1928 * is already remapped in this snapshot 1929 * and trigger an exception if not. 1930 */ 1931 e = dm_lookup_exception(&snap->complete, chunk); 1932 if (e) 1933 goto next_snapshot; 1934 1935 pe = __lookup_pending_exception(snap, chunk); 1936 if (!pe) { 1937 up_write(&snap->lock); 1938 pe = alloc_pending_exception(snap); 1939 down_write(&snap->lock); 1940 1941 if (!snap->valid) { 1942 free_pending_exception(pe); 1943 goto next_snapshot; 1944 } 1945 1946 e = dm_lookup_exception(&snap->complete, chunk); 1947 if (e) { 1948 free_pending_exception(pe); 1949 goto next_snapshot; 1950 } 1951 1952 pe = __find_pending_exception(snap, pe, chunk); 1953 if (!pe) { 1954 __invalidate_snapshot(snap, -ENOMEM); 1955 goto next_snapshot; 1956 } 1957 } 1958 1959 r = DM_MAPIO_SUBMITTED; 1960 1961 /* 1962 * If an origin bio was supplied, queue it to wait for the 1963 * completion of this exception, and start this one last, 1964 * at the end of the function. 1965 */ 1966 if (bio) { 1967 bio_list_add(&pe->origin_bios, bio); 1968 bio = NULL; 1969 1970 if (!pe->started) { 1971 pe->started = 1; 1972 pe_to_start_last = pe; 1973 } 1974 } 1975 1976 if (!pe->started) { 1977 pe->started = 1; 1978 pe_to_start_now = pe; 1979 } 1980 1981 next_snapshot: 1982 up_write(&snap->lock); 1983 1984 if (pe_to_start_now) { 1985 start_copy(pe_to_start_now); 1986 pe_to_start_now = NULL; 1987 } 1988 } 1989 1990 /* 1991 * Submit the exception against which the bio is queued last, 1992 * to give the other exceptions a head start. 1993 */ 1994 if (pe_to_start_last) 1995 start_copy(pe_to_start_last); 1996 1997 return r; 1998 } 1999 2000 /* 2001 * Called on a write from the origin driver. 2002 */ 2003 static int do_origin(struct dm_dev *origin, struct bio *bio) 2004 { 2005 struct origin *o; 2006 int r = DM_MAPIO_REMAPPED; 2007 2008 down_read(&_origins_lock); 2009 o = __lookup_origin(origin->bdev); 2010 if (o) 2011 r = __origin_write(&o->snapshots, bio->bi_sector, bio); 2012 up_read(&_origins_lock); 2013 2014 return r; 2015 } 2016 2017 /* 2018 * Trigger exceptions in all non-merging snapshots. 2019 * 2020 * The chunk size of the merging snapshot may be larger than the chunk 2021 * size of some other snapshot so we may need to reallocate multiple 2022 * chunks in other snapshots. 2023 * 2024 * We scan all the overlapping exceptions in the other snapshots. 2025 * Returns 1 if anything was reallocated and must be waited for, 2026 * otherwise returns 0. 2027 * 2028 * size must be a multiple of merging_snap's chunk_size. 2029 */ 2030 static int origin_write_extent(struct dm_snapshot *merging_snap, 2031 sector_t sector, unsigned size) 2032 { 2033 int must_wait = 0; 2034 sector_t n; 2035 struct origin *o; 2036 2037 /* 2038 * The origin's __minimum_chunk_size() got stored in split_io 2039 * by snapshot_merge_resume(). 2040 */ 2041 down_read(&_origins_lock); 2042 o = __lookup_origin(merging_snap->origin->bdev); 2043 for (n = 0; n < size; n += merging_snap->ti->split_io) 2044 if (__origin_write(&o->snapshots, sector + n, NULL) == 2045 DM_MAPIO_SUBMITTED) 2046 must_wait = 1; 2047 up_read(&_origins_lock); 2048 2049 return must_wait; 2050 } 2051 2052 /* 2053 * Origin: maps a linear range of a device, with hooks for snapshotting. 2054 */ 2055 2056 /* 2057 * Construct an origin mapping: <dev_path> 2058 * The context for an origin is merely a 'struct dm_dev *' 2059 * pointing to the real device. 2060 */ 2061 static int origin_ctr(struct dm_target *ti, unsigned int argc, char **argv) 2062 { 2063 int r; 2064 struct dm_dev *dev; 2065 2066 if (argc != 1) { 2067 ti->error = "origin: incorrect number of arguments"; 2068 return -EINVAL; 2069 } 2070 2071 r = dm_get_device(ti, argv[0], dm_table_get_mode(ti->table), &dev); 2072 if (r) { 2073 ti->error = "Cannot get target device"; 2074 return r; 2075 } 2076 2077 ti->private = dev; 2078 ti->num_flush_requests = 1; 2079 2080 return 0; 2081 } 2082 2083 static void origin_dtr(struct dm_target *ti) 2084 { 2085 struct dm_dev *dev = ti->private; 2086 dm_put_device(ti, dev); 2087 } 2088 2089 static int origin_map(struct dm_target *ti, struct bio *bio, 2090 union map_info *map_context) 2091 { 2092 struct dm_dev *dev = ti->private; 2093 bio->bi_bdev = dev->bdev; 2094 2095 if (bio->bi_rw & REQ_FLUSH) 2096 return DM_MAPIO_REMAPPED; 2097 2098 /* Only tell snapshots if this is a write */ 2099 return (bio_rw(bio) == WRITE) ? do_origin(dev, bio) : DM_MAPIO_REMAPPED; 2100 } 2101 2102 /* 2103 * Set the target "split_io" field to the minimum of all the snapshots' 2104 * chunk sizes. 2105 */ 2106 static void origin_resume(struct dm_target *ti) 2107 { 2108 struct dm_dev *dev = ti->private; 2109 2110 ti->split_io = get_origin_minimum_chunksize(dev->bdev); 2111 } 2112 2113 static int origin_status(struct dm_target *ti, status_type_t type, char *result, 2114 unsigned int maxlen) 2115 { 2116 struct dm_dev *dev = ti->private; 2117 2118 switch (type) { 2119 case STATUSTYPE_INFO: 2120 result[0] = '\0'; 2121 break; 2122 2123 case STATUSTYPE_TABLE: 2124 snprintf(result, maxlen, "%s", dev->name); 2125 break; 2126 } 2127 2128 return 0; 2129 } 2130 2131 static int origin_merge(struct dm_target *ti, struct bvec_merge_data *bvm, 2132 struct bio_vec *biovec, int max_size) 2133 { 2134 struct dm_dev *dev = ti->private; 2135 struct request_queue *q = bdev_get_queue(dev->bdev); 2136 2137 if (!q->merge_bvec_fn) 2138 return max_size; 2139 2140 bvm->bi_bdev = dev->bdev; 2141 bvm->bi_sector = bvm->bi_sector; 2142 2143 return min(max_size, q->merge_bvec_fn(q, bvm, biovec)); 2144 } 2145 2146 static int origin_iterate_devices(struct dm_target *ti, 2147 iterate_devices_callout_fn fn, void *data) 2148 { 2149 struct dm_dev *dev = ti->private; 2150 2151 return fn(ti, dev, 0, ti->len, data); 2152 } 2153 2154 static struct target_type origin_target = { 2155 .name = "snapshot-origin", 2156 .version = {1, 7, 1}, 2157 .module = THIS_MODULE, 2158 .ctr = origin_ctr, 2159 .dtr = origin_dtr, 2160 .map = origin_map, 2161 .resume = origin_resume, 2162 .status = origin_status, 2163 .merge = origin_merge, 2164 .iterate_devices = origin_iterate_devices, 2165 }; 2166 2167 static struct target_type snapshot_target = { 2168 .name = "snapshot", 2169 .version = {1, 10, 0}, 2170 .module = THIS_MODULE, 2171 .ctr = snapshot_ctr, 2172 .dtr = snapshot_dtr, 2173 .map = snapshot_map, 2174 .end_io = snapshot_end_io, 2175 .preresume = snapshot_preresume, 2176 .resume = snapshot_resume, 2177 .status = snapshot_status, 2178 .iterate_devices = snapshot_iterate_devices, 2179 }; 2180 2181 static struct target_type merge_target = { 2182 .name = dm_snapshot_merge_target_name, 2183 .version = {1, 1, 0}, 2184 .module = THIS_MODULE, 2185 .ctr = snapshot_ctr, 2186 .dtr = snapshot_dtr, 2187 .map = snapshot_merge_map, 2188 .end_io = snapshot_end_io, 2189 .presuspend = snapshot_merge_presuspend, 2190 .preresume = snapshot_preresume, 2191 .resume = snapshot_merge_resume, 2192 .status = snapshot_status, 2193 .iterate_devices = snapshot_iterate_devices, 2194 }; 2195 2196 static int __init dm_snapshot_init(void) 2197 { 2198 int r; 2199 2200 r = dm_exception_store_init(); 2201 if (r) { 2202 DMERR("Failed to initialize exception stores"); 2203 return r; 2204 } 2205 2206 r = dm_register_target(&snapshot_target); 2207 if (r < 0) { 2208 DMERR("snapshot target register failed %d", r); 2209 goto bad_register_snapshot_target; 2210 } 2211 2212 r = dm_register_target(&origin_target); 2213 if (r < 0) { 2214 DMERR("Origin target register failed %d", r); 2215 goto bad_register_origin_target; 2216 } 2217 2218 r = dm_register_target(&merge_target); 2219 if (r < 0) { 2220 DMERR("Merge target register failed %d", r); 2221 goto bad_register_merge_target; 2222 } 2223 2224 r = init_origin_hash(); 2225 if (r) { 2226 DMERR("init_origin_hash failed."); 2227 goto bad_origin_hash; 2228 } 2229 2230 exception_cache = KMEM_CACHE(dm_exception, 0); 2231 if (!exception_cache) { 2232 DMERR("Couldn't create exception cache."); 2233 r = -ENOMEM; 2234 goto bad_exception_cache; 2235 } 2236 2237 pending_cache = KMEM_CACHE(dm_snap_pending_exception, 0); 2238 if (!pending_cache) { 2239 DMERR("Couldn't create pending cache."); 2240 r = -ENOMEM; 2241 goto bad_pending_cache; 2242 } 2243 2244 tracked_chunk_cache = KMEM_CACHE(dm_snap_tracked_chunk, 0); 2245 if (!tracked_chunk_cache) { 2246 DMERR("Couldn't create cache to track chunks in use."); 2247 r = -ENOMEM; 2248 goto bad_tracked_chunk_cache; 2249 } 2250 2251 return 0; 2252 2253 bad_tracked_chunk_cache: 2254 kmem_cache_destroy(pending_cache); 2255 bad_pending_cache: 2256 kmem_cache_destroy(exception_cache); 2257 bad_exception_cache: 2258 exit_origin_hash(); 2259 bad_origin_hash: 2260 dm_unregister_target(&merge_target); 2261 bad_register_merge_target: 2262 dm_unregister_target(&origin_target); 2263 bad_register_origin_target: 2264 dm_unregister_target(&snapshot_target); 2265 bad_register_snapshot_target: 2266 dm_exception_store_exit(); 2267 2268 return r; 2269 } 2270 2271 static void __exit dm_snapshot_exit(void) 2272 { 2273 dm_unregister_target(&snapshot_target); 2274 dm_unregister_target(&origin_target); 2275 dm_unregister_target(&merge_target); 2276 2277 exit_origin_hash(); 2278 kmem_cache_destroy(pending_cache); 2279 kmem_cache_destroy(exception_cache); 2280 kmem_cache_destroy(tracked_chunk_cache); 2281 2282 dm_exception_store_exit(); 2283 } 2284 2285 /* Module hooks */ 2286 module_init(dm_snapshot_init); 2287 module_exit(dm_snapshot_exit); 2288 2289 MODULE_DESCRIPTION(DM_NAME " snapshot target"); 2290 MODULE_AUTHOR("Joe Thornber"); 2291 MODULE_LICENSE("GPL"); 2292