1 /* 2 * Copyright (C) 2001-2002 Sistina Software (UK) Limited. 3 * 4 * This file is released under the GPL. 5 */ 6 7 #include <linux/blkdev.h> 8 #include <linux/device-mapper.h> 9 #include <linux/delay.h> 10 #include <linux/fs.h> 11 #include <linux/init.h> 12 #include <linux/kdev_t.h> 13 #include <linux/list.h> 14 #include <linux/list_bl.h> 15 #include <linux/mempool.h> 16 #include <linux/module.h> 17 #include <linux/slab.h> 18 #include <linux/vmalloc.h> 19 #include <linux/log2.h> 20 #include <linux/dm-kcopyd.h> 21 22 #include "dm.h" 23 24 #include "dm-exception-store.h" 25 26 #define DM_MSG_PREFIX "snapshots" 27 28 static const char dm_snapshot_merge_target_name[] = "snapshot-merge"; 29 30 #define dm_target_is_snapshot_merge(ti) \ 31 ((ti)->type->name == dm_snapshot_merge_target_name) 32 33 /* 34 * The size of the mempool used to track chunks in use. 35 */ 36 #define MIN_IOS 256 37 38 #define DM_TRACKED_CHUNK_HASH_SIZE 16 39 #define DM_TRACKED_CHUNK_HASH(x) ((unsigned long)(x) & \ 40 (DM_TRACKED_CHUNK_HASH_SIZE - 1)) 41 42 struct dm_exception_table { 43 uint32_t hash_mask; 44 unsigned hash_shift; 45 struct hlist_bl_head *table; 46 }; 47 48 struct dm_snapshot { 49 struct rw_semaphore lock; 50 51 struct dm_dev *origin; 52 struct dm_dev *cow; 53 54 struct dm_target *ti; 55 56 /* List of snapshots per Origin */ 57 struct list_head list; 58 59 /* 60 * You can't use a snapshot if this is 0 (e.g. if full). 61 * A snapshot-merge target never clears this. 62 */ 63 int valid; 64 65 /* 66 * The snapshot overflowed because of a write to the snapshot device. 67 * We don't have to invalidate the snapshot in this case, but we need 68 * to prevent further writes. 69 */ 70 int snapshot_overflowed; 71 72 /* Origin writes don't trigger exceptions until this is set */ 73 int active; 74 75 atomic_t pending_exceptions_count; 76 77 spinlock_t pe_allocation_lock; 78 79 /* Protected by "pe_allocation_lock" */ 80 sector_t exception_start_sequence; 81 82 /* Protected by kcopyd single-threaded callback */ 83 sector_t exception_complete_sequence; 84 85 /* 86 * A list of pending exceptions that completed out of order. 87 * Protected by kcopyd single-threaded callback. 88 */ 89 struct rb_root out_of_order_tree; 90 91 mempool_t pending_pool; 92 93 struct dm_exception_table pending; 94 struct dm_exception_table complete; 95 96 /* 97 * pe_lock protects all pending_exception operations and access 98 * as well as the snapshot_bios list. 99 */ 100 spinlock_t pe_lock; 101 102 /* Chunks with outstanding reads */ 103 spinlock_t tracked_chunk_lock; 104 struct hlist_head tracked_chunk_hash[DM_TRACKED_CHUNK_HASH_SIZE]; 105 106 /* The on disk metadata handler */ 107 struct dm_exception_store *store; 108 109 unsigned in_progress; 110 struct wait_queue_head in_progress_wait; 111 112 struct dm_kcopyd_client *kcopyd_client; 113 114 /* Wait for events based on state_bits */ 115 unsigned long state_bits; 116 117 /* Range of chunks currently being merged. */ 118 chunk_t first_merging_chunk; 119 int num_merging_chunks; 120 121 /* 122 * The merge operation failed if this flag is set. 123 * Failure modes are handled as follows: 124 * - I/O error reading the header 125 * => don't load the target; abort. 126 * - Header does not have "valid" flag set 127 * => use the origin; forget about the snapshot. 128 * - I/O error when reading exceptions 129 * => don't load the target; abort. 130 * (We can't use the intermediate origin state.) 131 * - I/O error while merging 132 * => stop merging; set merge_failed; process I/O normally. 133 */ 134 bool merge_failed:1; 135 136 bool discard_zeroes_cow:1; 137 bool discard_passdown_origin:1; 138 139 /* 140 * Incoming bios that overlap with chunks being merged must wait 141 * for them to be committed. 142 */ 143 struct bio_list bios_queued_during_merge; 144 145 /* 146 * Flush data after merge. 147 */ 148 struct bio flush_bio; 149 }; 150 151 /* 152 * state_bits: 153 * RUNNING_MERGE - Merge operation is in progress. 154 * SHUTDOWN_MERGE - Set to signal that merge needs to be stopped; 155 * cleared afterwards. 156 */ 157 #define RUNNING_MERGE 0 158 #define SHUTDOWN_MERGE 1 159 160 /* 161 * Maximum number of chunks being copied on write. 162 * 163 * The value was decided experimentally as a trade-off between memory 164 * consumption, stalling the kernel's workqueues and maintaining a high enough 165 * throughput. 166 */ 167 #define DEFAULT_COW_THRESHOLD 2048 168 169 static unsigned cow_threshold = DEFAULT_COW_THRESHOLD; 170 module_param_named(snapshot_cow_threshold, cow_threshold, uint, 0644); 171 MODULE_PARM_DESC(snapshot_cow_threshold, "Maximum number of chunks being copied on write"); 172 173 DECLARE_DM_KCOPYD_THROTTLE_WITH_MODULE_PARM(snapshot_copy_throttle, 174 "A percentage of time allocated for copy on write"); 175 176 struct dm_dev *dm_snap_origin(struct dm_snapshot *s) 177 { 178 return s->origin; 179 } 180 EXPORT_SYMBOL(dm_snap_origin); 181 182 struct dm_dev *dm_snap_cow(struct dm_snapshot *s) 183 { 184 return s->cow; 185 } 186 EXPORT_SYMBOL(dm_snap_cow); 187 188 static sector_t chunk_to_sector(struct dm_exception_store *store, 189 chunk_t chunk) 190 { 191 return chunk << store->chunk_shift; 192 } 193 194 static int bdev_equal(struct block_device *lhs, struct block_device *rhs) 195 { 196 /* 197 * There is only ever one instance of a particular block 198 * device so we can compare pointers safely. 199 */ 200 return lhs == rhs; 201 } 202 203 struct dm_snap_pending_exception { 204 struct dm_exception e; 205 206 /* 207 * Origin buffers waiting for this to complete are held 208 * in a bio list 209 */ 210 struct bio_list origin_bios; 211 struct bio_list snapshot_bios; 212 213 /* Pointer back to snapshot context */ 214 struct dm_snapshot *snap; 215 216 /* 217 * 1 indicates the exception has already been sent to 218 * kcopyd. 219 */ 220 int started; 221 222 /* There was copying error. */ 223 int copy_error; 224 225 /* A sequence number, it is used for in-order completion. */ 226 sector_t exception_sequence; 227 228 struct rb_node out_of_order_node; 229 230 /* 231 * For writing a complete chunk, bypassing the copy. 232 */ 233 struct bio *full_bio; 234 bio_end_io_t *full_bio_end_io; 235 }; 236 237 /* 238 * Hash table mapping origin volumes to lists of snapshots and 239 * a lock to protect it 240 */ 241 static struct kmem_cache *exception_cache; 242 static struct kmem_cache *pending_cache; 243 244 struct dm_snap_tracked_chunk { 245 struct hlist_node node; 246 chunk_t chunk; 247 }; 248 249 static void init_tracked_chunk(struct bio *bio) 250 { 251 struct dm_snap_tracked_chunk *c = dm_per_bio_data(bio, sizeof(struct dm_snap_tracked_chunk)); 252 INIT_HLIST_NODE(&c->node); 253 } 254 255 static bool is_bio_tracked(struct bio *bio) 256 { 257 struct dm_snap_tracked_chunk *c = dm_per_bio_data(bio, sizeof(struct dm_snap_tracked_chunk)); 258 return !hlist_unhashed(&c->node); 259 } 260 261 static void track_chunk(struct dm_snapshot *s, struct bio *bio, chunk_t chunk) 262 { 263 struct dm_snap_tracked_chunk *c = dm_per_bio_data(bio, sizeof(struct dm_snap_tracked_chunk)); 264 265 c->chunk = chunk; 266 267 spin_lock_irq(&s->tracked_chunk_lock); 268 hlist_add_head(&c->node, 269 &s->tracked_chunk_hash[DM_TRACKED_CHUNK_HASH(chunk)]); 270 spin_unlock_irq(&s->tracked_chunk_lock); 271 } 272 273 static void stop_tracking_chunk(struct dm_snapshot *s, struct bio *bio) 274 { 275 struct dm_snap_tracked_chunk *c = dm_per_bio_data(bio, sizeof(struct dm_snap_tracked_chunk)); 276 unsigned long flags; 277 278 spin_lock_irqsave(&s->tracked_chunk_lock, flags); 279 hlist_del(&c->node); 280 spin_unlock_irqrestore(&s->tracked_chunk_lock, flags); 281 } 282 283 static int __chunk_is_tracked(struct dm_snapshot *s, chunk_t chunk) 284 { 285 struct dm_snap_tracked_chunk *c; 286 int found = 0; 287 288 spin_lock_irq(&s->tracked_chunk_lock); 289 290 hlist_for_each_entry(c, 291 &s->tracked_chunk_hash[DM_TRACKED_CHUNK_HASH(chunk)], node) { 292 if (c->chunk == chunk) { 293 found = 1; 294 break; 295 } 296 } 297 298 spin_unlock_irq(&s->tracked_chunk_lock); 299 300 return found; 301 } 302 303 /* 304 * This conflicting I/O is extremely improbable in the caller, 305 * so msleep(1) is sufficient and there is no need for a wait queue. 306 */ 307 static void __check_for_conflicting_io(struct dm_snapshot *s, chunk_t chunk) 308 { 309 while (__chunk_is_tracked(s, chunk)) 310 msleep(1); 311 } 312 313 /* 314 * One of these per registered origin, held in the snapshot_origins hash 315 */ 316 struct origin { 317 /* The origin device */ 318 struct block_device *bdev; 319 320 struct list_head hash_list; 321 322 /* List of snapshots for this origin */ 323 struct list_head snapshots; 324 }; 325 326 /* 327 * This structure is allocated for each origin target 328 */ 329 struct dm_origin { 330 struct dm_dev *dev; 331 struct dm_target *ti; 332 unsigned split_boundary; 333 struct list_head hash_list; 334 }; 335 336 /* 337 * Size of the hash table for origin volumes. If we make this 338 * the size of the minors list then it should be nearly perfect 339 */ 340 #define ORIGIN_HASH_SIZE 256 341 #define ORIGIN_MASK 0xFF 342 static struct list_head *_origins; 343 static struct list_head *_dm_origins; 344 static struct rw_semaphore _origins_lock; 345 346 static DECLARE_WAIT_QUEUE_HEAD(_pending_exceptions_done); 347 static DEFINE_SPINLOCK(_pending_exceptions_done_spinlock); 348 static uint64_t _pending_exceptions_done_count; 349 350 static int init_origin_hash(void) 351 { 352 int i; 353 354 _origins = kmalloc_array(ORIGIN_HASH_SIZE, sizeof(struct list_head), 355 GFP_KERNEL); 356 if (!_origins) { 357 DMERR("unable to allocate memory for _origins"); 358 return -ENOMEM; 359 } 360 for (i = 0; i < ORIGIN_HASH_SIZE; i++) 361 INIT_LIST_HEAD(_origins + i); 362 363 _dm_origins = kmalloc_array(ORIGIN_HASH_SIZE, 364 sizeof(struct list_head), 365 GFP_KERNEL); 366 if (!_dm_origins) { 367 DMERR("unable to allocate memory for _dm_origins"); 368 kfree(_origins); 369 return -ENOMEM; 370 } 371 for (i = 0; i < ORIGIN_HASH_SIZE; i++) 372 INIT_LIST_HEAD(_dm_origins + i); 373 374 init_rwsem(&_origins_lock); 375 376 return 0; 377 } 378 379 static void exit_origin_hash(void) 380 { 381 kfree(_origins); 382 kfree(_dm_origins); 383 } 384 385 static unsigned origin_hash(struct block_device *bdev) 386 { 387 return bdev->bd_dev & ORIGIN_MASK; 388 } 389 390 static struct origin *__lookup_origin(struct block_device *origin) 391 { 392 struct list_head *ol; 393 struct origin *o; 394 395 ol = &_origins[origin_hash(origin)]; 396 list_for_each_entry (o, ol, hash_list) 397 if (bdev_equal(o->bdev, origin)) 398 return o; 399 400 return NULL; 401 } 402 403 static void __insert_origin(struct origin *o) 404 { 405 struct list_head *sl = &_origins[origin_hash(o->bdev)]; 406 list_add_tail(&o->hash_list, sl); 407 } 408 409 static struct dm_origin *__lookup_dm_origin(struct block_device *origin) 410 { 411 struct list_head *ol; 412 struct dm_origin *o; 413 414 ol = &_dm_origins[origin_hash(origin)]; 415 list_for_each_entry (o, ol, hash_list) 416 if (bdev_equal(o->dev->bdev, origin)) 417 return o; 418 419 return NULL; 420 } 421 422 static void __insert_dm_origin(struct dm_origin *o) 423 { 424 struct list_head *sl = &_dm_origins[origin_hash(o->dev->bdev)]; 425 list_add_tail(&o->hash_list, sl); 426 } 427 428 static void __remove_dm_origin(struct dm_origin *o) 429 { 430 list_del(&o->hash_list); 431 } 432 433 /* 434 * _origins_lock must be held when calling this function. 435 * Returns number of snapshots registered using the supplied cow device, plus: 436 * snap_src - a snapshot suitable for use as a source of exception handover 437 * snap_dest - a snapshot capable of receiving exception handover. 438 * snap_merge - an existing snapshot-merge target linked to the same origin. 439 * There can be at most one snapshot-merge target. The parameter is optional. 440 * 441 * Possible return values and states of snap_src and snap_dest. 442 * 0: NULL, NULL - first new snapshot 443 * 1: snap_src, NULL - normal snapshot 444 * 2: snap_src, snap_dest - waiting for handover 445 * 2: snap_src, NULL - handed over, waiting for old to be deleted 446 * 1: NULL, snap_dest - source got destroyed without handover 447 */ 448 static int __find_snapshots_sharing_cow(struct dm_snapshot *snap, 449 struct dm_snapshot **snap_src, 450 struct dm_snapshot **snap_dest, 451 struct dm_snapshot **snap_merge) 452 { 453 struct dm_snapshot *s; 454 struct origin *o; 455 int count = 0; 456 int active; 457 458 o = __lookup_origin(snap->origin->bdev); 459 if (!o) 460 goto out; 461 462 list_for_each_entry(s, &o->snapshots, list) { 463 if (dm_target_is_snapshot_merge(s->ti) && snap_merge) 464 *snap_merge = s; 465 if (!bdev_equal(s->cow->bdev, snap->cow->bdev)) 466 continue; 467 468 down_read(&s->lock); 469 active = s->active; 470 up_read(&s->lock); 471 472 if (active) { 473 if (snap_src) 474 *snap_src = s; 475 } else if (snap_dest) 476 *snap_dest = s; 477 478 count++; 479 } 480 481 out: 482 return count; 483 } 484 485 /* 486 * On success, returns 1 if this snapshot is a handover destination, 487 * otherwise returns 0. 488 */ 489 static int __validate_exception_handover(struct dm_snapshot *snap) 490 { 491 struct dm_snapshot *snap_src = NULL, *snap_dest = NULL; 492 struct dm_snapshot *snap_merge = NULL; 493 494 /* Does snapshot need exceptions handed over to it? */ 495 if ((__find_snapshots_sharing_cow(snap, &snap_src, &snap_dest, 496 &snap_merge) == 2) || 497 snap_dest) { 498 snap->ti->error = "Snapshot cow pairing for exception " 499 "table handover failed"; 500 return -EINVAL; 501 } 502 503 /* 504 * If no snap_src was found, snap cannot become a handover 505 * destination. 506 */ 507 if (!snap_src) 508 return 0; 509 510 /* 511 * Non-snapshot-merge handover? 512 */ 513 if (!dm_target_is_snapshot_merge(snap->ti)) 514 return 1; 515 516 /* 517 * Do not allow more than one merging snapshot. 518 */ 519 if (snap_merge) { 520 snap->ti->error = "A snapshot is already merging."; 521 return -EINVAL; 522 } 523 524 if (!snap_src->store->type->prepare_merge || 525 !snap_src->store->type->commit_merge) { 526 snap->ti->error = "Snapshot exception store does not " 527 "support snapshot-merge."; 528 return -EINVAL; 529 } 530 531 return 1; 532 } 533 534 static void __insert_snapshot(struct origin *o, struct dm_snapshot *s) 535 { 536 struct dm_snapshot *l; 537 538 /* Sort the list according to chunk size, largest-first smallest-last */ 539 list_for_each_entry(l, &o->snapshots, list) 540 if (l->store->chunk_size < s->store->chunk_size) 541 break; 542 list_add_tail(&s->list, &l->list); 543 } 544 545 /* 546 * Make a note of the snapshot and its origin so we can look it 547 * up when the origin has a write on it. 548 * 549 * Also validate snapshot exception store handovers. 550 * On success, returns 1 if this registration is a handover destination, 551 * otherwise returns 0. 552 */ 553 static int register_snapshot(struct dm_snapshot *snap) 554 { 555 struct origin *o, *new_o = NULL; 556 struct block_device *bdev = snap->origin->bdev; 557 int r = 0; 558 559 new_o = kmalloc(sizeof(*new_o), GFP_KERNEL); 560 if (!new_o) 561 return -ENOMEM; 562 563 down_write(&_origins_lock); 564 565 r = __validate_exception_handover(snap); 566 if (r < 0) { 567 kfree(new_o); 568 goto out; 569 } 570 571 o = __lookup_origin(bdev); 572 if (o) 573 kfree(new_o); 574 else { 575 /* New origin */ 576 o = new_o; 577 578 /* Initialise the struct */ 579 INIT_LIST_HEAD(&o->snapshots); 580 o->bdev = bdev; 581 582 __insert_origin(o); 583 } 584 585 __insert_snapshot(o, snap); 586 587 out: 588 up_write(&_origins_lock); 589 590 return r; 591 } 592 593 /* 594 * Move snapshot to correct place in list according to chunk size. 595 */ 596 static void reregister_snapshot(struct dm_snapshot *s) 597 { 598 struct block_device *bdev = s->origin->bdev; 599 600 down_write(&_origins_lock); 601 602 list_del(&s->list); 603 __insert_snapshot(__lookup_origin(bdev), s); 604 605 up_write(&_origins_lock); 606 } 607 608 static void unregister_snapshot(struct dm_snapshot *s) 609 { 610 struct origin *o; 611 612 down_write(&_origins_lock); 613 o = __lookup_origin(s->origin->bdev); 614 615 list_del(&s->list); 616 if (o && list_empty(&o->snapshots)) { 617 list_del(&o->hash_list); 618 kfree(o); 619 } 620 621 up_write(&_origins_lock); 622 } 623 624 /* 625 * Implementation of the exception hash tables. 626 * The lowest hash_shift bits of the chunk number are ignored, allowing 627 * some consecutive chunks to be grouped together. 628 */ 629 static uint32_t exception_hash(struct dm_exception_table *et, chunk_t chunk); 630 631 /* Lock to protect access to the completed and pending exception hash tables. */ 632 struct dm_exception_table_lock { 633 struct hlist_bl_head *complete_slot; 634 struct hlist_bl_head *pending_slot; 635 }; 636 637 static void dm_exception_table_lock_init(struct dm_snapshot *s, chunk_t chunk, 638 struct dm_exception_table_lock *lock) 639 { 640 struct dm_exception_table *complete = &s->complete; 641 struct dm_exception_table *pending = &s->pending; 642 643 lock->complete_slot = &complete->table[exception_hash(complete, chunk)]; 644 lock->pending_slot = &pending->table[exception_hash(pending, chunk)]; 645 } 646 647 static void dm_exception_table_lock(struct dm_exception_table_lock *lock) 648 { 649 hlist_bl_lock(lock->complete_slot); 650 hlist_bl_lock(lock->pending_slot); 651 } 652 653 static void dm_exception_table_unlock(struct dm_exception_table_lock *lock) 654 { 655 hlist_bl_unlock(lock->pending_slot); 656 hlist_bl_unlock(lock->complete_slot); 657 } 658 659 static int dm_exception_table_init(struct dm_exception_table *et, 660 uint32_t size, unsigned hash_shift) 661 { 662 unsigned int i; 663 664 et->hash_shift = hash_shift; 665 et->hash_mask = size - 1; 666 et->table = kvmalloc_array(size, sizeof(struct hlist_bl_head), 667 GFP_KERNEL); 668 if (!et->table) 669 return -ENOMEM; 670 671 for (i = 0; i < size; i++) 672 INIT_HLIST_BL_HEAD(et->table + i); 673 674 return 0; 675 } 676 677 static void dm_exception_table_exit(struct dm_exception_table *et, 678 struct kmem_cache *mem) 679 { 680 struct hlist_bl_head *slot; 681 struct dm_exception *ex; 682 struct hlist_bl_node *pos, *n; 683 int i, size; 684 685 size = et->hash_mask + 1; 686 for (i = 0; i < size; i++) { 687 slot = et->table + i; 688 689 hlist_bl_for_each_entry_safe(ex, pos, n, slot, hash_list) 690 kmem_cache_free(mem, ex); 691 } 692 693 kvfree(et->table); 694 } 695 696 static uint32_t exception_hash(struct dm_exception_table *et, chunk_t chunk) 697 { 698 return (chunk >> et->hash_shift) & et->hash_mask; 699 } 700 701 static void dm_remove_exception(struct dm_exception *e) 702 { 703 hlist_bl_del(&e->hash_list); 704 } 705 706 /* 707 * Return the exception data for a sector, or NULL if not 708 * remapped. 709 */ 710 static struct dm_exception *dm_lookup_exception(struct dm_exception_table *et, 711 chunk_t chunk) 712 { 713 struct hlist_bl_head *slot; 714 struct hlist_bl_node *pos; 715 struct dm_exception *e; 716 717 slot = &et->table[exception_hash(et, chunk)]; 718 hlist_bl_for_each_entry(e, pos, slot, hash_list) 719 if (chunk >= e->old_chunk && 720 chunk <= e->old_chunk + dm_consecutive_chunk_count(e)) 721 return e; 722 723 return NULL; 724 } 725 726 static struct dm_exception *alloc_completed_exception(gfp_t gfp) 727 { 728 struct dm_exception *e; 729 730 e = kmem_cache_alloc(exception_cache, gfp); 731 if (!e && gfp == GFP_NOIO) 732 e = kmem_cache_alloc(exception_cache, GFP_ATOMIC); 733 734 return e; 735 } 736 737 static void free_completed_exception(struct dm_exception *e) 738 { 739 kmem_cache_free(exception_cache, e); 740 } 741 742 static struct dm_snap_pending_exception *alloc_pending_exception(struct dm_snapshot *s) 743 { 744 struct dm_snap_pending_exception *pe = mempool_alloc(&s->pending_pool, 745 GFP_NOIO); 746 747 atomic_inc(&s->pending_exceptions_count); 748 pe->snap = s; 749 750 return pe; 751 } 752 753 static void free_pending_exception(struct dm_snap_pending_exception *pe) 754 { 755 struct dm_snapshot *s = pe->snap; 756 757 mempool_free(pe, &s->pending_pool); 758 smp_mb__before_atomic(); 759 atomic_dec(&s->pending_exceptions_count); 760 } 761 762 static void dm_insert_exception(struct dm_exception_table *eh, 763 struct dm_exception *new_e) 764 { 765 struct hlist_bl_head *l; 766 struct hlist_bl_node *pos; 767 struct dm_exception *e = NULL; 768 769 l = &eh->table[exception_hash(eh, new_e->old_chunk)]; 770 771 /* Add immediately if this table doesn't support consecutive chunks */ 772 if (!eh->hash_shift) 773 goto out; 774 775 /* List is ordered by old_chunk */ 776 hlist_bl_for_each_entry(e, pos, l, hash_list) { 777 /* Insert after an existing chunk? */ 778 if (new_e->old_chunk == (e->old_chunk + 779 dm_consecutive_chunk_count(e) + 1) && 780 new_e->new_chunk == (dm_chunk_number(e->new_chunk) + 781 dm_consecutive_chunk_count(e) + 1)) { 782 dm_consecutive_chunk_count_inc(e); 783 free_completed_exception(new_e); 784 return; 785 } 786 787 /* Insert before an existing chunk? */ 788 if (new_e->old_chunk == (e->old_chunk - 1) && 789 new_e->new_chunk == (dm_chunk_number(e->new_chunk) - 1)) { 790 dm_consecutive_chunk_count_inc(e); 791 e->old_chunk--; 792 e->new_chunk--; 793 free_completed_exception(new_e); 794 return; 795 } 796 797 if (new_e->old_chunk < e->old_chunk) 798 break; 799 } 800 801 out: 802 if (!e) { 803 /* 804 * Either the table doesn't support consecutive chunks or slot 805 * l is empty. 806 */ 807 hlist_bl_add_head(&new_e->hash_list, l); 808 } else if (new_e->old_chunk < e->old_chunk) { 809 /* Add before an existing exception */ 810 hlist_bl_add_before(&new_e->hash_list, &e->hash_list); 811 } else { 812 /* Add to l's tail: e is the last exception in this slot */ 813 hlist_bl_add_behind(&new_e->hash_list, &e->hash_list); 814 } 815 } 816 817 /* 818 * Callback used by the exception stores to load exceptions when 819 * initialising. 820 */ 821 static int dm_add_exception(void *context, chunk_t old, chunk_t new) 822 { 823 struct dm_exception_table_lock lock; 824 struct dm_snapshot *s = context; 825 struct dm_exception *e; 826 827 e = alloc_completed_exception(GFP_KERNEL); 828 if (!e) 829 return -ENOMEM; 830 831 e->old_chunk = old; 832 833 /* Consecutive_count is implicitly initialised to zero */ 834 e->new_chunk = new; 835 836 /* 837 * Although there is no need to lock access to the exception tables 838 * here, if we don't then hlist_bl_add_head(), called by 839 * dm_insert_exception(), will complain about accessing the 840 * corresponding list without locking it first. 841 */ 842 dm_exception_table_lock_init(s, old, &lock); 843 844 dm_exception_table_lock(&lock); 845 dm_insert_exception(&s->complete, e); 846 dm_exception_table_unlock(&lock); 847 848 return 0; 849 } 850 851 /* 852 * Return a minimum chunk size of all snapshots that have the specified origin. 853 * Return zero if the origin has no snapshots. 854 */ 855 static uint32_t __minimum_chunk_size(struct origin *o) 856 { 857 struct dm_snapshot *snap; 858 unsigned chunk_size = rounddown_pow_of_two(UINT_MAX); 859 860 if (o) 861 list_for_each_entry(snap, &o->snapshots, list) 862 chunk_size = min_not_zero(chunk_size, 863 snap->store->chunk_size); 864 865 return (uint32_t) chunk_size; 866 } 867 868 /* 869 * Hard coded magic. 870 */ 871 static int calc_max_buckets(void) 872 { 873 /* use a fixed size of 2MB */ 874 unsigned long mem = 2 * 1024 * 1024; 875 mem /= sizeof(struct hlist_bl_head); 876 877 return mem; 878 } 879 880 /* 881 * Allocate room for a suitable hash table. 882 */ 883 static int init_hash_tables(struct dm_snapshot *s) 884 { 885 sector_t hash_size, cow_dev_size, max_buckets; 886 887 /* 888 * Calculate based on the size of the original volume or 889 * the COW volume... 890 */ 891 cow_dev_size = get_dev_size(s->cow->bdev); 892 max_buckets = calc_max_buckets(); 893 894 hash_size = cow_dev_size >> s->store->chunk_shift; 895 hash_size = min(hash_size, max_buckets); 896 897 if (hash_size < 64) 898 hash_size = 64; 899 hash_size = rounddown_pow_of_two(hash_size); 900 if (dm_exception_table_init(&s->complete, hash_size, 901 DM_CHUNK_CONSECUTIVE_BITS)) 902 return -ENOMEM; 903 904 /* 905 * Allocate hash table for in-flight exceptions 906 * Make this smaller than the real hash table 907 */ 908 hash_size >>= 3; 909 if (hash_size < 64) 910 hash_size = 64; 911 912 if (dm_exception_table_init(&s->pending, hash_size, 0)) { 913 dm_exception_table_exit(&s->complete, exception_cache); 914 return -ENOMEM; 915 } 916 917 return 0; 918 } 919 920 static void merge_shutdown(struct dm_snapshot *s) 921 { 922 clear_bit_unlock(RUNNING_MERGE, &s->state_bits); 923 smp_mb__after_atomic(); 924 wake_up_bit(&s->state_bits, RUNNING_MERGE); 925 } 926 927 static struct bio *__release_queued_bios_after_merge(struct dm_snapshot *s) 928 { 929 s->first_merging_chunk = 0; 930 s->num_merging_chunks = 0; 931 932 return bio_list_get(&s->bios_queued_during_merge); 933 } 934 935 /* 936 * Remove one chunk from the index of completed exceptions. 937 */ 938 static int __remove_single_exception_chunk(struct dm_snapshot *s, 939 chunk_t old_chunk) 940 { 941 struct dm_exception *e; 942 943 e = dm_lookup_exception(&s->complete, old_chunk); 944 if (!e) { 945 DMERR("Corruption detected: exception for block %llu is " 946 "on disk but not in memory", 947 (unsigned long long)old_chunk); 948 return -EINVAL; 949 } 950 951 /* 952 * If this is the only chunk using this exception, remove exception. 953 */ 954 if (!dm_consecutive_chunk_count(e)) { 955 dm_remove_exception(e); 956 free_completed_exception(e); 957 return 0; 958 } 959 960 /* 961 * The chunk may be either at the beginning or the end of a 962 * group of consecutive chunks - never in the middle. We are 963 * removing chunks in the opposite order to that in which they 964 * were added, so this should always be true. 965 * Decrement the consecutive chunk counter and adjust the 966 * starting point if necessary. 967 */ 968 if (old_chunk == e->old_chunk) { 969 e->old_chunk++; 970 e->new_chunk++; 971 } else if (old_chunk != e->old_chunk + 972 dm_consecutive_chunk_count(e)) { 973 DMERR("Attempt to merge block %llu from the " 974 "middle of a chunk range [%llu - %llu]", 975 (unsigned long long)old_chunk, 976 (unsigned long long)e->old_chunk, 977 (unsigned long long) 978 e->old_chunk + dm_consecutive_chunk_count(e)); 979 return -EINVAL; 980 } 981 982 dm_consecutive_chunk_count_dec(e); 983 984 return 0; 985 } 986 987 static void flush_bios(struct bio *bio); 988 989 static int remove_single_exception_chunk(struct dm_snapshot *s) 990 { 991 struct bio *b = NULL; 992 int r; 993 chunk_t old_chunk = s->first_merging_chunk + s->num_merging_chunks - 1; 994 995 down_write(&s->lock); 996 997 /* 998 * Process chunks (and associated exceptions) in reverse order 999 * so that dm_consecutive_chunk_count_dec() accounting works. 1000 */ 1001 do { 1002 r = __remove_single_exception_chunk(s, old_chunk); 1003 if (r) 1004 goto out; 1005 } while (old_chunk-- > s->first_merging_chunk); 1006 1007 b = __release_queued_bios_after_merge(s); 1008 1009 out: 1010 up_write(&s->lock); 1011 if (b) 1012 flush_bios(b); 1013 1014 return r; 1015 } 1016 1017 static int origin_write_extent(struct dm_snapshot *merging_snap, 1018 sector_t sector, unsigned chunk_size); 1019 1020 static void merge_callback(int read_err, unsigned long write_err, 1021 void *context); 1022 1023 static uint64_t read_pending_exceptions_done_count(void) 1024 { 1025 uint64_t pending_exceptions_done; 1026 1027 spin_lock(&_pending_exceptions_done_spinlock); 1028 pending_exceptions_done = _pending_exceptions_done_count; 1029 spin_unlock(&_pending_exceptions_done_spinlock); 1030 1031 return pending_exceptions_done; 1032 } 1033 1034 static void increment_pending_exceptions_done_count(void) 1035 { 1036 spin_lock(&_pending_exceptions_done_spinlock); 1037 _pending_exceptions_done_count++; 1038 spin_unlock(&_pending_exceptions_done_spinlock); 1039 1040 wake_up_all(&_pending_exceptions_done); 1041 } 1042 1043 static void snapshot_merge_next_chunks(struct dm_snapshot *s) 1044 { 1045 int i, linear_chunks; 1046 chunk_t old_chunk, new_chunk; 1047 struct dm_io_region src, dest; 1048 sector_t io_size; 1049 uint64_t previous_count; 1050 1051 BUG_ON(!test_bit(RUNNING_MERGE, &s->state_bits)); 1052 if (unlikely(test_bit(SHUTDOWN_MERGE, &s->state_bits))) 1053 goto shut; 1054 1055 /* 1056 * valid flag never changes during merge, so no lock required. 1057 */ 1058 if (!s->valid) { 1059 DMERR("Snapshot is invalid: can't merge"); 1060 goto shut; 1061 } 1062 1063 linear_chunks = s->store->type->prepare_merge(s->store, &old_chunk, 1064 &new_chunk); 1065 if (linear_chunks <= 0) { 1066 if (linear_chunks < 0) { 1067 DMERR("Read error in exception store: " 1068 "shutting down merge"); 1069 down_write(&s->lock); 1070 s->merge_failed = true; 1071 up_write(&s->lock); 1072 } 1073 goto shut; 1074 } 1075 1076 /* Adjust old_chunk and new_chunk to reflect start of linear region */ 1077 old_chunk = old_chunk + 1 - linear_chunks; 1078 new_chunk = new_chunk + 1 - linear_chunks; 1079 1080 /* 1081 * Use one (potentially large) I/O to copy all 'linear_chunks' 1082 * from the exception store to the origin 1083 */ 1084 io_size = linear_chunks * s->store->chunk_size; 1085 1086 dest.bdev = s->origin->bdev; 1087 dest.sector = chunk_to_sector(s->store, old_chunk); 1088 dest.count = min(io_size, get_dev_size(dest.bdev) - dest.sector); 1089 1090 src.bdev = s->cow->bdev; 1091 src.sector = chunk_to_sector(s->store, new_chunk); 1092 src.count = dest.count; 1093 1094 /* 1095 * Reallocate any exceptions needed in other snapshots then 1096 * wait for the pending exceptions to complete. 1097 * Each time any pending exception (globally on the system) 1098 * completes we are woken and repeat the process to find out 1099 * if we can proceed. While this may not seem a particularly 1100 * efficient algorithm, it is not expected to have any 1101 * significant impact on performance. 1102 */ 1103 previous_count = read_pending_exceptions_done_count(); 1104 while (origin_write_extent(s, dest.sector, io_size)) { 1105 wait_event(_pending_exceptions_done, 1106 (read_pending_exceptions_done_count() != 1107 previous_count)); 1108 /* Retry after the wait, until all exceptions are done. */ 1109 previous_count = read_pending_exceptions_done_count(); 1110 } 1111 1112 down_write(&s->lock); 1113 s->first_merging_chunk = old_chunk; 1114 s->num_merging_chunks = linear_chunks; 1115 up_write(&s->lock); 1116 1117 /* Wait until writes to all 'linear_chunks' drain */ 1118 for (i = 0; i < linear_chunks; i++) 1119 __check_for_conflicting_io(s, old_chunk + i); 1120 1121 dm_kcopyd_copy(s->kcopyd_client, &src, 1, &dest, 0, merge_callback, s); 1122 return; 1123 1124 shut: 1125 merge_shutdown(s); 1126 } 1127 1128 static void error_bios(struct bio *bio); 1129 1130 static int flush_data(struct dm_snapshot *s) 1131 { 1132 struct bio *flush_bio = &s->flush_bio; 1133 1134 bio_reset(flush_bio); 1135 bio_set_dev(flush_bio, s->origin->bdev); 1136 flush_bio->bi_opf = REQ_OP_WRITE | REQ_PREFLUSH; 1137 1138 return submit_bio_wait(flush_bio); 1139 } 1140 1141 static void merge_callback(int read_err, unsigned long write_err, void *context) 1142 { 1143 struct dm_snapshot *s = context; 1144 struct bio *b = NULL; 1145 1146 if (read_err || write_err) { 1147 if (read_err) 1148 DMERR("Read error: shutting down merge."); 1149 else 1150 DMERR("Write error: shutting down merge."); 1151 goto shut; 1152 } 1153 1154 if (flush_data(s) < 0) { 1155 DMERR("Flush after merge failed: shutting down merge"); 1156 goto shut; 1157 } 1158 1159 if (s->store->type->commit_merge(s->store, 1160 s->num_merging_chunks) < 0) { 1161 DMERR("Write error in exception store: shutting down merge"); 1162 goto shut; 1163 } 1164 1165 if (remove_single_exception_chunk(s) < 0) 1166 goto shut; 1167 1168 snapshot_merge_next_chunks(s); 1169 1170 return; 1171 1172 shut: 1173 down_write(&s->lock); 1174 s->merge_failed = true; 1175 b = __release_queued_bios_after_merge(s); 1176 up_write(&s->lock); 1177 error_bios(b); 1178 1179 merge_shutdown(s); 1180 } 1181 1182 static void start_merge(struct dm_snapshot *s) 1183 { 1184 if (!test_and_set_bit(RUNNING_MERGE, &s->state_bits)) 1185 snapshot_merge_next_chunks(s); 1186 } 1187 1188 /* 1189 * Stop the merging process and wait until it finishes. 1190 */ 1191 static void stop_merge(struct dm_snapshot *s) 1192 { 1193 set_bit(SHUTDOWN_MERGE, &s->state_bits); 1194 wait_on_bit(&s->state_bits, RUNNING_MERGE, TASK_UNINTERRUPTIBLE); 1195 clear_bit(SHUTDOWN_MERGE, &s->state_bits); 1196 } 1197 1198 static int parse_snapshot_features(struct dm_arg_set *as, struct dm_snapshot *s, 1199 struct dm_target *ti) 1200 { 1201 int r; 1202 unsigned argc; 1203 const char *arg_name; 1204 1205 static const struct dm_arg _args[] = { 1206 {0, 2, "Invalid number of feature arguments"}, 1207 }; 1208 1209 /* 1210 * No feature arguments supplied. 1211 */ 1212 if (!as->argc) 1213 return 0; 1214 1215 r = dm_read_arg_group(_args, as, &argc, &ti->error); 1216 if (r) 1217 return -EINVAL; 1218 1219 while (argc && !r) { 1220 arg_name = dm_shift_arg(as); 1221 argc--; 1222 1223 if (!strcasecmp(arg_name, "discard_zeroes_cow")) 1224 s->discard_zeroes_cow = true; 1225 1226 else if (!strcasecmp(arg_name, "discard_passdown_origin")) 1227 s->discard_passdown_origin = true; 1228 1229 else { 1230 ti->error = "Unrecognised feature requested"; 1231 r = -EINVAL; 1232 break; 1233 } 1234 } 1235 1236 if (!s->discard_zeroes_cow && s->discard_passdown_origin) { 1237 /* 1238 * TODO: really these are disjoint.. but ti->num_discard_bios 1239 * and dm_bio_get_target_bio_nr() require rigid constraints. 1240 */ 1241 ti->error = "discard_passdown_origin feature depends on discard_zeroes_cow"; 1242 r = -EINVAL; 1243 } 1244 1245 return r; 1246 } 1247 1248 /* 1249 * Construct a snapshot mapping: 1250 * <origin_dev> <COW-dev> <p|po|n> <chunk-size> [<# feature args> [<arg>]*] 1251 */ 1252 static int snapshot_ctr(struct dm_target *ti, unsigned int argc, char **argv) 1253 { 1254 struct dm_snapshot *s; 1255 struct dm_arg_set as; 1256 int i; 1257 int r = -EINVAL; 1258 char *origin_path, *cow_path; 1259 dev_t origin_dev, cow_dev; 1260 unsigned args_used, num_flush_bios = 1; 1261 fmode_t origin_mode = FMODE_READ; 1262 1263 if (argc < 4) { 1264 ti->error = "requires 4 or more arguments"; 1265 r = -EINVAL; 1266 goto bad; 1267 } 1268 1269 if (dm_target_is_snapshot_merge(ti)) { 1270 num_flush_bios = 2; 1271 origin_mode = FMODE_WRITE; 1272 } 1273 1274 s = kzalloc(sizeof(*s), GFP_KERNEL); 1275 if (!s) { 1276 ti->error = "Cannot allocate private snapshot structure"; 1277 r = -ENOMEM; 1278 goto bad; 1279 } 1280 1281 as.argc = argc; 1282 as.argv = argv; 1283 dm_consume_args(&as, 4); 1284 r = parse_snapshot_features(&as, s, ti); 1285 if (r) 1286 goto bad_features; 1287 1288 origin_path = argv[0]; 1289 argv++; 1290 argc--; 1291 1292 r = dm_get_device(ti, origin_path, origin_mode, &s->origin); 1293 if (r) { 1294 ti->error = "Cannot get origin device"; 1295 goto bad_origin; 1296 } 1297 origin_dev = s->origin->bdev->bd_dev; 1298 1299 cow_path = argv[0]; 1300 argv++; 1301 argc--; 1302 1303 cow_dev = dm_get_dev_t(cow_path); 1304 if (cow_dev && cow_dev == origin_dev) { 1305 ti->error = "COW device cannot be the same as origin device"; 1306 r = -EINVAL; 1307 goto bad_cow; 1308 } 1309 1310 r = dm_get_device(ti, cow_path, dm_table_get_mode(ti->table), &s->cow); 1311 if (r) { 1312 ti->error = "Cannot get COW device"; 1313 goto bad_cow; 1314 } 1315 1316 r = dm_exception_store_create(ti, argc, argv, s, &args_used, &s->store); 1317 if (r) { 1318 ti->error = "Couldn't create exception store"; 1319 r = -EINVAL; 1320 goto bad_store; 1321 } 1322 1323 argv += args_used; 1324 argc -= args_used; 1325 1326 s->ti = ti; 1327 s->valid = 1; 1328 s->snapshot_overflowed = 0; 1329 s->active = 0; 1330 atomic_set(&s->pending_exceptions_count, 0); 1331 spin_lock_init(&s->pe_allocation_lock); 1332 s->exception_start_sequence = 0; 1333 s->exception_complete_sequence = 0; 1334 s->out_of_order_tree = RB_ROOT; 1335 init_rwsem(&s->lock); 1336 INIT_LIST_HEAD(&s->list); 1337 spin_lock_init(&s->pe_lock); 1338 s->state_bits = 0; 1339 s->merge_failed = false; 1340 s->first_merging_chunk = 0; 1341 s->num_merging_chunks = 0; 1342 bio_list_init(&s->bios_queued_during_merge); 1343 bio_init(&s->flush_bio, NULL, 0); 1344 1345 /* Allocate hash table for COW data */ 1346 if (init_hash_tables(s)) { 1347 ti->error = "Unable to allocate hash table space"; 1348 r = -ENOMEM; 1349 goto bad_hash_tables; 1350 } 1351 1352 init_waitqueue_head(&s->in_progress_wait); 1353 1354 s->kcopyd_client = dm_kcopyd_client_create(&dm_kcopyd_throttle); 1355 if (IS_ERR(s->kcopyd_client)) { 1356 r = PTR_ERR(s->kcopyd_client); 1357 ti->error = "Could not create kcopyd client"; 1358 goto bad_kcopyd; 1359 } 1360 1361 r = mempool_init_slab_pool(&s->pending_pool, MIN_IOS, pending_cache); 1362 if (r) { 1363 ti->error = "Could not allocate mempool for pending exceptions"; 1364 goto bad_pending_pool; 1365 } 1366 1367 for (i = 0; i < DM_TRACKED_CHUNK_HASH_SIZE; i++) 1368 INIT_HLIST_HEAD(&s->tracked_chunk_hash[i]); 1369 1370 spin_lock_init(&s->tracked_chunk_lock); 1371 1372 ti->private = s; 1373 ti->num_flush_bios = num_flush_bios; 1374 if (s->discard_zeroes_cow) 1375 ti->num_discard_bios = (s->discard_passdown_origin ? 2 : 1); 1376 ti->per_io_data_size = sizeof(struct dm_snap_tracked_chunk); 1377 1378 /* Add snapshot to the list of snapshots for this origin */ 1379 /* Exceptions aren't triggered till snapshot_resume() is called */ 1380 r = register_snapshot(s); 1381 if (r == -ENOMEM) { 1382 ti->error = "Snapshot origin struct allocation failed"; 1383 goto bad_load_and_register; 1384 } else if (r < 0) { 1385 /* invalid handover, register_snapshot has set ti->error */ 1386 goto bad_load_and_register; 1387 } 1388 1389 /* 1390 * Metadata must only be loaded into one table at once, so skip this 1391 * if metadata will be handed over during resume. 1392 * Chunk size will be set during the handover - set it to zero to 1393 * ensure it's ignored. 1394 */ 1395 if (r > 0) { 1396 s->store->chunk_size = 0; 1397 return 0; 1398 } 1399 1400 r = s->store->type->read_metadata(s->store, dm_add_exception, 1401 (void *)s); 1402 if (r < 0) { 1403 ti->error = "Failed to read snapshot metadata"; 1404 goto bad_read_metadata; 1405 } else if (r > 0) { 1406 s->valid = 0; 1407 DMWARN("Snapshot is marked invalid."); 1408 } 1409 1410 if (!s->store->chunk_size) { 1411 ti->error = "Chunk size not set"; 1412 r = -EINVAL; 1413 goto bad_read_metadata; 1414 } 1415 1416 r = dm_set_target_max_io_len(ti, s->store->chunk_size); 1417 if (r) 1418 goto bad_read_metadata; 1419 1420 return 0; 1421 1422 bad_read_metadata: 1423 unregister_snapshot(s); 1424 bad_load_and_register: 1425 mempool_exit(&s->pending_pool); 1426 bad_pending_pool: 1427 dm_kcopyd_client_destroy(s->kcopyd_client); 1428 bad_kcopyd: 1429 dm_exception_table_exit(&s->pending, pending_cache); 1430 dm_exception_table_exit(&s->complete, exception_cache); 1431 bad_hash_tables: 1432 dm_exception_store_destroy(s->store); 1433 bad_store: 1434 dm_put_device(ti, s->cow); 1435 bad_cow: 1436 dm_put_device(ti, s->origin); 1437 bad_origin: 1438 bad_features: 1439 kfree(s); 1440 bad: 1441 return r; 1442 } 1443 1444 static void __free_exceptions(struct dm_snapshot *s) 1445 { 1446 dm_kcopyd_client_destroy(s->kcopyd_client); 1447 s->kcopyd_client = NULL; 1448 1449 dm_exception_table_exit(&s->pending, pending_cache); 1450 dm_exception_table_exit(&s->complete, exception_cache); 1451 } 1452 1453 static void __handover_exceptions(struct dm_snapshot *snap_src, 1454 struct dm_snapshot *snap_dest) 1455 { 1456 union { 1457 struct dm_exception_table table_swap; 1458 struct dm_exception_store *store_swap; 1459 } u; 1460 1461 /* 1462 * Swap all snapshot context information between the two instances. 1463 */ 1464 u.table_swap = snap_dest->complete; 1465 snap_dest->complete = snap_src->complete; 1466 snap_src->complete = u.table_swap; 1467 1468 u.store_swap = snap_dest->store; 1469 snap_dest->store = snap_src->store; 1470 snap_dest->store->userspace_supports_overflow = u.store_swap->userspace_supports_overflow; 1471 snap_src->store = u.store_swap; 1472 1473 snap_dest->store->snap = snap_dest; 1474 snap_src->store->snap = snap_src; 1475 1476 snap_dest->ti->max_io_len = snap_dest->store->chunk_size; 1477 snap_dest->valid = snap_src->valid; 1478 snap_dest->snapshot_overflowed = snap_src->snapshot_overflowed; 1479 1480 /* 1481 * Set source invalid to ensure it receives no further I/O. 1482 */ 1483 snap_src->valid = 0; 1484 } 1485 1486 static void snapshot_dtr(struct dm_target *ti) 1487 { 1488 #ifdef CONFIG_DM_DEBUG 1489 int i; 1490 #endif 1491 struct dm_snapshot *s = ti->private; 1492 struct dm_snapshot *snap_src = NULL, *snap_dest = NULL; 1493 1494 down_read(&_origins_lock); 1495 /* Check whether exception handover must be cancelled */ 1496 (void) __find_snapshots_sharing_cow(s, &snap_src, &snap_dest, NULL); 1497 if (snap_src && snap_dest && (s == snap_src)) { 1498 down_write(&snap_dest->lock); 1499 snap_dest->valid = 0; 1500 up_write(&snap_dest->lock); 1501 DMERR("Cancelling snapshot handover."); 1502 } 1503 up_read(&_origins_lock); 1504 1505 if (dm_target_is_snapshot_merge(ti)) 1506 stop_merge(s); 1507 1508 /* Prevent further origin writes from using this snapshot. */ 1509 /* After this returns there can be no new kcopyd jobs. */ 1510 unregister_snapshot(s); 1511 1512 while (atomic_read(&s->pending_exceptions_count)) 1513 msleep(1); 1514 /* 1515 * Ensure instructions in mempool_exit aren't reordered 1516 * before atomic_read. 1517 */ 1518 smp_mb(); 1519 1520 #ifdef CONFIG_DM_DEBUG 1521 for (i = 0; i < DM_TRACKED_CHUNK_HASH_SIZE; i++) 1522 BUG_ON(!hlist_empty(&s->tracked_chunk_hash[i])); 1523 #endif 1524 1525 __free_exceptions(s); 1526 1527 mempool_exit(&s->pending_pool); 1528 1529 dm_exception_store_destroy(s->store); 1530 1531 bio_uninit(&s->flush_bio); 1532 1533 dm_put_device(ti, s->cow); 1534 1535 dm_put_device(ti, s->origin); 1536 1537 WARN_ON(s->in_progress); 1538 1539 kfree(s); 1540 } 1541 1542 static void account_start_copy(struct dm_snapshot *s) 1543 { 1544 spin_lock(&s->in_progress_wait.lock); 1545 s->in_progress++; 1546 spin_unlock(&s->in_progress_wait.lock); 1547 } 1548 1549 static void account_end_copy(struct dm_snapshot *s) 1550 { 1551 spin_lock(&s->in_progress_wait.lock); 1552 BUG_ON(!s->in_progress); 1553 s->in_progress--; 1554 if (likely(s->in_progress <= cow_threshold) && 1555 unlikely(waitqueue_active(&s->in_progress_wait))) 1556 wake_up_locked(&s->in_progress_wait); 1557 spin_unlock(&s->in_progress_wait.lock); 1558 } 1559 1560 static bool wait_for_in_progress(struct dm_snapshot *s, bool unlock_origins) 1561 { 1562 if (unlikely(s->in_progress > cow_threshold)) { 1563 spin_lock(&s->in_progress_wait.lock); 1564 if (likely(s->in_progress > cow_threshold)) { 1565 /* 1566 * NOTE: this throttle doesn't account for whether 1567 * the caller is servicing an IO that will trigger a COW 1568 * so excess throttling may result for chunks not required 1569 * to be COW'd. But if cow_threshold was reached, extra 1570 * throttling is unlikely to negatively impact performance. 1571 */ 1572 DECLARE_WAITQUEUE(wait, current); 1573 __add_wait_queue(&s->in_progress_wait, &wait); 1574 __set_current_state(TASK_UNINTERRUPTIBLE); 1575 spin_unlock(&s->in_progress_wait.lock); 1576 if (unlock_origins) 1577 up_read(&_origins_lock); 1578 io_schedule(); 1579 remove_wait_queue(&s->in_progress_wait, &wait); 1580 return false; 1581 } 1582 spin_unlock(&s->in_progress_wait.lock); 1583 } 1584 return true; 1585 } 1586 1587 /* 1588 * Flush a list of buffers. 1589 */ 1590 static void flush_bios(struct bio *bio) 1591 { 1592 struct bio *n; 1593 1594 while (bio) { 1595 n = bio->bi_next; 1596 bio->bi_next = NULL; 1597 submit_bio_noacct(bio); 1598 bio = n; 1599 } 1600 } 1601 1602 static int do_origin(struct dm_dev *origin, struct bio *bio, bool limit); 1603 1604 /* 1605 * Flush a list of buffers. 1606 */ 1607 static void retry_origin_bios(struct dm_snapshot *s, struct bio *bio) 1608 { 1609 struct bio *n; 1610 int r; 1611 1612 while (bio) { 1613 n = bio->bi_next; 1614 bio->bi_next = NULL; 1615 r = do_origin(s->origin, bio, false); 1616 if (r == DM_MAPIO_REMAPPED) 1617 submit_bio_noacct(bio); 1618 bio = n; 1619 } 1620 } 1621 1622 /* 1623 * Error a list of buffers. 1624 */ 1625 static void error_bios(struct bio *bio) 1626 { 1627 struct bio *n; 1628 1629 while (bio) { 1630 n = bio->bi_next; 1631 bio->bi_next = NULL; 1632 bio_io_error(bio); 1633 bio = n; 1634 } 1635 } 1636 1637 static void __invalidate_snapshot(struct dm_snapshot *s, int err) 1638 { 1639 if (!s->valid) 1640 return; 1641 1642 if (err == -EIO) 1643 DMERR("Invalidating snapshot: Error reading/writing."); 1644 else if (err == -ENOMEM) 1645 DMERR("Invalidating snapshot: Unable to allocate exception."); 1646 1647 if (s->store->type->drop_snapshot) 1648 s->store->type->drop_snapshot(s->store); 1649 1650 s->valid = 0; 1651 1652 dm_table_event(s->ti->table); 1653 } 1654 1655 static void invalidate_snapshot(struct dm_snapshot *s, int err) 1656 { 1657 down_write(&s->lock); 1658 __invalidate_snapshot(s, err); 1659 up_write(&s->lock); 1660 } 1661 1662 static void pending_complete(void *context, int success) 1663 { 1664 struct dm_snap_pending_exception *pe = context; 1665 struct dm_exception *e; 1666 struct dm_snapshot *s = pe->snap; 1667 struct bio *origin_bios = NULL; 1668 struct bio *snapshot_bios = NULL; 1669 struct bio *full_bio = NULL; 1670 struct dm_exception_table_lock lock; 1671 int error = 0; 1672 1673 dm_exception_table_lock_init(s, pe->e.old_chunk, &lock); 1674 1675 if (!success) { 1676 /* Read/write error - snapshot is unusable */ 1677 invalidate_snapshot(s, -EIO); 1678 error = 1; 1679 1680 dm_exception_table_lock(&lock); 1681 goto out; 1682 } 1683 1684 e = alloc_completed_exception(GFP_NOIO); 1685 if (!e) { 1686 invalidate_snapshot(s, -ENOMEM); 1687 error = 1; 1688 1689 dm_exception_table_lock(&lock); 1690 goto out; 1691 } 1692 *e = pe->e; 1693 1694 down_read(&s->lock); 1695 dm_exception_table_lock(&lock); 1696 if (!s->valid) { 1697 up_read(&s->lock); 1698 free_completed_exception(e); 1699 error = 1; 1700 1701 goto out; 1702 } 1703 1704 /* 1705 * Add a proper exception. After inserting the completed exception all 1706 * subsequent snapshot reads to this chunk will be redirected to the 1707 * COW device. This ensures that we do not starve. Moreover, as long 1708 * as the pending exception exists, neither origin writes nor snapshot 1709 * merging can overwrite the chunk in origin. 1710 */ 1711 dm_insert_exception(&s->complete, e); 1712 up_read(&s->lock); 1713 1714 /* Wait for conflicting reads to drain */ 1715 if (__chunk_is_tracked(s, pe->e.old_chunk)) { 1716 dm_exception_table_unlock(&lock); 1717 __check_for_conflicting_io(s, pe->e.old_chunk); 1718 dm_exception_table_lock(&lock); 1719 } 1720 1721 out: 1722 /* Remove the in-flight exception from the list */ 1723 dm_remove_exception(&pe->e); 1724 1725 dm_exception_table_unlock(&lock); 1726 1727 snapshot_bios = bio_list_get(&pe->snapshot_bios); 1728 origin_bios = bio_list_get(&pe->origin_bios); 1729 full_bio = pe->full_bio; 1730 if (full_bio) 1731 full_bio->bi_end_io = pe->full_bio_end_io; 1732 increment_pending_exceptions_done_count(); 1733 1734 /* Submit any pending write bios */ 1735 if (error) { 1736 if (full_bio) 1737 bio_io_error(full_bio); 1738 error_bios(snapshot_bios); 1739 } else { 1740 if (full_bio) 1741 bio_endio(full_bio); 1742 flush_bios(snapshot_bios); 1743 } 1744 1745 retry_origin_bios(s, origin_bios); 1746 1747 free_pending_exception(pe); 1748 } 1749 1750 static void complete_exception(struct dm_snap_pending_exception *pe) 1751 { 1752 struct dm_snapshot *s = pe->snap; 1753 1754 /* Update the metadata if we are persistent */ 1755 s->store->type->commit_exception(s->store, &pe->e, !pe->copy_error, 1756 pending_complete, pe); 1757 } 1758 1759 /* 1760 * Called when the copy I/O has finished. kcopyd actually runs 1761 * this code so don't block. 1762 */ 1763 static void copy_callback(int read_err, unsigned long write_err, void *context) 1764 { 1765 struct dm_snap_pending_exception *pe = context; 1766 struct dm_snapshot *s = pe->snap; 1767 1768 pe->copy_error = read_err || write_err; 1769 1770 if (pe->exception_sequence == s->exception_complete_sequence) { 1771 struct rb_node *next; 1772 1773 s->exception_complete_sequence++; 1774 complete_exception(pe); 1775 1776 next = rb_first(&s->out_of_order_tree); 1777 while (next) { 1778 pe = rb_entry(next, struct dm_snap_pending_exception, 1779 out_of_order_node); 1780 if (pe->exception_sequence != s->exception_complete_sequence) 1781 break; 1782 next = rb_next(next); 1783 s->exception_complete_sequence++; 1784 rb_erase(&pe->out_of_order_node, &s->out_of_order_tree); 1785 complete_exception(pe); 1786 cond_resched(); 1787 } 1788 } else { 1789 struct rb_node *parent = NULL; 1790 struct rb_node **p = &s->out_of_order_tree.rb_node; 1791 struct dm_snap_pending_exception *pe2; 1792 1793 while (*p) { 1794 pe2 = rb_entry(*p, struct dm_snap_pending_exception, out_of_order_node); 1795 parent = *p; 1796 1797 BUG_ON(pe->exception_sequence == pe2->exception_sequence); 1798 if (pe->exception_sequence < pe2->exception_sequence) 1799 p = &((*p)->rb_left); 1800 else 1801 p = &((*p)->rb_right); 1802 } 1803 1804 rb_link_node(&pe->out_of_order_node, parent, p); 1805 rb_insert_color(&pe->out_of_order_node, &s->out_of_order_tree); 1806 } 1807 account_end_copy(s); 1808 } 1809 1810 /* 1811 * Dispatches the copy operation to kcopyd. 1812 */ 1813 static void start_copy(struct dm_snap_pending_exception *pe) 1814 { 1815 struct dm_snapshot *s = pe->snap; 1816 struct dm_io_region src, dest; 1817 struct block_device *bdev = s->origin->bdev; 1818 sector_t dev_size; 1819 1820 dev_size = get_dev_size(bdev); 1821 1822 src.bdev = bdev; 1823 src.sector = chunk_to_sector(s->store, pe->e.old_chunk); 1824 src.count = min((sector_t)s->store->chunk_size, dev_size - src.sector); 1825 1826 dest.bdev = s->cow->bdev; 1827 dest.sector = chunk_to_sector(s->store, pe->e.new_chunk); 1828 dest.count = src.count; 1829 1830 /* Hand over to kcopyd */ 1831 account_start_copy(s); 1832 dm_kcopyd_copy(s->kcopyd_client, &src, 1, &dest, 0, copy_callback, pe); 1833 } 1834 1835 static void full_bio_end_io(struct bio *bio) 1836 { 1837 void *callback_data = bio->bi_private; 1838 1839 dm_kcopyd_do_callback(callback_data, 0, bio->bi_status ? 1 : 0); 1840 } 1841 1842 static void start_full_bio(struct dm_snap_pending_exception *pe, 1843 struct bio *bio) 1844 { 1845 struct dm_snapshot *s = pe->snap; 1846 void *callback_data; 1847 1848 pe->full_bio = bio; 1849 pe->full_bio_end_io = bio->bi_end_io; 1850 1851 account_start_copy(s); 1852 callback_data = dm_kcopyd_prepare_callback(s->kcopyd_client, 1853 copy_callback, pe); 1854 1855 bio->bi_end_io = full_bio_end_io; 1856 bio->bi_private = callback_data; 1857 1858 submit_bio_noacct(bio); 1859 } 1860 1861 static struct dm_snap_pending_exception * 1862 __lookup_pending_exception(struct dm_snapshot *s, chunk_t chunk) 1863 { 1864 struct dm_exception *e = dm_lookup_exception(&s->pending, chunk); 1865 1866 if (!e) 1867 return NULL; 1868 1869 return container_of(e, struct dm_snap_pending_exception, e); 1870 } 1871 1872 /* 1873 * Inserts a pending exception into the pending table. 1874 * 1875 * NOTE: a write lock must be held on the chunk's pending exception table slot 1876 * before calling this. 1877 */ 1878 static struct dm_snap_pending_exception * 1879 __insert_pending_exception(struct dm_snapshot *s, 1880 struct dm_snap_pending_exception *pe, chunk_t chunk) 1881 { 1882 pe->e.old_chunk = chunk; 1883 bio_list_init(&pe->origin_bios); 1884 bio_list_init(&pe->snapshot_bios); 1885 pe->started = 0; 1886 pe->full_bio = NULL; 1887 1888 spin_lock(&s->pe_allocation_lock); 1889 if (s->store->type->prepare_exception(s->store, &pe->e)) { 1890 spin_unlock(&s->pe_allocation_lock); 1891 free_pending_exception(pe); 1892 return NULL; 1893 } 1894 1895 pe->exception_sequence = s->exception_start_sequence++; 1896 spin_unlock(&s->pe_allocation_lock); 1897 1898 dm_insert_exception(&s->pending, &pe->e); 1899 1900 return pe; 1901 } 1902 1903 /* 1904 * Looks to see if this snapshot already has a pending exception 1905 * for this chunk, otherwise it allocates a new one and inserts 1906 * it into the pending table. 1907 * 1908 * NOTE: a write lock must be held on the chunk's pending exception table slot 1909 * before calling this. 1910 */ 1911 static struct dm_snap_pending_exception * 1912 __find_pending_exception(struct dm_snapshot *s, 1913 struct dm_snap_pending_exception *pe, chunk_t chunk) 1914 { 1915 struct dm_snap_pending_exception *pe2; 1916 1917 pe2 = __lookup_pending_exception(s, chunk); 1918 if (pe2) { 1919 free_pending_exception(pe); 1920 return pe2; 1921 } 1922 1923 return __insert_pending_exception(s, pe, chunk); 1924 } 1925 1926 static void remap_exception(struct dm_snapshot *s, struct dm_exception *e, 1927 struct bio *bio, chunk_t chunk) 1928 { 1929 bio_set_dev(bio, s->cow->bdev); 1930 bio->bi_iter.bi_sector = 1931 chunk_to_sector(s->store, dm_chunk_number(e->new_chunk) + 1932 (chunk - e->old_chunk)) + 1933 (bio->bi_iter.bi_sector & s->store->chunk_mask); 1934 } 1935 1936 static void zero_callback(int read_err, unsigned long write_err, void *context) 1937 { 1938 struct bio *bio = context; 1939 struct dm_snapshot *s = bio->bi_private; 1940 1941 account_end_copy(s); 1942 bio->bi_status = write_err ? BLK_STS_IOERR : 0; 1943 bio_endio(bio); 1944 } 1945 1946 static void zero_exception(struct dm_snapshot *s, struct dm_exception *e, 1947 struct bio *bio, chunk_t chunk) 1948 { 1949 struct dm_io_region dest; 1950 1951 dest.bdev = s->cow->bdev; 1952 dest.sector = bio->bi_iter.bi_sector; 1953 dest.count = s->store->chunk_size; 1954 1955 account_start_copy(s); 1956 WARN_ON_ONCE(bio->bi_private); 1957 bio->bi_private = s; 1958 dm_kcopyd_zero(s->kcopyd_client, 1, &dest, 0, zero_callback, bio); 1959 } 1960 1961 static bool io_overlaps_chunk(struct dm_snapshot *s, struct bio *bio) 1962 { 1963 return bio->bi_iter.bi_size == 1964 (s->store->chunk_size << SECTOR_SHIFT); 1965 } 1966 1967 static int snapshot_map(struct dm_target *ti, struct bio *bio) 1968 { 1969 struct dm_exception *e; 1970 struct dm_snapshot *s = ti->private; 1971 int r = DM_MAPIO_REMAPPED; 1972 chunk_t chunk; 1973 struct dm_snap_pending_exception *pe = NULL; 1974 struct dm_exception_table_lock lock; 1975 1976 init_tracked_chunk(bio); 1977 1978 if (bio->bi_opf & REQ_PREFLUSH) { 1979 bio_set_dev(bio, s->cow->bdev); 1980 return DM_MAPIO_REMAPPED; 1981 } 1982 1983 chunk = sector_to_chunk(s->store, bio->bi_iter.bi_sector); 1984 dm_exception_table_lock_init(s, chunk, &lock); 1985 1986 /* Full snapshots are not usable */ 1987 /* To get here the table must be live so s->active is always set. */ 1988 if (!s->valid) 1989 return DM_MAPIO_KILL; 1990 1991 if (bio_data_dir(bio) == WRITE) { 1992 while (unlikely(!wait_for_in_progress(s, false))) 1993 ; /* wait_for_in_progress() has slept */ 1994 } 1995 1996 down_read(&s->lock); 1997 dm_exception_table_lock(&lock); 1998 1999 if (!s->valid || (unlikely(s->snapshot_overflowed) && 2000 bio_data_dir(bio) == WRITE)) { 2001 r = DM_MAPIO_KILL; 2002 goto out_unlock; 2003 } 2004 2005 if (unlikely(bio_op(bio) == REQ_OP_DISCARD)) { 2006 if (s->discard_passdown_origin && dm_bio_get_target_bio_nr(bio)) { 2007 /* 2008 * passdown discard to origin (without triggering 2009 * snapshot exceptions via do_origin; doing so would 2010 * defeat the goal of freeing space in origin that is 2011 * implied by the "discard_passdown_origin" feature) 2012 */ 2013 bio_set_dev(bio, s->origin->bdev); 2014 track_chunk(s, bio, chunk); 2015 goto out_unlock; 2016 } 2017 /* discard to snapshot (target_bio_nr == 0) zeroes exceptions */ 2018 } 2019 2020 /* If the block is already remapped - use that, else remap it */ 2021 e = dm_lookup_exception(&s->complete, chunk); 2022 if (e) { 2023 remap_exception(s, e, bio, chunk); 2024 if (unlikely(bio_op(bio) == REQ_OP_DISCARD) && 2025 io_overlaps_chunk(s, bio)) { 2026 dm_exception_table_unlock(&lock); 2027 up_read(&s->lock); 2028 zero_exception(s, e, bio, chunk); 2029 r = DM_MAPIO_SUBMITTED; /* discard is not issued */ 2030 goto out; 2031 } 2032 goto out_unlock; 2033 } 2034 2035 if (unlikely(bio_op(bio) == REQ_OP_DISCARD)) { 2036 /* 2037 * If no exception exists, complete discard immediately 2038 * otherwise it'll trigger copy-out. 2039 */ 2040 bio_endio(bio); 2041 r = DM_MAPIO_SUBMITTED; 2042 goto out_unlock; 2043 } 2044 2045 /* 2046 * Write to snapshot - higher level takes care of RW/RO 2047 * flags so we should only get this if we are 2048 * writeable. 2049 */ 2050 if (bio_data_dir(bio) == WRITE) { 2051 pe = __lookup_pending_exception(s, chunk); 2052 if (!pe) { 2053 dm_exception_table_unlock(&lock); 2054 pe = alloc_pending_exception(s); 2055 dm_exception_table_lock(&lock); 2056 2057 e = dm_lookup_exception(&s->complete, chunk); 2058 if (e) { 2059 free_pending_exception(pe); 2060 remap_exception(s, e, bio, chunk); 2061 goto out_unlock; 2062 } 2063 2064 pe = __find_pending_exception(s, pe, chunk); 2065 if (!pe) { 2066 dm_exception_table_unlock(&lock); 2067 up_read(&s->lock); 2068 2069 down_write(&s->lock); 2070 2071 if (s->store->userspace_supports_overflow) { 2072 if (s->valid && !s->snapshot_overflowed) { 2073 s->snapshot_overflowed = 1; 2074 DMERR("Snapshot overflowed: Unable to allocate exception."); 2075 } 2076 } else 2077 __invalidate_snapshot(s, -ENOMEM); 2078 up_write(&s->lock); 2079 2080 r = DM_MAPIO_KILL; 2081 goto out; 2082 } 2083 } 2084 2085 remap_exception(s, &pe->e, bio, chunk); 2086 2087 r = DM_MAPIO_SUBMITTED; 2088 2089 if (!pe->started && io_overlaps_chunk(s, bio)) { 2090 pe->started = 1; 2091 2092 dm_exception_table_unlock(&lock); 2093 up_read(&s->lock); 2094 2095 start_full_bio(pe, bio); 2096 goto out; 2097 } 2098 2099 bio_list_add(&pe->snapshot_bios, bio); 2100 2101 if (!pe->started) { 2102 /* this is protected by the exception table lock */ 2103 pe->started = 1; 2104 2105 dm_exception_table_unlock(&lock); 2106 up_read(&s->lock); 2107 2108 start_copy(pe); 2109 goto out; 2110 } 2111 } else { 2112 bio_set_dev(bio, s->origin->bdev); 2113 track_chunk(s, bio, chunk); 2114 } 2115 2116 out_unlock: 2117 dm_exception_table_unlock(&lock); 2118 up_read(&s->lock); 2119 out: 2120 return r; 2121 } 2122 2123 /* 2124 * A snapshot-merge target behaves like a combination of a snapshot 2125 * target and a snapshot-origin target. It only generates new 2126 * exceptions in other snapshots and not in the one that is being 2127 * merged. 2128 * 2129 * For each chunk, if there is an existing exception, it is used to 2130 * redirect I/O to the cow device. Otherwise I/O is sent to the origin, 2131 * which in turn might generate exceptions in other snapshots. 2132 * If merging is currently taking place on the chunk in question, the 2133 * I/O is deferred by adding it to s->bios_queued_during_merge. 2134 */ 2135 static int snapshot_merge_map(struct dm_target *ti, struct bio *bio) 2136 { 2137 struct dm_exception *e; 2138 struct dm_snapshot *s = ti->private; 2139 int r = DM_MAPIO_REMAPPED; 2140 chunk_t chunk; 2141 2142 init_tracked_chunk(bio); 2143 2144 if (bio->bi_opf & REQ_PREFLUSH) { 2145 if (!dm_bio_get_target_bio_nr(bio)) 2146 bio_set_dev(bio, s->origin->bdev); 2147 else 2148 bio_set_dev(bio, s->cow->bdev); 2149 return DM_MAPIO_REMAPPED; 2150 } 2151 2152 if (unlikely(bio_op(bio) == REQ_OP_DISCARD)) { 2153 /* Once merging, discards no longer effect change */ 2154 bio_endio(bio); 2155 return DM_MAPIO_SUBMITTED; 2156 } 2157 2158 chunk = sector_to_chunk(s->store, bio->bi_iter.bi_sector); 2159 2160 down_write(&s->lock); 2161 2162 /* Full merging snapshots are redirected to the origin */ 2163 if (!s->valid) 2164 goto redirect_to_origin; 2165 2166 /* If the block is already remapped - use that */ 2167 e = dm_lookup_exception(&s->complete, chunk); 2168 if (e) { 2169 /* Queue writes overlapping with chunks being merged */ 2170 if (bio_data_dir(bio) == WRITE && 2171 chunk >= s->first_merging_chunk && 2172 chunk < (s->first_merging_chunk + 2173 s->num_merging_chunks)) { 2174 bio_set_dev(bio, s->origin->bdev); 2175 bio_list_add(&s->bios_queued_during_merge, bio); 2176 r = DM_MAPIO_SUBMITTED; 2177 goto out_unlock; 2178 } 2179 2180 remap_exception(s, e, bio, chunk); 2181 2182 if (bio_data_dir(bio) == WRITE) 2183 track_chunk(s, bio, chunk); 2184 goto out_unlock; 2185 } 2186 2187 redirect_to_origin: 2188 bio_set_dev(bio, s->origin->bdev); 2189 2190 if (bio_data_dir(bio) == WRITE) { 2191 up_write(&s->lock); 2192 return do_origin(s->origin, bio, false); 2193 } 2194 2195 out_unlock: 2196 up_write(&s->lock); 2197 2198 return r; 2199 } 2200 2201 static int snapshot_end_io(struct dm_target *ti, struct bio *bio, 2202 blk_status_t *error) 2203 { 2204 struct dm_snapshot *s = ti->private; 2205 2206 if (is_bio_tracked(bio)) 2207 stop_tracking_chunk(s, bio); 2208 2209 return DM_ENDIO_DONE; 2210 } 2211 2212 static void snapshot_merge_presuspend(struct dm_target *ti) 2213 { 2214 struct dm_snapshot *s = ti->private; 2215 2216 stop_merge(s); 2217 } 2218 2219 static int snapshot_preresume(struct dm_target *ti) 2220 { 2221 int r = 0; 2222 struct dm_snapshot *s = ti->private; 2223 struct dm_snapshot *snap_src = NULL, *snap_dest = NULL; 2224 2225 down_read(&_origins_lock); 2226 (void) __find_snapshots_sharing_cow(s, &snap_src, &snap_dest, NULL); 2227 if (snap_src && snap_dest) { 2228 down_read(&snap_src->lock); 2229 if (s == snap_src) { 2230 DMERR("Unable to resume snapshot source until " 2231 "handover completes."); 2232 r = -EINVAL; 2233 } else if (!dm_suspended(snap_src->ti)) { 2234 DMERR("Unable to perform snapshot handover until " 2235 "source is suspended."); 2236 r = -EINVAL; 2237 } 2238 up_read(&snap_src->lock); 2239 } 2240 up_read(&_origins_lock); 2241 2242 return r; 2243 } 2244 2245 static void snapshot_resume(struct dm_target *ti) 2246 { 2247 struct dm_snapshot *s = ti->private; 2248 struct dm_snapshot *snap_src = NULL, *snap_dest = NULL, *snap_merging = NULL; 2249 struct dm_origin *o; 2250 struct mapped_device *origin_md = NULL; 2251 bool must_restart_merging = false; 2252 2253 down_read(&_origins_lock); 2254 2255 o = __lookup_dm_origin(s->origin->bdev); 2256 if (o) 2257 origin_md = dm_table_get_md(o->ti->table); 2258 if (!origin_md) { 2259 (void) __find_snapshots_sharing_cow(s, NULL, NULL, &snap_merging); 2260 if (snap_merging) 2261 origin_md = dm_table_get_md(snap_merging->ti->table); 2262 } 2263 if (origin_md == dm_table_get_md(ti->table)) 2264 origin_md = NULL; 2265 if (origin_md) { 2266 if (dm_hold(origin_md)) 2267 origin_md = NULL; 2268 } 2269 2270 up_read(&_origins_lock); 2271 2272 if (origin_md) { 2273 dm_internal_suspend_fast(origin_md); 2274 if (snap_merging && test_bit(RUNNING_MERGE, &snap_merging->state_bits)) { 2275 must_restart_merging = true; 2276 stop_merge(snap_merging); 2277 } 2278 } 2279 2280 down_read(&_origins_lock); 2281 2282 (void) __find_snapshots_sharing_cow(s, &snap_src, &snap_dest, NULL); 2283 if (snap_src && snap_dest) { 2284 down_write(&snap_src->lock); 2285 down_write_nested(&snap_dest->lock, SINGLE_DEPTH_NESTING); 2286 __handover_exceptions(snap_src, snap_dest); 2287 up_write(&snap_dest->lock); 2288 up_write(&snap_src->lock); 2289 } 2290 2291 up_read(&_origins_lock); 2292 2293 if (origin_md) { 2294 if (must_restart_merging) 2295 start_merge(snap_merging); 2296 dm_internal_resume_fast(origin_md); 2297 dm_put(origin_md); 2298 } 2299 2300 /* Now we have correct chunk size, reregister */ 2301 reregister_snapshot(s); 2302 2303 down_write(&s->lock); 2304 s->active = 1; 2305 up_write(&s->lock); 2306 } 2307 2308 static uint32_t get_origin_minimum_chunksize(struct block_device *bdev) 2309 { 2310 uint32_t min_chunksize; 2311 2312 down_read(&_origins_lock); 2313 min_chunksize = __minimum_chunk_size(__lookup_origin(bdev)); 2314 up_read(&_origins_lock); 2315 2316 return min_chunksize; 2317 } 2318 2319 static void snapshot_merge_resume(struct dm_target *ti) 2320 { 2321 struct dm_snapshot *s = ti->private; 2322 2323 /* 2324 * Handover exceptions from existing snapshot. 2325 */ 2326 snapshot_resume(ti); 2327 2328 /* 2329 * snapshot-merge acts as an origin, so set ti->max_io_len 2330 */ 2331 ti->max_io_len = get_origin_minimum_chunksize(s->origin->bdev); 2332 2333 start_merge(s); 2334 } 2335 2336 static void snapshot_status(struct dm_target *ti, status_type_t type, 2337 unsigned status_flags, char *result, unsigned maxlen) 2338 { 2339 unsigned sz = 0; 2340 struct dm_snapshot *snap = ti->private; 2341 unsigned num_features; 2342 2343 switch (type) { 2344 case STATUSTYPE_INFO: 2345 2346 down_write(&snap->lock); 2347 2348 if (!snap->valid) 2349 DMEMIT("Invalid"); 2350 else if (snap->merge_failed) 2351 DMEMIT("Merge failed"); 2352 else if (snap->snapshot_overflowed) 2353 DMEMIT("Overflow"); 2354 else { 2355 if (snap->store->type->usage) { 2356 sector_t total_sectors, sectors_allocated, 2357 metadata_sectors; 2358 snap->store->type->usage(snap->store, 2359 &total_sectors, 2360 §ors_allocated, 2361 &metadata_sectors); 2362 DMEMIT("%llu/%llu %llu", 2363 (unsigned long long)sectors_allocated, 2364 (unsigned long long)total_sectors, 2365 (unsigned long long)metadata_sectors); 2366 } 2367 else 2368 DMEMIT("Unknown"); 2369 } 2370 2371 up_write(&snap->lock); 2372 2373 break; 2374 2375 case STATUSTYPE_TABLE: 2376 /* 2377 * kdevname returns a static pointer so we need 2378 * to make private copies if the output is to 2379 * make sense. 2380 */ 2381 DMEMIT("%s %s", snap->origin->name, snap->cow->name); 2382 sz += snap->store->type->status(snap->store, type, result + sz, 2383 maxlen - sz); 2384 num_features = snap->discard_zeroes_cow + snap->discard_passdown_origin; 2385 if (num_features) { 2386 DMEMIT(" %u", num_features); 2387 if (snap->discard_zeroes_cow) 2388 DMEMIT(" discard_zeroes_cow"); 2389 if (snap->discard_passdown_origin) 2390 DMEMIT(" discard_passdown_origin"); 2391 } 2392 break; 2393 } 2394 } 2395 2396 static int snapshot_iterate_devices(struct dm_target *ti, 2397 iterate_devices_callout_fn fn, void *data) 2398 { 2399 struct dm_snapshot *snap = ti->private; 2400 int r; 2401 2402 r = fn(ti, snap->origin, 0, ti->len, data); 2403 2404 if (!r) 2405 r = fn(ti, snap->cow, 0, get_dev_size(snap->cow->bdev), data); 2406 2407 return r; 2408 } 2409 2410 static void snapshot_io_hints(struct dm_target *ti, struct queue_limits *limits) 2411 { 2412 struct dm_snapshot *snap = ti->private; 2413 2414 if (snap->discard_zeroes_cow) { 2415 struct dm_snapshot *snap_src = NULL, *snap_dest = NULL; 2416 2417 down_read(&_origins_lock); 2418 2419 (void) __find_snapshots_sharing_cow(snap, &snap_src, &snap_dest, NULL); 2420 if (snap_src && snap_dest) 2421 snap = snap_src; 2422 2423 /* All discards are split on chunk_size boundary */ 2424 limits->discard_granularity = snap->store->chunk_size; 2425 limits->max_discard_sectors = snap->store->chunk_size; 2426 2427 up_read(&_origins_lock); 2428 } 2429 } 2430 2431 /*----------------------------------------------------------------- 2432 * Origin methods 2433 *---------------------------------------------------------------*/ 2434 2435 /* 2436 * If no exceptions need creating, DM_MAPIO_REMAPPED is returned and any 2437 * supplied bio was ignored. The caller may submit it immediately. 2438 * (No remapping actually occurs as the origin is always a direct linear 2439 * map.) 2440 * 2441 * If further exceptions are required, DM_MAPIO_SUBMITTED is returned 2442 * and any supplied bio is added to a list to be submitted once all 2443 * the necessary exceptions exist. 2444 */ 2445 static int __origin_write(struct list_head *snapshots, sector_t sector, 2446 struct bio *bio) 2447 { 2448 int r = DM_MAPIO_REMAPPED; 2449 struct dm_snapshot *snap; 2450 struct dm_exception *e; 2451 struct dm_snap_pending_exception *pe, *pe2; 2452 struct dm_snap_pending_exception *pe_to_start_now = NULL; 2453 struct dm_snap_pending_exception *pe_to_start_last = NULL; 2454 struct dm_exception_table_lock lock; 2455 chunk_t chunk; 2456 2457 /* Do all the snapshots on this origin */ 2458 list_for_each_entry (snap, snapshots, list) { 2459 /* 2460 * Don't make new exceptions in a merging snapshot 2461 * because it has effectively been deleted 2462 */ 2463 if (dm_target_is_snapshot_merge(snap->ti)) 2464 continue; 2465 2466 /* Nothing to do if writing beyond end of snapshot */ 2467 if (sector >= dm_table_get_size(snap->ti->table)) 2468 continue; 2469 2470 /* 2471 * Remember, different snapshots can have 2472 * different chunk sizes. 2473 */ 2474 chunk = sector_to_chunk(snap->store, sector); 2475 dm_exception_table_lock_init(snap, chunk, &lock); 2476 2477 down_read(&snap->lock); 2478 dm_exception_table_lock(&lock); 2479 2480 /* Only deal with valid and active snapshots */ 2481 if (!snap->valid || !snap->active) 2482 goto next_snapshot; 2483 2484 pe = __lookup_pending_exception(snap, chunk); 2485 if (!pe) { 2486 /* 2487 * Check exception table to see if block is already 2488 * remapped in this snapshot and trigger an exception 2489 * if not. 2490 */ 2491 e = dm_lookup_exception(&snap->complete, chunk); 2492 if (e) 2493 goto next_snapshot; 2494 2495 dm_exception_table_unlock(&lock); 2496 pe = alloc_pending_exception(snap); 2497 dm_exception_table_lock(&lock); 2498 2499 pe2 = __lookup_pending_exception(snap, chunk); 2500 2501 if (!pe2) { 2502 e = dm_lookup_exception(&snap->complete, chunk); 2503 if (e) { 2504 free_pending_exception(pe); 2505 goto next_snapshot; 2506 } 2507 2508 pe = __insert_pending_exception(snap, pe, chunk); 2509 if (!pe) { 2510 dm_exception_table_unlock(&lock); 2511 up_read(&snap->lock); 2512 2513 invalidate_snapshot(snap, -ENOMEM); 2514 continue; 2515 } 2516 } else { 2517 free_pending_exception(pe); 2518 pe = pe2; 2519 } 2520 } 2521 2522 r = DM_MAPIO_SUBMITTED; 2523 2524 /* 2525 * If an origin bio was supplied, queue it to wait for the 2526 * completion of this exception, and start this one last, 2527 * at the end of the function. 2528 */ 2529 if (bio) { 2530 bio_list_add(&pe->origin_bios, bio); 2531 bio = NULL; 2532 2533 if (!pe->started) { 2534 pe->started = 1; 2535 pe_to_start_last = pe; 2536 } 2537 } 2538 2539 if (!pe->started) { 2540 pe->started = 1; 2541 pe_to_start_now = pe; 2542 } 2543 2544 next_snapshot: 2545 dm_exception_table_unlock(&lock); 2546 up_read(&snap->lock); 2547 2548 if (pe_to_start_now) { 2549 start_copy(pe_to_start_now); 2550 pe_to_start_now = NULL; 2551 } 2552 } 2553 2554 /* 2555 * Submit the exception against which the bio is queued last, 2556 * to give the other exceptions a head start. 2557 */ 2558 if (pe_to_start_last) 2559 start_copy(pe_to_start_last); 2560 2561 return r; 2562 } 2563 2564 /* 2565 * Called on a write from the origin driver. 2566 */ 2567 static int do_origin(struct dm_dev *origin, struct bio *bio, bool limit) 2568 { 2569 struct origin *o; 2570 int r = DM_MAPIO_REMAPPED; 2571 2572 again: 2573 down_read(&_origins_lock); 2574 o = __lookup_origin(origin->bdev); 2575 if (o) { 2576 if (limit) { 2577 struct dm_snapshot *s; 2578 list_for_each_entry(s, &o->snapshots, list) 2579 if (unlikely(!wait_for_in_progress(s, true))) 2580 goto again; 2581 } 2582 2583 r = __origin_write(&o->snapshots, bio->bi_iter.bi_sector, bio); 2584 } 2585 up_read(&_origins_lock); 2586 2587 return r; 2588 } 2589 2590 /* 2591 * Trigger exceptions in all non-merging snapshots. 2592 * 2593 * The chunk size of the merging snapshot may be larger than the chunk 2594 * size of some other snapshot so we may need to reallocate multiple 2595 * chunks in other snapshots. 2596 * 2597 * We scan all the overlapping exceptions in the other snapshots. 2598 * Returns 1 if anything was reallocated and must be waited for, 2599 * otherwise returns 0. 2600 * 2601 * size must be a multiple of merging_snap's chunk_size. 2602 */ 2603 static int origin_write_extent(struct dm_snapshot *merging_snap, 2604 sector_t sector, unsigned size) 2605 { 2606 int must_wait = 0; 2607 sector_t n; 2608 struct origin *o; 2609 2610 /* 2611 * The origin's __minimum_chunk_size() got stored in max_io_len 2612 * by snapshot_merge_resume(). 2613 */ 2614 down_read(&_origins_lock); 2615 o = __lookup_origin(merging_snap->origin->bdev); 2616 for (n = 0; n < size; n += merging_snap->ti->max_io_len) 2617 if (__origin_write(&o->snapshots, sector + n, NULL) == 2618 DM_MAPIO_SUBMITTED) 2619 must_wait = 1; 2620 up_read(&_origins_lock); 2621 2622 return must_wait; 2623 } 2624 2625 /* 2626 * Origin: maps a linear range of a device, with hooks for snapshotting. 2627 */ 2628 2629 /* 2630 * Construct an origin mapping: <dev_path> 2631 * The context for an origin is merely a 'struct dm_dev *' 2632 * pointing to the real device. 2633 */ 2634 static int origin_ctr(struct dm_target *ti, unsigned int argc, char **argv) 2635 { 2636 int r; 2637 struct dm_origin *o; 2638 2639 if (argc != 1) { 2640 ti->error = "origin: incorrect number of arguments"; 2641 return -EINVAL; 2642 } 2643 2644 o = kmalloc(sizeof(struct dm_origin), GFP_KERNEL); 2645 if (!o) { 2646 ti->error = "Cannot allocate private origin structure"; 2647 r = -ENOMEM; 2648 goto bad_alloc; 2649 } 2650 2651 r = dm_get_device(ti, argv[0], dm_table_get_mode(ti->table), &o->dev); 2652 if (r) { 2653 ti->error = "Cannot get target device"; 2654 goto bad_open; 2655 } 2656 2657 o->ti = ti; 2658 ti->private = o; 2659 ti->num_flush_bios = 1; 2660 2661 return 0; 2662 2663 bad_open: 2664 kfree(o); 2665 bad_alloc: 2666 return r; 2667 } 2668 2669 static void origin_dtr(struct dm_target *ti) 2670 { 2671 struct dm_origin *o = ti->private; 2672 2673 dm_put_device(ti, o->dev); 2674 kfree(o); 2675 } 2676 2677 static int origin_map(struct dm_target *ti, struct bio *bio) 2678 { 2679 struct dm_origin *o = ti->private; 2680 unsigned available_sectors; 2681 2682 bio_set_dev(bio, o->dev->bdev); 2683 2684 if (unlikely(bio->bi_opf & REQ_PREFLUSH)) 2685 return DM_MAPIO_REMAPPED; 2686 2687 if (bio_data_dir(bio) != WRITE) 2688 return DM_MAPIO_REMAPPED; 2689 2690 available_sectors = o->split_boundary - 2691 ((unsigned)bio->bi_iter.bi_sector & (o->split_boundary - 1)); 2692 2693 if (bio_sectors(bio) > available_sectors) 2694 dm_accept_partial_bio(bio, available_sectors); 2695 2696 /* Only tell snapshots if this is a write */ 2697 return do_origin(o->dev, bio, true); 2698 } 2699 2700 /* 2701 * Set the target "max_io_len" field to the minimum of all the snapshots' 2702 * chunk sizes. 2703 */ 2704 static void origin_resume(struct dm_target *ti) 2705 { 2706 struct dm_origin *o = ti->private; 2707 2708 o->split_boundary = get_origin_minimum_chunksize(o->dev->bdev); 2709 2710 down_write(&_origins_lock); 2711 __insert_dm_origin(o); 2712 up_write(&_origins_lock); 2713 } 2714 2715 static void origin_postsuspend(struct dm_target *ti) 2716 { 2717 struct dm_origin *o = ti->private; 2718 2719 down_write(&_origins_lock); 2720 __remove_dm_origin(o); 2721 up_write(&_origins_lock); 2722 } 2723 2724 static void origin_status(struct dm_target *ti, status_type_t type, 2725 unsigned status_flags, char *result, unsigned maxlen) 2726 { 2727 struct dm_origin *o = ti->private; 2728 2729 switch (type) { 2730 case STATUSTYPE_INFO: 2731 result[0] = '\0'; 2732 break; 2733 2734 case STATUSTYPE_TABLE: 2735 snprintf(result, maxlen, "%s", o->dev->name); 2736 break; 2737 } 2738 } 2739 2740 static int origin_iterate_devices(struct dm_target *ti, 2741 iterate_devices_callout_fn fn, void *data) 2742 { 2743 struct dm_origin *o = ti->private; 2744 2745 return fn(ti, o->dev, 0, ti->len, data); 2746 } 2747 2748 static struct target_type origin_target = { 2749 .name = "snapshot-origin", 2750 .version = {1, 9, 0}, 2751 .module = THIS_MODULE, 2752 .ctr = origin_ctr, 2753 .dtr = origin_dtr, 2754 .map = origin_map, 2755 .resume = origin_resume, 2756 .postsuspend = origin_postsuspend, 2757 .status = origin_status, 2758 .iterate_devices = origin_iterate_devices, 2759 }; 2760 2761 static struct target_type snapshot_target = { 2762 .name = "snapshot", 2763 .version = {1, 16, 0}, 2764 .module = THIS_MODULE, 2765 .ctr = snapshot_ctr, 2766 .dtr = snapshot_dtr, 2767 .map = snapshot_map, 2768 .end_io = snapshot_end_io, 2769 .preresume = snapshot_preresume, 2770 .resume = snapshot_resume, 2771 .status = snapshot_status, 2772 .iterate_devices = snapshot_iterate_devices, 2773 .io_hints = snapshot_io_hints, 2774 }; 2775 2776 static struct target_type merge_target = { 2777 .name = dm_snapshot_merge_target_name, 2778 .version = {1, 5, 0}, 2779 .module = THIS_MODULE, 2780 .ctr = snapshot_ctr, 2781 .dtr = snapshot_dtr, 2782 .map = snapshot_merge_map, 2783 .end_io = snapshot_end_io, 2784 .presuspend = snapshot_merge_presuspend, 2785 .preresume = snapshot_preresume, 2786 .resume = snapshot_merge_resume, 2787 .status = snapshot_status, 2788 .iterate_devices = snapshot_iterate_devices, 2789 .io_hints = snapshot_io_hints, 2790 }; 2791 2792 static int __init dm_snapshot_init(void) 2793 { 2794 int r; 2795 2796 r = dm_exception_store_init(); 2797 if (r) { 2798 DMERR("Failed to initialize exception stores"); 2799 return r; 2800 } 2801 2802 r = init_origin_hash(); 2803 if (r) { 2804 DMERR("init_origin_hash failed."); 2805 goto bad_origin_hash; 2806 } 2807 2808 exception_cache = KMEM_CACHE(dm_exception, 0); 2809 if (!exception_cache) { 2810 DMERR("Couldn't create exception cache."); 2811 r = -ENOMEM; 2812 goto bad_exception_cache; 2813 } 2814 2815 pending_cache = KMEM_CACHE(dm_snap_pending_exception, 0); 2816 if (!pending_cache) { 2817 DMERR("Couldn't create pending cache."); 2818 r = -ENOMEM; 2819 goto bad_pending_cache; 2820 } 2821 2822 r = dm_register_target(&snapshot_target); 2823 if (r < 0) { 2824 DMERR("snapshot target register failed %d", r); 2825 goto bad_register_snapshot_target; 2826 } 2827 2828 r = dm_register_target(&origin_target); 2829 if (r < 0) { 2830 DMERR("Origin target register failed %d", r); 2831 goto bad_register_origin_target; 2832 } 2833 2834 r = dm_register_target(&merge_target); 2835 if (r < 0) { 2836 DMERR("Merge target register failed %d", r); 2837 goto bad_register_merge_target; 2838 } 2839 2840 return 0; 2841 2842 bad_register_merge_target: 2843 dm_unregister_target(&origin_target); 2844 bad_register_origin_target: 2845 dm_unregister_target(&snapshot_target); 2846 bad_register_snapshot_target: 2847 kmem_cache_destroy(pending_cache); 2848 bad_pending_cache: 2849 kmem_cache_destroy(exception_cache); 2850 bad_exception_cache: 2851 exit_origin_hash(); 2852 bad_origin_hash: 2853 dm_exception_store_exit(); 2854 2855 return r; 2856 } 2857 2858 static void __exit dm_snapshot_exit(void) 2859 { 2860 dm_unregister_target(&snapshot_target); 2861 dm_unregister_target(&origin_target); 2862 dm_unregister_target(&merge_target); 2863 2864 exit_origin_hash(); 2865 kmem_cache_destroy(pending_cache); 2866 kmem_cache_destroy(exception_cache); 2867 2868 dm_exception_store_exit(); 2869 } 2870 2871 /* Module hooks */ 2872 module_init(dm_snapshot_init); 2873 module_exit(dm_snapshot_exit); 2874 2875 MODULE_DESCRIPTION(DM_NAME " snapshot target"); 2876 MODULE_AUTHOR("Joe Thornber"); 2877 MODULE_LICENSE("GPL"); 2878 MODULE_ALIAS("dm-snapshot-origin"); 2879 MODULE_ALIAS("dm-snapshot-merge"); 2880