1 /* 2 * Copyright (C) 2012 Red Hat. All rights reserved. 3 * 4 * This file is released under the GPL. 5 */ 6 7 #include "dm.h" 8 #include "dm-bio-prison-v2.h" 9 #include "dm-bio-record.h" 10 #include "dm-cache-metadata.h" 11 12 #include <linux/dm-io.h> 13 #include <linux/dm-kcopyd.h> 14 #include <linux/jiffies.h> 15 #include <linux/init.h> 16 #include <linux/mempool.h> 17 #include <linux/module.h> 18 #include <linux/rwsem.h> 19 #include <linux/slab.h> 20 #include <linux/vmalloc.h> 21 22 #define DM_MSG_PREFIX "cache" 23 24 DECLARE_DM_KCOPYD_THROTTLE_WITH_MODULE_PARM(cache_copy_throttle, 25 "A percentage of time allocated for copying to and/or from cache"); 26 27 /*----------------------------------------------------------------*/ 28 29 /* 30 * Glossary: 31 * 32 * oblock: index of an origin block 33 * cblock: index of a cache block 34 * promotion: movement of a block from origin to cache 35 * demotion: movement of a block from cache to origin 36 * migration: movement of a block between the origin and cache device, 37 * either direction 38 */ 39 40 /*----------------------------------------------------------------*/ 41 42 struct io_tracker { 43 spinlock_t lock; 44 45 /* 46 * Sectors of in-flight IO. 47 */ 48 sector_t in_flight; 49 50 /* 51 * The time, in jiffies, when this device became idle (if it is 52 * indeed idle). 53 */ 54 unsigned long idle_time; 55 unsigned long last_update_time; 56 }; 57 58 static void iot_init(struct io_tracker *iot) 59 { 60 spin_lock_init(&iot->lock); 61 iot->in_flight = 0ul; 62 iot->idle_time = 0ul; 63 iot->last_update_time = jiffies; 64 } 65 66 static bool __iot_idle_for(struct io_tracker *iot, unsigned long jifs) 67 { 68 if (iot->in_flight) 69 return false; 70 71 return time_after(jiffies, iot->idle_time + jifs); 72 } 73 74 static bool iot_idle_for(struct io_tracker *iot, unsigned long jifs) 75 { 76 bool r; 77 unsigned long flags; 78 79 spin_lock_irqsave(&iot->lock, flags); 80 r = __iot_idle_for(iot, jifs); 81 spin_unlock_irqrestore(&iot->lock, flags); 82 83 return r; 84 } 85 86 static void iot_io_begin(struct io_tracker *iot, sector_t len) 87 { 88 unsigned long flags; 89 90 spin_lock_irqsave(&iot->lock, flags); 91 iot->in_flight += len; 92 spin_unlock_irqrestore(&iot->lock, flags); 93 } 94 95 static void __iot_io_end(struct io_tracker *iot, sector_t len) 96 { 97 iot->in_flight -= len; 98 if (!iot->in_flight) 99 iot->idle_time = jiffies; 100 } 101 102 static void iot_io_end(struct io_tracker *iot, sector_t len) 103 { 104 unsigned long flags; 105 106 spin_lock_irqsave(&iot->lock, flags); 107 __iot_io_end(iot, len); 108 spin_unlock_irqrestore(&iot->lock, flags); 109 } 110 111 /*----------------------------------------------------------------*/ 112 113 /* 114 * Represents a chunk of future work. 'input' allows continuations to pass 115 * values between themselves, typically error values. 116 */ 117 struct continuation { 118 struct work_struct ws; 119 int input; 120 }; 121 122 static inline void init_continuation(struct continuation *k, 123 void (*fn)(struct work_struct *)) 124 { 125 INIT_WORK(&k->ws, fn); 126 k->input = 0; 127 } 128 129 static inline void queue_continuation(struct workqueue_struct *wq, 130 struct continuation *k) 131 { 132 queue_work(wq, &k->ws); 133 } 134 135 /*----------------------------------------------------------------*/ 136 137 /* 138 * The batcher collects together pieces of work that need a particular 139 * operation to occur before they can proceed (typically a commit). 140 */ 141 struct batcher { 142 /* 143 * The operation that everyone is waiting for. 144 */ 145 int (*commit_op)(void *context); 146 void *commit_context; 147 148 /* 149 * This is how bios should be issued once the commit op is complete 150 * (accounted_request). 151 */ 152 void (*issue_op)(struct bio *bio, void *context); 153 void *issue_context; 154 155 /* 156 * Queued work gets put on here after commit. 157 */ 158 struct workqueue_struct *wq; 159 160 spinlock_t lock; 161 struct list_head work_items; 162 struct bio_list bios; 163 struct work_struct commit_work; 164 165 bool commit_scheduled; 166 }; 167 168 static void __commit(struct work_struct *_ws) 169 { 170 struct batcher *b = container_of(_ws, struct batcher, commit_work); 171 172 int r; 173 unsigned long flags; 174 struct list_head work_items; 175 struct work_struct *ws, *tmp; 176 struct continuation *k; 177 struct bio *bio; 178 struct bio_list bios; 179 180 INIT_LIST_HEAD(&work_items); 181 bio_list_init(&bios); 182 183 /* 184 * We have to grab these before the commit_op to avoid a race 185 * condition. 186 */ 187 spin_lock_irqsave(&b->lock, flags); 188 list_splice_init(&b->work_items, &work_items); 189 bio_list_merge(&bios, &b->bios); 190 bio_list_init(&b->bios); 191 b->commit_scheduled = false; 192 spin_unlock_irqrestore(&b->lock, flags); 193 194 r = b->commit_op(b->commit_context); 195 196 list_for_each_entry_safe(ws, tmp, &work_items, entry) { 197 k = container_of(ws, struct continuation, ws); 198 k->input = r; 199 INIT_LIST_HEAD(&ws->entry); /* to avoid a WARN_ON */ 200 queue_work(b->wq, ws); 201 } 202 203 while ((bio = bio_list_pop(&bios))) { 204 if (r) { 205 bio->bi_error = r; 206 bio_endio(bio); 207 } else 208 b->issue_op(bio, b->issue_context); 209 } 210 } 211 212 static void batcher_init(struct batcher *b, 213 int (*commit_op)(void *), 214 void *commit_context, 215 void (*issue_op)(struct bio *bio, void *), 216 void *issue_context, 217 struct workqueue_struct *wq) 218 { 219 b->commit_op = commit_op; 220 b->commit_context = commit_context; 221 b->issue_op = issue_op; 222 b->issue_context = issue_context; 223 b->wq = wq; 224 225 spin_lock_init(&b->lock); 226 INIT_LIST_HEAD(&b->work_items); 227 bio_list_init(&b->bios); 228 INIT_WORK(&b->commit_work, __commit); 229 b->commit_scheduled = false; 230 } 231 232 static void async_commit(struct batcher *b) 233 { 234 queue_work(b->wq, &b->commit_work); 235 } 236 237 static void continue_after_commit(struct batcher *b, struct continuation *k) 238 { 239 unsigned long flags; 240 bool commit_scheduled; 241 242 spin_lock_irqsave(&b->lock, flags); 243 commit_scheduled = b->commit_scheduled; 244 list_add_tail(&k->ws.entry, &b->work_items); 245 spin_unlock_irqrestore(&b->lock, flags); 246 247 if (commit_scheduled) 248 async_commit(b); 249 } 250 251 /* 252 * Bios are errored if commit failed. 253 */ 254 static void issue_after_commit(struct batcher *b, struct bio *bio) 255 { 256 unsigned long flags; 257 bool commit_scheduled; 258 259 spin_lock_irqsave(&b->lock, flags); 260 commit_scheduled = b->commit_scheduled; 261 bio_list_add(&b->bios, bio); 262 spin_unlock_irqrestore(&b->lock, flags); 263 264 if (commit_scheduled) 265 async_commit(b); 266 } 267 268 /* 269 * Call this if some urgent work is waiting for the commit to complete. 270 */ 271 static void schedule_commit(struct batcher *b) 272 { 273 bool immediate; 274 unsigned long flags; 275 276 spin_lock_irqsave(&b->lock, flags); 277 immediate = !list_empty(&b->work_items) || !bio_list_empty(&b->bios); 278 b->commit_scheduled = true; 279 spin_unlock_irqrestore(&b->lock, flags); 280 281 if (immediate) 282 async_commit(b); 283 } 284 285 /* 286 * There are a couple of places where we let a bio run, but want to do some 287 * work before calling its endio function. We do this by temporarily 288 * changing the endio fn. 289 */ 290 struct dm_hook_info { 291 bio_end_io_t *bi_end_io; 292 }; 293 294 static void dm_hook_bio(struct dm_hook_info *h, struct bio *bio, 295 bio_end_io_t *bi_end_io, void *bi_private) 296 { 297 h->bi_end_io = bio->bi_end_io; 298 299 bio->bi_end_io = bi_end_io; 300 bio->bi_private = bi_private; 301 } 302 303 static void dm_unhook_bio(struct dm_hook_info *h, struct bio *bio) 304 { 305 bio->bi_end_io = h->bi_end_io; 306 } 307 308 /*----------------------------------------------------------------*/ 309 310 #define MIGRATION_POOL_SIZE 128 311 #define COMMIT_PERIOD HZ 312 #define MIGRATION_COUNT_WINDOW 10 313 314 /* 315 * The block size of the device holding cache data must be 316 * between 32KB and 1GB. 317 */ 318 #define DATA_DEV_BLOCK_SIZE_MIN_SECTORS (32 * 1024 >> SECTOR_SHIFT) 319 #define DATA_DEV_BLOCK_SIZE_MAX_SECTORS (1024 * 1024 * 1024 >> SECTOR_SHIFT) 320 321 enum cache_metadata_mode { 322 CM_WRITE, /* metadata may be changed */ 323 CM_READ_ONLY, /* metadata may not be changed */ 324 CM_FAIL 325 }; 326 327 enum cache_io_mode { 328 /* 329 * Data is written to cached blocks only. These blocks are marked 330 * dirty. If you lose the cache device you will lose data. 331 * Potential performance increase for both reads and writes. 332 */ 333 CM_IO_WRITEBACK, 334 335 /* 336 * Data is written to both cache and origin. Blocks are never 337 * dirty. Potential performance benfit for reads only. 338 */ 339 CM_IO_WRITETHROUGH, 340 341 /* 342 * A degraded mode useful for various cache coherency situations 343 * (eg, rolling back snapshots). Reads and writes always go to the 344 * origin. If a write goes to a cached oblock, then the cache 345 * block is invalidated. 346 */ 347 CM_IO_PASSTHROUGH 348 }; 349 350 struct cache_features { 351 enum cache_metadata_mode mode; 352 enum cache_io_mode io_mode; 353 unsigned metadata_version; 354 }; 355 356 struct cache_stats { 357 atomic_t read_hit; 358 atomic_t read_miss; 359 atomic_t write_hit; 360 atomic_t write_miss; 361 atomic_t demotion; 362 atomic_t promotion; 363 atomic_t writeback; 364 atomic_t copies_avoided; 365 atomic_t cache_cell_clash; 366 atomic_t commit_count; 367 atomic_t discard_count; 368 }; 369 370 struct cache { 371 struct dm_target *ti; 372 struct dm_target_callbacks callbacks; 373 374 struct dm_cache_metadata *cmd; 375 376 /* 377 * Metadata is written to this device. 378 */ 379 struct dm_dev *metadata_dev; 380 381 /* 382 * The slower of the two data devices. Typically a spindle. 383 */ 384 struct dm_dev *origin_dev; 385 386 /* 387 * The faster of the two data devices. Typically an SSD. 388 */ 389 struct dm_dev *cache_dev; 390 391 /* 392 * Size of the origin device in _complete_ blocks and native sectors. 393 */ 394 dm_oblock_t origin_blocks; 395 sector_t origin_sectors; 396 397 /* 398 * Size of the cache device in blocks. 399 */ 400 dm_cblock_t cache_size; 401 402 /* 403 * Fields for converting from sectors to blocks. 404 */ 405 sector_t sectors_per_block; 406 int sectors_per_block_shift; 407 408 spinlock_t lock; 409 struct list_head deferred_cells; 410 struct bio_list deferred_bios; 411 struct bio_list deferred_writethrough_bios; 412 sector_t migration_threshold; 413 wait_queue_head_t migration_wait; 414 atomic_t nr_allocated_migrations; 415 416 /* 417 * The number of in flight migrations that are performing 418 * background io. eg, promotion, writeback. 419 */ 420 atomic_t nr_io_migrations; 421 422 struct rw_semaphore quiesce_lock; 423 424 /* 425 * cache_size entries, dirty if set 426 */ 427 atomic_t nr_dirty; 428 unsigned long *dirty_bitset; 429 430 /* 431 * origin_blocks entries, discarded if set. 432 */ 433 dm_dblock_t discard_nr_blocks; 434 unsigned long *discard_bitset; 435 uint32_t discard_block_size; /* a power of 2 times sectors per block */ 436 437 /* 438 * Rather than reconstructing the table line for the status we just 439 * save it and regurgitate. 440 */ 441 unsigned nr_ctr_args; 442 const char **ctr_args; 443 444 struct dm_kcopyd_client *copier; 445 struct workqueue_struct *wq; 446 struct work_struct deferred_bio_worker; 447 struct work_struct deferred_writethrough_worker; 448 struct work_struct migration_worker; 449 struct delayed_work waker; 450 struct dm_bio_prison_v2 *prison; 451 452 mempool_t *migration_pool; 453 454 struct dm_cache_policy *policy; 455 unsigned policy_nr_args; 456 457 bool need_tick_bio:1; 458 bool sized:1; 459 bool invalidate:1; 460 bool commit_requested:1; 461 bool loaded_mappings:1; 462 bool loaded_discards:1; 463 464 /* 465 * Cache features such as write-through. 466 */ 467 struct cache_features features; 468 469 struct cache_stats stats; 470 471 /* 472 * Invalidation fields. 473 */ 474 spinlock_t invalidation_lock; 475 struct list_head invalidation_requests; 476 477 struct io_tracker origin_tracker; 478 479 struct work_struct commit_ws; 480 struct batcher committer; 481 482 struct rw_semaphore background_work_lock; 483 }; 484 485 struct per_bio_data { 486 bool tick:1; 487 unsigned req_nr:2; 488 struct dm_bio_prison_cell_v2 *cell; 489 struct dm_hook_info hook_info; 490 sector_t len; 491 492 /* 493 * writethrough fields. These MUST remain at the end of this 494 * structure and the 'cache' member must be the first as it 495 * is used to determine the offset of the writethrough fields. 496 */ 497 struct cache *cache; 498 dm_cblock_t cblock; 499 struct dm_bio_details bio_details; 500 }; 501 502 struct dm_cache_migration { 503 struct continuation k; 504 struct cache *cache; 505 506 struct policy_work *op; 507 struct bio *overwrite_bio; 508 struct dm_bio_prison_cell_v2 *cell; 509 510 dm_cblock_t invalidate_cblock; 511 dm_oblock_t invalidate_oblock; 512 }; 513 514 /*----------------------------------------------------------------*/ 515 516 static bool writethrough_mode(struct cache_features *f) 517 { 518 return f->io_mode == CM_IO_WRITETHROUGH; 519 } 520 521 static bool writeback_mode(struct cache_features *f) 522 { 523 return f->io_mode == CM_IO_WRITEBACK; 524 } 525 526 static inline bool passthrough_mode(struct cache_features *f) 527 { 528 return unlikely(f->io_mode == CM_IO_PASSTHROUGH); 529 } 530 531 /*----------------------------------------------------------------*/ 532 533 static void wake_deferred_bio_worker(struct cache *cache) 534 { 535 queue_work(cache->wq, &cache->deferred_bio_worker); 536 } 537 538 static void wake_deferred_writethrough_worker(struct cache *cache) 539 { 540 queue_work(cache->wq, &cache->deferred_writethrough_worker); 541 } 542 543 static void wake_migration_worker(struct cache *cache) 544 { 545 if (passthrough_mode(&cache->features)) 546 return; 547 548 queue_work(cache->wq, &cache->migration_worker); 549 } 550 551 /*----------------------------------------------------------------*/ 552 553 static struct dm_bio_prison_cell_v2 *alloc_prison_cell(struct cache *cache) 554 { 555 return dm_bio_prison_alloc_cell_v2(cache->prison, GFP_NOWAIT); 556 } 557 558 static void free_prison_cell(struct cache *cache, struct dm_bio_prison_cell_v2 *cell) 559 { 560 dm_bio_prison_free_cell_v2(cache->prison, cell); 561 } 562 563 static struct dm_cache_migration *alloc_migration(struct cache *cache) 564 { 565 struct dm_cache_migration *mg; 566 567 mg = mempool_alloc(cache->migration_pool, GFP_NOWAIT); 568 if (mg) { 569 mg->cache = cache; 570 atomic_inc(&mg->cache->nr_allocated_migrations); 571 } 572 573 return mg; 574 } 575 576 static void free_migration(struct dm_cache_migration *mg) 577 { 578 struct cache *cache = mg->cache; 579 580 if (atomic_dec_and_test(&cache->nr_allocated_migrations)) 581 wake_up(&cache->migration_wait); 582 583 mempool_free(mg, cache->migration_pool); 584 } 585 586 /*----------------------------------------------------------------*/ 587 588 static inline dm_oblock_t oblock_succ(dm_oblock_t b) 589 { 590 return to_oblock(from_oblock(b) + 1ull); 591 } 592 593 static void build_key(dm_oblock_t begin, dm_oblock_t end, struct dm_cell_key_v2 *key) 594 { 595 key->virtual = 0; 596 key->dev = 0; 597 key->block_begin = from_oblock(begin); 598 key->block_end = from_oblock(end); 599 } 600 601 /* 602 * We have two lock levels. Level 0, which is used to prevent WRITEs, and 603 * level 1 which prevents *both* READs and WRITEs. 604 */ 605 #define WRITE_LOCK_LEVEL 0 606 #define READ_WRITE_LOCK_LEVEL 1 607 608 static unsigned lock_level(struct bio *bio) 609 { 610 return bio_data_dir(bio) == WRITE ? 611 WRITE_LOCK_LEVEL : 612 READ_WRITE_LOCK_LEVEL; 613 } 614 615 /*---------------------------------------------------------------- 616 * Per bio data 617 *--------------------------------------------------------------*/ 618 619 /* 620 * If using writeback, leave out struct per_bio_data's writethrough fields. 621 */ 622 #define PB_DATA_SIZE_WB (offsetof(struct per_bio_data, cache)) 623 #define PB_DATA_SIZE_WT (sizeof(struct per_bio_data)) 624 625 static size_t get_per_bio_data_size(struct cache *cache) 626 { 627 return writethrough_mode(&cache->features) ? PB_DATA_SIZE_WT : PB_DATA_SIZE_WB; 628 } 629 630 static struct per_bio_data *get_per_bio_data(struct bio *bio, size_t data_size) 631 { 632 struct per_bio_data *pb = dm_per_bio_data(bio, data_size); 633 BUG_ON(!pb); 634 return pb; 635 } 636 637 static struct per_bio_data *init_per_bio_data(struct bio *bio, size_t data_size) 638 { 639 struct per_bio_data *pb = get_per_bio_data(bio, data_size); 640 641 pb->tick = false; 642 pb->req_nr = dm_bio_get_target_bio_nr(bio); 643 pb->cell = NULL; 644 pb->len = 0; 645 646 return pb; 647 } 648 649 /*----------------------------------------------------------------*/ 650 651 static void defer_bio(struct cache *cache, struct bio *bio) 652 { 653 unsigned long flags; 654 655 spin_lock_irqsave(&cache->lock, flags); 656 bio_list_add(&cache->deferred_bios, bio); 657 spin_unlock_irqrestore(&cache->lock, flags); 658 659 wake_deferred_bio_worker(cache); 660 } 661 662 static void defer_bios(struct cache *cache, struct bio_list *bios) 663 { 664 unsigned long flags; 665 666 spin_lock_irqsave(&cache->lock, flags); 667 bio_list_merge(&cache->deferred_bios, bios); 668 bio_list_init(bios); 669 spin_unlock_irqrestore(&cache->lock, flags); 670 671 wake_deferred_bio_worker(cache); 672 } 673 674 /*----------------------------------------------------------------*/ 675 676 static bool bio_detain_shared(struct cache *cache, dm_oblock_t oblock, struct bio *bio) 677 { 678 bool r; 679 size_t pb_size; 680 struct per_bio_data *pb; 681 struct dm_cell_key_v2 key; 682 dm_oblock_t end = to_oblock(from_oblock(oblock) + 1ULL); 683 struct dm_bio_prison_cell_v2 *cell_prealloc, *cell; 684 685 cell_prealloc = alloc_prison_cell(cache); /* FIXME: allow wait if calling from worker */ 686 if (!cell_prealloc) { 687 defer_bio(cache, bio); 688 return false; 689 } 690 691 build_key(oblock, end, &key); 692 r = dm_cell_get_v2(cache->prison, &key, lock_level(bio), bio, cell_prealloc, &cell); 693 if (!r) { 694 /* 695 * Failed to get the lock. 696 */ 697 free_prison_cell(cache, cell_prealloc); 698 return r; 699 } 700 701 if (cell != cell_prealloc) 702 free_prison_cell(cache, cell_prealloc); 703 704 pb_size = get_per_bio_data_size(cache); 705 pb = get_per_bio_data(bio, pb_size); 706 pb->cell = cell; 707 708 return r; 709 } 710 711 /*----------------------------------------------------------------*/ 712 713 static bool is_dirty(struct cache *cache, dm_cblock_t b) 714 { 715 return test_bit(from_cblock(b), cache->dirty_bitset); 716 } 717 718 static void set_dirty(struct cache *cache, dm_cblock_t cblock) 719 { 720 if (!test_and_set_bit(from_cblock(cblock), cache->dirty_bitset)) { 721 atomic_inc(&cache->nr_dirty); 722 policy_set_dirty(cache->policy, cblock); 723 } 724 } 725 726 /* 727 * These two are called when setting after migrations to force the policy 728 * and dirty bitset to be in sync. 729 */ 730 static void force_set_dirty(struct cache *cache, dm_cblock_t cblock) 731 { 732 if (!test_and_set_bit(from_cblock(cblock), cache->dirty_bitset)) 733 atomic_inc(&cache->nr_dirty); 734 policy_set_dirty(cache->policy, cblock); 735 } 736 737 static void force_clear_dirty(struct cache *cache, dm_cblock_t cblock) 738 { 739 if (test_and_clear_bit(from_cblock(cblock), cache->dirty_bitset)) { 740 if (atomic_dec_return(&cache->nr_dirty) == 0) 741 dm_table_event(cache->ti->table); 742 } 743 744 policy_clear_dirty(cache->policy, cblock); 745 } 746 747 /*----------------------------------------------------------------*/ 748 749 static bool block_size_is_power_of_two(struct cache *cache) 750 { 751 return cache->sectors_per_block_shift >= 0; 752 } 753 754 /* gcc on ARM generates spurious references to __udivdi3 and __umoddi3 */ 755 #if defined(CONFIG_ARM) && __GNUC__ == 4 && __GNUC_MINOR__ <= 6 756 __always_inline 757 #endif 758 static dm_block_t block_div(dm_block_t b, uint32_t n) 759 { 760 do_div(b, n); 761 762 return b; 763 } 764 765 static dm_block_t oblocks_per_dblock(struct cache *cache) 766 { 767 dm_block_t oblocks = cache->discard_block_size; 768 769 if (block_size_is_power_of_two(cache)) 770 oblocks >>= cache->sectors_per_block_shift; 771 else 772 oblocks = block_div(oblocks, cache->sectors_per_block); 773 774 return oblocks; 775 } 776 777 static dm_dblock_t oblock_to_dblock(struct cache *cache, dm_oblock_t oblock) 778 { 779 return to_dblock(block_div(from_oblock(oblock), 780 oblocks_per_dblock(cache))); 781 } 782 783 static void set_discard(struct cache *cache, dm_dblock_t b) 784 { 785 unsigned long flags; 786 787 BUG_ON(from_dblock(b) >= from_dblock(cache->discard_nr_blocks)); 788 atomic_inc(&cache->stats.discard_count); 789 790 spin_lock_irqsave(&cache->lock, flags); 791 set_bit(from_dblock(b), cache->discard_bitset); 792 spin_unlock_irqrestore(&cache->lock, flags); 793 } 794 795 static void clear_discard(struct cache *cache, dm_dblock_t b) 796 { 797 unsigned long flags; 798 799 spin_lock_irqsave(&cache->lock, flags); 800 clear_bit(from_dblock(b), cache->discard_bitset); 801 spin_unlock_irqrestore(&cache->lock, flags); 802 } 803 804 static bool is_discarded(struct cache *cache, dm_dblock_t b) 805 { 806 int r; 807 unsigned long flags; 808 809 spin_lock_irqsave(&cache->lock, flags); 810 r = test_bit(from_dblock(b), cache->discard_bitset); 811 spin_unlock_irqrestore(&cache->lock, flags); 812 813 return r; 814 } 815 816 static bool is_discarded_oblock(struct cache *cache, dm_oblock_t b) 817 { 818 int r; 819 unsigned long flags; 820 821 spin_lock_irqsave(&cache->lock, flags); 822 r = test_bit(from_dblock(oblock_to_dblock(cache, b)), 823 cache->discard_bitset); 824 spin_unlock_irqrestore(&cache->lock, flags); 825 826 return r; 827 } 828 829 /*---------------------------------------------------------------- 830 * Remapping 831 *--------------------------------------------------------------*/ 832 static void remap_to_origin(struct cache *cache, struct bio *bio) 833 { 834 bio->bi_bdev = cache->origin_dev->bdev; 835 } 836 837 static void remap_to_cache(struct cache *cache, struct bio *bio, 838 dm_cblock_t cblock) 839 { 840 sector_t bi_sector = bio->bi_iter.bi_sector; 841 sector_t block = from_cblock(cblock); 842 843 bio->bi_bdev = cache->cache_dev->bdev; 844 if (!block_size_is_power_of_two(cache)) 845 bio->bi_iter.bi_sector = 846 (block * cache->sectors_per_block) + 847 sector_div(bi_sector, cache->sectors_per_block); 848 else 849 bio->bi_iter.bi_sector = 850 (block << cache->sectors_per_block_shift) | 851 (bi_sector & (cache->sectors_per_block - 1)); 852 } 853 854 static void check_if_tick_bio_needed(struct cache *cache, struct bio *bio) 855 { 856 unsigned long flags; 857 size_t pb_data_size = get_per_bio_data_size(cache); 858 struct per_bio_data *pb = get_per_bio_data(bio, pb_data_size); 859 860 spin_lock_irqsave(&cache->lock, flags); 861 if (cache->need_tick_bio && !op_is_flush(bio->bi_opf) && 862 bio_op(bio) != REQ_OP_DISCARD) { 863 pb->tick = true; 864 cache->need_tick_bio = false; 865 } 866 spin_unlock_irqrestore(&cache->lock, flags); 867 } 868 869 static void remap_to_origin_clear_discard(struct cache *cache, struct bio *bio, 870 dm_oblock_t oblock) 871 { 872 // FIXME: this is called way too much. 873 check_if_tick_bio_needed(cache, bio); 874 remap_to_origin(cache, bio); 875 if (bio_data_dir(bio) == WRITE) 876 clear_discard(cache, oblock_to_dblock(cache, oblock)); 877 } 878 879 static void remap_to_cache_dirty(struct cache *cache, struct bio *bio, 880 dm_oblock_t oblock, dm_cblock_t cblock) 881 { 882 check_if_tick_bio_needed(cache, bio); 883 remap_to_cache(cache, bio, cblock); 884 if (bio_data_dir(bio) == WRITE) { 885 set_dirty(cache, cblock); 886 clear_discard(cache, oblock_to_dblock(cache, oblock)); 887 } 888 } 889 890 static dm_oblock_t get_bio_block(struct cache *cache, struct bio *bio) 891 { 892 sector_t block_nr = bio->bi_iter.bi_sector; 893 894 if (!block_size_is_power_of_two(cache)) 895 (void) sector_div(block_nr, cache->sectors_per_block); 896 else 897 block_nr >>= cache->sectors_per_block_shift; 898 899 return to_oblock(block_nr); 900 } 901 902 static bool accountable_bio(struct cache *cache, struct bio *bio) 903 { 904 return ((bio->bi_bdev == cache->origin_dev->bdev) && 905 bio_op(bio) != REQ_OP_DISCARD); 906 } 907 908 static void accounted_begin(struct cache *cache, struct bio *bio) 909 { 910 size_t pb_data_size = get_per_bio_data_size(cache); 911 struct per_bio_data *pb = get_per_bio_data(bio, pb_data_size); 912 913 if (accountable_bio(cache, bio)) { 914 pb->len = bio_sectors(bio); 915 iot_io_begin(&cache->origin_tracker, pb->len); 916 } 917 } 918 919 static void accounted_complete(struct cache *cache, struct bio *bio) 920 { 921 size_t pb_data_size = get_per_bio_data_size(cache); 922 struct per_bio_data *pb = get_per_bio_data(bio, pb_data_size); 923 924 iot_io_end(&cache->origin_tracker, pb->len); 925 } 926 927 static void accounted_request(struct cache *cache, struct bio *bio) 928 { 929 accounted_begin(cache, bio); 930 generic_make_request(bio); 931 } 932 933 static void issue_op(struct bio *bio, void *context) 934 { 935 struct cache *cache = context; 936 accounted_request(cache, bio); 937 } 938 939 static void defer_writethrough_bio(struct cache *cache, struct bio *bio) 940 { 941 unsigned long flags; 942 943 spin_lock_irqsave(&cache->lock, flags); 944 bio_list_add(&cache->deferred_writethrough_bios, bio); 945 spin_unlock_irqrestore(&cache->lock, flags); 946 947 wake_deferred_writethrough_worker(cache); 948 } 949 950 static void writethrough_endio(struct bio *bio) 951 { 952 struct per_bio_data *pb = get_per_bio_data(bio, PB_DATA_SIZE_WT); 953 954 dm_unhook_bio(&pb->hook_info, bio); 955 956 if (bio->bi_error) { 957 bio_endio(bio); 958 return; 959 } 960 961 dm_bio_restore(&pb->bio_details, bio); 962 remap_to_cache(pb->cache, bio, pb->cblock); 963 964 /* 965 * We can't issue this bio directly, since we're in interrupt 966 * context. So it gets put on a bio list for processing by the 967 * worker thread. 968 */ 969 defer_writethrough_bio(pb->cache, bio); 970 } 971 972 /* 973 * FIXME: send in parallel, huge latency as is. 974 * When running in writethrough mode we need to send writes to clean blocks 975 * to both the cache and origin devices. In future we'd like to clone the 976 * bio and send them in parallel, but for now we're doing them in 977 * series as this is easier. 978 */ 979 static void remap_to_origin_then_cache(struct cache *cache, struct bio *bio, 980 dm_oblock_t oblock, dm_cblock_t cblock) 981 { 982 struct per_bio_data *pb = get_per_bio_data(bio, PB_DATA_SIZE_WT); 983 984 pb->cache = cache; 985 pb->cblock = cblock; 986 dm_hook_bio(&pb->hook_info, bio, writethrough_endio, NULL); 987 dm_bio_record(&pb->bio_details, bio); 988 989 remap_to_origin_clear_discard(pb->cache, bio, oblock); 990 } 991 992 /*---------------------------------------------------------------- 993 * Failure modes 994 *--------------------------------------------------------------*/ 995 static enum cache_metadata_mode get_cache_mode(struct cache *cache) 996 { 997 return cache->features.mode; 998 } 999 1000 static const char *cache_device_name(struct cache *cache) 1001 { 1002 return dm_device_name(dm_table_get_md(cache->ti->table)); 1003 } 1004 1005 static void notify_mode_switch(struct cache *cache, enum cache_metadata_mode mode) 1006 { 1007 const char *descs[] = { 1008 "write", 1009 "read-only", 1010 "fail" 1011 }; 1012 1013 dm_table_event(cache->ti->table); 1014 DMINFO("%s: switching cache to %s mode", 1015 cache_device_name(cache), descs[(int)mode]); 1016 } 1017 1018 static void set_cache_mode(struct cache *cache, enum cache_metadata_mode new_mode) 1019 { 1020 bool needs_check; 1021 enum cache_metadata_mode old_mode = get_cache_mode(cache); 1022 1023 if (dm_cache_metadata_needs_check(cache->cmd, &needs_check)) { 1024 DMERR("%s: unable to read needs_check flag, setting failure mode.", 1025 cache_device_name(cache)); 1026 new_mode = CM_FAIL; 1027 } 1028 1029 if (new_mode == CM_WRITE && needs_check) { 1030 DMERR("%s: unable to switch cache to write mode until repaired.", 1031 cache_device_name(cache)); 1032 if (old_mode != new_mode) 1033 new_mode = old_mode; 1034 else 1035 new_mode = CM_READ_ONLY; 1036 } 1037 1038 /* Never move out of fail mode */ 1039 if (old_mode == CM_FAIL) 1040 new_mode = CM_FAIL; 1041 1042 switch (new_mode) { 1043 case CM_FAIL: 1044 case CM_READ_ONLY: 1045 dm_cache_metadata_set_read_only(cache->cmd); 1046 break; 1047 1048 case CM_WRITE: 1049 dm_cache_metadata_set_read_write(cache->cmd); 1050 break; 1051 } 1052 1053 cache->features.mode = new_mode; 1054 1055 if (new_mode != old_mode) 1056 notify_mode_switch(cache, new_mode); 1057 } 1058 1059 static void abort_transaction(struct cache *cache) 1060 { 1061 const char *dev_name = cache_device_name(cache); 1062 1063 if (get_cache_mode(cache) >= CM_READ_ONLY) 1064 return; 1065 1066 if (dm_cache_metadata_set_needs_check(cache->cmd)) { 1067 DMERR("%s: failed to set 'needs_check' flag in metadata", dev_name); 1068 set_cache_mode(cache, CM_FAIL); 1069 } 1070 1071 DMERR_LIMIT("%s: aborting current metadata transaction", dev_name); 1072 if (dm_cache_metadata_abort(cache->cmd)) { 1073 DMERR("%s: failed to abort metadata transaction", dev_name); 1074 set_cache_mode(cache, CM_FAIL); 1075 } 1076 } 1077 1078 static void metadata_operation_failed(struct cache *cache, const char *op, int r) 1079 { 1080 DMERR_LIMIT("%s: metadata operation '%s' failed: error = %d", 1081 cache_device_name(cache), op, r); 1082 abort_transaction(cache); 1083 set_cache_mode(cache, CM_READ_ONLY); 1084 } 1085 1086 /*----------------------------------------------------------------*/ 1087 1088 static void load_stats(struct cache *cache) 1089 { 1090 struct dm_cache_statistics stats; 1091 1092 dm_cache_metadata_get_stats(cache->cmd, &stats); 1093 atomic_set(&cache->stats.read_hit, stats.read_hits); 1094 atomic_set(&cache->stats.read_miss, stats.read_misses); 1095 atomic_set(&cache->stats.write_hit, stats.write_hits); 1096 atomic_set(&cache->stats.write_miss, stats.write_misses); 1097 } 1098 1099 static void save_stats(struct cache *cache) 1100 { 1101 struct dm_cache_statistics stats; 1102 1103 if (get_cache_mode(cache) >= CM_READ_ONLY) 1104 return; 1105 1106 stats.read_hits = atomic_read(&cache->stats.read_hit); 1107 stats.read_misses = atomic_read(&cache->stats.read_miss); 1108 stats.write_hits = atomic_read(&cache->stats.write_hit); 1109 stats.write_misses = atomic_read(&cache->stats.write_miss); 1110 1111 dm_cache_metadata_set_stats(cache->cmd, &stats); 1112 } 1113 1114 static void update_stats(struct cache_stats *stats, enum policy_operation op) 1115 { 1116 switch (op) { 1117 case POLICY_PROMOTE: 1118 atomic_inc(&stats->promotion); 1119 break; 1120 1121 case POLICY_DEMOTE: 1122 atomic_inc(&stats->demotion); 1123 break; 1124 1125 case POLICY_WRITEBACK: 1126 atomic_inc(&stats->writeback); 1127 break; 1128 } 1129 } 1130 1131 /*---------------------------------------------------------------- 1132 * Migration processing 1133 * 1134 * Migration covers moving data from the origin device to the cache, or 1135 * vice versa. 1136 *--------------------------------------------------------------*/ 1137 1138 static void inc_io_migrations(struct cache *cache) 1139 { 1140 atomic_inc(&cache->nr_io_migrations); 1141 } 1142 1143 static void dec_io_migrations(struct cache *cache) 1144 { 1145 atomic_dec(&cache->nr_io_migrations); 1146 } 1147 1148 static bool discard_or_flush(struct bio *bio) 1149 { 1150 return bio_op(bio) == REQ_OP_DISCARD || op_is_flush(bio->bi_opf); 1151 } 1152 1153 static void calc_discard_block_range(struct cache *cache, struct bio *bio, 1154 dm_dblock_t *b, dm_dblock_t *e) 1155 { 1156 sector_t sb = bio->bi_iter.bi_sector; 1157 sector_t se = bio_end_sector(bio); 1158 1159 *b = to_dblock(dm_sector_div_up(sb, cache->discard_block_size)); 1160 1161 if (se - sb < cache->discard_block_size) 1162 *e = *b; 1163 else 1164 *e = to_dblock(block_div(se, cache->discard_block_size)); 1165 } 1166 1167 /*----------------------------------------------------------------*/ 1168 1169 static void prevent_background_work(struct cache *cache) 1170 { 1171 lockdep_off(); 1172 down_write(&cache->background_work_lock); 1173 lockdep_on(); 1174 } 1175 1176 static void allow_background_work(struct cache *cache) 1177 { 1178 lockdep_off(); 1179 up_write(&cache->background_work_lock); 1180 lockdep_on(); 1181 } 1182 1183 static bool background_work_begin(struct cache *cache) 1184 { 1185 bool r; 1186 1187 lockdep_off(); 1188 r = down_read_trylock(&cache->background_work_lock); 1189 lockdep_on(); 1190 1191 return r; 1192 } 1193 1194 static void background_work_end(struct cache *cache) 1195 { 1196 lockdep_off(); 1197 up_read(&cache->background_work_lock); 1198 lockdep_on(); 1199 } 1200 1201 /*----------------------------------------------------------------*/ 1202 1203 static void quiesce(struct dm_cache_migration *mg, 1204 void (*continuation)(struct work_struct *)) 1205 { 1206 init_continuation(&mg->k, continuation); 1207 dm_cell_quiesce_v2(mg->cache->prison, mg->cell, &mg->k.ws); 1208 } 1209 1210 static struct dm_cache_migration *ws_to_mg(struct work_struct *ws) 1211 { 1212 struct continuation *k = container_of(ws, struct continuation, ws); 1213 return container_of(k, struct dm_cache_migration, k); 1214 } 1215 1216 static void copy_complete(int read_err, unsigned long write_err, void *context) 1217 { 1218 struct dm_cache_migration *mg = container_of(context, struct dm_cache_migration, k); 1219 1220 if (read_err || write_err) 1221 mg->k.input = -EIO; 1222 1223 queue_continuation(mg->cache->wq, &mg->k); 1224 } 1225 1226 static int copy(struct dm_cache_migration *mg, bool promote) 1227 { 1228 int r; 1229 struct dm_io_region o_region, c_region; 1230 struct cache *cache = mg->cache; 1231 1232 o_region.bdev = cache->origin_dev->bdev; 1233 o_region.sector = from_oblock(mg->op->oblock) * cache->sectors_per_block; 1234 o_region.count = cache->sectors_per_block; 1235 1236 c_region.bdev = cache->cache_dev->bdev; 1237 c_region.sector = from_cblock(mg->op->cblock) * cache->sectors_per_block; 1238 c_region.count = cache->sectors_per_block; 1239 1240 if (promote) 1241 r = dm_kcopyd_copy(cache->copier, &o_region, 1, &c_region, 0, copy_complete, &mg->k); 1242 else 1243 r = dm_kcopyd_copy(cache->copier, &c_region, 1, &o_region, 0, copy_complete, &mg->k); 1244 1245 return r; 1246 } 1247 1248 static void bio_drop_shared_lock(struct cache *cache, struct bio *bio) 1249 { 1250 size_t pb_data_size = get_per_bio_data_size(cache); 1251 struct per_bio_data *pb = get_per_bio_data(bio, pb_data_size); 1252 1253 if (pb->cell && dm_cell_put_v2(cache->prison, pb->cell)) 1254 free_prison_cell(cache, pb->cell); 1255 pb->cell = NULL; 1256 } 1257 1258 static void overwrite_endio(struct bio *bio) 1259 { 1260 struct dm_cache_migration *mg = bio->bi_private; 1261 struct cache *cache = mg->cache; 1262 size_t pb_data_size = get_per_bio_data_size(cache); 1263 struct per_bio_data *pb = get_per_bio_data(bio, pb_data_size); 1264 1265 dm_unhook_bio(&pb->hook_info, bio); 1266 1267 if (bio->bi_error) 1268 mg->k.input = bio->bi_error; 1269 1270 queue_continuation(mg->cache->wq, &mg->k); 1271 } 1272 1273 static void overwrite(struct dm_cache_migration *mg, 1274 void (*continuation)(struct work_struct *)) 1275 { 1276 struct bio *bio = mg->overwrite_bio; 1277 size_t pb_data_size = get_per_bio_data_size(mg->cache); 1278 struct per_bio_data *pb = get_per_bio_data(bio, pb_data_size); 1279 1280 dm_hook_bio(&pb->hook_info, bio, overwrite_endio, mg); 1281 1282 /* 1283 * The overwrite bio is part of the copy operation, as such it does 1284 * not set/clear discard or dirty flags. 1285 */ 1286 if (mg->op->op == POLICY_PROMOTE) 1287 remap_to_cache(mg->cache, bio, mg->op->cblock); 1288 else 1289 remap_to_origin(mg->cache, bio); 1290 1291 init_continuation(&mg->k, continuation); 1292 accounted_request(mg->cache, bio); 1293 } 1294 1295 /* 1296 * Migration steps: 1297 * 1298 * 1) exclusive lock preventing WRITEs 1299 * 2) quiesce 1300 * 3) copy or issue overwrite bio 1301 * 4) upgrade to exclusive lock preventing READs and WRITEs 1302 * 5) quiesce 1303 * 6) update metadata and commit 1304 * 7) unlock 1305 */ 1306 static void mg_complete(struct dm_cache_migration *mg, bool success) 1307 { 1308 struct bio_list bios; 1309 struct cache *cache = mg->cache; 1310 struct policy_work *op = mg->op; 1311 dm_cblock_t cblock = op->cblock; 1312 1313 if (success) 1314 update_stats(&cache->stats, op->op); 1315 1316 switch (op->op) { 1317 case POLICY_PROMOTE: 1318 clear_discard(cache, oblock_to_dblock(cache, op->oblock)); 1319 policy_complete_background_work(cache->policy, op, success); 1320 1321 if (mg->overwrite_bio) { 1322 if (success) 1323 force_set_dirty(cache, cblock); 1324 else 1325 mg->overwrite_bio->bi_error = (mg->k.input ? : -EIO); 1326 bio_endio(mg->overwrite_bio); 1327 } else { 1328 if (success) 1329 force_clear_dirty(cache, cblock); 1330 dec_io_migrations(cache); 1331 } 1332 break; 1333 1334 case POLICY_DEMOTE: 1335 /* 1336 * We clear dirty here to update the nr_dirty counter. 1337 */ 1338 if (success) 1339 force_clear_dirty(cache, cblock); 1340 policy_complete_background_work(cache->policy, op, success); 1341 dec_io_migrations(cache); 1342 break; 1343 1344 case POLICY_WRITEBACK: 1345 if (success) 1346 force_clear_dirty(cache, cblock); 1347 policy_complete_background_work(cache->policy, op, success); 1348 dec_io_migrations(cache); 1349 break; 1350 } 1351 1352 bio_list_init(&bios); 1353 if (mg->cell) { 1354 if (dm_cell_unlock_v2(cache->prison, mg->cell, &bios)) 1355 free_prison_cell(cache, mg->cell); 1356 } 1357 1358 free_migration(mg); 1359 defer_bios(cache, &bios); 1360 wake_migration_worker(cache); 1361 1362 background_work_end(cache); 1363 } 1364 1365 static void mg_success(struct work_struct *ws) 1366 { 1367 struct dm_cache_migration *mg = ws_to_mg(ws); 1368 mg_complete(mg, mg->k.input == 0); 1369 } 1370 1371 static void mg_update_metadata(struct work_struct *ws) 1372 { 1373 int r; 1374 struct dm_cache_migration *mg = ws_to_mg(ws); 1375 struct cache *cache = mg->cache; 1376 struct policy_work *op = mg->op; 1377 1378 switch (op->op) { 1379 case POLICY_PROMOTE: 1380 r = dm_cache_insert_mapping(cache->cmd, op->cblock, op->oblock); 1381 if (r) { 1382 DMERR_LIMIT("%s: migration failed; couldn't insert mapping", 1383 cache_device_name(cache)); 1384 metadata_operation_failed(cache, "dm_cache_insert_mapping", r); 1385 1386 mg_complete(mg, false); 1387 return; 1388 } 1389 mg_complete(mg, true); 1390 break; 1391 1392 case POLICY_DEMOTE: 1393 r = dm_cache_remove_mapping(cache->cmd, op->cblock); 1394 if (r) { 1395 DMERR_LIMIT("%s: migration failed; couldn't update on disk metadata", 1396 cache_device_name(cache)); 1397 metadata_operation_failed(cache, "dm_cache_remove_mapping", r); 1398 1399 mg_complete(mg, false); 1400 return; 1401 } 1402 1403 /* 1404 * It would be nice if we only had to commit when a REQ_FLUSH 1405 * comes through. But there's one scenario that we have to 1406 * look out for: 1407 * 1408 * - vblock x in a cache block 1409 * - domotion occurs 1410 * - cache block gets reallocated and over written 1411 * - crash 1412 * 1413 * When we recover, because there was no commit the cache will 1414 * rollback to having the data for vblock x in the cache block. 1415 * But the cache block has since been overwritten, so it'll end 1416 * up pointing to data that was never in 'x' during the history 1417 * of the device. 1418 * 1419 * To avoid this issue we require a commit as part of the 1420 * demotion operation. 1421 */ 1422 init_continuation(&mg->k, mg_success); 1423 continue_after_commit(&cache->committer, &mg->k); 1424 schedule_commit(&cache->committer); 1425 break; 1426 1427 case POLICY_WRITEBACK: 1428 mg_complete(mg, true); 1429 break; 1430 } 1431 } 1432 1433 static void mg_update_metadata_after_copy(struct work_struct *ws) 1434 { 1435 struct dm_cache_migration *mg = ws_to_mg(ws); 1436 1437 /* 1438 * Did the copy succeed? 1439 */ 1440 if (mg->k.input) 1441 mg_complete(mg, false); 1442 else 1443 mg_update_metadata(ws); 1444 } 1445 1446 static void mg_upgrade_lock(struct work_struct *ws) 1447 { 1448 int r; 1449 struct dm_cache_migration *mg = ws_to_mg(ws); 1450 1451 /* 1452 * Did the copy succeed? 1453 */ 1454 if (mg->k.input) 1455 mg_complete(mg, false); 1456 1457 else { 1458 /* 1459 * Now we want the lock to prevent both reads and writes. 1460 */ 1461 r = dm_cell_lock_promote_v2(mg->cache->prison, mg->cell, 1462 READ_WRITE_LOCK_LEVEL); 1463 if (r < 0) 1464 mg_complete(mg, false); 1465 1466 else if (r) 1467 quiesce(mg, mg_update_metadata); 1468 1469 else 1470 mg_update_metadata(ws); 1471 } 1472 } 1473 1474 static void mg_copy(struct work_struct *ws) 1475 { 1476 int r; 1477 struct dm_cache_migration *mg = ws_to_mg(ws); 1478 1479 if (mg->overwrite_bio) { 1480 /* 1481 * It's safe to do this here, even though it's new data 1482 * because all IO has been locked out of the block. 1483 * 1484 * mg_lock_writes() already took READ_WRITE_LOCK_LEVEL 1485 * so _not_ using mg_upgrade_lock() as continutation. 1486 */ 1487 overwrite(mg, mg_update_metadata_after_copy); 1488 1489 } else { 1490 struct cache *cache = mg->cache; 1491 struct policy_work *op = mg->op; 1492 bool is_policy_promote = (op->op == POLICY_PROMOTE); 1493 1494 if ((!is_policy_promote && !is_dirty(cache, op->cblock)) || 1495 is_discarded_oblock(cache, op->oblock)) { 1496 mg_upgrade_lock(ws); 1497 return; 1498 } 1499 1500 init_continuation(&mg->k, mg_upgrade_lock); 1501 1502 r = copy(mg, is_policy_promote); 1503 if (r) { 1504 DMERR_LIMIT("%s: migration copy failed", cache_device_name(cache)); 1505 mg->k.input = -EIO; 1506 mg_complete(mg, false); 1507 } 1508 } 1509 } 1510 1511 static int mg_lock_writes(struct dm_cache_migration *mg) 1512 { 1513 int r; 1514 struct dm_cell_key_v2 key; 1515 struct cache *cache = mg->cache; 1516 struct dm_bio_prison_cell_v2 *prealloc; 1517 1518 prealloc = alloc_prison_cell(cache); 1519 if (!prealloc) { 1520 DMERR_LIMIT("%s: alloc_prison_cell failed", cache_device_name(cache)); 1521 mg_complete(mg, false); 1522 return -ENOMEM; 1523 } 1524 1525 /* 1526 * Prevent writes to the block, but allow reads to continue. 1527 * Unless we're using an overwrite bio, in which case we lock 1528 * everything. 1529 */ 1530 build_key(mg->op->oblock, oblock_succ(mg->op->oblock), &key); 1531 r = dm_cell_lock_v2(cache->prison, &key, 1532 mg->overwrite_bio ? READ_WRITE_LOCK_LEVEL : WRITE_LOCK_LEVEL, 1533 prealloc, &mg->cell); 1534 if (r < 0) { 1535 free_prison_cell(cache, prealloc); 1536 mg_complete(mg, false); 1537 return r; 1538 } 1539 1540 if (mg->cell != prealloc) 1541 free_prison_cell(cache, prealloc); 1542 1543 if (r == 0) 1544 mg_copy(&mg->k.ws); 1545 else 1546 quiesce(mg, mg_copy); 1547 1548 return 0; 1549 } 1550 1551 static int mg_start(struct cache *cache, struct policy_work *op, struct bio *bio) 1552 { 1553 struct dm_cache_migration *mg; 1554 1555 if (!background_work_begin(cache)) { 1556 policy_complete_background_work(cache->policy, op, false); 1557 return -EPERM; 1558 } 1559 1560 mg = alloc_migration(cache); 1561 if (!mg) { 1562 policy_complete_background_work(cache->policy, op, false); 1563 background_work_end(cache); 1564 return -ENOMEM; 1565 } 1566 1567 memset(mg, 0, sizeof(*mg)); 1568 1569 mg->cache = cache; 1570 mg->op = op; 1571 mg->overwrite_bio = bio; 1572 1573 if (!bio) 1574 inc_io_migrations(cache); 1575 1576 return mg_lock_writes(mg); 1577 } 1578 1579 /*---------------------------------------------------------------- 1580 * invalidation processing 1581 *--------------------------------------------------------------*/ 1582 1583 static void invalidate_complete(struct dm_cache_migration *mg, bool success) 1584 { 1585 struct bio_list bios; 1586 struct cache *cache = mg->cache; 1587 1588 bio_list_init(&bios); 1589 if (dm_cell_unlock_v2(cache->prison, mg->cell, &bios)) 1590 free_prison_cell(cache, mg->cell); 1591 1592 if (!success && mg->overwrite_bio) 1593 bio_io_error(mg->overwrite_bio); 1594 1595 free_migration(mg); 1596 defer_bios(cache, &bios); 1597 1598 background_work_end(cache); 1599 } 1600 1601 static void invalidate_completed(struct work_struct *ws) 1602 { 1603 struct dm_cache_migration *mg = ws_to_mg(ws); 1604 invalidate_complete(mg, !mg->k.input); 1605 } 1606 1607 static int invalidate_cblock(struct cache *cache, dm_cblock_t cblock) 1608 { 1609 int r = policy_invalidate_mapping(cache->policy, cblock); 1610 if (!r) { 1611 r = dm_cache_remove_mapping(cache->cmd, cblock); 1612 if (r) { 1613 DMERR_LIMIT("%s: invalidation failed; couldn't update on disk metadata", 1614 cache_device_name(cache)); 1615 metadata_operation_failed(cache, "dm_cache_remove_mapping", r); 1616 } 1617 1618 } else if (r == -ENODATA) { 1619 /* 1620 * Harmless, already unmapped. 1621 */ 1622 r = 0; 1623 1624 } else 1625 DMERR("%s: policy_invalidate_mapping failed", cache_device_name(cache)); 1626 1627 return r; 1628 } 1629 1630 static void invalidate_remove(struct work_struct *ws) 1631 { 1632 int r; 1633 struct dm_cache_migration *mg = ws_to_mg(ws); 1634 struct cache *cache = mg->cache; 1635 1636 r = invalidate_cblock(cache, mg->invalidate_cblock); 1637 if (r) { 1638 invalidate_complete(mg, false); 1639 return; 1640 } 1641 1642 init_continuation(&mg->k, invalidate_completed); 1643 continue_after_commit(&cache->committer, &mg->k); 1644 remap_to_origin_clear_discard(cache, mg->overwrite_bio, mg->invalidate_oblock); 1645 mg->overwrite_bio = NULL; 1646 schedule_commit(&cache->committer); 1647 } 1648 1649 static int invalidate_lock(struct dm_cache_migration *mg) 1650 { 1651 int r; 1652 struct dm_cell_key_v2 key; 1653 struct cache *cache = mg->cache; 1654 struct dm_bio_prison_cell_v2 *prealloc; 1655 1656 prealloc = alloc_prison_cell(cache); 1657 if (!prealloc) { 1658 invalidate_complete(mg, false); 1659 return -ENOMEM; 1660 } 1661 1662 build_key(mg->invalidate_oblock, oblock_succ(mg->invalidate_oblock), &key); 1663 r = dm_cell_lock_v2(cache->prison, &key, 1664 READ_WRITE_LOCK_LEVEL, prealloc, &mg->cell); 1665 if (r < 0) { 1666 free_prison_cell(cache, prealloc); 1667 invalidate_complete(mg, false); 1668 return r; 1669 } 1670 1671 if (mg->cell != prealloc) 1672 free_prison_cell(cache, prealloc); 1673 1674 if (r) 1675 quiesce(mg, invalidate_remove); 1676 1677 else { 1678 /* 1679 * We can't call invalidate_remove() directly here because we 1680 * might still be in request context. 1681 */ 1682 init_continuation(&mg->k, invalidate_remove); 1683 queue_work(cache->wq, &mg->k.ws); 1684 } 1685 1686 return 0; 1687 } 1688 1689 static int invalidate_start(struct cache *cache, dm_cblock_t cblock, 1690 dm_oblock_t oblock, struct bio *bio) 1691 { 1692 struct dm_cache_migration *mg; 1693 1694 if (!background_work_begin(cache)) 1695 return -EPERM; 1696 1697 mg = alloc_migration(cache); 1698 if (!mg) { 1699 background_work_end(cache); 1700 return -ENOMEM; 1701 } 1702 1703 memset(mg, 0, sizeof(*mg)); 1704 1705 mg->cache = cache; 1706 mg->overwrite_bio = bio; 1707 mg->invalidate_cblock = cblock; 1708 mg->invalidate_oblock = oblock; 1709 1710 return invalidate_lock(mg); 1711 } 1712 1713 /*---------------------------------------------------------------- 1714 * bio processing 1715 *--------------------------------------------------------------*/ 1716 1717 enum busy { 1718 IDLE, 1719 MODERATE, 1720 BUSY 1721 }; 1722 1723 static enum busy spare_migration_bandwidth(struct cache *cache) 1724 { 1725 bool idle = iot_idle_for(&cache->origin_tracker, HZ); 1726 sector_t current_volume = (atomic_read(&cache->nr_io_migrations) + 1) * 1727 cache->sectors_per_block; 1728 1729 if (current_volume <= cache->migration_threshold) 1730 return idle ? IDLE : MODERATE; 1731 else 1732 return idle ? MODERATE : BUSY; 1733 } 1734 1735 static void inc_hit_counter(struct cache *cache, struct bio *bio) 1736 { 1737 atomic_inc(bio_data_dir(bio) == READ ? 1738 &cache->stats.read_hit : &cache->stats.write_hit); 1739 } 1740 1741 static void inc_miss_counter(struct cache *cache, struct bio *bio) 1742 { 1743 atomic_inc(bio_data_dir(bio) == READ ? 1744 &cache->stats.read_miss : &cache->stats.write_miss); 1745 } 1746 1747 /*----------------------------------------------------------------*/ 1748 1749 static bool bio_writes_complete_block(struct cache *cache, struct bio *bio) 1750 { 1751 return (bio_data_dir(bio) == WRITE) && 1752 (bio->bi_iter.bi_size == (cache->sectors_per_block << SECTOR_SHIFT)); 1753 } 1754 1755 static bool optimisable_bio(struct cache *cache, struct bio *bio, dm_oblock_t block) 1756 { 1757 return writeback_mode(&cache->features) && 1758 (is_discarded_oblock(cache, block) || bio_writes_complete_block(cache, bio)); 1759 } 1760 1761 static int map_bio(struct cache *cache, struct bio *bio, dm_oblock_t block, 1762 bool *commit_needed) 1763 { 1764 int r, data_dir; 1765 bool rb, background_queued; 1766 dm_cblock_t cblock; 1767 size_t pb_data_size = get_per_bio_data_size(cache); 1768 struct per_bio_data *pb = get_per_bio_data(bio, pb_data_size); 1769 1770 *commit_needed = false; 1771 1772 rb = bio_detain_shared(cache, block, bio); 1773 if (!rb) { 1774 /* 1775 * An exclusive lock is held for this block, so we have to 1776 * wait. We set the commit_needed flag so the current 1777 * transaction will be committed asap, allowing this lock 1778 * to be dropped. 1779 */ 1780 *commit_needed = true; 1781 return DM_MAPIO_SUBMITTED; 1782 } 1783 1784 data_dir = bio_data_dir(bio); 1785 1786 if (optimisable_bio(cache, bio, block)) { 1787 struct policy_work *op = NULL; 1788 1789 r = policy_lookup_with_work(cache->policy, block, &cblock, data_dir, true, &op); 1790 if (unlikely(r && r != -ENOENT)) { 1791 DMERR_LIMIT("%s: policy_lookup_with_work() failed with r = %d", 1792 cache_device_name(cache), r); 1793 bio_io_error(bio); 1794 return DM_MAPIO_SUBMITTED; 1795 } 1796 1797 if (r == -ENOENT && op) { 1798 bio_drop_shared_lock(cache, bio); 1799 BUG_ON(op->op != POLICY_PROMOTE); 1800 mg_start(cache, op, bio); 1801 return DM_MAPIO_SUBMITTED; 1802 } 1803 } else { 1804 r = policy_lookup(cache->policy, block, &cblock, data_dir, false, &background_queued); 1805 if (unlikely(r && r != -ENOENT)) { 1806 DMERR_LIMIT("%s: policy_lookup() failed with r = %d", 1807 cache_device_name(cache), r); 1808 bio_io_error(bio); 1809 return DM_MAPIO_SUBMITTED; 1810 } 1811 1812 if (background_queued) 1813 wake_migration_worker(cache); 1814 } 1815 1816 if (r == -ENOENT) { 1817 /* 1818 * Miss. 1819 */ 1820 inc_miss_counter(cache, bio); 1821 if (pb->req_nr == 0) { 1822 accounted_begin(cache, bio); 1823 remap_to_origin_clear_discard(cache, bio, block); 1824 1825 } else { 1826 /* 1827 * This is a duplicate writethrough io that is no 1828 * longer needed because the block has been demoted. 1829 */ 1830 bio_endio(bio); 1831 return DM_MAPIO_SUBMITTED; 1832 } 1833 } else { 1834 /* 1835 * Hit. 1836 */ 1837 inc_hit_counter(cache, bio); 1838 1839 /* 1840 * Passthrough always maps to the origin, invalidating any 1841 * cache blocks that are written to. 1842 */ 1843 if (passthrough_mode(&cache->features)) { 1844 if (bio_data_dir(bio) == WRITE) { 1845 bio_drop_shared_lock(cache, bio); 1846 atomic_inc(&cache->stats.demotion); 1847 invalidate_start(cache, cblock, block, bio); 1848 } else 1849 remap_to_origin_clear_discard(cache, bio, block); 1850 1851 } else { 1852 if (bio_data_dir(bio) == WRITE && writethrough_mode(&cache->features) && 1853 !is_dirty(cache, cblock)) { 1854 remap_to_origin_then_cache(cache, bio, block, cblock); 1855 accounted_begin(cache, bio); 1856 } else 1857 remap_to_cache_dirty(cache, bio, block, cblock); 1858 } 1859 } 1860 1861 /* 1862 * dm core turns FUA requests into a separate payload and FLUSH req. 1863 */ 1864 if (bio->bi_opf & REQ_FUA) { 1865 /* 1866 * issue_after_commit will call accounted_begin a second time. So 1867 * we call accounted_complete() to avoid double accounting. 1868 */ 1869 accounted_complete(cache, bio); 1870 issue_after_commit(&cache->committer, bio); 1871 *commit_needed = true; 1872 return DM_MAPIO_SUBMITTED; 1873 } 1874 1875 return DM_MAPIO_REMAPPED; 1876 } 1877 1878 static bool process_bio(struct cache *cache, struct bio *bio) 1879 { 1880 bool commit_needed; 1881 1882 if (map_bio(cache, bio, get_bio_block(cache, bio), &commit_needed) == DM_MAPIO_REMAPPED) 1883 generic_make_request(bio); 1884 1885 return commit_needed; 1886 } 1887 1888 /* 1889 * A non-zero return indicates read_only or fail_io mode. 1890 */ 1891 static int commit(struct cache *cache, bool clean_shutdown) 1892 { 1893 int r; 1894 1895 if (get_cache_mode(cache) >= CM_READ_ONLY) 1896 return -EINVAL; 1897 1898 atomic_inc(&cache->stats.commit_count); 1899 r = dm_cache_commit(cache->cmd, clean_shutdown); 1900 if (r) 1901 metadata_operation_failed(cache, "dm_cache_commit", r); 1902 1903 return r; 1904 } 1905 1906 /* 1907 * Used by the batcher. 1908 */ 1909 static int commit_op(void *context) 1910 { 1911 struct cache *cache = context; 1912 1913 if (dm_cache_changed_this_transaction(cache->cmd)) 1914 return commit(cache, false); 1915 1916 return 0; 1917 } 1918 1919 /*----------------------------------------------------------------*/ 1920 1921 static bool process_flush_bio(struct cache *cache, struct bio *bio) 1922 { 1923 size_t pb_data_size = get_per_bio_data_size(cache); 1924 struct per_bio_data *pb = get_per_bio_data(bio, pb_data_size); 1925 1926 if (!pb->req_nr) 1927 remap_to_origin(cache, bio); 1928 else 1929 remap_to_cache(cache, bio, 0); 1930 1931 issue_after_commit(&cache->committer, bio); 1932 return true; 1933 } 1934 1935 static bool process_discard_bio(struct cache *cache, struct bio *bio) 1936 { 1937 dm_dblock_t b, e; 1938 1939 // FIXME: do we need to lock the region? Or can we just assume the 1940 // user wont be so foolish as to issue discard concurrently with 1941 // other IO? 1942 calc_discard_block_range(cache, bio, &b, &e); 1943 while (b != e) { 1944 set_discard(cache, b); 1945 b = to_dblock(from_dblock(b) + 1); 1946 } 1947 1948 bio_endio(bio); 1949 1950 return false; 1951 } 1952 1953 static void process_deferred_bios(struct work_struct *ws) 1954 { 1955 struct cache *cache = container_of(ws, struct cache, deferred_bio_worker); 1956 1957 unsigned long flags; 1958 bool commit_needed = false; 1959 struct bio_list bios; 1960 struct bio *bio; 1961 1962 bio_list_init(&bios); 1963 1964 spin_lock_irqsave(&cache->lock, flags); 1965 bio_list_merge(&bios, &cache->deferred_bios); 1966 bio_list_init(&cache->deferred_bios); 1967 spin_unlock_irqrestore(&cache->lock, flags); 1968 1969 while ((bio = bio_list_pop(&bios))) { 1970 if (bio->bi_opf & REQ_PREFLUSH) 1971 commit_needed = process_flush_bio(cache, bio) || commit_needed; 1972 1973 else if (bio_op(bio) == REQ_OP_DISCARD) 1974 commit_needed = process_discard_bio(cache, bio) || commit_needed; 1975 1976 else 1977 commit_needed = process_bio(cache, bio) || commit_needed; 1978 } 1979 1980 if (commit_needed) 1981 schedule_commit(&cache->committer); 1982 } 1983 1984 static void process_deferred_writethrough_bios(struct work_struct *ws) 1985 { 1986 struct cache *cache = container_of(ws, struct cache, deferred_writethrough_worker); 1987 1988 unsigned long flags; 1989 struct bio_list bios; 1990 struct bio *bio; 1991 1992 bio_list_init(&bios); 1993 1994 spin_lock_irqsave(&cache->lock, flags); 1995 bio_list_merge(&bios, &cache->deferred_writethrough_bios); 1996 bio_list_init(&cache->deferred_writethrough_bios); 1997 spin_unlock_irqrestore(&cache->lock, flags); 1998 1999 /* 2000 * These bios have already been through accounted_begin() 2001 */ 2002 while ((bio = bio_list_pop(&bios))) 2003 generic_make_request(bio); 2004 } 2005 2006 /*---------------------------------------------------------------- 2007 * Main worker loop 2008 *--------------------------------------------------------------*/ 2009 2010 static void requeue_deferred_bios(struct cache *cache) 2011 { 2012 struct bio *bio; 2013 struct bio_list bios; 2014 2015 bio_list_init(&bios); 2016 bio_list_merge(&bios, &cache->deferred_bios); 2017 bio_list_init(&cache->deferred_bios); 2018 2019 while ((bio = bio_list_pop(&bios))) { 2020 bio->bi_error = DM_ENDIO_REQUEUE; 2021 bio_endio(bio); 2022 } 2023 } 2024 2025 /* 2026 * We want to commit periodically so that not too much 2027 * unwritten metadata builds up. 2028 */ 2029 static void do_waker(struct work_struct *ws) 2030 { 2031 struct cache *cache = container_of(to_delayed_work(ws), struct cache, waker); 2032 2033 policy_tick(cache->policy, true); 2034 wake_migration_worker(cache); 2035 schedule_commit(&cache->committer); 2036 queue_delayed_work(cache->wq, &cache->waker, COMMIT_PERIOD); 2037 } 2038 2039 static void check_migrations(struct work_struct *ws) 2040 { 2041 int r; 2042 struct policy_work *op; 2043 struct cache *cache = container_of(ws, struct cache, migration_worker); 2044 enum busy b; 2045 2046 for (;;) { 2047 b = spare_migration_bandwidth(cache); 2048 if (b == BUSY) 2049 break; 2050 2051 r = policy_get_background_work(cache->policy, b == IDLE, &op); 2052 if (r == -ENODATA) 2053 break; 2054 2055 if (r) { 2056 DMERR_LIMIT("%s: policy_background_work failed", 2057 cache_device_name(cache)); 2058 break; 2059 } 2060 2061 r = mg_start(cache, op, NULL); 2062 if (r) 2063 break; 2064 } 2065 } 2066 2067 /*---------------------------------------------------------------- 2068 * Target methods 2069 *--------------------------------------------------------------*/ 2070 2071 /* 2072 * This function gets called on the error paths of the constructor, so we 2073 * have to cope with a partially initialised struct. 2074 */ 2075 static void destroy(struct cache *cache) 2076 { 2077 unsigned i; 2078 2079 mempool_destroy(cache->migration_pool); 2080 2081 if (cache->prison) 2082 dm_bio_prison_destroy_v2(cache->prison); 2083 2084 if (cache->wq) 2085 destroy_workqueue(cache->wq); 2086 2087 if (cache->dirty_bitset) 2088 free_bitset(cache->dirty_bitset); 2089 2090 if (cache->discard_bitset) 2091 free_bitset(cache->discard_bitset); 2092 2093 if (cache->copier) 2094 dm_kcopyd_client_destroy(cache->copier); 2095 2096 if (cache->cmd) 2097 dm_cache_metadata_close(cache->cmd); 2098 2099 if (cache->metadata_dev) 2100 dm_put_device(cache->ti, cache->metadata_dev); 2101 2102 if (cache->origin_dev) 2103 dm_put_device(cache->ti, cache->origin_dev); 2104 2105 if (cache->cache_dev) 2106 dm_put_device(cache->ti, cache->cache_dev); 2107 2108 if (cache->policy) 2109 dm_cache_policy_destroy(cache->policy); 2110 2111 for (i = 0; i < cache->nr_ctr_args ; i++) 2112 kfree(cache->ctr_args[i]); 2113 kfree(cache->ctr_args); 2114 2115 kfree(cache); 2116 } 2117 2118 static void cache_dtr(struct dm_target *ti) 2119 { 2120 struct cache *cache = ti->private; 2121 2122 destroy(cache); 2123 } 2124 2125 static sector_t get_dev_size(struct dm_dev *dev) 2126 { 2127 return i_size_read(dev->bdev->bd_inode) >> SECTOR_SHIFT; 2128 } 2129 2130 /*----------------------------------------------------------------*/ 2131 2132 /* 2133 * Construct a cache device mapping. 2134 * 2135 * cache <metadata dev> <cache dev> <origin dev> <block size> 2136 * <#feature args> [<feature arg>]* 2137 * <policy> <#policy args> [<policy arg>]* 2138 * 2139 * metadata dev : fast device holding the persistent metadata 2140 * cache dev : fast device holding cached data blocks 2141 * origin dev : slow device holding original data blocks 2142 * block size : cache unit size in sectors 2143 * 2144 * #feature args : number of feature arguments passed 2145 * feature args : writethrough. (The default is writeback.) 2146 * 2147 * policy : the replacement policy to use 2148 * #policy args : an even number of policy arguments corresponding 2149 * to key/value pairs passed to the policy 2150 * policy args : key/value pairs passed to the policy 2151 * E.g. 'sequential_threshold 1024' 2152 * See cache-policies.txt for details. 2153 * 2154 * Optional feature arguments are: 2155 * writethrough : write through caching that prohibits cache block 2156 * content from being different from origin block content. 2157 * Without this argument, the default behaviour is to write 2158 * back cache block contents later for performance reasons, 2159 * so they may differ from the corresponding origin blocks. 2160 */ 2161 struct cache_args { 2162 struct dm_target *ti; 2163 2164 struct dm_dev *metadata_dev; 2165 2166 struct dm_dev *cache_dev; 2167 sector_t cache_sectors; 2168 2169 struct dm_dev *origin_dev; 2170 sector_t origin_sectors; 2171 2172 uint32_t block_size; 2173 2174 const char *policy_name; 2175 int policy_argc; 2176 const char **policy_argv; 2177 2178 struct cache_features features; 2179 }; 2180 2181 static void destroy_cache_args(struct cache_args *ca) 2182 { 2183 if (ca->metadata_dev) 2184 dm_put_device(ca->ti, ca->metadata_dev); 2185 2186 if (ca->cache_dev) 2187 dm_put_device(ca->ti, ca->cache_dev); 2188 2189 if (ca->origin_dev) 2190 dm_put_device(ca->ti, ca->origin_dev); 2191 2192 kfree(ca); 2193 } 2194 2195 static bool at_least_one_arg(struct dm_arg_set *as, char **error) 2196 { 2197 if (!as->argc) { 2198 *error = "Insufficient args"; 2199 return false; 2200 } 2201 2202 return true; 2203 } 2204 2205 static int parse_metadata_dev(struct cache_args *ca, struct dm_arg_set *as, 2206 char **error) 2207 { 2208 int r; 2209 sector_t metadata_dev_size; 2210 char b[BDEVNAME_SIZE]; 2211 2212 if (!at_least_one_arg(as, error)) 2213 return -EINVAL; 2214 2215 r = dm_get_device(ca->ti, dm_shift_arg(as), FMODE_READ | FMODE_WRITE, 2216 &ca->metadata_dev); 2217 if (r) { 2218 *error = "Error opening metadata device"; 2219 return r; 2220 } 2221 2222 metadata_dev_size = get_dev_size(ca->metadata_dev); 2223 if (metadata_dev_size > DM_CACHE_METADATA_MAX_SECTORS_WARNING) 2224 DMWARN("Metadata device %s is larger than %u sectors: excess space will not be used.", 2225 bdevname(ca->metadata_dev->bdev, b), THIN_METADATA_MAX_SECTORS); 2226 2227 return 0; 2228 } 2229 2230 static int parse_cache_dev(struct cache_args *ca, struct dm_arg_set *as, 2231 char **error) 2232 { 2233 int r; 2234 2235 if (!at_least_one_arg(as, error)) 2236 return -EINVAL; 2237 2238 r = dm_get_device(ca->ti, dm_shift_arg(as), FMODE_READ | FMODE_WRITE, 2239 &ca->cache_dev); 2240 if (r) { 2241 *error = "Error opening cache device"; 2242 return r; 2243 } 2244 ca->cache_sectors = get_dev_size(ca->cache_dev); 2245 2246 return 0; 2247 } 2248 2249 static int parse_origin_dev(struct cache_args *ca, struct dm_arg_set *as, 2250 char **error) 2251 { 2252 int r; 2253 2254 if (!at_least_one_arg(as, error)) 2255 return -EINVAL; 2256 2257 r = dm_get_device(ca->ti, dm_shift_arg(as), FMODE_READ | FMODE_WRITE, 2258 &ca->origin_dev); 2259 if (r) { 2260 *error = "Error opening origin device"; 2261 return r; 2262 } 2263 2264 ca->origin_sectors = get_dev_size(ca->origin_dev); 2265 if (ca->ti->len > ca->origin_sectors) { 2266 *error = "Device size larger than cached device"; 2267 return -EINVAL; 2268 } 2269 2270 return 0; 2271 } 2272 2273 static int parse_block_size(struct cache_args *ca, struct dm_arg_set *as, 2274 char **error) 2275 { 2276 unsigned long block_size; 2277 2278 if (!at_least_one_arg(as, error)) 2279 return -EINVAL; 2280 2281 if (kstrtoul(dm_shift_arg(as), 10, &block_size) || !block_size || 2282 block_size < DATA_DEV_BLOCK_SIZE_MIN_SECTORS || 2283 block_size > DATA_DEV_BLOCK_SIZE_MAX_SECTORS || 2284 block_size & (DATA_DEV_BLOCK_SIZE_MIN_SECTORS - 1)) { 2285 *error = "Invalid data block size"; 2286 return -EINVAL; 2287 } 2288 2289 if (block_size > ca->cache_sectors) { 2290 *error = "Data block size is larger than the cache device"; 2291 return -EINVAL; 2292 } 2293 2294 ca->block_size = block_size; 2295 2296 return 0; 2297 } 2298 2299 static void init_features(struct cache_features *cf) 2300 { 2301 cf->mode = CM_WRITE; 2302 cf->io_mode = CM_IO_WRITEBACK; 2303 cf->metadata_version = 1; 2304 } 2305 2306 static int parse_features(struct cache_args *ca, struct dm_arg_set *as, 2307 char **error) 2308 { 2309 static struct dm_arg _args[] = { 2310 {0, 2, "Invalid number of cache feature arguments"}, 2311 }; 2312 2313 int r; 2314 unsigned argc; 2315 const char *arg; 2316 struct cache_features *cf = &ca->features; 2317 2318 init_features(cf); 2319 2320 r = dm_read_arg_group(_args, as, &argc, error); 2321 if (r) 2322 return -EINVAL; 2323 2324 while (argc--) { 2325 arg = dm_shift_arg(as); 2326 2327 if (!strcasecmp(arg, "writeback")) 2328 cf->io_mode = CM_IO_WRITEBACK; 2329 2330 else if (!strcasecmp(arg, "writethrough")) 2331 cf->io_mode = CM_IO_WRITETHROUGH; 2332 2333 else if (!strcasecmp(arg, "passthrough")) 2334 cf->io_mode = CM_IO_PASSTHROUGH; 2335 2336 else if (!strcasecmp(arg, "metadata2")) 2337 cf->metadata_version = 2; 2338 2339 else { 2340 *error = "Unrecognised cache feature requested"; 2341 return -EINVAL; 2342 } 2343 } 2344 2345 return 0; 2346 } 2347 2348 static int parse_policy(struct cache_args *ca, struct dm_arg_set *as, 2349 char **error) 2350 { 2351 static struct dm_arg _args[] = { 2352 {0, 1024, "Invalid number of policy arguments"}, 2353 }; 2354 2355 int r; 2356 2357 if (!at_least_one_arg(as, error)) 2358 return -EINVAL; 2359 2360 ca->policy_name = dm_shift_arg(as); 2361 2362 r = dm_read_arg_group(_args, as, &ca->policy_argc, error); 2363 if (r) 2364 return -EINVAL; 2365 2366 ca->policy_argv = (const char **)as->argv; 2367 dm_consume_args(as, ca->policy_argc); 2368 2369 return 0; 2370 } 2371 2372 static int parse_cache_args(struct cache_args *ca, int argc, char **argv, 2373 char **error) 2374 { 2375 int r; 2376 struct dm_arg_set as; 2377 2378 as.argc = argc; 2379 as.argv = argv; 2380 2381 r = parse_metadata_dev(ca, &as, error); 2382 if (r) 2383 return r; 2384 2385 r = parse_cache_dev(ca, &as, error); 2386 if (r) 2387 return r; 2388 2389 r = parse_origin_dev(ca, &as, error); 2390 if (r) 2391 return r; 2392 2393 r = parse_block_size(ca, &as, error); 2394 if (r) 2395 return r; 2396 2397 r = parse_features(ca, &as, error); 2398 if (r) 2399 return r; 2400 2401 r = parse_policy(ca, &as, error); 2402 if (r) 2403 return r; 2404 2405 return 0; 2406 } 2407 2408 /*----------------------------------------------------------------*/ 2409 2410 static struct kmem_cache *migration_cache; 2411 2412 #define NOT_CORE_OPTION 1 2413 2414 static int process_config_option(struct cache *cache, const char *key, const char *value) 2415 { 2416 unsigned long tmp; 2417 2418 if (!strcasecmp(key, "migration_threshold")) { 2419 if (kstrtoul(value, 10, &tmp)) 2420 return -EINVAL; 2421 2422 cache->migration_threshold = tmp; 2423 return 0; 2424 } 2425 2426 return NOT_CORE_OPTION; 2427 } 2428 2429 static int set_config_value(struct cache *cache, const char *key, const char *value) 2430 { 2431 int r = process_config_option(cache, key, value); 2432 2433 if (r == NOT_CORE_OPTION) 2434 r = policy_set_config_value(cache->policy, key, value); 2435 2436 if (r) 2437 DMWARN("bad config value for %s: %s", key, value); 2438 2439 return r; 2440 } 2441 2442 static int set_config_values(struct cache *cache, int argc, const char **argv) 2443 { 2444 int r = 0; 2445 2446 if (argc & 1) { 2447 DMWARN("Odd number of policy arguments given but they should be <key> <value> pairs."); 2448 return -EINVAL; 2449 } 2450 2451 while (argc) { 2452 r = set_config_value(cache, argv[0], argv[1]); 2453 if (r) 2454 break; 2455 2456 argc -= 2; 2457 argv += 2; 2458 } 2459 2460 return r; 2461 } 2462 2463 static int create_cache_policy(struct cache *cache, struct cache_args *ca, 2464 char **error) 2465 { 2466 struct dm_cache_policy *p = dm_cache_policy_create(ca->policy_name, 2467 cache->cache_size, 2468 cache->origin_sectors, 2469 cache->sectors_per_block); 2470 if (IS_ERR(p)) { 2471 *error = "Error creating cache's policy"; 2472 return PTR_ERR(p); 2473 } 2474 cache->policy = p; 2475 BUG_ON(!cache->policy); 2476 2477 return 0; 2478 } 2479 2480 /* 2481 * We want the discard block size to be at least the size of the cache 2482 * block size and have no more than 2^14 discard blocks across the origin. 2483 */ 2484 #define MAX_DISCARD_BLOCKS (1 << 14) 2485 2486 static bool too_many_discard_blocks(sector_t discard_block_size, 2487 sector_t origin_size) 2488 { 2489 (void) sector_div(origin_size, discard_block_size); 2490 2491 return origin_size > MAX_DISCARD_BLOCKS; 2492 } 2493 2494 static sector_t calculate_discard_block_size(sector_t cache_block_size, 2495 sector_t origin_size) 2496 { 2497 sector_t discard_block_size = cache_block_size; 2498 2499 if (origin_size) 2500 while (too_many_discard_blocks(discard_block_size, origin_size)) 2501 discard_block_size *= 2; 2502 2503 return discard_block_size; 2504 } 2505 2506 static void set_cache_size(struct cache *cache, dm_cblock_t size) 2507 { 2508 dm_block_t nr_blocks = from_cblock(size); 2509 2510 if (nr_blocks > (1 << 20) && cache->cache_size != size) 2511 DMWARN_LIMIT("You have created a cache device with a lot of individual cache blocks (%llu)\n" 2512 "All these mappings can consume a lot of kernel memory, and take some time to read/write.\n" 2513 "Please consider increasing the cache block size to reduce the overall cache block count.", 2514 (unsigned long long) nr_blocks); 2515 2516 cache->cache_size = size; 2517 } 2518 2519 static int is_congested(struct dm_dev *dev, int bdi_bits) 2520 { 2521 struct request_queue *q = bdev_get_queue(dev->bdev); 2522 return bdi_congested(q->backing_dev_info, bdi_bits); 2523 } 2524 2525 static int cache_is_congested(struct dm_target_callbacks *cb, int bdi_bits) 2526 { 2527 struct cache *cache = container_of(cb, struct cache, callbacks); 2528 2529 return is_congested(cache->origin_dev, bdi_bits) || 2530 is_congested(cache->cache_dev, bdi_bits); 2531 } 2532 2533 #define DEFAULT_MIGRATION_THRESHOLD 2048 2534 2535 static int cache_create(struct cache_args *ca, struct cache **result) 2536 { 2537 int r = 0; 2538 char **error = &ca->ti->error; 2539 struct cache *cache; 2540 struct dm_target *ti = ca->ti; 2541 dm_block_t origin_blocks; 2542 struct dm_cache_metadata *cmd; 2543 bool may_format = ca->features.mode == CM_WRITE; 2544 2545 cache = kzalloc(sizeof(*cache), GFP_KERNEL); 2546 if (!cache) 2547 return -ENOMEM; 2548 2549 cache->ti = ca->ti; 2550 ti->private = cache; 2551 ti->num_flush_bios = 2; 2552 ti->flush_supported = true; 2553 2554 ti->num_discard_bios = 1; 2555 ti->discards_supported = true; 2556 ti->split_discard_bios = false; 2557 2558 cache->features = ca->features; 2559 ti->per_io_data_size = get_per_bio_data_size(cache); 2560 2561 cache->callbacks.congested_fn = cache_is_congested; 2562 dm_table_add_target_callbacks(ti->table, &cache->callbacks); 2563 2564 cache->metadata_dev = ca->metadata_dev; 2565 cache->origin_dev = ca->origin_dev; 2566 cache->cache_dev = ca->cache_dev; 2567 2568 ca->metadata_dev = ca->origin_dev = ca->cache_dev = NULL; 2569 2570 origin_blocks = cache->origin_sectors = ca->origin_sectors; 2571 origin_blocks = block_div(origin_blocks, ca->block_size); 2572 cache->origin_blocks = to_oblock(origin_blocks); 2573 2574 cache->sectors_per_block = ca->block_size; 2575 if (dm_set_target_max_io_len(ti, cache->sectors_per_block)) { 2576 r = -EINVAL; 2577 goto bad; 2578 } 2579 2580 if (ca->block_size & (ca->block_size - 1)) { 2581 dm_block_t cache_size = ca->cache_sectors; 2582 2583 cache->sectors_per_block_shift = -1; 2584 cache_size = block_div(cache_size, ca->block_size); 2585 set_cache_size(cache, to_cblock(cache_size)); 2586 } else { 2587 cache->sectors_per_block_shift = __ffs(ca->block_size); 2588 set_cache_size(cache, to_cblock(ca->cache_sectors >> cache->sectors_per_block_shift)); 2589 } 2590 2591 r = create_cache_policy(cache, ca, error); 2592 if (r) 2593 goto bad; 2594 2595 cache->policy_nr_args = ca->policy_argc; 2596 cache->migration_threshold = DEFAULT_MIGRATION_THRESHOLD; 2597 2598 r = set_config_values(cache, ca->policy_argc, ca->policy_argv); 2599 if (r) { 2600 *error = "Error setting cache policy's config values"; 2601 goto bad; 2602 } 2603 2604 cmd = dm_cache_metadata_open(cache->metadata_dev->bdev, 2605 ca->block_size, may_format, 2606 dm_cache_policy_get_hint_size(cache->policy), 2607 ca->features.metadata_version); 2608 if (IS_ERR(cmd)) { 2609 *error = "Error creating metadata object"; 2610 r = PTR_ERR(cmd); 2611 goto bad; 2612 } 2613 cache->cmd = cmd; 2614 set_cache_mode(cache, CM_WRITE); 2615 if (get_cache_mode(cache) != CM_WRITE) { 2616 *error = "Unable to get write access to metadata, please check/repair metadata."; 2617 r = -EINVAL; 2618 goto bad; 2619 } 2620 2621 if (passthrough_mode(&cache->features)) { 2622 bool all_clean; 2623 2624 r = dm_cache_metadata_all_clean(cache->cmd, &all_clean); 2625 if (r) { 2626 *error = "dm_cache_metadata_all_clean() failed"; 2627 goto bad; 2628 } 2629 2630 if (!all_clean) { 2631 *error = "Cannot enter passthrough mode unless all blocks are clean"; 2632 r = -EINVAL; 2633 goto bad; 2634 } 2635 2636 policy_allow_migrations(cache->policy, false); 2637 } 2638 2639 spin_lock_init(&cache->lock); 2640 INIT_LIST_HEAD(&cache->deferred_cells); 2641 bio_list_init(&cache->deferred_bios); 2642 bio_list_init(&cache->deferred_writethrough_bios); 2643 atomic_set(&cache->nr_allocated_migrations, 0); 2644 atomic_set(&cache->nr_io_migrations, 0); 2645 init_waitqueue_head(&cache->migration_wait); 2646 2647 r = -ENOMEM; 2648 atomic_set(&cache->nr_dirty, 0); 2649 cache->dirty_bitset = alloc_bitset(from_cblock(cache->cache_size)); 2650 if (!cache->dirty_bitset) { 2651 *error = "could not allocate dirty bitset"; 2652 goto bad; 2653 } 2654 clear_bitset(cache->dirty_bitset, from_cblock(cache->cache_size)); 2655 2656 cache->discard_block_size = 2657 calculate_discard_block_size(cache->sectors_per_block, 2658 cache->origin_sectors); 2659 cache->discard_nr_blocks = to_dblock(dm_sector_div_up(cache->origin_sectors, 2660 cache->discard_block_size)); 2661 cache->discard_bitset = alloc_bitset(from_dblock(cache->discard_nr_blocks)); 2662 if (!cache->discard_bitset) { 2663 *error = "could not allocate discard bitset"; 2664 goto bad; 2665 } 2666 clear_bitset(cache->discard_bitset, from_dblock(cache->discard_nr_blocks)); 2667 2668 cache->copier = dm_kcopyd_client_create(&dm_kcopyd_throttle); 2669 if (IS_ERR(cache->copier)) { 2670 *error = "could not create kcopyd client"; 2671 r = PTR_ERR(cache->copier); 2672 goto bad; 2673 } 2674 2675 cache->wq = alloc_workqueue("dm-" DM_MSG_PREFIX, WQ_MEM_RECLAIM, 0); 2676 if (!cache->wq) { 2677 *error = "could not create workqueue for metadata object"; 2678 goto bad; 2679 } 2680 INIT_WORK(&cache->deferred_bio_worker, process_deferred_bios); 2681 INIT_WORK(&cache->deferred_writethrough_worker, 2682 process_deferred_writethrough_bios); 2683 INIT_WORK(&cache->migration_worker, check_migrations); 2684 INIT_DELAYED_WORK(&cache->waker, do_waker); 2685 2686 cache->prison = dm_bio_prison_create_v2(cache->wq); 2687 if (!cache->prison) { 2688 *error = "could not create bio prison"; 2689 goto bad; 2690 } 2691 2692 cache->migration_pool = mempool_create_slab_pool(MIGRATION_POOL_SIZE, 2693 migration_cache); 2694 if (!cache->migration_pool) { 2695 *error = "Error creating cache's migration mempool"; 2696 goto bad; 2697 } 2698 2699 cache->need_tick_bio = true; 2700 cache->sized = false; 2701 cache->invalidate = false; 2702 cache->commit_requested = false; 2703 cache->loaded_mappings = false; 2704 cache->loaded_discards = false; 2705 2706 load_stats(cache); 2707 2708 atomic_set(&cache->stats.demotion, 0); 2709 atomic_set(&cache->stats.promotion, 0); 2710 atomic_set(&cache->stats.copies_avoided, 0); 2711 atomic_set(&cache->stats.cache_cell_clash, 0); 2712 atomic_set(&cache->stats.commit_count, 0); 2713 atomic_set(&cache->stats.discard_count, 0); 2714 2715 spin_lock_init(&cache->invalidation_lock); 2716 INIT_LIST_HEAD(&cache->invalidation_requests); 2717 2718 batcher_init(&cache->committer, commit_op, cache, 2719 issue_op, cache, cache->wq); 2720 iot_init(&cache->origin_tracker); 2721 2722 init_rwsem(&cache->background_work_lock); 2723 prevent_background_work(cache); 2724 2725 *result = cache; 2726 return 0; 2727 bad: 2728 destroy(cache); 2729 return r; 2730 } 2731 2732 static int copy_ctr_args(struct cache *cache, int argc, const char **argv) 2733 { 2734 unsigned i; 2735 const char **copy; 2736 2737 copy = kcalloc(argc, sizeof(*copy), GFP_KERNEL); 2738 if (!copy) 2739 return -ENOMEM; 2740 for (i = 0; i < argc; i++) { 2741 copy[i] = kstrdup(argv[i], GFP_KERNEL); 2742 if (!copy[i]) { 2743 while (i--) 2744 kfree(copy[i]); 2745 kfree(copy); 2746 return -ENOMEM; 2747 } 2748 } 2749 2750 cache->nr_ctr_args = argc; 2751 cache->ctr_args = copy; 2752 2753 return 0; 2754 } 2755 2756 static int cache_ctr(struct dm_target *ti, unsigned argc, char **argv) 2757 { 2758 int r = -EINVAL; 2759 struct cache_args *ca; 2760 struct cache *cache = NULL; 2761 2762 ca = kzalloc(sizeof(*ca), GFP_KERNEL); 2763 if (!ca) { 2764 ti->error = "Error allocating memory for cache"; 2765 return -ENOMEM; 2766 } 2767 ca->ti = ti; 2768 2769 r = parse_cache_args(ca, argc, argv, &ti->error); 2770 if (r) 2771 goto out; 2772 2773 r = cache_create(ca, &cache); 2774 if (r) 2775 goto out; 2776 2777 r = copy_ctr_args(cache, argc - 3, (const char **)argv + 3); 2778 if (r) { 2779 destroy(cache); 2780 goto out; 2781 } 2782 2783 ti->private = cache; 2784 out: 2785 destroy_cache_args(ca); 2786 return r; 2787 } 2788 2789 /*----------------------------------------------------------------*/ 2790 2791 static int cache_map(struct dm_target *ti, struct bio *bio) 2792 { 2793 struct cache *cache = ti->private; 2794 2795 int r; 2796 bool commit_needed; 2797 dm_oblock_t block = get_bio_block(cache, bio); 2798 size_t pb_data_size = get_per_bio_data_size(cache); 2799 2800 init_per_bio_data(bio, pb_data_size); 2801 if (unlikely(from_oblock(block) >= from_oblock(cache->origin_blocks))) { 2802 /* 2803 * This can only occur if the io goes to a partial block at 2804 * the end of the origin device. We don't cache these. 2805 * Just remap to the origin and carry on. 2806 */ 2807 remap_to_origin(cache, bio); 2808 accounted_begin(cache, bio); 2809 return DM_MAPIO_REMAPPED; 2810 } 2811 2812 if (discard_or_flush(bio)) { 2813 defer_bio(cache, bio); 2814 return DM_MAPIO_SUBMITTED; 2815 } 2816 2817 r = map_bio(cache, bio, block, &commit_needed); 2818 if (commit_needed) 2819 schedule_commit(&cache->committer); 2820 2821 return r; 2822 } 2823 2824 static int cache_end_io(struct dm_target *ti, struct bio *bio, int error) 2825 { 2826 struct cache *cache = ti->private; 2827 unsigned long flags; 2828 size_t pb_data_size = get_per_bio_data_size(cache); 2829 struct per_bio_data *pb = get_per_bio_data(bio, pb_data_size); 2830 2831 if (pb->tick) { 2832 policy_tick(cache->policy, false); 2833 2834 spin_lock_irqsave(&cache->lock, flags); 2835 cache->need_tick_bio = true; 2836 spin_unlock_irqrestore(&cache->lock, flags); 2837 } 2838 2839 bio_drop_shared_lock(cache, bio); 2840 accounted_complete(cache, bio); 2841 2842 return 0; 2843 } 2844 2845 static int write_dirty_bitset(struct cache *cache) 2846 { 2847 int r; 2848 2849 if (get_cache_mode(cache) >= CM_READ_ONLY) 2850 return -EINVAL; 2851 2852 r = dm_cache_set_dirty_bits(cache->cmd, from_cblock(cache->cache_size), cache->dirty_bitset); 2853 if (r) 2854 metadata_operation_failed(cache, "dm_cache_set_dirty_bits", r); 2855 2856 return r; 2857 } 2858 2859 static int write_discard_bitset(struct cache *cache) 2860 { 2861 unsigned i, r; 2862 2863 if (get_cache_mode(cache) >= CM_READ_ONLY) 2864 return -EINVAL; 2865 2866 r = dm_cache_discard_bitset_resize(cache->cmd, cache->discard_block_size, 2867 cache->discard_nr_blocks); 2868 if (r) { 2869 DMERR("%s: could not resize on-disk discard bitset", cache_device_name(cache)); 2870 metadata_operation_failed(cache, "dm_cache_discard_bitset_resize", r); 2871 return r; 2872 } 2873 2874 for (i = 0; i < from_dblock(cache->discard_nr_blocks); i++) { 2875 r = dm_cache_set_discard(cache->cmd, to_dblock(i), 2876 is_discarded(cache, to_dblock(i))); 2877 if (r) { 2878 metadata_operation_failed(cache, "dm_cache_set_discard", r); 2879 return r; 2880 } 2881 } 2882 2883 return 0; 2884 } 2885 2886 static int write_hints(struct cache *cache) 2887 { 2888 int r; 2889 2890 if (get_cache_mode(cache) >= CM_READ_ONLY) 2891 return -EINVAL; 2892 2893 r = dm_cache_write_hints(cache->cmd, cache->policy); 2894 if (r) { 2895 metadata_operation_failed(cache, "dm_cache_write_hints", r); 2896 return r; 2897 } 2898 2899 return 0; 2900 } 2901 2902 /* 2903 * returns true on success 2904 */ 2905 static bool sync_metadata(struct cache *cache) 2906 { 2907 int r1, r2, r3, r4; 2908 2909 r1 = write_dirty_bitset(cache); 2910 if (r1) 2911 DMERR("%s: could not write dirty bitset", cache_device_name(cache)); 2912 2913 r2 = write_discard_bitset(cache); 2914 if (r2) 2915 DMERR("%s: could not write discard bitset", cache_device_name(cache)); 2916 2917 save_stats(cache); 2918 2919 r3 = write_hints(cache); 2920 if (r3) 2921 DMERR("%s: could not write hints", cache_device_name(cache)); 2922 2923 /* 2924 * If writing the above metadata failed, we still commit, but don't 2925 * set the clean shutdown flag. This will effectively force every 2926 * dirty bit to be set on reload. 2927 */ 2928 r4 = commit(cache, !r1 && !r2 && !r3); 2929 if (r4) 2930 DMERR("%s: could not write cache metadata", cache_device_name(cache)); 2931 2932 return !r1 && !r2 && !r3 && !r4; 2933 } 2934 2935 static void cache_postsuspend(struct dm_target *ti) 2936 { 2937 struct cache *cache = ti->private; 2938 2939 prevent_background_work(cache); 2940 BUG_ON(atomic_read(&cache->nr_io_migrations)); 2941 2942 cancel_delayed_work(&cache->waker); 2943 flush_workqueue(cache->wq); 2944 WARN_ON(cache->origin_tracker.in_flight); 2945 2946 /* 2947 * If it's a flush suspend there won't be any deferred bios, so this 2948 * call is harmless. 2949 */ 2950 requeue_deferred_bios(cache); 2951 2952 if (get_cache_mode(cache) == CM_WRITE) 2953 (void) sync_metadata(cache); 2954 } 2955 2956 static int load_mapping(void *context, dm_oblock_t oblock, dm_cblock_t cblock, 2957 bool dirty, uint32_t hint, bool hint_valid) 2958 { 2959 int r; 2960 struct cache *cache = context; 2961 2962 if (dirty) { 2963 set_bit(from_cblock(cblock), cache->dirty_bitset); 2964 atomic_inc(&cache->nr_dirty); 2965 } else 2966 clear_bit(from_cblock(cblock), cache->dirty_bitset); 2967 2968 r = policy_load_mapping(cache->policy, oblock, cblock, dirty, hint, hint_valid); 2969 if (r) 2970 return r; 2971 2972 return 0; 2973 } 2974 2975 /* 2976 * The discard block size in the on disk metadata is not 2977 * neccessarily the same as we're currently using. So we have to 2978 * be careful to only set the discarded attribute if we know it 2979 * covers a complete block of the new size. 2980 */ 2981 struct discard_load_info { 2982 struct cache *cache; 2983 2984 /* 2985 * These blocks are sized using the on disk dblock size, rather 2986 * than the current one. 2987 */ 2988 dm_block_t block_size; 2989 dm_block_t discard_begin, discard_end; 2990 }; 2991 2992 static void discard_load_info_init(struct cache *cache, 2993 struct discard_load_info *li) 2994 { 2995 li->cache = cache; 2996 li->discard_begin = li->discard_end = 0; 2997 } 2998 2999 static void set_discard_range(struct discard_load_info *li) 3000 { 3001 sector_t b, e; 3002 3003 if (li->discard_begin == li->discard_end) 3004 return; 3005 3006 /* 3007 * Convert to sectors. 3008 */ 3009 b = li->discard_begin * li->block_size; 3010 e = li->discard_end * li->block_size; 3011 3012 /* 3013 * Then convert back to the current dblock size. 3014 */ 3015 b = dm_sector_div_up(b, li->cache->discard_block_size); 3016 sector_div(e, li->cache->discard_block_size); 3017 3018 /* 3019 * The origin may have shrunk, so we need to check we're still in 3020 * bounds. 3021 */ 3022 if (e > from_dblock(li->cache->discard_nr_blocks)) 3023 e = from_dblock(li->cache->discard_nr_blocks); 3024 3025 for (; b < e; b++) 3026 set_discard(li->cache, to_dblock(b)); 3027 } 3028 3029 static int load_discard(void *context, sector_t discard_block_size, 3030 dm_dblock_t dblock, bool discard) 3031 { 3032 struct discard_load_info *li = context; 3033 3034 li->block_size = discard_block_size; 3035 3036 if (discard) { 3037 if (from_dblock(dblock) == li->discard_end) 3038 /* 3039 * We're already in a discard range, just extend it. 3040 */ 3041 li->discard_end = li->discard_end + 1ULL; 3042 3043 else { 3044 /* 3045 * Emit the old range and start a new one. 3046 */ 3047 set_discard_range(li); 3048 li->discard_begin = from_dblock(dblock); 3049 li->discard_end = li->discard_begin + 1ULL; 3050 } 3051 } else { 3052 set_discard_range(li); 3053 li->discard_begin = li->discard_end = 0; 3054 } 3055 3056 return 0; 3057 } 3058 3059 static dm_cblock_t get_cache_dev_size(struct cache *cache) 3060 { 3061 sector_t size = get_dev_size(cache->cache_dev); 3062 (void) sector_div(size, cache->sectors_per_block); 3063 return to_cblock(size); 3064 } 3065 3066 static bool can_resize(struct cache *cache, dm_cblock_t new_size) 3067 { 3068 if (from_cblock(new_size) > from_cblock(cache->cache_size)) 3069 return true; 3070 3071 /* 3072 * We can't drop a dirty block when shrinking the cache. 3073 */ 3074 while (from_cblock(new_size) < from_cblock(cache->cache_size)) { 3075 new_size = to_cblock(from_cblock(new_size) + 1); 3076 if (is_dirty(cache, new_size)) { 3077 DMERR("%s: unable to shrink cache; cache block %llu is dirty", 3078 cache_device_name(cache), 3079 (unsigned long long) from_cblock(new_size)); 3080 return false; 3081 } 3082 } 3083 3084 return true; 3085 } 3086 3087 static int resize_cache_dev(struct cache *cache, dm_cblock_t new_size) 3088 { 3089 int r; 3090 3091 r = dm_cache_resize(cache->cmd, new_size); 3092 if (r) { 3093 DMERR("%s: could not resize cache metadata", cache_device_name(cache)); 3094 metadata_operation_failed(cache, "dm_cache_resize", r); 3095 return r; 3096 } 3097 3098 set_cache_size(cache, new_size); 3099 3100 return 0; 3101 } 3102 3103 static int cache_preresume(struct dm_target *ti) 3104 { 3105 int r = 0; 3106 struct cache *cache = ti->private; 3107 dm_cblock_t csize = get_cache_dev_size(cache); 3108 3109 /* 3110 * Check to see if the cache has resized. 3111 */ 3112 if (!cache->sized) { 3113 r = resize_cache_dev(cache, csize); 3114 if (r) 3115 return r; 3116 3117 cache->sized = true; 3118 3119 } else if (csize != cache->cache_size) { 3120 if (!can_resize(cache, csize)) 3121 return -EINVAL; 3122 3123 r = resize_cache_dev(cache, csize); 3124 if (r) 3125 return r; 3126 } 3127 3128 if (!cache->loaded_mappings) { 3129 r = dm_cache_load_mappings(cache->cmd, cache->policy, 3130 load_mapping, cache); 3131 if (r) { 3132 DMERR("%s: could not load cache mappings", cache_device_name(cache)); 3133 metadata_operation_failed(cache, "dm_cache_load_mappings", r); 3134 return r; 3135 } 3136 3137 cache->loaded_mappings = true; 3138 } 3139 3140 if (!cache->loaded_discards) { 3141 struct discard_load_info li; 3142 3143 /* 3144 * The discard bitset could have been resized, or the 3145 * discard block size changed. To be safe we start by 3146 * setting every dblock to not discarded. 3147 */ 3148 clear_bitset(cache->discard_bitset, from_dblock(cache->discard_nr_blocks)); 3149 3150 discard_load_info_init(cache, &li); 3151 r = dm_cache_load_discards(cache->cmd, load_discard, &li); 3152 if (r) { 3153 DMERR("%s: could not load origin discards", cache_device_name(cache)); 3154 metadata_operation_failed(cache, "dm_cache_load_discards", r); 3155 return r; 3156 } 3157 set_discard_range(&li); 3158 3159 cache->loaded_discards = true; 3160 } 3161 3162 return r; 3163 } 3164 3165 static void cache_resume(struct dm_target *ti) 3166 { 3167 struct cache *cache = ti->private; 3168 3169 cache->need_tick_bio = true; 3170 allow_background_work(cache); 3171 do_waker(&cache->waker.work); 3172 } 3173 3174 /* 3175 * Status format: 3176 * 3177 * <metadata block size> <#used metadata blocks>/<#total metadata blocks> 3178 * <cache block size> <#used cache blocks>/<#total cache blocks> 3179 * <#read hits> <#read misses> <#write hits> <#write misses> 3180 * <#demotions> <#promotions> <#dirty> 3181 * <#features> <features>* 3182 * <#core args> <core args> 3183 * <policy name> <#policy args> <policy args>* <cache metadata mode> <needs_check> 3184 */ 3185 static void cache_status(struct dm_target *ti, status_type_t type, 3186 unsigned status_flags, char *result, unsigned maxlen) 3187 { 3188 int r = 0; 3189 unsigned i; 3190 ssize_t sz = 0; 3191 dm_block_t nr_free_blocks_metadata = 0; 3192 dm_block_t nr_blocks_metadata = 0; 3193 char buf[BDEVNAME_SIZE]; 3194 struct cache *cache = ti->private; 3195 dm_cblock_t residency; 3196 bool needs_check; 3197 3198 switch (type) { 3199 case STATUSTYPE_INFO: 3200 if (get_cache_mode(cache) == CM_FAIL) { 3201 DMEMIT("Fail"); 3202 break; 3203 } 3204 3205 /* Commit to ensure statistics aren't out-of-date */ 3206 if (!(status_flags & DM_STATUS_NOFLUSH_FLAG) && !dm_suspended(ti)) 3207 (void) commit(cache, false); 3208 3209 r = dm_cache_get_free_metadata_block_count(cache->cmd, &nr_free_blocks_metadata); 3210 if (r) { 3211 DMERR("%s: dm_cache_get_free_metadata_block_count returned %d", 3212 cache_device_name(cache), r); 3213 goto err; 3214 } 3215 3216 r = dm_cache_get_metadata_dev_size(cache->cmd, &nr_blocks_metadata); 3217 if (r) { 3218 DMERR("%s: dm_cache_get_metadata_dev_size returned %d", 3219 cache_device_name(cache), r); 3220 goto err; 3221 } 3222 3223 residency = policy_residency(cache->policy); 3224 3225 DMEMIT("%u %llu/%llu %llu %llu/%llu %u %u %u %u %u %u %lu ", 3226 (unsigned)DM_CACHE_METADATA_BLOCK_SIZE, 3227 (unsigned long long)(nr_blocks_metadata - nr_free_blocks_metadata), 3228 (unsigned long long)nr_blocks_metadata, 3229 (unsigned long long)cache->sectors_per_block, 3230 (unsigned long long) from_cblock(residency), 3231 (unsigned long long) from_cblock(cache->cache_size), 3232 (unsigned) atomic_read(&cache->stats.read_hit), 3233 (unsigned) atomic_read(&cache->stats.read_miss), 3234 (unsigned) atomic_read(&cache->stats.write_hit), 3235 (unsigned) atomic_read(&cache->stats.write_miss), 3236 (unsigned) atomic_read(&cache->stats.demotion), 3237 (unsigned) atomic_read(&cache->stats.promotion), 3238 (unsigned long) atomic_read(&cache->nr_dirty)); 3239 3240 if (cache->features.metadata_version == 2) 3241 DMEMIT("2 metadata2 "); 3242 else 3243 DMEMIT("1 "); 3244 3245 if (writethrough_mode(&cache->features)) 3246 DMEMIT("writethrough "); 3247 3248 else if (passthrough_mode(&cache->features)) 3249 DMEMIT("passthrough "); 3250 3251 else if (writeback_mode(&cache->features)) 3252 DMEMIT("writeback "); 3253 3254 else { 3255 DMERR("%s: internal error: unknown io mode: %d", 3256 cache_device_name(cache), (int) cache->features.io_mode); 3257 goto err; 3258 } 3259 3260 DMEMIT("2 migration_threshold %llu ", (unsigned long long) cache->migration_threshold); 3261 3262 DMEMIT("%s ", dm_cache_policy_get_name(cache->policy)); 3263 if (sz < maxlen) { 3264 r = policy_emit_config_values(cache->policy, result, maxlen, &sz); 3265 if (r) 3266 DMERR("%s: policy_emit_config_values returned %d", 3267 cache_device_name(cache), r); 3268 } 3269 3270 if (get_cache_mode(cache) == CM_READ_ONLY) 3271 DMEMIT("ro "); 3272 else 3273 DMEMIT("rw "); 3274 3275 r = dm_cache_metadata_needs_check(cache->cmd, &needs_check); 3276 3277 if (r || needs_check) 3278 DMEMIT("needs_check "); 3279 else 3280 DMEMIT("- "); 3281 3282 break; 3283 3284 case STATUSTYPE_TABLE: 3285 format_dev_t(buf, cache->metadata_dev->bdev->bd_dev); 3286 DMEMIT("%s ", buf); 3287 format_dev_t(buf, cache->cache_dev->bdev->bd_dev); 3288 DMEMIT("%s ", buf); 3289 format_dev_t(buf, cache->origin_dev->bdev->bd_dev); 3290 DMEMIT("%s", buf); 3291 3292 for (i = 0; i < cache->nr_ctr_args - 1; i++) 3293 DMEMIT(" %s", cache->ctr_args[i]); 3294 if (cache->nr_ctr_args) 3295 DMEMIT(" %s", cache->ctr_args[cache->nr_ctr_args - 1]); 3296 } 3297 3298 return; 3299 3300 err: 3301 DMEMIT("Error"); 3302 } 3303 3304 /* 3305 * Defines a range of cblocks, begin to (end - 1) are in the range. end is 3306 * the one-past-the-end value. 3307 */ 3308 struct cblock_range { 3309 dm_cblock_t begin; 3310 dm_cblock_t end; 3311 }; 3312 3313 /* 3314 * A cache block range can take two forms: 3315 * 3316 * i) A single cblock, eg. '3456' 3317 * ii) A begin and end cblock with a dash between, eg. 123-234 3318 */ 3319 static int parse_cblock_range(struct cache *cache, const char *str, 3320 struct cblock_range *result) 3321 { 3322 char dummy; 3323 uint64_t b, e; 3324 int r; 3325 3326 /* 3327 * Try and parse form (ii) first. 3328 */ 3329 r = sscanf(str, "%llu-%llu%c", &b, &e, &dummy); 3330 if (r < 0) 3331 return r; 3332 3333 if (r == 2) { 3334 result->begin = to_cblock(b); 3335 result->end = to_cblock(e); 3336 return 0; 3337 } 3338 3339 /* 3340 * That didn't work, try form (i). 3341 */ 3342 r = sscanf(str, "%llu%c", &b, &dummy); 3343 if (r < 0) 3344 return r; 3345 3346 if (r == 1) { 3347 result->begin = to_cblock(b); 3348 result->end = to_cblock(from_cblock(result->begin) + 1u); 3349 return 0; 3350 } 3351 3352 DMERR("%s: invalid cblock range '%s'", cache_device_name(cache), str); 3353 return -EINVAL; 3354 } 3355 3356 static int validate_cblock_range(struct cache *cache, struct cblock_range *range) 3357 { 3358 uint64_t b = from_cblock(range->begin); 3359 uint64_t e = from_cblock(range->end); 3360 uint64_t n = from_cblock(cache->cache_size); 3361 3362 if (b >= n) { 3363 DMERR("%s: begin cblock out of range: %llu >= %llu", 3364 cache_device_name(cache), b, n); 3365 return -EINVAL; 3366 } 3367 3368 if (e > n) { 3369 DMERR("%s: end cblock out of range: %llu > %llu", 3370 cache_device_name(cache), e, n); 3371 return -EINVAL; 3372 } 3373 3374 if (b >= e) { 3375 DMERR("%s: invalid cblock range: %llu >= %llu", 3376 cache_device_name(cache), b, e); 3377 return -EINVAL; 3378 } 3379 3380 return 0; 3381 } 3382 3383 static inline dm_cblock_t cblock_succ(dm_cblock_t b) 3384 { 3385 return to_cblock(from_cblock(b) + 1); 3386 } 3387 3388 static int request_invalidation(struct cache *cache, struct cblock_range *range) 3389 { 3390 int r = 0; 3391 3392 /* 3393 * We don't need to do any locking here because we know we're in 3394 * passthrough mode. There's is potential for a race between an 3395 * invalidation triggered by an io and an invalidation message. This 3396 * is harmless, we must not worry if the policy call fails. 3397 */ 3398 while (range->begin != range->end) { 3399 r = invalidate_cblock(cache, range->begin); 3400 if (r) 3401 return r; 3402 3403 range->begin = cblock_succ(range->begin); 3404 } 3405 3406 cache->commit_requested = true; 3407 return r; 3408 } 3409 3410 static int process_invalidate_cblocks_message(struct cache *cache, unsigned count, 3411 const char **cblock_ranges) 3412 { 3413 int r = 0; 3414 unsigned i; 3415 struct cblock_range range; 3416 3417 if (!passthrough_mode(&cache->features)) { 3418 DMERR("%s: cache has to be in passthrough mode for invalidation", 3419 cache_device_name(cache)); 3420 return -EPERM; 3421 } 3422 3423 for (i = 0; i < count; i++) { 3424 r = parse_cblock_range(cache, cblock_ranges[i], &range); 3425 if (r) 3426 break; 3427 3428 r = validate_cblock_range(cache, &range); 3429 if (r) 3430 break; 3431 3432 /* 3433 * Pass begin and end origin blocks to the worker and wake it. 3434 */ 3435 r = request_invalidation(cache, &range); 3436 if (r) 3437 break; 3438 } 3439 3440 return r; 3441 } 3442 3443 /* 3444 * Supports 3445 * "<key> <value>" 3446 * and 3447 * "invalidate_cblocks [(<begin>)|(<begin>-<end>)]* 3448 * 3449 * The key migration_threshold is supported by the cache target core. 3450 */ 3451 static int cache_message(struct dm_target *ti, unsigned argc, char **argv) 3452 { 3453 struct cache *cache = ti->private; 3454 3455 if (!argc) 3456 return -EINVAL; 3457 3458 if (get_cache_mode(cache) >= CM_READ_ONLY) { 3459 DMERR("%s: unable to service cache target messages in READ_ONLY or FAIL mode", 3460 cache_device_name(cache)); 3461 return -EOPNOTSUPP; 3462 } 3463 3464 if (!strcasecmp(argv[0], "invalidate_cblocks")) 3465 return process_invalidate_cblocks_message(cache, argc - 1, (const char **) argv + 1); 3466 3467 if (argc != 2) 3468 return -EINVAL; 3469 3470 return set_config_value(cache, argv[0], argv[1]); 3471 } 3472 3473 static int cache_iterate_devices(struct dm_target *ti, 3474 iterate_devices_callout_fn fn, void *data) 3475 { 3476 int r = 0; 3477 struct cache *cache = ti->private; 3478 3479 r = fn(ti, cache->cache_dev, 0, get_dev_size(cache->cache_dev), data); 3480 if (!r) 3481 r = fn(ti, cache->origin_dev, 0, ti->len, data); 3482 3483 return r; 3484 } 3485 3486 static void set_discard_limits(struct cache *cache, struct queue_limits *limits) 3487 { 3488 /* 3489 * FIXME: these limits may be incompatible with the cache device 3490 */ 3491 limits->max_discard_sectors = min_t(sector_t, cache->discard_block_size * 1024, 3492 cache->origin_sectors); 3493 limits->discard_granularity = cache->discard_block_size << SECTOR_SHIFT; 3494 } 3495 3496 static void cache_io_hints(struct dm_target *ti, struct queue_limits *limits) 3497 { 3498 struct cache *cache = ti->private; 3499 uint64_t io_opt_sectors = limits->io_opt >> SECTOR_SHIFT; 3500 3501 /* 3502 * If the system-determined stacked limits are compatible with the 3503 * cache's blocksize (io_opt is a factor) do not override them. 3504 */ 3505 if (io_opt_sectors < cache->sectors_per_block || 3506 do_div(io_opt_sectors, cache->sectors_per_block)) { 3507 blk_limits_io_min(limits, cache->sectors_per_block << SECTOR_SHIFT); 3508 blk_limits_io_opt(limits, cache->sectors_per_block << SECTOR_SHIFT); 3509 } 3510 set_discard_limits(cache, limits); 3511 } 3512 3513 /*----------------------------------------------------------------*/ 3514 3515 static struct target_type cache_target = { 3516 .name = "cache", 3517 .version = {2, 0, 0}, 3518 .module = THIS_MODULE, 3519 .ctr = cache_ctr, 3520 .dtr = cache_dtr, 3521 .map = cache_map, 3522 .end_io = cache_end_io, 3523 .postsuspend = cache_postsuspend, 3524 .preresume = cache_preresume, 3525 .resume = cache_resume, 3526 .status = cache_status, 3527 .message = cache_message, 3528 .iterate_devices = cache_iterate_devices, 3529 .io_hints = cache_io_hints, 3530 }; 3531 3532 static int __init dm_cache_init(void) 3533 { 3534 int r; 3535 3536 r = dm_register_target(&cache_target); 3537 if (r) { 3538 DMERR("cache target registration failed: %d", r); 3539 return r; 3540 } 3541 3542 migration_cache = KMEM_CACHE(dm_cache_migration, 0); 3543 if (!migration_cache) { 3544 dm_unregister_target(&cache_target); 3545 return -ENOMEM; 3546 } 3547 3548 return 0; 3549 } 3550 3551 static void __exit dm_cache_exit(void) 3552 { 3553 dm_unregister_target(&cache_target); 3554 kmem_cache_destroy(migration_cache); 3555 } 3556 3557 module_init(dm_cache_init); 3558 module_exit(dm_cache_exit); 3559 3560 MODULE_DESCRIPTION(DM_NAME " cache target"); 3561 MODULE_AUTHOR("Joe Thornber <ejt@redhat.com>"); 3562 MODULE_LICENSE("GPL"); 3563