1 /* 2 * Copyright (C) 2012 Red Hat. All rights reserved. 3 * 4 * This file is released under the GPL. 5 */ 6 7 #include "dm.h" 8 #include "dm-bio-prison-v2.h" 9 #include "dm-bio-record.h" 10 #include "dm-cache-metadata.h" 11 12 #include <linux/dm-io.h> 13 #include <linux/dm-kcopyd.h> 14 #include <linux/jiffies.h> 15 #include <linux/init.h> 16 #include <linux/mempool.h> 17 #include <linux/module.h> 18 #include <linux/rwsem.h> 19 #include <linux/slab.h> 20 #include <linux/vmalloc.h> 21 22 #define DM_MSG_PREFIX "cache" 23 24 DECLARE_DM_KCOPYD_THROTTLE_WITH_MODULE_PARM(cache_copy_throttle, 25 "A percentage of time allocated for copying to and/or from cache"); 26 27 /*----------------------------------------------------------------*/ 28 29 /* 30 * Glossary: 31 * 32 * oblock: index of an origin block 33 * cblock: index of a cache block 34 * promotion: movement of a block from origin to cache 35 * demotion: movement of a block from cache to origin 36 * migration: movement of a block between the origin and cache device, 37 * either direction 38 */ 39 40 /*----------------------------------------------------------------*/ 41 42 struct io_tracker { 43 spinlock_t lock; 44 45 /* 46 * Sectors of in-flight IO. 47 */ 48 sector_t in_flight; 49 50 /* 51 * The time, in jiffies, when this device became idle (if it is 52 * indeed idle). 53 */ 54 unsigned long idle_time; 55 unsigned long last_update_time; 56 }; 57 58 static void iot_init(struct io_tracker *iot) 59 { 60 spin_lock_init(&iot->lock); 61 iot->in_flight = 0ul; 62 iot->idle_time = 0ul; 63 iot->last_update_time = jiffies; 64 } 65 66 static bool __iot_idle_for(struct io_tracker *iot, unsigned long jifs) 67 { 68 if (iot->in_flight) 69 return false; 70 71 return time_after(jiffies, iot->idle_time + jifs); 72 } 73 74 static bool iot_idle_for(struct io_tracker *iot, unsigned long jifs) 75 { 76 bool r; 77 78 spin_lock_irq(&iot->lock); 79 r = __iot_idle_for(iot, jifs); 80 spin_unlock_irq(&iot->lock); 81 82 return r; 83 } 84 85 static void iot_io_begin(struct io_tracker *iot, sector_t len) 86 { 87 spin_lock_irq(&iot->lock); 88 iot->in_flight += len; 89 spin_unlock_irq(&iot->lock); 90 } 91 92 static void __iot_io_end(struct io_tracker *iot, sector_t len) 93 { 94 if (!len) 95 return; 96 97 iot->in_flight -= len; 98 if (!iot->in_flight) 99 iot->idle_time = jiffies; 100 } 101 102 static void iot_io_end(struct io_tracker *iot, sector_t len) 103 { 104 unsigned long flags; 105 106 spin_lock_irqsave(&iot->lock, flags); 107 __iot_io_end(iot, len); 108 spin_unlock_irqrestore(&iot->lock, flags); 109 } 110 111 /*----------------------------------------------------------------*/ 112 113 /* 114 * Represents a chunk of future work. 'input' allows continuations to pass 115 * values between themselves, typically error values. 116 */ 117 struct continuation { 118 struct work_struct ws; 119 blk_status_t input; 120 }; 121 122 static inline void init_continuation(struct continuation *k, 123 void (*fn)(struct work_struct *)) 124 { 125 INIT_WORK(&k->ws, fn); 126 k->input = 0; 127 } 128 129 static inline void queue_continuation(struct workqueue_struct *wq, 130 struct continuation *k) 131 { 132 queue_work(wq, &k->ws); 133 } 134 135 /*----------------------------------------------------------------*/ 136 137 /* 138 * The batcher collects together pieces of work that need a particular 139 * operation to occur before they can proceed (typically a commit). 140 */ 141 struct batcher { 142 /* 143 * The operation that everyone is waiting for. 144 */ 145 blk_status_t (*commit_op)(void *context); 146 void *commit_context; 147 148 /* 149 * This is how bios should be issued once the commit op is complete 150 * (accounted_request). 151 */ 152 void (*issue_op)(struct bio *bio, void *context); 153 void *issue_context; 154 155 /* 156 * Queued work gets put on here after commit. 157 */ 158 struct workqueue_struct *wq; 159 160 spinlock_t lock; 161 struct list_head work_items; 162 struct bio_list bios; 163 struct work_struct commit_work; 164 165 bool commit_scheduled; 166 }; 167 168 static void __commit(struct work_struct *_ws) 169 { 170 struct batcher *b = container_of(_ws, struct batcher, commit_work); 171 blk_status_t r; 172 struct list_head work_items; 173 struct work_struct *ws, *tmp; 174 struct continuation *k; 175 struct bio *bio; 176 struct bio_list bios; 177 178 INIT_LIST_HEAD(&work_items); 179 bio_list_init(&bios); 180 181 /* 182 * We have to grab these before the commit_op to avoid a race 183 * condition. 184 */ 185 spin_lock_irq(&b->lock); 186 list_splice_init(&b->work_items, &work_items); 187 bio_list_merge(&bios, &b->bios); 188 bio_list_init(&b->bios); 189 b->commit_scheduled = false; 190 spin_unlock_irq(&b->lock); 191 192 r = b->commit_op(b->commit_context); 193 194 list_for_each_entry_safe(ws, tmp, &work_items, entry) { 195 k = container_of(ws, struct continuation, ws); 196 k->input = r; 197 INIT_LIST_HEAD(&ws->entry); /* to avoid a WARN_ON */ 198 queue_work(b->wq, ws); 199 } 200 201 while ((bio = bio_list_pop(&bios))) { 202 if (r) { 203 bio->bi_status = r; 204 bio_endio(bio); 205 } else 206 b->issue_op(bio, b->issue_context); 207 } 208 } 209 210 static void batcher_init(struct batcher *b, 211 blk_status_t (*commit_op)(void *), 212 void *commit_context, 213 void (*issue_op)(struct bio *bio, void *), 214 void *issue_context, 215 struct workqueue_struct *wq) 216 { 217 b->commit_op = commit_op; 218 b->commit_context = commit_context; 219 b->issue_op = issue_op; 220 b->issue_context = issue_context; 221 b->wq = wq; 222 223 spin_lock_init(&b->lock); 224 INIT_LIST_HEAD(&b->work_items); 225 bio_list_init(&b->bios); 226 INIT_WORK(&b->commit_work, __commit); 227 b->commit_scheduled = false; 228 } 229 230 static void async_commit(struct batcher *b) 231 { 232 queue_work(b->wq, &b->commit_work); 233 } 234 235 static void continue_after_commit(struct batcher *b, struct continuation *k) 236 { 237 bool commit_scheduled; 238 239 spin_lock_irq(&b->lock); 240 commit_scheduled = b->commit_scheduled; 241 list_add_tail(&k->ws.entry, &b->work_items); 242 spin_unlock_irq(&b->lock); 243 244 if (commit_scheduled) 245 async_commit(b); 246 } 247 248 /* 249 * Bios are errored if commit failed. 250 */ 251 static void issue_after_commit(struct batcher *b, struct bio *bio) 252 { 253 bool commit_scheduled; 254 255 spin_lock_irq(&b->lock); 256 commit_scheduled = b->commit_scheduled; 257 bio_list_add(&b->bios, bio); 258 spin_unlock_irq(&b->lock); 259 260 if (commit_scheduled) 261 async_commit(b); 262 } 263 264 /* 265 * Call this if some urgent work is waiting for the commit to complete. 266 */ 267 static void schedule_commit(struct batcher *b) 268 { 269 bool immediate; 270 271 spin_lock_irq(&b->lock); 272 immediate = !list_empty(&b->work_items) || !bio_list_empty(&b->bios); 273 b->commit_scheduled = true; 274 spin_unlock_irq(&b->lock); 275 276 if (immediate) 277 async_commit(b); 278 } 279 280 /* 281 * There are a couple of places where we let a bio run, but want to do some 282 * work before calling its endio function. We do this by temporarily 283 * changing the endio fn. 284 */ 285 struct dm_hook_info { 286 bio_end_io_t *bi_end_io; 287 }; 288 289 static void dm_hook_bio(struct dm_hook_info *h, struct bio *bio, 290 bio_end_io_t *bi_end_io, void *bi_private) 291 { 292 h->bi_end_io = bio->bi_end_io; 293 294 bio->bi_end_io = bi_end_io; 295 bio->bi_private = bi_private; 296 } 297 298 static void dm_unhook_bio(struct dm_hook_info *h, struct bio *bio) 299 { 300 bio->bi_end_io = h->bi_end_io; 301 } 302 303 /*----------------------------------------------------------------*/ 304 305 #define MIGRATION_POOL_SIZE 128 306 #define COMMIT_PERIOD HZ 307 #define MIGRATION_COUNT_WINDOW 10 308 309 /* 310 * The block size of the device holding cache data must be 311 * between 32KB and 1GB. 312 */ 313 #define DATA_DEV_BLOCK_SIZE_MIN_SECTORS (32 * 1024 >> SECTOR_SHIFT) 314 #define DATA_DEV_BLOCK_SIZE_MAX_SECTORS (1024 * 1024 * 1024 >> SECTOR_SHIFT) 315 316 enum cache_metadata_mode { 317 CM_WRITE, /* metadata may be changed */ 318 CM_READ_ONLY, /* metadata may not be changed */ 319 CM_FAIL 320 }; 321 322 enum cache_io_mode { 323 /* 324 * Data is written to cached blocks only. These blocks are marked 325 * dirty. If you lose the cache device you will lose data. 326 * Potential performance increase for both reads and writes. 327 */ 328 CM_IO_WRITEBACK, 329 330 /* 331 * Data is written to both cache and origin. Blocks are never 332 * dirty. Potential performance benfit for reads only. 333 */ 334 CM_IO_WRITETHROUGH, 335 336 /* 337 * A degraded mode useful for various cache coherency situations 338 * (eg, rolling back snapshots). Reads and writes always go to the 339 * origin. If a write goes to a cached oblock, then the cache 340 * block is invalidated. 341 */ 342 CM_IO_PASSTHROUGH 343 }; 344 345 struct cache_features { 346 enum cache_metadata_mode mode; 347 enum cache_io_mode io_mode; 348 unsigned metadata_version; 349 bool discard_passdown:1; 350 }; 351 352 struct cache_stats { 353 atomic_t read_hit; 354 atomic_t read_miss; 355 atomic_t write_hit; 356 atomic_t write_miss; 357 atomic_t demotion; 358 atomic_t promotion; 359 atomic_t writeback; 360 atomic_t copies_avoided; 361 atomic_t cache_cell_clash; 362 atomic_t commit_count; 363 atomic_t discard_count; 364 }; 365 366 struct cache { 367 struct dm_target *ti; 368 spinlock_t lock; 369 370 /* 371 * Fields for converting from sectors to blocks. 372 */ 373 int sectors_per_block_shift; 374 sector_t sectors_per_block; 375 376 struct dm_cache_metadata *cmd; 377 378 /* 379 * Metadata is written to this device. 380 */ 381 struct dm_dev *metadata_dev; 382 383 /* 384 * The slower of the two data devices. Typically a spindle. 385 */ 386 struct dm_dev *origin_dev; 387 388 /* 389 * The faster of the two data devices. Typically an SSD. 390 */ 391 struct dm_dev *cache_dev; 392 393 /* 394 * Size of the origin device in _complete_ blocks and native sectors. 395 */ 396 dm_oblock_t origin_blocks; 397 sector_t origin_sectors; 398 399 /* 400 * Size of the cache device in blocks. 401 */ 402 dm_cblock_t cache_size; 403 404 /* 405 * Invalidation fields. 406 */ 407 spinlock_t invalidation_lock; 408 struct list_head invalidation_requests; 409 410 sector_t migration_threshold; 411 wait_queue_head_t migration_wait; 412 atomic_t nr_allocated_migrations; 413 414 /* 415 * The number of in flight migrations that are performing 416 * background io. eg, promotion, writeback. 417 */ 418 atomic_t nr_io_migrations; 419 420 struct bio_list deferred_bios; 421 422 struct rw_semaphore quiesce_lock; 423 424 struct dm_target_callbacks callbacks; 425 426 /* 427 * origin_blocks entries, discarded if set. 428 */ 429 dm_dblock_t discard_nr_blocks; 430 unsigned long *discard_bitset; 431 uint32_t discard_block_size; /* a power of 2 times sectors per block */ 432 433 /* 434 * Rather than reconstructing the table line for the status we just 435 * save it and regurgitate. 436 */ 437 unsigned nr_ctr_args; 438 const char **ctr_args; 439 440 struct dm_kcopyd_client *copier; 441 struct work_struct deferred_bio_worker; 442 struct work_struct migration_worker; 443 struct workqueue_struct *wq; 444 struct delayed_work waker; 445 struct dm_bio_prison_v2 *prison; 446 447 /* 448 * cache_size entries, dirty if set 449 */ 450 unsigned long *dirty_bitset; 451 atomic_t nr_dirty; 452 453 unsigned policy_nr_args; 454 struct dm_cache_policy *policy; 455 456 /* 457 * Cache features such as write-through. 458 */ 459 struct cache_features features; 460 461 struct cache_stats stats; 462 463 bool need_tick_bio:1; 464 bool sized:1; 465 bool invalidate:1; 466 bool commit_requested:1; 467 bool loaded_mappings:1; 468 bool loaded_discards:1; 469 470 struct rw_semaphore background_work_lock; 471 472 struct batcher committer; 473 struct work_struct commit_ws; 474 475 struct io_tracker tracker; 476 477 mempool_t migration_pool; 478 479 struct bio_set bs; 480 }; 481 482 struct per_bio_data { 483 bool tick:1; 484 unsigned req_nr:2; 485 struct dm_bio_prison_cell_v2 *cell; 486 struct dm_hook_info hook_info; 487 sector_t len; 488 }; 489 490 struct dm_cache_migration { 491 struct continuation k; 492 struct cache *cache; 493 494 struct policy_work *op; 495 struct bio *overwrite_bio; 496 struct dm_bio_prison_cell_v2 *cell; 497 498 dm_cblock_t invalidate_cblock; 499 dm_oblock_t invalidate_oblock; 500 }; 501 502 /*----------------------------------------------------------------*/ 503 504 static bool writethrough_mode(struct cache *cache) 505 { 506 return cache->features.io_mode == CM_IO_WRITETHROUGH; 507 } 508 509 static bool writeback_mode(struct cache *cache) 510 { 511 return cache->features.io_mode == CM_IO_WRITEBACK; 512 } 513 514 static inline bool passthrough_mode(struct cache *cache) 515 { 516 return unlikely(cache->features.io_mode == CM_IO_PASSTHROUGH); 517 } 518 519 /*----------------------------------------------------------------*/ 520 521 static void wake_deferred_bio_worker(struct cache *cache) 522 { 523 queue_work(cache->wq, &cache->deferred_bio_worker); 524 } 525 526 static void wake_migration_worker(struct cache *cache) 527 { 528 if (passthrough_mode(cache)) 529 return; 530 531 queue_work(cache->wq, &cache->migration_worker); 532 } 533 534 /*----------------------------------------------------------------*/ 535 536 static struct dm_bio_prison_cell_v2 *alloc_prison_cell(struct cache *cache) 537 { 538 return dm_bio_prison_alloc_cell_v2(cache->prison, GFP_NOIO); 539 } 540 541 static void free_prison_cell(struct cache *cache, struct dm_bio_prison_cell_v2 *cell) 542 { 543 dm_bio_prison_free_cell_v2(cache->prison, cell); 544 } 545 546 static struct dm_cache_migration *alloc_migration(struct cache *cache) 547 { 548 struct dm_cache_migration *mg; 549 550 mg = mempool_alloc(&cache->migration_pool, GFP_NOIO); 551 552 memset(mg, 0, sizeof(*mg)); 553 554 mg->cache = cache; 555 atomic_inc(&cache->nr_allocated_migrations); 556 557 return mg; 558 } 559 560 static void free_migration(struct dm_cache_migration *mg) 561 { 562 struct cache *cache = mg->cache; 563 564 if (atomic_dec_and_test(&cache->nr_allocated_migrations)) 565 wake_up(&cache->migration_wait); 566 567 mempool_free(mg, &cache->migration_pool); 568 } 569 570 /*----------------------------------------------------------------*/ 571 572 static inline dm_oblock_t oblock_succ(dm_oblock_t b) 573 { 574 return to_oblock(from_oblock(b) + 1ull); 575 } 576 577 static void build_key(dm_oblock_t begin, dm_oblock_t end, struct dm_cell_key_v2 *key) 578 { 579 key->virtual = 0; 580 key->dev = 0; 581 key->block_begin = from_oblock(begin); 582 key->block_end = from_oblock(end); 583 } 584 585 /* 586 * We have two lock levels. Level 0, which is used to prevent WRITEs, and 587 * level 1 which prevents *both* READs and WRITEs. 588 */ 589 #define WRITE_LOCK_LEVEL 0 590 #define READ_WRITE_LOCK_LEVEL 1 591 592 static unsigned lock_level(struct bio *bio) 593 { 594 return bio_data_dir(bio) == WRITE ? 595 WRITE_LOCK_LEVEL : 596 READ_WRITE_LOCK_LEVEL; 597 } 598 599 /*---------------------------------------------------------------- 600 * Per bio data 601 *--------------------------------------------------------------*/ 602 603 static struct per_bio_data *get_per_bio_data(struct bio *bio) 604 { 605 struct per_bio_data *pb = dm_per_bio_data(bio, sizeof(struct per_bio_data)); 606 BUG_ON(!pb); 607 return pb; 608 } 609 610 static struct per_bio_data *init_per_bio_data(struct bio *bio) 611 { 612 struct per_bio_data *pb = get_per_bio_data(bio); 613 614 pb->tick = false; 615 pb->req_nr = dm_bio_get_target_bio_nr(bio); 616 pb->cell = NULL; 617 pb->len = 0; 618 619 return pb; 620 } 621 622 /*----------------------------------------------------------------*/ 623 624 static void defer_bio(struct cache *cache, struct bio *bio) 625 { 626 spin_lock_irq(&cache->lock); 627 bio_list_add(&cache->deferred_bios, bio); 628 spin_unlock_irq(&cache->lock); 629 630 wake_deferred_bio_worker(cache); 631 } 632 633 static void defer_bios(struct cache *cache, struct bio_list *bios) 634 { 635 spin_lock_irq(&cache->lock); 636 bio_list_merge(&cache->deferred_bios, bios); 637 bio_list_init(bios); 638 spin_unlock_irq(&cache->lock); 639 640 wake_deferred_bio_worker(cache); 641 } 642 643 /*----------------------------------------------------------------*/ 644 645 static bool bio_detain_shared(struct cache *cache, dm_oblock_t oblock, struct bio *bio) 646 { 647 bool r; 648 struct per_bio_data *pb; 649 struct dm_cell_key_v2 key; 650 dm_oblock_t end = to_oblock(from_oblock(oblock) + 1ULL); 651 struct dm_bio_prison_cell_v2 *cell_prealloc, *cell; 652 653 cell_prealloc = alloc_prison_cell(cache); /* FIXME: allow wait if calling from worker */ 654 655 build_key(oblock, end, &key); 656 r = dm_cell_get_v2(cache->prison, &key, lock_level(bio), bio, cell_prealloc, &cell); 657 if (!r) { 658 /* 659 * Failed to get the lock. 660 */ 661 free_prison_cell(cache, cell_prealloc); 662 return r; 663 } 664 665 if (cell != cell_prealloc) 666 free_prison_cell(cache, cell_prealloc); 667 668 pb = get_per_bio_data(bio); 669 pb->cell = cell; 670 671 return r; 672 } 673 674 /*----------------------------------------------------------------*/ 675 676 static bool is_dirty(struct cache *cache, dm_cblock_t b) 677 { 678 return test_bit(from_cblock(b), cache->dirty_bitset); 679 } 680 681 static void set_dirty(struct cache *cache, dm_cblock_t cblock) 682 { 683 if (!test_and_set_bit(from_cblock(cblock), cache->dirty_bitset)) { 684 atomic_inc(&cache->nr_dirty); 685 policy_set_dirty(cache->policy, cblock); 686 } 687 } 688 689 /* 690 * These two are called when setting after migrations to force the policy 691 * and dirty bitset to be in sync. 692 */ 693 static void force_set_dirty(struct cache *cache, dm_cblock_t cblock) 694 { 695 if (!test_and_set_bit(from_cblock(cblock), cache->dirty_bitset)) 696 atomic_inc(&cache->nr_dirty); 697 policy_set_dirty(cache->policy, cblock); 698 } 699 700 static void force_clear_dirty(struct cache *cache, dm_cblock_t cblock) 701 { 702 if (test_and_clear_bit(from_cblock(cblock), cache->dirty_bitset)) { 703 if (atomic_dec_return(&cache->nr_dirty) == 0) 704 dm_table_event(cache->ti->table); 705 } 706 707 policy_clear_dirty(cache->policy, cblock); 708 } 709 710 /*----------------------------------------------------------------*/ 711 712 static bool block_size_is_power_of_two(struct cache *cache) 713 { 714 return cache->sectors_per_block_shift >= 0; 715 } 716 717 /* gcc on ARM generates spurious references to __udivdi3 and __umoddi3 */ 718 #if defined(CONFIG_ARM) && __GNUC__ == 4 && __GNUC_MINOR__ <= 6 719 __always_inline 720 #endif 721 static dm_block_t block_div(dm_block_t b, uint32_t n) 722 { 723 do_div(b, n); 724 725 return b; 726 } 727 728 static dm_block_t oblocks_per_dblock(struct cache *cache) 729 { 730 dm_block_t oblocks = cache->discard_block_size; 731 732 if (block_size_is_power_of_two(cache)) 733 oblocks >>= cache->sectors_per_block_shift; 734 else 735 oblocks = block_div(oblocks, cache->sectors_per_block); 736 737 return oblocks; 738 } 739 740 static dm_dblock_t oblock_to_dblock(struct cache *cache, dm_oblock_t oblock) 741 { 742 return to_dblock(block_div(from_oblock(oblock), 743 oblocks_per_dblock(cache))); 744 } 745 746 static void set_discard(struct cache *cache, dm_dblock_t b) 747 { 748 BUG_ON(from_dblock(b) >= from_dblock(cache->discard_nr_blocks)); 749 atomic_inc(&cache->stats.discard_count); 750 751 spin_lock_irq(&cache->lock); 752 set_bit(from_dblock(b), cache->discard_bitset); 753 spin_unlock_irq(&cache->lock); 754 } 755 756 static void clear_discard(struct cache *cache, dm_dblock_t b) 757 { 758 spin_lock_irq(&cache->lock); 759 clear_bit(from_dblock(b), cache->discard_bitset); 760 spin_unlock_irq(&cache->lock); 761 } 762 763 static bool is_discarded(struct cache *cache, dm_dblock_t b) 764 { 765 int r; 766 spin_lock_irq(&cache->lock); 767 r = test_bit(from_dblock(b), cache->discard_bitset); 768 spin_unlock_irq(&cache->lock); 769 770 return r; 771 } 772 773 static bool is_discarded_oblock(struct cache *cache, dm_oblock_t b) 774 { 775 int r; 776 spin_lock_irq(&cache->lock); 777 r = test_bit(from_dblock(oblock_to_dblock(cache, b)), 778 cache->discard_bitset); 779 spin_unlock_irq(&cache->lock); 780 781 return r; 782 } 783 784 /*---------------------------------------------------------------- 785 * Remapping 786 *--------------------------------------------------------------*/ 787 static void remap_to_origin(struct cache *cache, struct bio *bio) 788 { 789 bio_set_dev(bio, cache->origin_dev->bdev); 790 } 791 792 static void remap_to_cache(struct cache *cache, struct bio *bio, 793 dm_cblock_t cblock) 794 { 795 sector_t bi_sector = bio->bi_iter.bi_sector; 796 sector_t block = from_cblock(cblock); 797 798 bio_set_dev(bio, cache->cache_dev->bdev); 799 if (!block_size_is_power_of_two(cache)) 800 bio->bi_iter.bi_sector = 801 (block * cache->sectors_per_block) + 802 sector_div(bi_sector, cache->sectors_per_block); 803 else 804 bio->bi_iter.bi_sector = 805 (block << cache->sectors_per_block_shift) | 806 (bi_sector & (cache->sectors_per_block - 1)); 807 } 808 809 static void check_if_tick_bio_needed(struct cache *cache, struct bio *bio) 810 { 811 struct per_bio_data *pb; 812 813 spin_lock_irq(&cache->lock); 814 if (cache->need_tick_bio && !op_is_flush(bio->bi_opf) && 815 bio_op(bio) != REQ_OP_DISCARD) { 816 pb = get_per_bio_data(bio); 817 pb->tick = true; 818 cache->need_tick_bio = false; 819 } 820 spin_unlock_irq(&cache->lock); 821 } 822 823 static void __remap_to_origin_clear_discard(struct cache *cache, struct bio *bio, 824 dm_oblock_t oblock, bool bio_has_pbd) 825 { 826 if (bio_has_pbd) 827 check_if_tick_bio_needed(cache, bio); 828 remap_to_origin(cache, bio); 829 if (bio_data_dir(bio) == WRITE) 830 clear_discard(cache, oblock_to_dblock(cache, oblock)); 831 } 832 833 static void remap_to_origin_clear_discard(struct cache *cache, struct bio *bio, 834 dm_oblock_t oblock) 835 { 836 // FIXME: check_if_tick_bio_needed() is called way too much through this interface 837 __remap_to_origin_clear_discard(cache, bio, oblock, true); 838 } 839 840 static void remap_to_cache_dirty(struct cache *cache, struct bio *bio, 841 dm_oblock_t oblock, dm_cblock_t cblock) 842 { 843 check_if_tick_bio_needed(cache, bio); 844 remap_to_cache(cache, bio, cblock); 845 if (bio_data_dir(bio) == WRITE) { 846 set_dirty(cache, cblock); 847 clear_discard(cache, oblock_to_dblock(cache, oblock)); 848 } 849 } 850 851 static dm_oblock_t get_bio_block(struct cache *cache, struct bio *bio) 852 { 853 sector_t block_nr = bio->bi_iter.bi_sector; 854 855 if (!block_size_is_power_of_two(cache)) 856 (void) sector_div(block_nr, cache->sectors_per_block); 857 else 858 block_nr >>= cache->sectors_per_block_shift; 859 860 return to_oblock(block_nr); 861 } 862 863 static bool accountable_bio(struct cache *cache, struct bio *bio) 864 { 865 return bio_op(bio) != REQ_OP_DISCARD; 866 } 867 868 static void accounted_begin(struct cache *cache, struct bio *bio) 869 { 870 struct per_bio_data *pb; 871 872 if (accountable_bio(cache, bio)) { 873 pb = get_per_bio_data(bio); 874 pb->len = bio_sectors(bio); 875 iot_io_begin(&cache->tracker, pb->len); 876 } 877 } 878 879 static void accounted_complete(struct cache *cache, struct bio *bio) 880 { 881 struct per_bio_data *pb = get_per_bio_data(bio); 882 883 iot_io_end(&cache->tracker, pb->len); 884 } 885 886 static void accounted_request(struct cache *cache, struct bio *bio) 887 { 888 accounted_begin(cache, bio); 889 generic_make_request(bio); 890 } 891 892 static void issue_op(struct bio *bio, void *context) 893 { 894 struct cache *cache = context; 895 accounted_request(cache, bio); 896 } 897 898 /* 899 * When running in writethrough mode we need to send writes to clean blocks 900 * to both the cache and origin devices. Clone the bio and send them in parallel. 901 */ 902 static void remap_to_origin_and_cache(struct cache *cache, struct bio *bio, 903 dm_oblock_t oblock, dm_cblock_t cblock) 904 { 905 struct bio *origin_bio = bio_clone_fast(bio, GFP_NOIO, &cache->bs); 906 907 BUG_ON(!origin_bio); 908 909 bio_chain(origin_bio, bio); 910 /* 911 * Passing false to __remap_to_origin_clear_discard() skips 912 * all code that might use per_bio_data (since clone doesn't have it) 913 */ 914 __remap_to_origin_clear_discard(cache, origin_bio, oblock, false); 915 submit_bio(origin_bio); 916 917 remap_to_cache(cache, bio, cblock); 918 } 919 920 /*---------------------------------------------------------------- 921 * Failure modes 922 *--------------------------------------------------------------*/ 923 static enum cache_metadata_mode get_cache_mode(struct cache *cache) 924 { 925 return cache->features.mode; 926 } 927 928 static const char *cache_device_name(struct cache *cache) 929 { 930 return dm_device_name(dm_table_get_md(cache->ti->table)); 931 } 932 933 static void notify_mode_switch(struct cache *cache, enum cache_metadata_mode mode) 934 { 935 const char *descs[] = { 936 "write", 937 "read-only", 938 "fail" 939 }; 940 941 dm_table_event(cache->ti->table); 942 DMINFO("%s: switching cache to %s mode", 943 cache_device_name(cache), descs[(int)mode]); 944 } 945 946 static void set_cache_mode(struct cache *cache, enum cache_metadata_mode new_mode) 947 { 948 bool needs_check; 949 enum cache_metadata_mode old_mode = get_cache_mode(cache); 950 951 if (dm_cache_metadata_needs_check(cache->cmd, &needs_check)) { 952 DMERR("%s: unable to read needs_check flag, setting failure mode.", 953 cache_device_name(cache)); 954 new_mode = CM_FAIL; 955 } 956 957 if (new_mode == CM_WRITE && needs_check) { 958 DMERR("%s: unable to switch cache to write mode until repaired.", 959 cache_device_name(cache)); 960 if (old_mode != new_mode) 961 new_mode = old_mode; 962 else 963 new_mode = CM_READ_ONLY; 964 } 965 966 /* Never move out of fail mode */ 967 if (old_mode == CM_FAIL) 968 new_mode = CM_FAIL; 969 970 switch (new_mode) { 971 case CM_FAIL: 972 case CM_READ_ONLY: 973 dm_cache_metadata_set_read_only(cache->cmd); 974 break; 975 976 case CM_WRITE: 977 dm_cache_metadata_set_read_write(cache->cmd); 978 break; 979 } 980 981 cache->features.mode = new_mode; 982 983 if (new_mode != old_mode) 984 notify_mode_switch(cache, new_mode); 985 } 986 987 static void abort_transaction(struct cache *cache) 988 { 989 const char *dev_name = cache_device_name(cache); 990 991 if (get_cache_mode(cache) >= CM_READ_ONLY) 992 return; 993 994 if (dm_cache_metadata_set_needs_check(cache->cmd)) { 995 DMERR("%s: failed to set 'needs_check' flag in metadata", dev_name); 996 set_cache_mode(cache, CM_FAIL); 997 } 998 999 DMERR_LIMIT("%s: aborting current metadata transaction", dev_name); 1000 if (dm_cache_metadata_abort(cache->cmd)) { 1001 DMERR("%s: failed to abort metadata transaction", dev_name); 1002 set_cache_mode(cache, CM_FAIL); 1003 } 1004 } 1005 1006 static void metadata_operation_failed(struct cache *cache, const char *op, int r) 1007 { 1008 DMERR_LIMIT("%s: metadata operation '%s' failed: error = %d", 1009 cache_device_name(cache), op, r); 1010 abort_transaction(cache); 1011 set_cache_mode(cache, CM_READ_ONLY); 1012 } 1013 1014 /*----------------------------------------------------------------*/ 1015 1016 static void load_stats(struct cache *cache) 1017 { 1018 struct dm_cache_statistics stats; 1019 1020 dm_cache_metadata_get_stats(cache->cmd, &stats); 1021 atomic_set(&cache->stats.read_hit, stats.read_hits); 1022 atomic_set(&cache->stats.read_miss, stats.read_misses); 1023 atomic_set(&cache->stats.write_hit, stats.write_hits); 1024 atomic_set(&cache->stats.write_miss, stats.write_misses); 1025 } 1026 1027 static void save_stats(struct cache *cache) 1028 { 1029 struct dm_cache_statistics stats; 1030 1031 if (get_cache_mode(cache) >= CM_READ_ONLY) 1032 return; 1033 1034 stats.read_hits = atomic_read(&cache->stats.read_hit); 1035 stats.read_misses = atomic_read(&cache->stats.read_miss); 1036 stats.write_hits = atomic_read(&cache->stats.write_hit); 1037 stats.write_misses = atomic_read(&cache->stats.write_miss); 1038 1039 dm_cache_metadata_set_stats(cache->cmd, &stats); 1040 } 1041 1042 static void update_stats(struct cache_stats *stats, enum policy_operation op) 1043 { 1044 switch (op) { 1045 case POLICY_PROMOTE: 1046 atomic_inc(&stats->promotion); 1047 break; 1048 1049 case POLICY_DEMOTE: 1050 atomic_inc(&stats->demotion); 1051 break; 1052 1053 case POLICY_WRITEBACK: 1054 atomic_inc(&stats->writeback); 1055 break; 1056 } 1057 } 1058 1059 /*---------------------------------------------------------------- 1060 * Migration processing 1061 * 1062 * Migration covers moving data from the origin device to the cache, or 1063 * vice versa. 1064 *--------------------------------------------------------------*/ 1065 1066 static void inc_io_migrations(struct cache *cache) 1067 { 1068 atomic_inc(&cache->nr_io_migrations); 1069 } 1070 1071 static void dec_io_migrations(struct cache *cache) 1072 { 1073 atomic_dec(&cache->nr_io_migrations); 1074 } 1075 1076 static bool discard_or_flush(struct bio *bio) 1077 { 1078 return bio_op(bio) == REQ_OP_DISCARD || op_is_flush(bio->bi_opf); 1079 } 1080 1081 static void calc_discard_block_range(struct cache *cache, struct bio *bio, 1082 dm_dblock_t *b, dm_dblock_t *e) 1083 { 1084 sector_t sb = bio->bi_iter.bi_sector; 1085 sector_t se = bio_end_sector(bio); 1086 1087 *b = to_dblock(dm_sector_div_up(sb, cache->discard_block_size)); 1088 1089 if (se - sb < cache->discard_block_size) 1090 *e = *b; 1091 else 1092 *e = to_dblock(block_div(se, cache->discard_block_size)); 1093 } 1094 1095 /*----------------------------------------------------------------*/ 1096 1097 static void prevent_background_work(struct cache *cache) 1098 { 1099 lockdep_off(); 1100 down_write(&cache->background_work_lock); 1101 lockdep_on(); 1102 } 1103 1104 static void allow_background_work(struct cache *cache) 1105 { 1106 lockdep_off(); 1107 up_write(&cache->background_work_lock); 1108 lockdep_on(); 1109 } 1110 1111 static bool background_work_begin(struct cache *cache) 1112 { 1113 bool r; 1114 1115 lockdep_off(); 1116 r = down_read_trylock(&cache->background_work_lock); 1117 lockdep_on(); 1118 1119 return r; 1120 } 1121 1122 static void background_work_end(struct cache *cache) 1123 { 1124 lockdep_off(); 1125 up_read(&cache->background_work_lock); 1126 lockdep_on(); 1127 } 1128 1129 /*----------------------------------------------------------------*/ 1130 1131 static bool bio_writes_complete_block(struct cache *cache, struct bio *bio) 1132 { 1133 return (bio_data_dir(bio) == WRITE) && 1134 (bio->bi_iter.bi_size == (cache->sectors_per_block << SECTOR_SHIFT)); 1135 } 1136 1137 static bool optimisable_bio(struct cache *cache, struct bio *bio, dm_oblock_t block) 1138 { 1139 return writeback_mode(cache) && 1140 (is_discarded_oblock(cache, block) || bio_writes_complete_block(cache, bio)); 1141 } 1142 1143 static void quiesce(struct dm_cache_migration *mg, 1144 void (*continuation)(struct work_struct *)) 1145 { 1146 init_continuation(&mg->k, continuation); 1147 dm_cell_quiesce_v2(mg->cache->prison, mg->cell, &mg->k.ws); 1148 } 1149 1150 static struct dm_cache_migration *ws_to_mg(struct work_struct *ws) 1151 { 1152 struct continuation *k = container_of(ws, struct continuation, ws); 1153 return container_of(k, struct dm_cache_migration, k); 1154 } 1155 1156 static void copy_complete(int read_err, unsigned long write_err, void *context) 1157 { 1158 struct dm_cache_migration *mg = container_of(context, struct dm_cache_migration, k); 1159 1160 if (read_err || write_err) 1161 mg->k.input = BLK_STS_IOERR; 1162 1163 queue_continuation(mg->cache->wq, &mg->k); 1164 } 1165 1166 static void copy(struct dm_cache_migration *mg, bool promote) 1167 { 1168 struct dm_io_region o_region, c_region; 1169 struct cache *cache = mg->cache; 1170 1171 o_region.bdev = cache->origin_dev->bdev; 1172 o_region.sector = from_oblock(mg->op->oblock) * cache->sectors_per_block; 1173 o_region.count = cache->sectors_per_block; 1174 1175 c_region.bdev = cache->cache_dev->bdev; 1176 c_region.sector = from_cblock(mg->op->cblock) * cache->sectors_per_block; 1177 c_region.count = cache->sectors_per_block; 1178 1179 if (promote) 1180 dm_kcopyd_copy(cache->copier, &o_region, 1, &c_region, 0, copy_complete, &mg->k); 1181 else 1182 dm_kcopyd_copy(cache->copier, &c_region, 1, &o_region, 0, copy_complete, &mg->k); 1183 } 1184 1185 static void bio_drop_shared_lock(struct cache *cache, struct bio *bio) 1186 { 1187 struct per_bio_data *pb = get_per_bio_data(bio); 1188 1189 if (pb->cell && dm_cell_put_v2(cache->prison, pb->cell)) 1190 free_prison_cell(cache, pb->cell); 1191 pb->cell = NULL; 1192 } 1193 1194 static void overwrite_endio(struct bio *bio) 1195 { 1196 struct dm_cache_migration *mg = bio->bi_private; 1197 struct cache *cache = mg->cache; 1198 struct per_bio_data *pb = get_per_bio_data(bio); 1199 1200 dm_unhook_bio(&pb->hook_info, bio); 1201 1202 if (bio->bi_status) 1203 mg->k.input = bio->bi_status; 1204 1205 queue_continuation(cache->wq, &mg->k); 1206 } 1207 1208 static void overwrite(struct dm_cache_migration *mg, 1209 void (*continuation)(struct work_struct *)) 1210 { 1211 struct bio *bio = mg->overwrite_bio; 1212 struct per_bio_data *pb = get_per_bio_data(bio); 1213 1214 dm_hook_bio(&pb->hook_info, bio, overwrite_endio, mg); 1215 1216 /* 1217 * The overwrite bio is part of the copy operation, as such it does 1218 * not set/clear discard or dirty flags. 1219 */ 1220 if (mg->op->op == POLICY_PROMOTE) 1221 remap_to_cache(mg->cache, bio, mg->op->cblock); 1222 else 1223 remap_to_origin(mg->cache, bio); 1224 1225 init_continuation(&mg->k, continuation); 1226 accounted_request(mg->cache, bio); 1227 } 1228 1229 /* 1230 * Migration steps: 1231 * 1232 * 1) exclusive lock preventing WRITEs 1233 * 2) quiesce 1234 * 3) copy or issue overwrite bio 1235 * 4) upgrade to exclusive lock preventing READs and WRITEs 1236 * 5) quiesce 1237 * 6) update metadata and commit 1238 * 7) unlock 1239 */ 1240 static void mg_complete(struct dm_cache_migration *mg, bool success) 1241 { 1242 struct bio_list bios; 1243 struct cache *cache = mg->cache; 1244 struct policy_work *op = mg->op; 1245 dm_cblock_t cblock = op->cblock; 1246 1247 if (success) 1248 update_stats(&cache->stats, op->op); 1249 1250 switch (op->op) { 1251 case POLICY_PROMOTE: 1252 clear_discard(cache, oblock_to_dblock(cache, op->oblock)); 1253 policy_complete_background_work(cache->policy, op, success); 1254 1255 if (mg->overwrite_bio) { 1256 if (success) 1257 force_set_dirty(cache, cblock); 1258 else if (mg->k.input) 1259 mg->overwrite_bio->bi_status = mg->k.input; 1260 else 1261 mg->overwrite_bio->bi_status = BLK_STS_IOERR; 1262 bio_endio(mg->overwrite_bio); 1263 } else { 1264 if (success) 1265 force_clear_dirty(cache, cblock); 1266 dec_io_migrations(cache); 1267 } 1268 break; 1269 1270 case POLICY_DEMOTE: 1271 /* 1272 * We clear dirty here to update the nr_dirty counter. 1273 */ 1274 if (success) 1275 force_clear_dirty(cache, cblock); 1276 policy_complete_background_work(cache->policy, op, success); 1277 dec_io_migrations(cache); 1278 break; 1279 1280 case POLICY_WRITEBACK: 1281 if (success) 1282 force_clear_dirty(cache, cblock); 1283 policy_complete_background_work(cache->policy, op, success); 1284 dec_io_migrations(cache); 1285 break; 1286 } 1287 1288 bio_list_init(&bios); 1289 if (mg->cell) { 1290 if (dm_cell_unlock_v2(cache->prison, mg->cell, &bios)) 1291 free_prison_cell(cache, mg->cell); 1292 } 1293 1294 free_migration(mg); 1295 defer_bios(cache, &bios); 1296 wake_migration_worker(cache); 1297 1298 background_work_end(cache); 1299 } 1300 1301 static void mg_success(struct work_struct *ws) 1302 { 1303 struct dm_cache_migration *mg = ws_to_mg(ws); 1304 mg_complete(mg, mg->k.input == 0); 1305 } 1306 1307 static void mg_update_metadata(struct work_struct *ws) 1308 { 1309 int r; 1310 struct dm_cache_migration *mg = ws_to_mg(ws); 1311 struct cache *cache = mg->cache; 1312 struct policy_work *op = mg->op; 1313 1314 switch (op->op) { 1315 case POLICY_PROMOTE: 1316 r = dm_cache_insert_mapping(cache->cmd, op->cblock, op->oblock); 1317 if (r) { 1318 DMERR_LIMIT("%s: migration failed; couldn't insert mapping", 1319 cache_device_name(cache)); 1320 metadata_operation_failed(cache, "dm_cache_insert_mapping", r); 1321 1322 mg_complete(mg, false); 1323 return; 1324 } 1325 mg_complete(mg, true); 1326 break; 1327 1328 case POLICY_DEMOTE: 1329 r = dm_cache_remove_mapping(cache->cmd, op->cblock); 1330 if (r) { 1331 DMERR_LIMIT("%s: migration failed; couldn't update on disk metadata", 1332 cache_device_name(cache)); 1333 metadata_operation_failed(cache, "dm_cache_remove_mapping", r); 1334 1335 mg_complete(mg, false); 1336 return; 1337 } 1338 1339 /* 1340 * It would be nice if we only had to commit when a REQ_FLUSH 1341 * comes through. But there's one scenario that we have to 1342 * look out for: 1343 * 1344 * - vblock x in a cache block 1345 * - domotion occurs 1346 * - cache block gets reallocated and over written 1347 * - crash 1348 * 1349 * When we recover, because there was no commit the cache will 1350 * rollback to having the data for vblock x in the cache block. 1351 * But the cache block has since been overwritten, so it'll end 1352 * up pointing to data that was never in 'x' during the history 1353 * of the device. 1354 * 1355 * To avoid this issue we require a commit as part of the 1356 * demotion operation. 1357 */ 1358 init_continuation(&mg->k, mg_success); 1359 continue_after_commit(&cache->committer, &mg->k); 1360 schedule_commit(&cache->committer); 1361 break; 1362 1363 case POLICY_WRITEBACK: 1364 mg_complete(mg, true); 1365 break; 1366 } 1367 } 1368 1369 static void mg_update_metadata_after_copy(struct work_struct *ws) 1370 { 1371 struct dm_cache_migration *mg = ws_to_mg(ws); 1372 1373 /* 1374 * Did the copy succeed? 1375 */ 1376 if (mg->k.input) 1377 mg_complete(mg, false); 1378 else 1379 mg_update_metadata(ws); 1380 } 1381 1382 static void mg_upgrade_lock(struct work_struct *ws) 1383 { 1384 int r; 1385 struct dm_cache_migration *mg = ws_to_mg(ws); 1386 1387 /* 1388 * Did the copy succeed? 1389 */ 1390 if (mg->k.input) 1391 mg_complete(mg, false); 1392 1393 else { 1394 /* 1395 * Now we want the lock to prevent both reads and writes. 1396 */ 1397 r = dm_cell_lock_promote_v2(mg->cache->prison, mg->cell, 1398 READ_WRITE_LOCK_LEVEL); 1399 if (r < 0) 1400 mg_complete(mg, false); 1401 1402 else if (r) 1403 quiesce(mg, mg_update_metadata); 1404 1405 else 1406 mg_update_metadata(ws); 1407 } 1408 } 1409 1410 static void mg_full_copy(struct work_struct *ws) 1411 { 1412 struct dm_cache_migration *mg = ws_to_mg(ws); 1413 struct cache *cache = mg->cache; 1414 struct policy_work *op = mg->op; 1415 bool is_policy_promote = (op->op == POLICY_PROMOTE); 1416 1417 if ((!is_policy_promote && !is_dirty(cache, op->cblock)) || 1418 is_discarded_oblock(cache, op->oblock)) { 1419 mg_upgrade_lock(ws); 1420 return; 1421 } 1422 1423 init_continuation(&mg->k, mg_upgrade_lock); 1424 copy(mg, is_policy_promote); 1425 } 1426 1427 static void mg_copy(struct work_struct *ws) 1428 { 1429 struct dm_cache_migration *mg = ws_to_mg(ws); 1430 1431 if (mg->overwrite_bio) { 1432 /* 1433 * No exclusive lock was held when we last checked if the bio 1434 * was optimisable. So we have to check again in case things 1435 * have changed (eg, the block may no longer be discarded). 1436 */ 1437 if (!optimisable_bio(mg->cache, mg->overwrite_bio, mg->op->oblock)) { 1438 /* 1439 * Fallback to a real full copy after doing some tidying up. 1440 */ 1441 bool rb = bio_detain_shared(mg->cache, mg->op->oblock, mg->overwrite_bio); 1442 BUG_ON(rb); /* An exclussive lock must _not_ be held for this block */ 1443 mg->overwrite_bio = NULL; 1444 inc_io_migrations(mg->cache); 1445 mg_full_copy(ws); 1446 return; 1447 } 1448 1449 /* 1450 * It's safe to do this here, even though it's new data 1451 * because all IO has been locked out of the block. 1452 * 1453 * mg_lock_writes() already took READ_WRITE_LOCK_LEVEL 1454 * so _not_ using mg_upgrade_lock() as continutation. 1455 */ 1456 overwrite(mg, mg_update_metadata_after_copy); 1457 1458 } else 1459 mg_full_copy(ws); 1460 } 1461 1462 static int mg_lock_writes(struct dm_cache_migration *mg) 1463 { 1464 int r; 1465 struct dm_cell_key_v2 key; 1466 struct cache *cache = mg->cache; 1467 struct dm_bio_prison_cell_v2 *prealloc; 1468 1469 prealloc = alloc_prison_cell(cache); 1470 1471 /* 1472 * Prevent writes to the block, but allow reads to continue. 1473 * Unless we're using an overwrite bio, in which case we lock 1474 * everything. 1475 */ 1476 build_key(mg->op->oblock, oblock_succ(mg->op->oblock), &key); 1477 r = dm_cell_lock_v2(cache->prison, &key, 1478 mg->overwrite_bio ? READ_WRITE_LOCK_LEVEL : WRITE_LOCK_LEVEL, 1479 prealloc, &mg->cell); 1480 if (r < 0) { 1481 free_prison_cell(cache, prealloc); 1482 mg_complete(mg, false); 1483 return r; 1484 } 1485 1486 if (mg->cell != prealloc) 1487 free_prison_cell(cache, prealloc); 1488 1489 if (r == 0) 1490 mg_copy(&mg->k.ws); 1491 else 1492 quiesce(mg, mg_copy); 1493 1494 return 0; 1495 } 1496 1497 static int mg_start(struct cache *cache, struct policy_work *op, struct bio *bio) 1498 { 1499 struct dm_cache_migration *mg; 1500 1501 if (!background_work_begin(cache)) { 1502 policy_complete_background_work(cache->policy, op, false); 1503 return -EPERM; 1504 } 1505 1506 mg = alloc_migration(cache); 1507 1508 mg->op = op; 1509 mg->overwrite_bio = bio; 1510 1511 if (!bio) 1512 inc_io_migrations(cache); 1513 1514 return mg_lock_writes(mg); 1515 } 1516 1517 /*---------------------------------------------------------------- 1518 * invalidation processing 1519 *--------------------------------------------------------------*/ 1520 1521 static void invalidate_complete(struct dm_cache_migration *mg, bool success) 1522 { 1523 struct bio_list bios; 1524 struct cache *cache = mg->cache; 1525 1526 bio_list_init(&bios); 1527 if (dm_cell_unlock_v2(cache->prison, mg->cell, &bios)) 1528 free_prison_cell(cache, mg->cell); 1529 1530 if (!success && mg->overwrite_bio) 1531 bio_io_error(mg->overwrite_bio); 1532 1533 free_migration(mg); 1534 defer_bios(cache, &bios); 1535 1536 background_work_end(cache); 1537 } 1538 1539 static void invalidate_completed(struct work_struct *ws) 1540 { 1541 struct dm_cache_migration *mg = ws_to_mg(ws); 1542 invalidate_complete(mg, !mg->k.input); 1543 } 1544 1545 static int invalidate_cblock(struct cache *cache, dm_cblock_t cblock) 1546 { 1547 int r = policy_invalidate_mapping(cache->policy, cblock); 1548 if (!r) { 1549 r = dm_cache_remove_mapping(cache->cmd, cblock); 1550 if (r) { 1551 DMERR_LIMIT("%s: invalidation failed; couldn't update on disk metadata", 1552 cache_device_name(cache)); 1553 metadata_operation_failed(cache, "dm_cache_remove_mapping", r); 1554 } 1555 1556 } else if (r == -ENODATA) { 1557 /* 1558 * Harmless, already unmapped. 1559 */ 1560 r = 0; 1561 1562 } else 1563 DMERR("%s: policy_invalidate_mapping failed", cache_device_name(cache)); 1564 1565 return r; 1566 } 1567 1568 static void invalidate_remove(struct work_struct *ws) 1569 { 1570 int r; 1571 struct dm_cache_migration *mg = ws_to_mg(ws); 1572 struct cache *cache = mg->cache; 1573 1574 r = invalidate_cblock(cache, mg->invalidate_cblock); 1575 if (r) { 1576 invalidate_complete(mg, false); 1577 return; 1578 } 1579 1580 init_continuation(&mg->k, invalidate_completed); 1581 continue_after_commit(&cache->committer, &mg->k); 1582 remap_to_origin_clear_discard(cache, mg->overwrite_bio, mg->invalidate_oblock); 1583 mg->overwrite_bio = NULL; 1584 schedule_commit(&cache->committer); 1585 } 1586 1587 static int invalidate_lock(struct dm_cache_migration *mg) 1588 { 1589 int r; 1590 struct dm_cell_key_v2 key; 1591 struct cache *cache = mg->cache; 1592 struct dm_bio_prison_cell_v2 *prealloc; 1593 1594 prealloc = alloc_prison_cell(cache); 1595 1596 build_key(mg->invalidate_oblock, oblock_succ(mg->invalidate_oblock), &key); 1597 r = dm_cell_lock_v2(cache->prison, &key, 1598 READ_WRITE_LOCK_LEVEL, prealloc, &mg->cell); 1599 if (r < 0) { 1600 free_prison_cell(cache, prealloc); 1601 invalidate_complete(mg, false); 1602 return r; 1603 } 1604 1605 if (mg->cell != prealloc) 1606 free_prison_cell(cache, prealloc); 1607 1608 if (r) 1609 quiesce(mg, invalidate_remove); 1610 1611 else { 1612 /* 1613 * We can't call invalidate_remove() directly here because we 1614 * might still be in request context. 1615 */ 1616 init_continuation(&mg->k, invalidate_remove); 1617 queue_work(cache->wq, &mg->k.ws); 1618 } 1619 1620 return 0; 1621 } 1622 1623 static int invalidate_start(struct cache *cache, dm_cblock_t cblock, 1624 dm_oblock_t oblock, struct bio *bio) 1625 { 1626 struct dm_cache_migration *mg; 1627 1628 if (!background_work_begin(cache)) 1629 return -EPERM; 1630 1631 mg = alloc_migration(cache); 1632 1633 mg->overwrite_bio = bio; 1634 mg->invalidate_cblock = cblock; 1635 mg->invalidate_oblock = oblock; 1636 1637 return invalidate_lock(mg); 1638 } 1639 1640 /*---------------------------------------------------------------- 1641 * bio processing 1642 *--------------------------------------------------------------*/ 1643 1644 enum busy { 1645 IDLE, 1646 BUSY 1647 }; 1648 1649 static enum busy spare_migration_bandwidth(struct cache *cache) 1650 { 1651 bool idle = iot_idle_for(&cache->tracker, HZ); 1652 sector_t current_volume = (atomic_read(&cache->nr_io_migrations) + 1) * 1653 cache->sectors_per_block; 1654 1655 if (idle && current_volume <= cache->migration_threshold) 1656 return IDLE; 1657 else 1658 return BUSY; 1659 } 1660 1661 static void inc_hit_counter(struct cache *cache, struct bio *bio) 1662 { 1663 atomic_inc(bio_data_dir(bio) == READ ? 1664 &cache->stats.read_hit : &cache->stats.write_hit); 1665 } 1666 1667 static void inc_miss_counter(struct cache *cache, struct bio *bio) 1668 { 1669 atomic_inc(bio_data_dir(bio) == READ ? 1670 &cache->stats.read_miss : &cache->stats.write_miss); 1671 } 1672 1673 /*----------------------------------------------------------------*/ 1674 1675 static int map_bio(struct cache *cache, struct bio *bio, dm_oblock_t block, 1676 bool *commit_needed) 1677 { 1678 int r, data_dir; 1679 bool rb, background_queued; 1680 dm_cblock_t cblock; 1681 1682 *commit_needed = false; 1683 1684 rb = bio_detain_shared(cache, block, bio); 1685 if (!rb) { 1686 /* 1687 * An exclusive lock is held for this block, so we have to 1688 * wait. We set the commit_needed flag so the current 1689 * transaction will be committed asap, allowing this lock 1690 * to be dropped. 1691 */ 1692 *commit_needed = true; 1693 return DM_MAPIO_SUBMITTED; 1694 } 1695 1696 data_dir = bio_data_dir(bio); 1697 1698 if (optimisable_bio(cache, bio, block)) { 1699 struct policy_work *op = NULL; 1700 1701 r = policy_lookup_with_work(cache->policy, block, &cblock, data_dir, true, &op); 1702 if (unlikely(r && r != -ENOENT)) { 1703 DMERR_LIMIT("%s: policy_lookup_with_work() failed with r = %d", 1704 cache_device_name(cache), r); 1705 bio_io_error(bio); 1706 return DM_MAPIO_SUBMITTED; 1707 } 1708 1709 if (r == -ENOENT && op) { 1710 bio_drop_shared_lock(cache, bio); 1711 BUG_ON(op->op != POLICY_PROMOTE); 1712 mg_start(cache, op, bio); 1713 return DM_MAPIO_SUBMITTED; 1714 } 1715 } else { 1716 r = policy_lookup(cache->policy, block, &cblock, data_dir, false, &background_queued); 1717 if (unlikely(r && r != -ENOENT)) { 1718 DMERR_LIMIT("%s: policy_lookup() failed with r = %d", 1719 cache_device_name(cache), r); 1720 bio_io_error(bio); 1721 return DM_MAPIO_SUBMITTED; 1722 } 1723 1724 if (background_queued) 1725 wake_migration_worker(cache); 1726 } 1727 1728 if (r == -ENOENT) { 1729 struct per_bio_data *pb = get_per_bio_data(bio); 1730 1731 /* 1732 * Miss. 1733 */ 1734 inc_miss_counter(cache, bio); 1735 if (pb->req_nr == 0) { 1736 accounted_begin(cache, bio); 1737 remap_to_origin_clear_discard(cache, bio, block); 1738 } else { 1739 /* 1740 * This is a duplicate writethrough io that is no 1741 * longer needed because the block has been demoted. 1742 */ 1743 bio_endio(bio); 1744 return DM_MAPIO_SUBMITTED; 1745 } 1746 } else { 1747 /* 1748 * Hit. 1749 */ 1750 inc_hit_counter(cache, bio); 1751 1752 /* 1753 * Passthrough always maps to the origin, invalidating any 1754 * cache blocks that are written to. 1755 */ 1756 if (passthrough_mode(cache)) { 1757 if (bio_data_dir(bio) == WRITE) { 1758 bio_drop_shared_lock(cache, bio); 1759 atomic_inc(&cache->stats.demotion); 1760 invalidate_start(cache, cblock, block, bio); 1761 } else 1762 remap_to_origin_clear_discard(cache, bio, block); 1763 } else { 1764 if (bio_data_dir(bio) == WRITE && writethrough_mode(cache) && 1765 !is_dirty(cache, cblock)) { 1766 remap_to_origin_and_cache(cache, bio, block, cblock); 1767 accounted_begin(cache, bio); 1768 } else 1769 remap_to_cache_dirty(cache, bio, block, cblock); 1770 } 1771 } 1772 1773 /* 1774 * dm core turns FUA requests into a separate payload and FLUSH req. 1775 */ 1776 if (bio->bi_opf & REQ_FUA) { 1777 /* 1778 * issue_after_commit will call accounted_begin a second time. So 1779 * we call accounted_complete() to avoid double accounting. 1780 */ 1781 accounted_complete(cache, bio); 1782 issue_after_commit(&cache->committer, bio); 1783 *commit_needed = true; 1784 return DM_MAPIO_SUBMITTED; 1785 } 1786 1787 return DM_MAPIO_REMAPPED; 1788 } 1789 1790 static bool process_bio(struct cache *cache, struct bio *bio) 1791 { 1792 bool commit_needed; 1793 1794 if (map_bio(cache, bio, get_bio_block(cache, bio), &commit_needed) == DM_MAPIO_REMAPPED) 1795 generic_make_request(bio); 1796 1797 return commit_needed; 1798 } 1799 1800 /* 1801 * A non-zero return indicates read_only or fail_io mode. 1802 */ 1803 static int commit(struct cache *cache, bool clean_shutdown) 1804 { 1805 int r; 1806 1807 if (get_cache_mode(cache) >= CM_READ_ONLY) 1808 return -EINVAL; 1809 1810 atomic_inc(&cache->stats.commit_count); 1811 r = dm_cache_commit(cache->cmd, clean_shutdown); 1812 if (r) 1813 metadata_operation_failed(cache, "dm_cache_commit", r); 1814 1815 return r; 1816 } 1817 1818 /* 1819 * Used by the batcher. 1820 */ 1821 static blk_status_t commit_op(void *context) 1822 { 1823 struct cache *cache = context; 1824 1825 if (dm_cache_changed_this_transaction(cache->cmd)) 1826 return errno_to_blk_status(commit(cache, false)); 1827 1828 return 0; 1829 } 1830 1831 /*----------------------------------------------------------------*/ 1832 1833 static bool process_flush_bio(struct cache *cache, struct bio *bio) 1834 { 1835 struct per_bio_data *pb = get_per_bio_data(bio); 1836 1837 if (!pb->req_nr) 1838 remap_to_origin(cache, bio); 1839 else 1840 remap_to_cache(cache, bio, 0); 1841 1842 issue_after_commit(&cache->committer, bio); 1843 return true; 1844 } 1845 1846 static bool process_discard_bio(struct cache *cache, struct bio *bio) 1847 { 1848 dm_dblock_t b, e; 1849 1850 // FIXME: do we need to lock the region? Or can we just assume the 1851 // user wont be so foolish as to issue discard concurrently with 1852 // other IO? 1853 calc_discard_block_range(cache, bio, &b, &e); 1854 while (b != e) { 1855 set_discard(cache, b); 1856 b = to_dblock(from_dblock(b) + 1); 1857 } 1858 1859 if (cache->features.discard_passdown) { 1860 remap_to_origin(cache, bio); 1861 generic_make_request(bio); 1862 } else 1863 bio_endio(bio); 1864 1865 return false; 1866 } 1867 1868 static void process_deferred_bios(struct work_struct *ws) 1869 { 1870 struct cache *cache = container_of(ws, struct cache, deferred_bio_worker); 1871 1872 bool commit_needed = false; 1873 struct bio_list bios; 1874 struct bio *bio; 1875 1876 bio_list_init(&bios); 1877 1878 spin_lock_irq(&cache->lock); 1879 bio_list_merge(&bios, &cache->deferred_bios); 1880 bio_list_init(&cache->deferred_bios); 1881 spin_unlock_irq(&cache->lock); 1882 1883 while ((bio = bio_list_pop(&bios))) { 1884 if (bio->bi_opf & REQ_PREFLUSH) 1885 commit_needed = process_flush_bio(cache, bio) || commit_needed; 1886 1887 else if (bio_op(bio) == REQ_OP_DISCARD) 1888 commit_needed = process_discard_bio(cache, bio) || commit_needed; 1889 1890 else 1891 commit_needed = process_bio(cache, bio) || commit_needed; 1892 } 1893 1894 if (commit_needed) 1895 schedule_commit(&cache->committer); 1896 } 1897 1898 /*---------------------------------------------------------------- 1899 * Main worker loop 1900 *--------------------------------------------------------------*/ 1901 1902 static void requeue_deferred_bios(struct cache *cache) 1903 { 1904 struct bio *bio; 1905 struct bio_list bios; 1906 1907 bio_list_init(&bios); 1908 bio_list_merge(&bios, &cache->deferred_bios); 1909 bio_list_init(&cache->deferred_bios); 1910 1911 while ((bio = bio_list_pop(&bios))) { 1912 bio->bi_status = BLK_STS_DM_REQUEUE; 1913 bio_endio(bio); 1914 } 1915 } 1916 1917 /* 1918 * We want to commit periodically so that not too much 1919 * unwritten metadata builds up. 1920 */ 1921 static void do_waker(struct work_struct *ws) 1922 { 1923 struct cache *cache = container_of(to_delayed_work(ws), struct cache, waker); 1924 1925 policy_tick(cache->policy, true); 1926 wake_migration_worker(cache); 1927 schedule_commit(&cache->committer); 1928 queue_delayed_work(cache->wq, &cache->waker, COMMIT_PERIOD); 1929 } 1930 1931 static void check_migrations(struct work_struct *ws) 1932 { 1933 int r; 1934 struct policy_work *op; 1935 struct cache *cache = container_of(ws, struct cache, migration_worker); 1936 enum busy b; 1937 1938 for (;;) { 1939 b = spare_migration_bandwidth(cache); 1940 1941 r = policy_get_background_work(cache->policy, b == IDLE, &op); 1942 if (r == -ENODATA) 1943 break; 1944 1945 if (r) { 1946 DMERR_LIMIT("%s: policy_background_work failed", 1947 cache_device_name(cache)); 1948 break; 1949 } 1950 1951 r = mg_start(cache, op, NULL); 1952 if (r) 1953 break; 1954 } 1955 } 1956 1957 /*---------------------------------------------------------------- 1958 * Target methods 1959 *--------------------------------------------------------------*/ 1960 1961 /* 1962 * This function gets called on the error paths of the constructor, so we 1963 * have to cope with a partially initialised struct. 1964 */ 1965 static void destroy(struct cache *cache) 1966 { 1967 unsigned i; 1968 1969 mempool_exit(&cache->migration_pool); 1970 1971 if (cache->prison) 1972 dm_bio_prison_destroy_v2(cache->prison); 1973 1974 if (cache->wq) 1975 destroy_workqueue(cache->wq); 1976 1977 if (cache->dirty_bitset) 1978 free_bitset(cache->dirty_bitset); 1979 1980 if (cache->discard_bitset) 1981 free_bitset(cache->discard_bitset); 1982 1983 if (cache->copier) 1984 dm_kcopyd_client_destroy(cache->copier); 1985 1986 if (cache->cmd) 1987 dm_cache_metadata_close(cache->cmd); 1988 1989 if (cache->metadata_dev) 1990 dm_put_device(cache->ti, cache->metadata_dev); 1991 1992 if (cache->origin_dev) 1993 dm_put_device(cache->ti, cache->origin_dev); 1994 1995 if (cache->cache_dev) 1996 dm_put_device(cache->ti, cache->cache_dev); 1997 1998 if (cache->policy) 1999 dm_cache_policy_destroy(cache->policy); 2000 2001 for (i = 0; i < cache->nr_ctr_args ; i++) 2002 kfree(cache->ctr_args[i]); 2003 kfree(cache->ctr_args); 2004 2005 bioset_exit(&cache->bs); 2006 2007 kfree(cache); 2008 } 2009 2010 static void cache_dtr(struct dm_target *ti) 2011 { 2012 struct cache *cache = ti->private; 2013 2014 destroy(cache); 2015 } 2016 2017 static sector_t get_dev_size(struct dm_dev *dev) 2018 { 2019 return i_size_read(dev->bdev->bd_inode) >> SECTOR_SHIFT; 2020 } 2021 2022 /*----------------------------------------------------------------*/ 2023 2024 /* 2025 * Construct a cache device mapping. 2026 * 2027 * cache <metadata dev> <cache dev> <origin dev> <block size> 2028 * <#feature args> [<feature arg>]* 2029 * <policy> <#policy args> [<policy arg>]* 2030 * 2031 * metadata dev : fast device holding the persistent metadata 2032 * cache dev : fast device holding cached data blocks 2033 * origin dev : slow device holding original data blocks 2034 * block size : cache unit size in sectors 2035 * 2036 * #feature args : number of feature arguments passed 2037 * feature args : writethrough. (The default is writeback.) 2038 * 2039 * policy : the replacement policy to use 2040 * #policy args : an even number of policy arguments corresponding 2041 * to key/value pairs passed to the policy 2042 * policy args : key/value pairs passed to the policy 2043 * E.g. 'sequential_threshold 1024' 2044 * See cache-policies.txt for details. 2045 * 2046 * Optional feature arguments are: 2047 * writethrough : write through caching that prohibits cache block 2048 * content from being different from origin block content. 2049 * Without this argument, the default behaviour is to write 2050 * back cache block contents later for performance reasons, 2051 * so they may differ from the corresponding origin blocks. 2052 */ 2053 struct cache_args { 2054 struct dm_target *ti; 2055 2056 struct dm_dev *metadata_dev; 2057 2058 struct dm_dev *cache_dev; 2059 sector_t cache_sectors; 2060 2061 struct dm_dev *origin_dev; 2062 sector_t origin_sectors; 2063 2064 uint32_t block_size; 2065 2066 const char *policy_name; 2067 int policy_argc; 2068 const char **policy_argv; 2069 2070 struct cache_features features; 2071 }; 2072 2073 static void destroy_cache_args(struct cache_args *ca) 2074 { 2075 if (ca->metadata_dev) 2076 dm_put_device(ca->ti, ca->metadata_dev); 2077 2078 if (ca->cache_dev) 2079 dm_put_device(ca->ti, ca->cache_dev); 2080 2081 if (ca->origin_dev) 2082 dm_put_device(ca->ti, ca->origin_dev); 2083 2084 kfree(ca); 2085 } 2086 2087 static bool at_least_one_arg(struct dm_arg_set *as, char **error) 2088 { 2089 if (!as->argc) { 2090 *error = "Insufficient args"; 2091 return false; 2092 } 2093 2094 return true; 2095 } 2096 2097 static int parse_metadata_dev(struct cache_args *ca, struct dm_arg_set *as, 2098 char **error) 2099 { 2100 int r; 2101 sector_t metadata_dev_size; 2102 char b[BDEVNAME_SIZE]; 2103 2104 if (!at_least_one_arg(as, error)) 2105 return -EINVAL; 2106 2107 r = dm_get_device(ca->ti, dm_shift_arg(as), FMODE_READ | FMODE_WRITE, 2108 &ca->metadata_dev); 2109 if (r) { 2110 *error = "Error opening metadata device"; 2111 return r; 2112 } 2113 2114 metadata_dev_size = get_dev_size(ca->metadata_dev); 2115 if (metadata_dev_size > DM_CACHE_METADATA_MAX_SECTORS_WARNING) 2116 DMWARN("Metadata device %s is larger than %u sectors: excess space will not be used.", 2117 bdevname(ca->metadata_dev->bdev, b), THIN_METADATA_MAX_SECTORS); 2118 2119 return 0; 2120 } 2121 2122 static int parse_cache_dev(struct cache_args *ca, struct dm_arg_set *as, 2123 char **error) 2124 { 2125 int r; 2126 2127 if (!at_least_one_arg(as, error)) 2128 return -EINVAL; 2129 2130 r = dm_get_device(ca->ti, dm_shift_arg(as), FMODE_READ | FMODE_WRITE, 2131 &ca->cache_dev); 2132 if (r) { 2133 *error = "Error opening cache device"; 2134 return r; 2135 } 2136 ca->cache_sectors = get_dev_size(ca->cache_dev); 2137 2138 return 0; 2139 } 2140 2141 static int parse_origin_dev(struct cache_args *ca, struct dm_arg_set *as, 2142 char **error) 2143 { 2144 int r; 2145 2146 if (!at_least_one_arg(as, error)) 2147 return -EINVAL; 2148 2149 r = dm_get_device(ca->ti, dm_shift_arg(as), FMODE_READ | FMODE_WRITE, 2150 &ca->origin_dev); 2151 if (r) { 2152 *error = "Error opening origin device"; 2153 return r; 2154 } 2155 2156 ca->origin_sectors = get_dev_size(ca->origin_dev); 2157 if (ca->ti->len > ca->origin_sectors) { 2158 *error = "Device size larger than cached device"; 2159 return -EINVAL; 2160 } 2161 2162 return 0; 2163 } 2164 2165 static int parse_block_size(struct cache_args *ca, struct dm_arg_set *as, 2166 char **error) 2167 { 2168 unsigned long block_size; 2169 2170 if (!at_least_one_arg(as, error)) 2171 return -EINVAL; 2172 2173 if (kstrtoul(dm_shift_arg(as), 10, &block_size) || !block_size || 2174 block_size < DATA_DEV_BLOCK_SIZE_MIN_SECTORS || 2175 block_size > DATA_DEV_BLOCK_SIZE_MAX_SECTORS || 2176 block_size & (DATA_DEV_BLOCK_SIZE_MIN_SECTORS - 1)) { 2177 *error = "Invalid data block size"; 2178 return -EINVAL; 2179 } 2180 2181 if (block_size > ca->cache_sectors) { 2182 *error = "Data block size is larger than the cache device"; 2183 return -EINVAL; 2184 } 2185 2186 ca->block_size = block_size; 2187 2188 return 0; 2189 } 2190 2191 static void init_features(struct cache_features *cf) 2192 { 2193 cf->mode = CM_WRITE; 2194 cf->io_mode = CM_IO_WRITEBACK; 2195 cf->metadata_version = 1; 2196 cf->discard_passdown = true; 2197 } 2198 2199 static int parse_features(struct cache_args *ca, struct dm_arg_set *as, 2200 char **error) 2201 { 2202 static const struct dm_arg _args[] = { 2203 {0, 3, "Invalid number of cache feature arguments"}, 2204 }; 2205 2206 int r, mode_ctr = 0; 2207 unsigned argc; 2208 const char *arg; 2209 struct cache_features *cf = &ca->features; 2210 2211 init_features(cf); 2212 2213 r = dm_read_arg_group(_args, as, &argc, error); 2214 if (r) 2215 return -EINVAL; 2216 2217 while (argc--) { 2218 arg = dm_shift_arg(as); 2219 2220 if (!strcasecmp(arg, "writeback")) { 2221 cf->io_mode = CM_IO_WRITEBACK; 2222 mode_ctr++; 2223 } 2224 2225 else if (!strcasecmp(arg, "writethrough")) { 2226 cf->io_mode = CM_IO_WRITETHROUGH; 2227 mode_ctr++; 2228 } 2229 2230 else if (!strcasecmp(arg, "passthrough")) { 2231 cf->io_mode = CM_IO_PASSTHROUGH; 2232 mode_ctr++; 2233 } 2234 2235 else if (!strcasecmp(arg, "metadata2")) 2236 cf->metadata_version = 2; 2237 2238 else if (!strcasecmp(arg, "no_discard_passdown")) 2239 cf->discard_passdown = false; 2240 2241 else { 2242 *error = "Unrecognised cache feature requested"; 2243 return -EINVAL; 2244 } 2245 } 2246 2247 if (mode_ctr > 1) { 2248 *error = "Duplicate cache io_mode features requested"; 2249 return -EINVAL; 2250 } 2251 2252 return 0; 2253 } 2254 2255 static int parse_policy(struct cache_args *ca, struct dm_arg_set *as, 2256 char **error) 2257 { 2258 static const struct dm_arg _args[] = { 2259 {0, 1024, "Invalid number of policy arguments"}, 2260 }; 2261 2262 int r; 2263 2264 if (!at_least_one_arg(as, error)) 2265 return -EINVAL; 2266 2267 ca->policy_name = dm_shift_arg(as); 2268 2269 r = dm_read_arg_group(_args, as, &ca->policy_argc, error); 2270 if (r) 2271 return -EINVAL; 2272 2273 ca->policy_argv = (const char **)as->argv; 2274 dm_consume_args(as, ca->policy_argc); 2275 2276 return 0; 2277 } 2278 2279 static int parse_cache_args(struct cache_args *ca, int argc, char **argv, 2280 char **error) 2281 { 2282 int r; 2283 struct dm_arg_set as; 2284 2285 as.argc = argc; 2286 as.argv = argv; 2287 2288 r = parse_metadata_dev(ca, &as, error); 2289 if (r) 2290 return r; 2291 2292 r = parse_cache_dev(ca, &as, error); 2293 if (r) 2294 return r; 2295 2296 r = parse_origin_dev(ca, &as, error); 2297 if (r) 2298 return r; 2299 2300 r = parse_block_size(ca, &as, error); 2301 if (r) 2302 return r; 2303 2304 r = parse_features(ca, &as, error); 2305 if (r) 2306 return r; 2307 2308 r = parse_policy(ca, &as, error); 2309 if (r) 2310 return r; 2311 2312 return 0; 2313 } 2314 2315 /*----------------------------------------------------------------*/ 2316 2317 static struct kmem_cache *migration_cache; 2318 2319 #define NOT_CORE_OPTION 1 2320 2321 static int process_config_option(struct cache *cache, const char *key, const char *value) 2322 { 2323 unsigned long tmp; 2324 2325 if (!strcasecmp(key, "migration_threshold")) { 2326 if (kstrtoul(value, 10, &tmp)) 2327 return -EINVAL; 2328 2329 cache->migration_threshold = tmp; 2330 return 0; 2331 } 2332 2333 return NOT_CORE_OPTION; 2334 } 2335 2336 static int set_config_value(struct cache *cache, const char *key, const char *value) 2337 { 2338 int r = process_config_option(cache, key, value); 2339 2340 if (r == NOT_CORE_OPTION) 2341 r = policy_set_config_value(cache->policy, key, value); 2342 2343 if (r) 2344 DMWARN("bad config value for %s: %s", key, value); 2345 2346 return r; 2347 } 2348 2349 static int set_config_values(struct cache *cache, int argc, const char **argv) 2350 { 2351 int r = 0; 2352 2353 if (argc & 1) { 2354 DMWARN("Odd number of policy arguments given but they should be <key> <value> pairs."); 2355 return -EINVAL; 2356 } 2357 2358 while (argc) { 2359 r = set_config_value(cache, argv[0], argv[1]); 2360 if (r) 2361 break; 2362 2363 argc -= 2; 2364 argv += 2; 2365 } 2366 2367 return r; 2368 } 2369 2370 static int create_cache_policy(struct cache *cache, struct cache_args *ca, 2371 char **error) 2372 { 2373 struct dm_cache_policy *p = dm_cache_policy_create(ca->policy_name, 2374 cache->cache_size, 2375 cache->origin_sectors, 2376 cache->sectors_per_block); 2377 if (IS_ERR(p)) { 2378 *error = "Error creating cache's policy"; 2379 return PTR_ERR(p); 2380 } 2381 cache->policy = p; 2382 BUG_ON(!cache->policy); 2383 2384 return 0; 2385 } 2386 2387 /* 2388 * We want the discard block size to be at least the size of the cache 2389 * block size and have no more than 2^14 discard blocks across the origin. 2390 */ 2391 #define MAX_DISCARD_BLOCKS (1 << 14) 2392 2393 static bool too_many_discard_blocks(sector_t discard_block_size, 2394 sector_t origin_size) 2395 { 2396 (void) sector_div(origin_size, discard_block_size); 2397 2398 return origin_size > MAX_DISCARD_BLOCKS; 2399 } 2400 2401 static sector_t calculate_discard_block_size(sector_t cache_block_size, 2402 sector_t origin_size) 2403 { 2404 sector_t discard_block_size = cache_block_size; 2405 2406 if (origin_size) 2407 while (too_many_discard_blocks(discard_block_size, origin_size)) 2408 discard_block_size *= 2; 2409 2410 return discard_block_size; 2411 } 2412 2413 static void set_cache_size(struct cache *cache, dm_cblock_t size) 2414 { 2415 dm_block_t nr_blocks = from_cblock(size); 2416 2417 if (nr_blocks > (1 << 20) && cache->cache_size != size) 2418 DMWARN_LIMIT("You have created a cache device with a lot of individual cache blocks (%llu)\n" 2419 "All these mappings can consume a lot of kernel memory, and take some time to read/write.\n" 2420 "Please consider increasing the cache block size to reduce the overall cache block count.", 2421 (unsigned long long) nr_blocks); 2422 2423 cache->cache_size = size; 2424 } 2425 2426 static int is_congested(struct dm_dev *dev, int bdi_bits) 2427 { 2428 struct request_queue *q = bdev_get_queue(dev->bdev); 2429 return bdi_congested(q->backing_dev_info, bdi_bits); 2430 } 2431 2432 static int cache_is_congested(struct dm_target_callbacks *cb, int bdi_bits) 2433 { 2434 struct cache *cache = container_of(cb, struct cache, callbacks); 2435 2436 return is_congested(cache->origin_dev, bdi_bits) || 2437 is_congested(cache->cache_dev, bdi_bits); 2438 } 2439 2440 #define DEFAULT_MIGRATION_THRESHOLD 2048 2441 2442 static int cache_create(struct cache_args *ca, struct cache **result) 2443 { 2444 int r = 0; 2445 char **error = &ca->ti->error; 2446 struct cache *cache; 2447 struct dm_target *ti = ca->ti; 2448 dm_block_t origin_blocks; 2449 struct dm_cache_metadata *cmd; 2450 bool may_format = ca->features.mode == CM_WRITE; 2451 2452 cache = kzalloc(sizeof(*cache), GFP_KERNEL); 2453 if (!cache) 2454 return -ENOMEM; 2455 2456 cache->ti = ca->ti; 2457 ti->private = cache; 2458 ti->num_flush_bios = 2; 2459 ti->flush_supported = true; 2460 2461 ti->num_discard_bios = 1; 2462 ti->discards_supported = true; 2463 2464 ti->per_io_data_size = sizeof(struct per_bio_data); 2465 2466 cache->features = ca->features; 2467 if (writethrough_mode(cache)) { 2468 /* Create bioset for writethrough bios issued to origin */ 2469 r = bioset_init(&cache->bs, BIO_POOL_SIZE, 0, 0); 2470 if (r) 2471 goto bad; 2472 } 2473 2474 cache->callbacks.congested_fn = cache_is_congested; 2475 dm_table_add_target_callbacks(ti->table, &cache->callbacks); 2476 2477 cache->metadata_dev = ca->metadata_dev; 2478 cache->origin_dev = ca->origin_dev; 2479 cache->cache_dev = ca->cache_dev; 2480 2481 ca->metadata_dev = ca->origin_dev = ca->cache_dev = NULL; 2482 2483 origin_blocks = cache->origin_sectors = ca->origin_sectors; 2484 origin_blocks = block_div(origin_blocks, ca->block_size); 2485 cache->origin_blocks = to_oblock(origin_blocks); 2486 2487 cache->sectors_per_block = ca->block_size; 2488 if (dm_set_target_max_io_len(ti, cache->sectors_per_block)) { 2489 r = -EINVAL; 2490 goto bad; 2491 } 2492 2493 if (ca->block_size & (ca->block_size - 1)) { 2494 dm_block_t cache_size = ca->cache_sectors; 2495 2496 cache->sectors_per_block_shift = -1; 2497 cache_size = block_div(cache_size, ca->block_size); 2498 set_cache_size(cache, to_cblock(cache_size)); 2499 } else { 2500 cache->sectors_per_block_shift = __ffs(ca->block_size); 2501 set_cache_size(cache, to_cblock(ca->cache_sectors >> cache->sectors_per_block_shift)); 2502 } 2503 2504 r = create_cache_policy(cache, ca, error); 2505 if (r) 2506 goto bad; 2507 2508 cache->policy_nr_args = ca->policy_argc; 2509 cache->migration_threshold = DEFAULT_MIGRATION_THRESHOLD; 2510 2511 r = set_config_values(cache, ca->policy_argc, ca->policy_argv); 2512 if (r) { 2513 *error = "Error setting cache policy's config values"; 2514 goto bad; 2515 } 2516 2517 cmd = dm_cache_metadata_open(cache->metadata_dev->bdev, 2518 ca->block_size, may_format, 2519 dm_cache_policy_get_hint_size(cache->policy), 2520 ca->features.metadata_version); 2521 if (IS_ERR(cmd)) { 2522 *error = "Error creating metadata object"; 2523 r = PTR_ERR(cmd); 2524 goto bad; 2525 } 2526 cache->cmd = cmd; 2527 set_cache_mode(cache, CM_WRITE); 2528 if (get_cache_mode(cache) != CM_WRITE) { 2529 *error = "Unable to get write access to metadata, please check/repair metadata."; 2530 r = -EINVAL; 2531 goto bad; 2532 } 2533 2534 if (passthrough_mode(cache)) { 2535 bool all_clean; 2536 2537 r = dm_cache_metadata_all_clean(cache->cmd, &all_clean); 2538 if (r) { 2539 *error = "dm_cache_metadata_all_clean() failed"; 2540 goto bad; 2541 } 2542 2543 if (!all_clean) { 2544 *error = "Cannot enter passthrough mode unless all blocks are clean"; 2545 r = -EINVAL; 2546 goto bad; 2547 } 2548 2549 policy_allow_migrations(cache->policy, false); 2550 } 2551 2552 spin_lock_init(&cache->lock); 2553 bio_list_init(&cache->deferred_bios); 2554 atomic_set(&cache->nr_allocated_migrations, 0); 2555 atomic_set(&cache->nr_io_migrations, 0); 2556 init_waitqueue_head(&cache->migration_wait); 2557 2558 r = -ENOMEM; 2559 atomic_set(&cache->nr_dirty, 0); 2560 cache->dirty_bitset = alloc_bitset(from_cblock(cache->cache_size)); 2561 if (!cache->dirty_bitset) { 2562 *error = "could not allocate dirty bitset"; 2563 goto bad; 2564 } 2565 clear_bitset(cache->dirty_bitset, from_cblock(cache->cache_size)); 2566 2567 cache->discard_block_size = 2568 calculate_discard_block_size(cache->sectors_per_block, 2569 cache->origin_sectors); 2570 cache->discard_nr_blocks = to_dblock(dm_sector_div_up(cache->origin_sectors, 2571 cache->discard_block_size)); 2572 cache->discard_bitset = alloc_bitset(from_dblock(cache->discard_nr_blocks)); 2573 if (!cache->discard_bitset) { 2574 *error = "could not allocate discard bitset"; 2575 goto bad; 2576 } 2577 clear_bitset(cache->discard_bitset, from_dblock(cache->discard_nr_blocks)); 2578 2579 cache->copier = dm_kcopyd_client_create(&dm_kcopyd_throttle); 2580 if (IS_ERR(cache->copier)) { 2581 *error = "could not create kcopyd client"; 2582 r = PTR_ERR(cache->copier); 2583 goto bad; 2584 } 2585 2586 cache->wq = alloc_workqueue("dm-" DM_MSG_PREFIX, WQ_MEM_RECLAIM, 0); 2587 if (!cache->wq) { 2588 *error = "could not create workqueue for metadata object"; 2589 goto bad; 2590 } 2591 INIT_WORK(&cache->deferred_bio_worker, process_deferred_bios); 2592 INIT_WORK(&cache->migration_worker, check_migrations); 2593 INIT_DELAYED_WORK(&cache->waker, do_waker); 2594 2595 cache->prison = dm_bio_prison_create_v2(cache->wq); 2596 if (!cache->prison) { 2597 *error = "could not create bio prison"; 2598 goto bad; 2599 } 2600 2601 r = mempool_init_slab_pool(&cache->migration_pool, MIGRATION_POOL_SIZE, 2602 migration_cache); 2603 if (r) { 2604 *error = "Error creating cache's migration mempool"; 2605 goto bad; 2606 } 2607 2608 cache->need_tick_bio = true; 2609 cache->sized = false; 2610 cache->invalidate = false; 2611 cache->commit_requested = false; 2612 cache->loaded_mappings = false; 2613 cache->loaded_discards = false; 2614 2615 load_stats(cache); 2616 2617 atomic_set(&cache->stats.demotion, 0); 2618 atomic_set(&cache->stats.promotion, 0); 2619 atomic_set(&cache->stats.copies_avoided, 0); 2620 atomic_set(&cache->stats.cache_cell_clash, 0); 2621 atomic_set(&cache->stats.commit_count, 0); 2622 atomic_set(&cache->stats.discard_count, 0); 2623 2624 spin_lock_init(&cache->invalidation_lock); 2625 INIT_LIST_HEAD(&cache->invalidation_requests); 2626 2627 batcher_init(&cache->committer, commit_op, cache, 2628 issue_op, cache, cache->wq); 2629 iot_init(&cache->tracker); 2630 2631 init_rwsem(&cache->background_work_lock); 2632 prevent_background_work(cache); 2633 2634 *result = cache; 2635 return 0; 2636 bad: 2637 destroy(cache); 2638 return r; 2639 } 2640 2641 static int copy_ctr_args(struct cache *cache, int argc, const char **argv) 2642 { 2643 unsigned i; 2644 const char **copy; 2645 2646 copy = kcalloc(argc, sizeof(*copy), GFP_KERNEL); 2647 if (!copy) 2648 return -ENOMEM; 2649 for (i = 0; i < argc; i++) { 2650 copy[i] = kstrdup(argv[i], GFP_KERNEL); 2651 if (!copy[i]) { 2652 while (i--) 2653 kfree(copy[i]); 2654 kfree(copy); 2655 return -ENOMEM; 2656 } 2657 } 2658 2659 cache->nr_ctr_args = argc; 2660 cache->ctr_args = copy; 2661 2662 return 0; 2663 } 2664 2665 static int cache_ctr(struct dm_target *ti, unsigned argc, char **argv) 2666 { 2667 int r = -EINVAL; 2668 struct cache_args *ca; 2669 struct cache *cache = NULL; 2670 2671 ca = kzalloc(sizeof(*ca), GFP_KERNEL); 2672 if (!ca) { 2673 ti->error = "Error allocating memory for cache"; 2674 return -ENOMEM; 2675 } 2676 ca->ti = ti; 2677 2678 r = parse_cache_args(ca, argc, argv, &ti->error); 2679 if (r) 2680 goto out; 2681 2682 r = cache_create(ca, &cache); 2683 if (r) 2684 goto out; 2685 2686 r = copy_ctr_args(cache, argc - 3, (const char **)argv + 3); 2687 if (r) { 2688 destroy(cache); 2689 goto out; 2690 } 2691 2692 ti->private = cache; 2693 out: 2694 destroy_cache_args(ca); 2695 return r; 2696 } 2697 2698 /*----------------------------------------------------------------*/ 2699 2700 static int cache_map(struct dm_target *ti, struct bio *bio) 2701 { 2702 struct cache *cache = ti->private; 2703 2704 int r; 2705 bool commit_needed; 2706 dm_oblock_t block = get_bio_block(cache, bio); 2707 2708 init_per_bio_data(bio); 2709 if (unlikely(from_oblock(block) >= from_oblock(cache->origin_blocks))) { 2710 /* 2711 * This can only occur if the io goes to a partial block at 2712 * the end of the origin device. We don't cache these. 2713 * Just remap to the origin and carry on. 2714 */ 2715 remap_to_origin(cache, bio); 2716 accounted_begin(cache, bio); 2717 return DM_MAPIO_REMAPPED; 2718 } 2719 2720 if (discard_or_flush(bio)) { 2721 defer_bio(cache, bio); 2722 return DM_MAPIO_SUBMITTED; 2723 } 2724 2725 r = map_bio(cache, bio, block, &commit_needed); 2726 if (commit_needed) 2727 schedule_commit(&cache->committer); 2728 2729 return r; 2730 } 2731 2732 static int cache_end_io(struct dm_target *ti, struct bio *bio, blk_status_t *error) 2733 { 2734 struct cache *cache = ti->private; 2735 unsigned long flags; 2736 struct per_bio_data *pb = get_per_bio_data(bio); 2737 2738 if (pb->tick) { 2739 policy_tick(cache->policy, false); 2740 2741 spin_lock_irqsave(&cache->lock, flags); 2742 cache->need_tick_bio = true; 2743 spin_unlock_irqrestore(&cache->lock, flags); 2744 } 2745 2746 bio_drop_shared_lock(cache, bio); 2747 accounted_complete(cache, bio); 2748 2749 return DM_ENDIO_DONE; 2750 } 2751 2752 static int write_dirty_bitset(struct cache *cache) 2753 { 2754 int r; 2755 2756 if (get_cache_mode(cache) >= CM_READ_ONLY) 2757 return -EINVAL; 2758 2759 r = dm_cache_set_dirty_bits(cache->cmd, from_cblock(cache->cache_size), cache->dirty_bitset); 2760 if (r) 2761 metadata_operation_failed(cache, "dm_cache_set_dirty_bits", r); 2762 2763 return r; 2764 } 2765 2766 static int write_discard_bitset(struct cache *cache) 2767 { 2768 unsigned i, r; 2769 2770 if (get_cache_mode(cache) >= CM_READ_ONLY) 2771 return -EINVAL; 2772 2773 r = dm_cache_discard_bitset_resize(cache->cmd, cache->discard_block_size, 2774 cache->discard_nr_blocks); 2775 if (r) { 2776 DMERR("%s: could not resize on-disk discard bitset", cache_device_name(cache)); 2777 metadata_operation_failed(cache, "dm_cache_discard_bitset_resize", r); 2778 return r; 2779 } 2780 2781 for (i = 0; i < from_dblock(cache->discard_nr_blocks); i++) { 2782 r = dm_cache_set_discard(cache->cmd, to_dblock(i), 2783 is_discarded(cache, to_dblock(i))); 2784 if (r) { 2785 metadata_operation_failed(cache, "dm_cache_set_discard", r); 2786 return r; 2787 } 2788 } 2789 2790 return 0; 2791 } 2792 2793 static int write_hints(struct cache *cache) 2794 { 2795 int r; 2796 2797 if (get_cache_mode(cache) >= CM_READ_ONLY) 2798 return -EINVAL; 2799 2800 r = dm_cache_write_hints(cache->cmd, cache->policy); 2801 if (r) { 2802 metadata_operation_failed(cache, "dm_cache_write_hints", r); 2803 return r; 2804 } 2805 2806 return 0; 2807 } 2808 2809 /* 2810 * returns true on success 2811 */ 2812 static bool sync_metadata(struct cache *cache) 2813 { 2814 int r1, r2, r3, r4; 2815 2816 r1 = write_dirty_bitset(cache); 2817 if (r1) 2818 DMERR("%s: could not write dirty bitset", cache_device_name(cache)); 2819 2820 r2 = write_discard_bitset(cache); 2821 if (r2) 2822 DMERR("%s: could not write discard bitset", cache_device_name(cache)); 2823 2824 save_stats(cache); 2825 2826 r3 = write_hints(cache); 2827 if (r3) 2828 DMERR("%s: could not write hints", cache_device_name(cache)); 2829 2830 /* 2831 * If writing the above metadata failed, we still commit, but don't 2832 * set the clean shutdown flag. This will effectively force every 2833 * dirty bit to be set on reload. 2834 */ 2835 r4 = commit(cache, !r1 && !r2 && !r3); 2836 if (r4) 2837 DMERR("%s: could not write cache metadata", cache_device_name(cache)); 2838 2839 return !r1 && !r2 && !r3 && !r4; 2840 } 2841 2842 static void cache_postsuspend(struct dm_target *ti) 2843 { 2844 struct cache *cache = ti->private; 2845 2846 prevent_background_work(cache); 2847 BUG_ON(atomic_read(&cache->nr_io_migrations)); 2848 2849 cancel_delayed_work(&cache->waker); 2850 flush_workqueue(cache->wq); 2851 WARN_ON(cache->tracker.in_flight); 2852 2853 /* 2854 * If it's a flush suspend there won't be any deferred bios, so this 2855 * call is harmless. 2856 */ 2857 requeue_deferred_bios(cache); 2858 2859 if (get_cache_mode(cache) == CM_WRITE) 2860 (void) sync_metadata(cache); 2861 } 2862 2863 static int load_mapping(void *context, dm_oblock_t oblock, dm_cblock_t cblock, 2864 bool dirty, uint32_t hint, bool hint_valid) 2865 { 2866 int r; 2867 struct cache *cache = context; 2868 2869 if (dirty) { 2870 set_bit(from_cblock(cblock), cache->dirty_bitset); 2871 atomic_inc(&cache->nr_dirty); 2872 } else 2873 clear_bit(from_cblock(cblock), cache->dirty_bitset); 2874 2875 r = policy_load_mapping(cache->policy, oblock, cblock, dirty, hint, hint_valid); 2876 if (r) 2877 return r; 2878 2879 return 0; 2880 } 2881 2882 /* 2883 * The discard block size in the on disk metadata is not 2884 * neccessarily the same as we're currently using. So we have to 2885 * be careful to only set the discarded attribute if we know it 2886 * covers a complete block of the new size. 2887 */ 2888 struct discard_load_info { 2889 struct cache *cache; 2890 2891 /* 2892 * These blocks are sized using the on disk dblock size, rather 2893 * than the current one. 2894 */ 2895 dm_block_t block_size; 2896 dm_block_t discard_begin, discard_end; 2897 }; 2898 2899 static void discard_load_info_init(struct cache *cache, 2900 struct discard_load_info *li) 2901 { 2902 li->cache = cache; 2903 li->discard_begin = li->discard_end = 0; 2904 } 2905 2906 static void set_discard_range(struct discard_load_info *li) 2907 { 2908 sector_t b, e; 2909 2910 if (li->discard_begin == li->discard_end) 2911 return; 2912 2913 /* 2914 * Convert to sectors. 2915 */ 2916 b = li->discard_begin * li->block_size; 2917 e = li->discard_end * li->block_size; 2918 2919 /* 2920 * Then convert back to the current dblock size. 2921 */ 2922 b = dm_sector_div_up(b, li->cache->discard_block_size); 2923 sector_div(e, li->cache->discard_block_size); 2924 2925 /* 2926 * The origin may have shrunk, so we need to check we're still in 2927 * bounds. 2928 */ 2929 if (e > from_dblock(li->cache->discard_nr_blocks)) 2930 e = from_dblock(li->cache->discard_nr_blocks); 2931 2932 for (; b < e; b++) 2933 set_discard(li->cache, to_dblock(b)); 2934 } 2935 2936 static int load_discard(void *context, sector_t discard_block_size, 2937 dm_dblock_t dblock, bool discard) 2938 { 2939 struct discard_load_info *li = context; 2940 2941 li->block_size = discard_block_size; 2942 2943 if (discard) { 2944 if (from_dblock(dblock) == li->discard_end) 2945 /* 2946 * We're already in a discard range, just extend it. 2947 */ 2948 li->discard_end = li->discard_end + 1ULL; 2949 2950 else { 2951 /* 2952 * Emit the old range and start a new one. 2953 */ 2954 set_discard_range(li); 2955 li->discard_begin = from_dblock(dblock); 2956 li->discard_end = li->discard_begin + 1ULL; 2957 } 2958 } else { 2959 set_discard_range(li); 2960 li->discard_begin = li->discard_end = 0; 2961 } 2962 2963 return 0; 2964 } 2965 2966 static dm_cblock_t get_cache_dev_size(struct cache *cache) 2967 { 2968 sector_t size = get_dev_size(cache->cache_dev); 2969 (void) sector_div(size, cache->sectors_per_block); 2970 return to_cblock(size); 2971 } 2972 2973 static bool can_resize(struct cache *cache, dm_cblock_t new_size) 2974 { 2975 if (from_cblock(new_size) > from_cblock(cache->cache_size)) { 2976 if (cache->sized) { 2977 DMERR("%s: unable to extend cache due to missing cache table reload", 2978 cache_device_name(cache)); 2979 return false; 2980 } 2981 } 2982 2983 /* 2984 * We can't drop a dirty block when shrinking the cache. 2985 */ 2986 while (from_cblock(new_size) < from_cblock(cache->cache_size)) { 2987 new_size = to_cblock(from_cblock(new_size) + 1); 2988 if (is_dirty(cache, new_size)) { 2989 DMERR("%s: unable to shrink cache; cache block %llu is dirty", 2990 cache_device_name(cache), 2991 (unsigned long long) from_cblock(new_size)); 2992 return false; 2993 } 2994 } 2995 2996 return true; 2997 } 2998 2999 static int resize_cache_dev(struct cache *cache, dm_cblock_t new_size) 3000 { 3001 int r; 3002 3003 r = dm_cache_resize(cache->cmd, new_size); 3004 if (r) { 3005 DMERR("%s: could not resize cache metadata", cache_device_name(cache)); 3006 metadata_operation_failed(cache, "dm_cache_resize", r); 3007 return r; 3008 } 3009 3010 set_cache_size(cache, new_size); 3011 3012 return 0; 3013 } 3014 3015 static int cache_preresume(struct dm_target *ti) 3016 { 3017 int r = 0; 3018 struct cache *cache = ti->private; 3019 dm_cblock_t csize = get_cache_dev_size(cache); 3020 3021 /* 3022 * Check to see if the cache has resized. 3023 */ 3024 if (!cache->sized) { 3025 r = resize_cache_dev(cache, csize); 3026 if (r) 3027 return r; 3028 3029 cache->sized = true; 3030 3031 } else if (csize != cache->cache_size) { 3032 if (!can_resize(cache, csize)) 3033 return -EINVAL; 3034 3035 r = resize_cache_dev(cache, csize); 3036 if (r) 3037 return r; 3038 } 3039 3040 if (!cache->loaded_mappings) { 3041 r = dm_cache_load_mappings(cache->cmd, cache->policy, 3042 load_mapping, cache); 3043 if (r) { 3044 DMERR("%s: could not load cache mappings", cache_device_name(cache)); 3045 metadata_operation_failed(cache, "dm_cache_load_mappings", r); 3046 return r; 3047 } 3048 3049 cache->loaded_mappings = true; 3050 } 3051 3052 if (!cache->loaded_discards) { 3053 struct discard_load_info li; 3054 3055 /* 3056 * The discard bitset could have been resized, or the 3057 * discard block size changed. To be safe we start by 3058 * setting every dblock to not discarded. 3059 */ 3060 clear_bitset(cache->discard_bitset, from_dblock(cache->discard_nr_blocks)); 3061 3062 discard_load_info_init(cache, &li); 3063 r = dm_cache_load_discards(cache->cmd, load_discard, &li); 3064 if (r) { 3065 DMERR("%s: could not load origin discards", cache_device_name(cache)); 3066 metadata_operation_failed(cache, "dm_cache_load_discards", r); 3067 return r; 3068 } 3069 set_discard_range(&li); 3070 3071 cache->loaded_discards = true; 3072 } 3073 3074 return r; 3075 } 3076 3077 static void cache_resume(struct dm_target *ti) 3078 { 3079 struct cache *cache = ti->private; 3080 3081 cache->need_tick_bio = true; 3082 allow_background_work(cache); 3083 do_waker(&cache->waker.work); 3084 } 3085 3086 static void emit_flags(struct cache *cache, char *result, 3087 unsigned maxlen, ssize_t *sz_ptr) 3088 { 3089 ssize_t sz = *sz_ptr; 3090 struct cache_features *cf = &cache->features; 3091 unsigned count = (cf->metadata_version == 2) + !cf->discard_passdown + 1; 3092 3093 DMEMIT("%u ", count); 3094 3095 if (cf->metadata_version == 2) 3096 DMEMIT("metadata2 "); 3097 3098 if (writethrough_mode(cache)) 3099 DMEMIT("writethrough "); 3100 3101 else if (passthrough_mode(cache)) 3102 DMEMIT("passthrough "); 3103 3104 else if (writeback_mode(cache)) 3105 DMEMIT("writeback "); 3106 3107 else { 3108 DMEMIT("unknown "); 3109 DMERR("%s: internal error: unknown io mode: %d", 3110 cache_device_name(cache), (int) cf->io_mode); 3111 } 3112 3113 if (!cf->discard_passdown) 3114 DMEMIT("no_discard_passdown "); 3115 3116 *sz_ptr = sz; 3117 } 3118 3119 /* 3120 * Status format: 3121 * 3122 * <metadata block size> <#used metadata blocks>/<#total metadata blocks> 3123 * <cache block size> <#used cache blocks>/<#total cache blocks> 3124 * <#read hits> <#read misses> <#write hits> <#write misses> 3125 * <#demotions> <#promotions> <#dirty> 3126 * <#features> <features>* 3127 * <#core args> <core args> 3128 * <policy name> <#policy args> <policy args>* <cache metadata mode> <needs_check> 3129 */ 3130 static void cache_status(struct dm_target *ti, status_type_t type, 3131 unsigned status_flags, char *result, unsigned maxlen) 3132 { 3133 int r = 0; 3134 unsigned i; 3135 ssize_t sz = 0; 3136 dm_block_t nr_free_blocks_metadata = 0; 3137 dm_block_t nr_blocks_metadata = 0; 3138 char buf[BDEVNAME_SIZE]; 3139 struct cache *cache = ti->private; 3140 dm_cblock_t residency; 3141 bool needs_check; 3142 3143 switch (type) { 3144 case STATUSTYPE_INFO: 3145 if (get_cache_mode(cache) == CM_FAIL) { 3146 DMEMIT("Fail"); 3147 break; 3148 } 3149 3150 /* Commit to ensure statistics aren't out-of-date */ 3151 if (!(status_flags & DM_STATUS_NOFLUSH_FLAG) && !dm_suspended(ti)) 3152 (void) commit(cache, false); 3153 3154 r = dm_cache_get_free_metadata_block_count(cache->cmd, &nr_free_blocks_metadata); 3155 if (r) { 3156 DMERR("%s: dm_cache_get_free_metadata_block_count returned %d", 3157 cache_device_name(cache), r); 3158 goto err; 3159 } 3160 3161 r = dm_cache_get_metadata_dev_size(cache->cmd, &nr_blocks_metadata); 3162 if (r) { 3163 DMERR("%s: dm_cache_get_metadata_dev_size returned %d", 3164 cache_device_name(cache), r); 3165 goto err; 3166 } 3167 3168 residency = policy_residency(cache->policy); 3169 3170 DMEMIT("%u %llu/%llu %llu %llu/%llu %u %u %u %u %u %u %lu ", 3171 (unsigned)DM_CACHE_METADATA_BLOCK_SIZE, 3172 (unsigned long long)(nr_blocks_metadata - nr_free_blocks_metadata), 3173 (unsigned long long)nr_blocks_metadata, 3174 (unsigned long long)cache->sectors_per_block, 3175 (unsigned long long) from_cblock(residency), 3176 (unsigned long long) from_cblock(cache->cache_size), 3177 (unsigned) atomic_read(&cache->stats.read_hit), 3178 (unsigned) atomic_read(&cache->stats.read_miss), 3179 (unsigned) atomic_read(&cache->stats.write_hit), 3180 (unsigned) atomic_read(&cache->stats.write_miss), 3181 (unsigned) atomic_read(&cache->stats.demotion), 3182 (unsigned) atomic_read(&cache->stats.promotion), 3183 (unsigned long) atomic_read(&cache->nr_dirty)); 3184 3185 emit_flags(cache, result, maxlen, &sz); 3186 3187 DMEMIT("2 migration_threshold %llu ", (unsigned long long) cache->migration_threshold); 3188 3189 DMEMIT("%s ", dm_cache_policy_get_name(cache->policy)); 3190 if (sz < maxlen) { 3191 r = policy_emit_config_values(cache->policy, result, maxlen, &sz); 3192 if (r) 3193 DMERR("%s: policy_emit_config_values returned %d", 3194 cache_device_name(cache), r); 3195 } 3196 3197 if (get_cache_mode(cache) == CM_READ_ONLY) 3198 DMEMIT("ro "); 3199 else 3200 DMEMIT("rw "); 3201 3202 r = dm_cache_metadata_needs_check(cache->cmd, &needs_check); 3203 3204 if (r || needs_check) 3205 DMEMIT("needs_check "); 3206 else 3207 DMEMIT("- "); 3208 3209 break; 3210 3211 case STATUSTYPE_TABLE: 3212 format_dev_t(buf, cache->metadata_dev->bdev->bd_dev); 3213 DMEMIT("%s ", buf); 3214 format_dev_t(buf, cache->cache_dev->bdev->bd_dev); 3215 DMEMIT("%s ", buf); 3216 format_dev_t(buf, cache->origin_dev->bdev->bd_dev); 3217 DMEMIT("%s", buf); 3218 3219 for (i = 0; i < cache->nr_ctr_args - 1; i++) 3220 DMEMIT(" %s", cache->ctr_args[i]); 3221 if (cache->nr_ctr_args) 3222 DMEMIT(" %s", cache->ctr_args[cache->nr_ctr_args - 1]); 3223 } 3224 3225 return; 3226 3227 err: 3228 DMEMIT("Error"); 3229 } 3230 3231 /* 3232 * Defines a range of cblocks, begin to (end - 1) are in the range. end is 3233 * the one-past-the-end value. 3234 */ 3235 struct cblock_range { 3236 dm_cblock_t begin; 3237 dm_cblock_t end; 3238 }; 3239 3240 /* 3241 * A cache block range can take two forms: 3242 * 3243 * i) A single cblock, eg. '3456' 3244 * ii) A begin and end cblock with a dash between, eg. 123-234 3245 */ 3246 static int parse_cblock_range(struct cache *cache, const char *str, 3247 struct cblock_range *result) 3248 { 3249 char dummy; 3250 uint64_t b, e; 3251 int r; 3252 3253 /* 3254 * Try and parse form (ii) first. 3255 */ 3256 r = sscanf(str, "%llu-%llu%c", &b, &e, &dummy); 3257 if (r < 0) 3258 return r; 3259 3260 if (r == 2) { 3261 result->begin = to_cblock(b); 3262 result->end = to_cblock(e); 3263 return 0; 3264 } 3265 3266 /* 3267 * That didn't work, try form (i). 3268 */ 3269 r = sscanf(str, "%llu%c", &b, &dummy); 3270 if (r < 0) 3271 return r; 3272 3273 if (r == 1) { 3274 result->begin = to_cblock(b); 3275 result->end = to_cblock(from_cblock(result->begin) + 1u); 3276 return 0; 3277 } 3278 3279 DMERR("%s: invalid cblock range '%s'", cache_device_name(cache), str); 3280 return -EINVAL; 3281 } 3282 3283 static int validate_cblock_range(struct cache *cache, struct cblock_range *range) 3284 { 3285 uint64_t b = from_cblock(range->begin); 3286 uint64_t e = from_cblock(range->end); 3287 uint64_t n = from_cblock(cache->cache_size); 3288 3289 if (b >= n) { 3290 DMERR("%s: begin cblock out of range: %llu >= %llu", 3291 cache_device_name(cache), b, n); 3292 return -EINVAL; 3293 } 3294 3295 if (e > n) { 3296 DMERR("%s: end cblock out of range: %llu > %llu", 3297 cache_device_name(cache), e, n); 3298 return -EINVAL; 3299 } 3300 3301 if (b >= e) { 3302 DMERR("%s: invalid cblock range: %llu >= %llu", 3303 cache_device_name(cache), b, e); 3304 return -EINVAL; 3305 } 3306 3307 return 0; 3308 } 3309 3310 static inline dm_cblock_t cblock_succ(dm_cblock_t b) 3311 { 3312 return to_cblock(from_cblock(b) + 1); 3313 } 3314 3315 static int request_invalidation(struct cache *cache, struct cblock_range *range) 3316 { 3317 int r = 0; 3318 3319 /* 3320 * We don't need to do any locking here because we know we're in 3321 * passthrough mode. There's is potential for a race between an 3322 * invalidation triggered by an io and an invalidation message. This 3323 * is harmless, we must not worry if the policy call fails. 3324 */ 3325 while (range->begin != range->end) { 3326 r = invalidate_cblock(cache, range->begin); 3327 if (r) 3328 return r; 3329 3330 range->begin = cblock_succ(range->begin); 3331 } 3332 3333 cache->commit_requested = true; 3334 return r; 3335 } 3336 3337 static int process_invalidate_cblocks_message(struct cache *cache, unsigned count, 3338 const char **cblock_ranges) 3339 { 3340 int r = 0; 3341 unsigned i; 3342 struct cblock_range range; 3343 3344 if (!passthrough_mode(cache)) { 3345 DMERR("%s: cache has to be in passthrough mode for invalidation", 3346 cache_device_name(cache)); 3347 return -EPERM; 3348 } 3349 3350 for (i = 0; i < count; i++) { 3351 r = parse_cblock_range(cache, cblock_ranges[i], &range); 3352 if (r) 3353 break; 3354 3355 r = validate_cblock_range(cache, &range); 3356 if (r) 3357 break; 3358 3359 /* 3360 * Pass begin and end origin blocks to the worker and wake it. 3361 */ 3362 r = request_invalidation(cache, &range); 3363 if (r) 3364 break; 3365 } 3366 3367 return r; 3368 } 3369 3370 /* 3371 * Supports 3372 * "<key> <value>" 3373 * and 3374 * "invalidate_cblocks [(<begin>)|(<begin>-<end>)]* 3375 * 3376 * The key migration_threshold is supported by the cache target core. 3377 */ 3378 static int cache_message(struct dm_target *ti, unsigned argc, char **argv, 3379 char *result, unsigned maxlen) 3380 { 3381 struct cache *cache = ti->private; 3382 3383 if (!argc) 3384 return -EINVAL; 3385 3386 if (get_cache_mode(cache) >= CM_READ_ONLY) { 3387 DMERR("%s: unable to service cache target messages in READ_ONLY or FAIL mode", 3388 cache_device_name(cache)); 3389 return -EOPNOTSUPP; 3390 } 3391 3392 if (!strcasecmp(argv[0], "invalidate_cblocks")) 3393 return process_invalidate_cblocks_message(cache, argc - 1, (const char **) argv + 1); 3394 3395 if (argc != 2) 3396 return -EINVAL; 3397 3398 return set_config_value(cache, argv[0], argv[1]); 3399 } 3400 3401 static int cache_iterate_devices(struct dm_target *ti, 3402 iterate_devices_callout_fn fn, void *data) 3403 { 3404 int r = 0; 3405 struct cache *cache = ti->private; 3406 3407 r = fn(ti, cache->cache_dev, 0, get_dev_size(cache->cache_dev), data); 3408 if (!r) 3409 r = fn(ti, cache->origin_dev, 0, ti->len, data); 3410 3411 return r; 3412 } 3413 3414 static bool origin_dev_supports_discard(struct block_device *origin_bdev) 3415 { 3416 struct request_queue *q = bdev_get_queue(origin_bdev); 3417 3418 return q && blk_queue_discard(q); 3419 } 3420 3421 /* 3422 * If discard_passdown was enabled verify that the origin device 3423 * supports discards. Disable discard_passdown if not. 3424 */ 3425 static void disable_passdown_if_not_supported(struct cache *cache) 3426 { 3427 struct block_device *origin_bdev = cache->origin_dev->bdev; 3428 struct queue_limits *origin_limits = &bdev_get_queue(origin_bdev)->limits; 3429 const char *reason = NULL; 3430 char buf[BDEVNAME_SIZE]; 3431 3432 if (!cache->features.discard_passdown) 3433 return; 3434 3435 if (!origin_dev_supports_discard(origin_bdev)) 3436 reason = "discard unsupported"; 3437 3438 else if (origin_limits->max_discard_sectors < cache->sectors_per_block) 3439 reason = "max discard sectors smaller than a block"; 3440 3441 if (reason) { 3442 DMWARN("Origin device (%s) %s: Disabling discard passdown.", 3443 bdevname(origin_bdev, buf), reason); 3444 cache->features.discard_passdown = false; 3445 } 3446 } 3447 3448 static void set_discard_limits(struct cache *cache, struct queue_limits *limits) 3449 { 3450 struct block_device *origin_bdev = cache->origin_dev->bdev; 3451 struct queue_limits *origin_limits = &bdev_get_queue(origin_bdev)->limits; 3452 3453 if (!cache->features.discard_passdown) { 3454 /* No passdown is done so setting own virtual limits */ 3455 limits->max_discard_sectors = min_t(sector_t, cache->discard_block_size * 1024, 3456 cache->origin_sectors); 3457 limits->discard_granularity = cache->discard_block_size << SECTOR_SHIFT; 3458 return; 3459 } 3460 3461 /* 3462 * cache_iterate_devices() is stacking both origin and fast device limits 3463 * but discards aren't passed to fast device, so inherit origin's limits. 3464 */ 3465 limits->max_discard_sectors = origin_limits->max_discard_sectors; 3466 limits->max_hw_discard_sectors = origin_limits->max_hw_discard_sectors; 3467 limits->discard_granularity = origin_limits->discard_granularity; 3468 limits->discard_alignment = origin_limits->discard_alignment; 3469 limits->discard_misaligned = origin_limits->discard_misaligned; 3470 } 3471 3472 static void cache_io_hints(struct dm_target *ti, struct queue_limits *limits) 3473 { 3474 struct cache *cache = ti->private; 3475 uint64_t io_opt_sectors = limits->io_opt >> SECTOR_SHIFT; 3476 3477 /* 3478 * If the system-determined stacked limits are compatible with the 3479 * cache's blocksize (io_opt is a factor) do not override them. 3480 */ 3481 if (io_opt_sectors < cache->sectors_per_block || 3482 do_div(io_opt_sectors, cache->sectors_per_block)) { 3483 blk_limits_io_min(limits, cache->sectors_per_block << SECTOR_SHIFT); 3484 blk_limits_io_opt(limits, cache->sectors_per_block << SECTOR_SHIFT); 3485 } 3486 3487 disable_passdown_if_not_supported(cache); 3488 set_discard_limits(cache, limits); 3489 } 3490 3491 /*----------------------------------------------------------------*/ 3492 3493 static struct target_type cache_target = { 3494 .name = "cache", 3495 .version = {2, 1, 0}, 3496 .module = THIS_MODULE, 3497 .ctr = cache_ctr, 3498 .dtr = cache_dtr, 3499 .map = cache_map, 3500 .end_io = cache_end_io, 3501 .postsuspend = cache_postsuspend, 3502 .preresume = cache_preresume, 3503 .resume = cache_resume, 3504 .status = cache_status, 3505 .message = cache_message, 3506 .iterate_devices = cache_iterate_devices, 3507 .io_hints = cache_io_hints, 3508 }; 3509 3510 static int __init dm_cache_init(void) 3511 { 3512 int r; 3513 3514 migration_cache = KMEM_CACHE(dm_cache_migration, 0); 3515 if (!migration_cache) 3516 return -ENOMEM; 3517 3518 r = dm_register_target(&cache_target); 3519 if (r) { 3520 DMERR("cache target registration failed: %d", r); 3521 kmem_cache_destroy(migration_cache); 3522 return r; 3523 } 3524 3525 return 0; 3526 } 3527 3528 static void __exit dm_cache_exit(void) 3529 { 3530 dm_unregister_target(&cache_target); 3531 kmem_cache_destroy(migration_cache); 3532 } 3533 3534 module_init(dm_cache_init); 3535 module_exit(dm_cache_exit); 3536 3537 MODULE_DESCRIPTION(DM_NAME " cache target"); 3538 MODULE_AUTHOR("Joe Thornber <ejt@redhat.com>"); 3539 MODULE_LICENSE("GPL"); 3540