1 // SPDX-License-Identifier: GPL-2.0-only 2 /* 3 * Copyright (C) 2012 Red Hat. All rights reserved. 4 * 5 * This file is released under the GPL. 6 */ 7 8 #include "dm.h" 9 #include "dm-bio-prison-v2.h" 10 #include "dm-bio-record.h" 11 #include "dm-cache-metadata.h" 12 #include "dm-io-tracker.h" 13 #include "dm-cache-background-tracker.h" 14 15 #include <linux/dm-io.h> 16 #include <linux/dm-kcopyd.h> 17 #include <linux/jiffies.h> 18 #include <linux/init.h> 19 #include <linux/mempool.h> 20 #include <linux/module.h> 21 #include <linux/rwsem.h> 22 #include <linux/slab.h> 23 #include <linux/vmalloc.h> 24 25 #define DM_MSG_PREFIX "cache" 26 27 DECLARE_DM_KCOPYD_THROTTLE_WITH_MODULE_PARM(cache_copy_throttle, 28 "A percentage of time allocated for copying to and/or from cache"); 29 30 /*----------------------------------------------------------------*/ 31 32 /* 33 * Glossary: 34 * 35 * oblock: index of an origin block 36 * cblock: index of a cache block 37 * promotion: movement of a block from origin to cache 38 * demotion: movement of a block from cache to origin 39 * migration: movement of a block between the origin and cache device, 40 * either direction 41 */ 42 43 /*----------------------------------------------------------------*/ 44 45 /* 46 * Represents a chunk of future work. 'input' allows continuations to pass 47 * values between themselves, typically error values. 48 */ 49 struct continuation { 50 struct work_struct ws; 51 blk_status_t input; 52 }; 53 54 static inline void init_continuation(struct continuation *k, 55 void (*fn)(struct work_struct *)) 56 { 57 INIT_WORK(&k->ws, fn); 58 k->input = 0; 59 } 60 61 static inline void queue_continuation(struct workqueue_struct *wq, 62 struct continuation *k) 63 { 64 queue_work(wq, &k->ws); 65 } 66 67 /*----------------------------------------------------------------*/ 68 69 /* 70 * The batcher collects together pieces of work that need a particular 71 * operation to occur before they can proceed (typically a commit). 72 */ 73 struct batcher { 74 /* 75 * The operation that everyone is waiting for. 76 */ 77 blk_status_t (*commit_op)(void *context); 78 void *commit_context; 79 80 /* 81 * This is how bios should be issued once the commit op is complete 82 * (accounted_request). 83 */ 84 void (*issue_op)(struct bio *bio, void *context); 85 void *issue_context; 86 87 /* 88 * Queued work gets put on here after commit. 89 */ 90 struct workqueue_struct *wq; 91 92 spinlock_t lock; 93 struct list_head work_items; 94 struct bio_list bios; 95 struct work_struct commit_work; 96 97 bool commit_scheduled; 98 }; 99 100 static void __commit(struct work_struct *_ws) 101 { 102 struct batcher *b = container_of(_ws, struct batcher, commit_work); 103 blk_status_t r; 104 struct list_head work_items; 105 struct work_struct *ws, *tmp; 106 struct continuation *k; 107 struct bio *bio; 108 struct bio_list bios; 109 110 INIT_LIST_HEAD(&work_items); 111 bio_list_init(&bios); 112 113 /* 114 * We have to grab these before the commit_op to avoid a race 115 * condition. 116 */ 117 spin_lock_irq(&b->lock); 118 list_splice_init(&b->work_items, &work_items); 119 bio_list_merge(&bios, &b->bios); 120 bio_list_init(&b->bios); 121 b->commit_scheduled = false; 122 spin_unlock_irq(&b->lock); 123 124 r = b->commit_op(b->commit_context); 125 126 list_for_each_entry_safe(ws, tmp, &work_items, entry) { 127 k = container_of(ws, struct continuation, ws); 128 k->input = r; 129 INIT_LIST_HEAD(&ws->entry); /* to avoid a WARN_ON */ 130 queue_work(b->wq, ws); 131 } 132 133 while ((bio = bio_list_pop(&bios))) { 134 if (r) { 135 bio->bi_status = r; 136 bio_endio(bio); 137 } else 138 b->issue_op(bio, b->issue_context); 139 } 140 } 141 142 static void batcher_init(struct batcher *b, 143 blk_status_t (*commit_op)(void *), 144 void *commit_context, 145 void (*issue_op)(struct bio *bio, void *), 146 void *issue_context, 147 struct workqueue_struct *wq) 148 { 149 b->commit_op = commit_op; 150 b->commit_context = commit_context; 151 b->issue_op = issue_op; 152 b->issue_context = issue_context; 153 b->wq = wq; 154 155 spin_lock_init(&b->lock); 156 INIT_LIST_HEAD(&b->work_items); 157 bio_list_init(&b->bios); 158 INIT_WORK(&b->commit_work, __commit); 159 b->commit_scheduled = false; 160 } 161 162 static void async_commit(struct batcher *b) 163 { 164 queue_work(b->wq, &b->commit_work); 165 } 166 167 static void continue_after_commit(struct batcher *b, struct continuation *k) 168 { 169 bool commit_scheduled; 170 171 spin_lock_irq(&b->lock); 172 commit_scheduled = b->commit_scheduled; 173 list_add_tail(&k->ws.entry, &b->work_items); 174 spin_unlock_irq(&b->lock); 175 176 if (commit_scheduled) 177 async_commit(b); 178 } 179 180 /* 181 * Bios are errored if commit failed. 182 */ 183 static void issue_after_commit(struct batcher *b, struct bio *bio) 184 { 185 bool commit_scheduled; 186 187 spin_lock_irq(&b->lock); 188 commit_scheduled = b->commit_scheduled; 189 bio_list_add(&b->bios, bio); 190 spin_unlock_irq(&b->lock); 191 192 if (commit_scheduled) 193 async_commit(b); 194 } 195 196 /* 197 * Call this if some urgent work is waiting for the commit to complete. 198 */ 199 static void schedule_commit(struct batcher *b) 200 { 201 bool immediate; 202 203 spin_lock_irq(&b->lock); 204 immediate = !list_empty(&b->work_items) || !bio_list_empty(&b->bios); 205 b->commit_scheduled = true; 206 spin_unlock_irq(&b->lock); 207 208 if (immediate) 209 async_commit(b); 210 } 211 212 /* 213 * There are a couple of places where we let a bio run, but want to do some 214 * work before calling its endio function. We do this by temporarily 215 * changing the endio fn. 216 */ 217 struct dm_hook_info { 218 bio_end_io_t *bi_end_io; 219 }; 220 221 static void dm_hook_bio(struct dm_hook_info *h, struct bio *bio, 222 bio_end_io_t *bi_end_io, void *bi_private) 223 { 224 h->bi_end_io = bio->bi_end_io; 225 226 bio->bi_end_io = bi_end_io; 227 bio->bi_private = bi_private; 228 } 229 230 static void dm_unhook_bio(struct dm_hook_info *h, struct bio *bio) 231 { 232 bio->bi_end_io = h->bi_end_io; 233 } 234 235 /*----------------------------------------------------------------*/ 236 237 #define MIGRATION_POOL_SIZE 128 238 #define COMMIT_PERIOD HZ 239 #define MIGRATION_COUNT_WINDOW 10 240 241 /* 242 * The block size of the device holding cache data must be 243 * between 32KB and 1GB. 244 */ 245 #define DATA_DEV_BLOCK_SIZE_MIN_SECTORS (32 * 1024 >> SECTOR_SHIFT) 246 #define DATA_DEV_BLOCK_SIZE_MAX_SECTORS (1024 * 1024 * 1024 >> SECTOR_SHIFT) 247 248 enum cache_metadata_mode { 249 CM_WRITE, /* metadata may be changed */ 250 CM_READ_ONLY, /* metadata may not be changed */ 251 CM_FAIL 252 }; 253 254 enum cache_io_mode { 255 /* 256 * Data is written to cached blocks only. These blocks are marked 257 * dirty. If you lose the cache device you will lose data. 258 * Potential performance increase for both reads and writes. 259 */ 260 CM_IO_WRITEBACK, 261 262 /* 263 * Data is written to both cache and origin. Blocks are never 264 * dirty. Potential performance benfit for reads only. 265 */ 266 CM_IO_WRITETHROUGH, 267 268 /* 269 * A degraded mode useful for various cache coherency situations 270 * (eg, rolling back snapshots). Reads and writes always go to the 271 * origin. If a write goes to a cached oblock, then the cache 272 * block is invalidated. 273 */ 274 CM_IO_PASSTHROUGH 275 }; 276 277 struct cache_features { 278 enum cache_metadata_mode mode; 279 enum cache_io_mode io_mode; 280 unsigned int metadata_version; 281 bool discard_passdown:1; 282 }; 283 284 struct cache_stats { 285 atomic_t read_hit; 286 atomic_t read_miss; 287 atomic_t write_hit; 288 atomic_t write_miss; 289 atomic_t demotion; 290 atomic_t promotion; 291 atomic_t writeback; 292 atomic_t copies_avoided; 293 atomic_t cache_cell_clash; 294 atomic_t commit_count; 295 atomic_t discard_count; 296 }; 297 298 struct cache { 299 struct dm_target *ti; 300 spinlock_t lock; 301 302 /* 303 * Fields for converting from sectors to blocks. 304 */ 305 int sectors_per_block_shift; 306 sector_t sectors_per_block; 307 308 struct dm_cache_metadata *cmd; 309 310 /* 311 * Metadata is written to this device. 312 */ 313 struct dm_dev *metadata_dev; 314 315 /* 316 * The slower of the two data devices. Typically a spindle. 317 */ 318 struct dm_dev *origin_dev; 319 320 /* 321 * The faster of the two data devices. Typically an SSD. 322 */ 323 struct dm_dev *cache_dev; 324 325 /* 326 * Size of the origin device in _complete_ blocks and native sectors. 327 */ 328 dm_oblock_t origin_blocks; 329 sector_t origin_sectors; 330 331 /* 332 * Size of the cache device in blocks. 333 */ 334 dm_cblock_t cache_size; 335 336 /* 337 * Invalidation fields. 338 */ 339 spinlock_t invalidation_lock; 340 struct list_head invalidation_requests; 341 342 sector_t migration_threshold; 343 wait_queue_head_t migration_wait; 344 atomic_t nr_allocated_migrations; 345 346 /* 347 * The number of in flight migrations that are performing 348 * background io. eg, promotion, writeback. 349 */ 350 atomic_t nr_io_migrations; 351 352 struct bio_list deferred_bios; 353 354 struct rw_semaphore quiesce_lock; 355 356 /* 357 * origin_blocks entries, discarded if set. 358 */ 359 dm_dblock_t discard_nr_blocks; 360 unsigned long *discard_bitset; 361 uint32_t discard_block_size; /* a power of 2 times sectors per block */ 362 363 /* 364 * Rather than reconstructing the table line for the status we just 365 * save it and regurgitate. 366 */ 367 unsigned int nr_ctr_args; 368 const char **ctr_args; 369 370 struct dm_kcopyd_client *copier; 371 struct work_struct deferred_bio_worker; 372 struct work_struct migration_worker; 373 struct workqueue_struct *wq; 374 struct delayed_work waker; 375 struct dm_bio_prison_v2 *prison; 376 377 /* 378 * cache_size entries, dirty if set 379 */ 380 unsigned long *dirty_bitset; 381 atomic_t nr_dirty; 382 383 unsigned int policy_nr_args; 384 struct dm_cache_policy *policy; 385 386 /* 387 * Cache features such as write-through. 388 */ 389 struct cache_features features; 390 391 struct cache_stats stats; 392 393 bool need_tick_bio:1; 394 bool sized:1; 395 bool invalidate:1; 396 bool commit_requested:1; 397 bool loaded_mappings:1; 398 bool loaded_discards:1; 399 400 struct rw_semaphore background_work_lock; 401 402 struct batcher committer; 403 struct work_struct commit_ws; 404 405 struct dm_io_tracker tracker; 406 407 mempool_t migration_pool; 408 409 struct bio_set bs; 410 }; 411 412 struct per_bio_data { 413 bool tick:1; 414 unsigned int req_nr:2; 415 struct dm_bio_prison_cell_v2 *cell; 416 struct dm_hook_info hook_info; 417 sector_t len; 418 }; 419 420 struct dm_cache_migration { 421 struct continuation k; 422 struct cache *cache; 423 424 struct policy_work *op; 425 struct bio *overwrite_bio; 426 struct dm_bio_prison_cell_v2 *cell; 427 428 dm_cblock_t invalidate_cblock; 429 dm_oblock_t invalidate_oblock; 430 }; 431 432 /*----------------------------------------------------------------*/ 433 434 static bool writethrough_mode(struct cache *cache) 435 { 436 return cache->features.io_mode == CM_IO_WRITETHROUGH; 437 } 438 439 static bool writeback_mode(struct cache *cache) 440 { 441 return cache->features.io_mode == CM_IO_WRITEBACK; 442 } 443 444 static inline bool passthrough_mode(struct cache *cache) 445 { 446 return unlikely(cache->features.io_mode == CM_IO_PASSTHROUGH); 447 } 448 449 /*----------------------------------------------------------------*/ 450 451 static void wake_deferred_bio_worker(struct cache *cache) 452 { 453 queue_work(cache->wq, &cache->deferred_bio_worker); 454 } 455 456 static void wake_migration_worker(struct cache *cache) 457 { 458 if (passthrough_mode(cache)) 459 return; 460 461 queue_work(cache->wq, &cache->migration_worker); 462 } 463 464 /*----------------------------------------------------------------*/ 465 466 static struct dm_bio_prison_cell_v2 *alloc_prison_cell(struct cache *cache) 467 { 468 return dm_bio_prison_alloc_cell_v2(cache->prison, GFP_NOIO); 469 } 470 471 static void free_prison_cell(struct cache *cache, struct dm_bio_prison_cell_v2 *cell) 472 { 473 dm_bio_prison_free_cell_v2(cache->prison, cell); 474 } 475 476 static struct dm_cache_migration *alloc_migration(struct cache *cache) 477 { 478 struct dm_cache_migration *mg; 479 480 mg = mempool_alloc(&cache->migration_pool, GFP_NOIO); 481 482 memset(mg, 0, sizeof(*mg)); 483 484 mg->cache = cache; 485 atomic_inc(&cache->nr_allocated_migrations); 486 487 return mg; 488 } 489 490 static void free_migration(struct dm_cache_migration *mg) 491 { 492 struct cache *cache = mg->cache; 493 494 if (atomic_dec_and_test(&cache->nr_allocated_migrations)) 495 wake_up(&cache->migration_wait); 496 497 mempool_free(mg, &cache->migration_pool); 498 } 499 500 /*----------------------------------------------------------------*/ 501 502 static inline dm_oblock_t oblock_succ(dm_oblock_t b) 503 { 504 return to_oblock(from_oblock(b) + 1ull); 505 } 506 507 static void build_key(dm_oblock_t begin, dm_oblock_t end, struct dm_cell_key_v2 *key) 508 { 509 key->virtual = 0; 510 key->dev = 0; 511 key->block_begin = from_oblock(begin); 512 key->block_end = from_oblock(end); 513 } 514 515 /* 516 * We have two lock levels. Level 0, which is used to prevent WRITEs, and 517 * level 1 which prevents *both* READs and WRITEs. 518 */ 519 #define WRITE_LOCK_LEVEL 0 520 #define READ_WRITE_LOCK_LEVEL 1 521 522 static unsigned int lock_level(struct bio *bio) 523 { 524 return bio_data_dir(bio) == WRITE ? 525 WRITE_LOCK_LEVEL : 526 READ_WRITE_LOCK_LEVEL; 527 } 528 529 /* 530 *-------------------------------------------------------------- 531 * Per bio data 532 *-------------------------------------------------------------- 533 */ 534 535 static struct per_bio_data *get_per_bio_data(struct bio *bio) 536 { 537 struct per_bio_data *pb = dm_per_bio_data(bio, sizeof(struct per_bio_data)); 538 539 BUG_ON(!pb); 540 return pb; 541 } 542 543 static struct per_bio_data *init_per_bio_data(struct bio *bio) 544 { 545 struct per_bio_data *pb = get_per_bio_data(bio); 546 547 pb->tick = false; 548 pb->req_nr = dm_bio_get_target_bio_nr(bio); 549 pb->cell = NULL; 550 pb->len = 0; 551 552 return pb; 553 } 554 555 /*----------------------------------------------------------------*/ 556 557 static void defer_bio(struct cache *cache, struct bio *bio) 558 { 559 spin_lock_irq(&cache->lock); 560 bio_list_add(&cache->deferred_bios, bio); 561 spin_unlock_irq(&cache->lock); 562 563 wake_deferred_bio_worker(cache); 564 } 565 566 static void defer_bios(struct cache *cache, struct bio_list *bios) 567 { 568 spin_lock_irq(&cache->lock); 569 bio_list_merge(&cache->deferred_bios, bios); 570 bio_list_init(bios); 571 spin_unlock_irq(&cache->lock); 572 573 wake_deferred_bio_worker(cache); 574 } 575 576 /*----------------------------------------------------------------*/ 577 578 static bool bio_detain_shared(struct cache *cache, dm_oblock_t oblock, struct bio *bio) 579 { 580 bool r; 581 struct per_bio_data *pb; 582 struct dm_cell_key_v2 key; 583 dm_oblock_t end = to_oblock(from_oblock(oblock) + 1ULL); 584 struct dm_bio_prison_cell_v2 *cell_prealloc, *cell; 585 586 cell_prealloc = alloc_prison_cell(cache); /* FIXME: allow wait if calling from worker */ 587 588 build_key(oblock, end, &key); 589 r = dm_cell_get_v2(cache->prison, &key, lock_level(bio), bio, cell_prealloc, &cell); 590 if (!r) { 591 /* 592 * Failed to get the lock. 593 */ 594 free_prison_cell(cache, cell_prealloc); 595 return r; 596 } 597 598 if (cell != cell_prealloc) 599 free_prison_cell(cache, cell_prealloc); 600 601 pb = get_per_bio_data(bio); 602 pb->cell = cell; 603 604 return r; 605 } 606 607 /*----------------------------------------------------------------*/ 608 609 static bool is_dirty(struct cache *cache, dm_cblock_t b) 610 { 611 return test_bit(from_cblock(b), cache->dirty_bitset); 612 } 613 614 static void set_dirty(struct cache *cache, dm_cblock_t cblock) 615 { 616 if (!test_and_set_bit(from_cblock(cblock), cache->dirty_bitset)) { 617 atomic_inc(&cache->nr_dirty); 618 policy_set_dirty(cache->policy, cblock); 619 } 620 } 621 622 /* 623 * These two are called when setting after migrations to force the policy 624 * and dirty bitset to be in sync. 625 */ 626 static void force_set_dirty(struct cache *cache, dm_cblock_t cblock) 627 { 628 if (!test_and_set_bit(from_cblock(cblock), cache->dirty_bitset)) 629 atomic_inc(&cache->nr_dirty); 630 policy_set_dirty(cache->policy, cblock); 631 } 632 633 static void force_clear_dirty(struct cache *cache, dm_cblock_t cblock) 634 { 635 if (test_and_clear_bit(from_cblock(cblock), cache->dirty_bitset)) { 636 if (atomic_dec_return(&cache->nr_dirty) == 0) 637 dm_table_event(cache->ti->table); 638 } 639 640 policy_clear_dirty(cache->policy, cblock); 641 } 642 643 /*----------------------------------------------------------------*/ 644 645 static bool block_size_is_power_of_two(struct cache *cache) 646 { 647 return cache->sectors_per_block_shift >= 0; 648 } 649 650 static dm_block_t block_div(dm_block_t b, uint32_t n) 651 { 652 do_div(b, n); 653 654 return b; 655 } 656 657 static dm_block_t oblocks_per_dblock(struct cache *cache) 658 { 659 dm_block_t oblocks = cache->discard_block_size; 660 661 if (block_size_is_power_of_two(cache)) 662 oblocks >>= cache->sectors_per_block_shift; 663 else 664 oblocks = block_div(oblocks, cache->sectors_per_block); 665 666 return oblocks; 667 } 668 669 static dm_dblock_t oblock_to_dblock(struct cache *cache, dm_oblock_t oblock) 670 { 671 return to_dblock(block_div(from_oblock(oblock), 672 oblocks_per_dblock(cache))); 673 } 674 675 static void set_discard(struct cache *cache, dm_dblock_t b) 676 { 677 BUG_ON(from_dblock(b) >= from_dblock(cache->discard_nr_blocks)); 678 atomic_inc(&cache->stats.discard_count); 679 680 spin_lock_irq(&cache->lock); 681 set_bit(from_dblock(b), cache->discard_bitset); 682 spin_unlock_irq(&cache->lock); 683 } 684 685 static void clear_discard(struct cache *cache, dm_dblock_t b) 686 { 687 spin_lock_irq(&cache->lock); 688 clear_bit(from_dblock(b), cache->discard_bitset); 689 spin_unlock_irq(&cache->lock); 690 } 691 692 static bool is_discarded(struct cache *cache, dm_dblock_t b) 693 { 694 int r; 695 696 spin_lock_irq(&cache->lock); 697 r = test_bit(from_dblock(b), cache->discard_bitset); 698 spin_unlock_irq(&cache->lock); 699 700 return r; 701 } 702 703 static bool is_discarded_oblock(struct cache *cache, dm_oblock_t b) 704 { 705 int r; 706 707 spin_lock_irq(&cache->lock); 708 r = test_bit(from_dblock(oblock_to_dblock(cache, b)), 709 cache->discard_bitset); 710 spin_unlock_irq(&cache->lock); 711 712 return r; 713 } 714 715 /* 716 * ------------------------------------------------------------- 717 * Remapping 718 *-------------------------------------------------------------- 719 */ 720 static void remap_to_origin(struct cache *cache, struct bio *bio) 721 { 722 bio_set_dev(bio, cache->origin_dev->bdev); 723 } 724 725 static void remap_to_cache(struct cache *cache, struct bio *bio, 726 dm_cblock_t cblock) 727 { 728 sector_t bi_sector = bio->bi_iter.bi_sector; 729 sector_t block = from_cblock(cblock); 730 731 bio_set_dev(bio, cache->cache_dev->bdev); 732 if (!block_size_is_power_of_two(cache)) 733 bio->bi_iter.bi_sector = 734 (block * cache->sectors_per_block) + 735 sector_div(bi_sector, cache->sectors_per_block); 736 else 737 bio->bi_iter.bi_sector = 738 (block << cache->sectors_per_block_shift) | 739 (bi_sector & (cache->sectors_per_block - 1)); 740 } 741 742 static void check_if_tick_bio_needed(struct cache *cache, struct bio *bio) 743 { 744 struct per_bio_data *pb; 745 746 spin_lock_irq(&cache->lock); 747 if (cache->need_tick_bio && !op_is_flush(bio->bi_opf) && 748 bio_op(bio) != REQ_OP_DISCARD) { 749 pb = get_per_bio_data(bio); 750 pb->tick = true; 751 cache->need_tick_bio = false; 752 } 753 spin_unlock_irq(&cache->lock); 754 } 755 756 static void remap_to_origin_clear_discard(struct cache *cache, struct bio *bio, 757 dm_oblock_t oblock) 758 { 759 // FIXME: check_if_tick_bio_needed() is called way too much through this interface 760 check_if_tick_bio_needed(cache, bio); 761 remap_to_origin(cache, bio); 762 if (bio_data_dir(bio) == WRITE) 763 clear_discard(cache, oblock_to_dblock(cache, oblock)); 764 } 765 766 static void remap_to_cache_dirty(struct cache *cache, struct bio *bio, 767 dm_oblock_t oblock, dm_cblock_t cblock) 768 { 769 check_if_tick_bio_needed(cache, bio); 770 remap_to_cache(cache, bio, cblock); 771 if (bio_data_dir(bio) == WRITE) { 772 set_dirty(cache, cblock); 773 clear_discard(cache, oblock_to_dblock(cache, oblock)); 774 } 775 } 776 777 static dm_oblock_t get_bio_block(struct cache *cache, struct bio *bio) 778 { 779 sector_t block_nr = bio->bi_iter.bi_sector; 780 781 if (!block_size_is_power_of_two(cache)) 782 (void) sector_div(block_nr, cache->sectors_per_block); 783 else 784 block_nr >>= cache->sectors_per_block_shift; 785 786 return to_oblock(block_nr); 787 } 788 789 static bool accountable_bio(struct cache *cache, struct bio *bio) 790 { 791 return bio_op(bio) != REQ_OP_DISCARD; 792 } 793 794 static void accounted_begin(struct cache *cache, struct bio *bio) 795 { 796 struct per_bio_data *pb; 797 798 if (accountable_bio(cache, bio)) { 799 pb = get_per_bio_data(bio); 800 pb->len = bio_sectors(bio); 801 dm_iot_io_begin(&cache->tracker, pb->len); 802 } 803 } 804 805 static void accounted_complete(struct cache *cache, struct bio *bio) 806 { 807 struct per_bio_data *pb = get_per_bio_data(bio); 808 809 dm_iot_io_end(&cache->tracker, pb->len); 810 } 811 812 static void accounted_request(struct cache *cache, struct bio *bio) 813 { 814 accounted_begin(cache, bio); 815 dm_submit_bio_remap(bio, NULL); 816 } 817 818 static void issue_op(struct bio *bio, void *context) 819 { 820 struct cache *cache = context; 821 822 accounted_request(cache, bio); 823 } 824 825 /* 826 * When running in writethrough mode we need to send writes to clean blocks 827 * to both the cache and origin devices. Clone the bio and send them in parallel. 828 */ 829 static void remap_to_origin_and_cache(struct cache *cache, struct bio *bio, 830 dm_oblock_t oblock, dm_cblock_t cblock) 831 { 832 struct bio *origin_bio = bio_alloc_clone(cache->origin_dev->bdev, bio, 833 GFP_NOIO, &cache->bs); 834 835 BUG_ON(!origin_bio); 836 837 bio_chain(origin_bio, bio); 838 839 if (bio_data_dir(origin_bio) == WRITE) 840 clear_discard(cache, oblock_to_dblock(cache, oblock)); 841 submit_bio(origin_bio); 842 843 remap_to_cache(cache, bio, cblock); 844 } 845 846 /* 847 *-------------------------------------------------------------- 848 * Failure modes 849 *-------------------------------------------------------------- 850 */ 851 static enum cache_metadata_mode get_cache_mode(struct cache *cache) 852 { 853 return cache->features.mode; 854 } 855 856 static const char *cache_device_name(struct cache *cache) 857 { 858 return dm_table_device_name(cache->ti->table); 859 } 860 861 static void notify_mode_switch(struct cache *cache, enum cache_metadata_mode mode) 862 { 863 static const char *descs[] = { 864 "write", 865 "read-only", 866 "fail" 867 }; 868 869 dm_table_event(cache->ti->table); 870 DMINFO("%s: switching cache to %s mode", 871 cache_device_name(cache), descs[(int)mode]); 872 } 873 874 static void set_cache_mode(struct cache *cache, enum cache_metadata_mode new_mode) 875 { 876 bool needs_check; 877 enum cache_metadata_mode old_mode = get_cache_mode(cache); 878 879 if (dm_cache_metadata_needs_check(cache->cmd, &needs_check)) { 880 DMERR("%s: unable to read needs_check flag, setting failure mode.", 881 cache_device_name(cache)); 882 new_mode = CM_FAIL; 883 } 884 885 if (new_mode == CM_WRITE && needs_check) { 886 DMERR("%s: unable to switch cache to write mode until repaired.", 887 cache_device_name(cache)); 888 if (old_mode != new_mode) 889 new_mode = old_mode; 890 else 891 new_mode = CM_READ_ONLY; 892 } 893 894 /* Never move out of fail mode */ 895 if (old_mode == CM_FAIL) 896 new_mode = CM_FAIL; 897 898 switch (new_mode) { 899 case CM_FAIL: 900 case CM_READ_ONLY: 901 dm_cache_metadata_set_read_only(cache->cmd); 902 break; 903 904 case CM_WRITE: 905 dm_cache_metadata_set_read_write(cache->cmd); 906 break; 907 } 908 909 cache->features.mode = new_mode; 910 911 if (new_mode != old_mode) 912 notify_mode_switch(cache, new_mode); 913 } 914 915 static void abort_transaction(struct cache *cache) 916 { 917 const char *dev_name = cache_device_name(cache); 918 919 if (get_cache_mode(cache) >= CM_READ_ONLY) 920 return; 921 922 DMERR_LIMIT("%s: aborting current metadata transaction", dev_name); 923 if (dm_cache_metadata_abort(cache->cmd)) { 924 DMERR("%s: failed to abort metadata transaction", dev_name); 925 set_cache_mode(cache, CM_FAIL); 926 } 927 928 if (dm_cache_metadata_set_needs_check(cache->cmd)) { 929 DMERR("%s: failed to set 'needs_check' flag in metadata", dev_name); 930 set_cache_mode(cache, CM_FAIL); 931 } 932 } 933 934 static void metadata_operation_failed(struct cache *cache, const char *op, int r) 935 { 936 DMERR_LIMIT("%s: metadata operation '%s' failed: error = %d", 937 cache_device_name(cache), op, r); 938 abort_transaction(cache); 939 set_cache_mode(cache, CM_READ_ONLY); 940 } 941 942 /*----------------------------------------------------------------*/ 943 944 static void load_stats(struct cache *cache) 945 { 946 struct dm_cache_statistics stats; 947 948 dm_cache_metadata_get_stats(cache->cmd, &stats); 949 atomic_set(&cache->stats.read_hit, stats.read_hits); 950 atomic_set(&cache->stats.read_miss, stats.read_misses); 951 atomic_set(&cache->stats.write_hit, stats.write_hits); 952 atomic_set(&cache->stats.write_miss, stats.write_misses); 953 } 954 955 static void save_stats(struct cache *cache) 956 { 957 struct dm_cache_statistics stats; 958 959 if (get_cache_mode(cache) >= CM_READ_ONLY) 960 return; 961 962 stats.read_hits = atomic_read(&cache->stats.read_hit); 963 stats.read_misses = atomic_read(&cache->stats.read_miss); 964 stats.write_hits = atomic_read(&cache->stats.write_hit); 965 stats.write_misses = atomic_read(&cache->stats.write_miss); 966 967 dm_cache_metadata_set_stats(cache->cmd, &stats); 968 } 969 970 static void update_stats(struct cache_stats *stats, enum policy_operation op) 971 { 972 switch (op) { 973 case POLICY_PROMOTE: 974 atomic_inc(&stats->promotion); 975 break; 976 977 case POLICY_DEMOTE: 978 atomic_inc(&stats->demotion); 979 break; 980 981 case POLICY_WRITEBACK: 982 atomic_inc(&stats->writeback); 983 break; 984 } 985 } 986 987 /* 988 *--------------------------------------------------------------------- 989 * Migration processing 990 * 991 * Migration covers moving data from the origin device to the cache, or 992 * vice versa. 993 *--------------------------------------------------------------------- 994 */ 995 static void inc_io_migrations(struct cache *cache) 996 { 997 atomic_inc(&cache->nr_io_migrations); 998 } 999 1000 static void dec_io_migrations(struct cache *cache) 1001 { 1002 atomic_dec(&cache->nr_io_migrations); 1003 } 1004 1005 static bool discard_or_flush(struct bio *bio) 1006 { 1007 return bio_op(bio) == REQ_OP_DISCARD || op_is_flush(bio->bi_opf); 1008 } 1009 1010 static void calc_discard_block_range(struct cache *cache, struct bio *bio, 1011 dm_dblock_t *b, dm_dblock_t *e) 1012 { 1013 sector_t sb = bio->bi_iter.bi_sector; 1014 sector_t se = bio_end_sector(bio); 1015 1016 *b = to_dblock(dm_sector_div_up(sb, cache->discard_block_size)); 1017 1018 if (se - sb < cache->discard_block_size) 1019 *e = *b; 1020 else 1021 *e = to_dblock(block_div(se, cache->discard_block_size)); 1022 } 1023 1024 /*----------------------------------------------------------------*/ 1025 1026 static void prevent_background_work(struct cache *cache) 1027 { 1028 lockdep_off(); 1029 down_write(&cache->background_work_lock); 1030 lockdep_on(); 1031 } 1032 1033 static void allow_background_work(struct cache *cache) 1034 { 1035 lockdep_off(); 1036 up_write(&cache->background_work_lock); 1037 lockdep_on(); 1038 } 1039 1040 static bool background_work_begin(struct cache *cache) 1041 { 1042 bool r; 1043 1044 lockdep_off(); 1045 r = down_read_trylock(&cache->background_work_lock); 1046 lockdep_on(); 1047 1048 return r; 1049 } 1050 1051 static void background_work_end(struct cache *cache) 1052 { 1053 lockdep_off(); 1054 up_read(&cache->background_work_lock); 1055 lockdep_on(); 1056 } 1057 1058 /*----------------------------------------------------------------*/ 1059 1060 static bool bio_writes_complete_block(struct cache *cache, struct bio *bio) 1061 { 1062 return (bio_data_dir(bio) == WRITE) && 1063 (bio->bi_iter.bi_size == (cache->sectors_per_block << SECTOR_SHIFT)); 1064 } 1065 1066 static bool optimisable_bio(struct cache *cache, struct bio *bio, dm_oblock_t block) 1067 { 1068 return writeback_mode(cache) && 1069 (is_discarded_oblock(cache, block) || bio_writes_complete_block(cache, bio)); 1070 } 1071 1072 static void quiesce(struct dm_cache_migration *mg, 1073 void (*continuation)(struct work_struct *)) 1074 { 1075 init_continuation(&mg->k, continuation); 1076 dm_cell_quiesce_v2(mg->cache->prison, mg->cell, &mg->k.ws); 1077 } 1078 1079 static struct dm_cache_migration *ws_to_mg(struct work_struct *ws) 1080 { 1081 struct continuation *k = container_of(ws, struct continuation, ws); 1082 1083 return container_of(k, struct dm_cache_migration, k); 1084 } 1085 1086 static void copy_complete(int read_err, unsigned long write_err, void *context) 1087 { 1088 struct dm_cache_migration *mg = container_of(context, struct dm_cache_migration, k); 1089 1090 if (read_err || write_err) 1091 mg->k.input = BLK_STS_IOERR; 1092 1093 queue_continuation(mg->cache->wq, &mg->k); 1094 } 1095 1096 static void copy(struct dm_cache_migration *mg, bool promote) 1097 { 1098 struct dm_io_region o_region, c_region; 1099 struct cache *cache = mg->cache; 1100 1101 o_region.bdev = cache->origin_dev->bdev; 1102 o_region.sector = from_oblock(mg->op->oblock) * cache->sectors_per_block; 1103 o_region.count = cache->sectors_per_block; 1104 1105 c_region.bdev = cache->cache_dev->bdev; 1106 c_region.sector = from_cblock(mg->op->cblock) * cache->sectors_per_block; 1107 c_region.count = cache->sectors_per_block; 1108 1109 if (promote) 1110 dm_kcopyd_copy(cache->copier, &o_region, 1, &c_region, 0, copy_complete, &mg->k); 1111 else 1112 dm_kcopyd_copy(cache->copier, &c_region, 1, &o_region, 0, copy_complete, &mg->k); 1113 } 1114 1115 static void bio_drop_shared_lock(struct cache *cache, struct bio *bio) 1116 { 1117 struct per_bio_data *pb = get_per_bio_data(bio); 1118 1119 if (pb->cell && dm_cell_put_v2(cache->prison, pb->cell)) 1120 free_prison_cell(cache, pb->cell); 1121 pb->cell = NULL; 1122 } 1123 1124 static void overwrite_endio(struct bio *bio) 1125 { 1126 struct dm_cache_migration *mg = bio->bi_private; 1127 struct cache *cache = mg->cache; 1128 struct per_bio_data *pb = get_per_bio_data(bio); 1129 1130 dm_unhook_bio(&pb->hook_info, bio); 1131 1132 if (bio->bi_status) 1133 mg->k.input = bio->bi_status; 1134 1135 queue_continuation(cache->wq, &mg->k); 1136 } 1137 1138 static void overwrite(struct dm_cache_migration *mg, 1139 void (*continuation)(struct work_struct *)) 1140 { 1141 struct bio *bio = mg->overwrite_bio; 1142 struct per_bio_data *pb = get_per_bio_data(bio); 1143 1144 dm_hook_bio(&pb->hook_info, bio, overwrite_endio, mg); 1145 1146 /* 1147 * The overwrite bio is part of the copy operation, as such it does 1148 * not set/clear discard or dirty flags. 1149 */ 1150 if (mg->op->op == POLICY_PROMOTE) 1151 remap_to_cache(mg->cache, bio, mg->op->cblock); 1152 else 1153 remap_to_origin(mg->cache, bio); 1154 1155 init_continuation(&mg->k, continuation); 1156 accounted_request(mg->cache, bio); 1157 } 1158 1159 /* 1160 * Migration steps: 1161 * 1162 * 1) exclusive lock preventing WRITEs 1163 * 2) quiesce 1164 * 3) copy or issue overwrite bio 1165 * 4) upgrade to exclusive lock preventing READs and WRITEs 1166 * 5) quiesce 1167 * 6) update metadata and commit 1168 * 7) unlock 1169 */ 1170 static void mg_complete(struct dm_cache_migration *mg, bool success) 1171 { 1172 struct bio_list bios; 1173 struct cache *cache = mg->cache; 1174 struct policy_work *op = mg->op; 1175 dm_cblock_t cblock = op->cblock; 1176 1177 if (success) 1178 update_stats(&cache->stats, op->op); 1179 1180 switch (op->op) { 1181 case POLICY_PROMOTE: 1182 clear_discard(cache, oblock_to_dblock(cache, op->oblock)); 1183 policy_complete_background_work(cache->policy, op, success); 1184 1185 if (mg->overwrite_bio) { 1186 if (success) 1187 force_set_dirty(cache, cblock); 1188 else if (mg->k.input) 1189 mg->overwrite_bio->bi_status = mg->k.input; 1190 else 1191 mg->overwrite_bio->bi_status = BLK_STS_IOERR; 1192 bio_endio(mg->overwrite_bio); 1193 } else { 1194 if (success) 1195 force_clear_dirty(cache, cblock); 1196 dec_io_migrations(cache); 1197 } 1198 break; 1199 1200 case POLICY_DEMOTE: 1201 /* 1202 * We clear dirty here to update the nr_dirty counter. 1203 */ 1204 if (success) 1205 force_clear_dirty(cache, cblock); 1206 policy_complete_background_work(cache->policy, op, success); 1207 dec_io_migrations(cache); 1208 break; 1209 1210 case POLICY_WRITEBACK: 1211 if (success) 1212 force_clear_dirty(cache, cblock); 1213 policy_complete_background_work(cache->policy, op, success); 1214 dec_io_migrations(cache); 1215 break; 1216 } 1217 1218 bio_list_init(&bios); 1219 if (mg->cell) { 1220 if (dm_cell_unlock_v2(cache->prison, mg->cell, &bios)) 1221 free_prison_cell(cache, mg->cell); 1222 } 1223 1224 free_migration(mg); 1225 defer_bios(cache, &bios); 1226 wake_migration_worker(cache); 1227 1228 background_work_end(cache); 1229 } 1230 1231 static void mg_success(struct work_struct *ws) 1232 { 1233 struct dm_cache_migration *mg = ws_to_mg(ws); 1234 1235 mg_complete(mg, mg->k.input == 0); 1236 } 1237 1238 static void mg_update_metadata(struct work_struct *ws) 1239 { 1240 int r; 1241 struct dm_cache_migration *mg = ws_to_mg(ws); 1242 struct cache *cache = mg->cache; 1243 struct policy_work *op = mg->op; 1244 1245 switch (op->op) { 1246 case POLICY_PROMOTE: 1247 r = dm_cache_insert_mapping(cache->cmd, op->cblock, op->oblock); 1248 if (r) { 1249 DMERR_LIMIT("%s: migration failed; couldn't insert mapping", 1250 cache_device_name(cache)); 1251 metadata_operation_failed(cache, "dm_cache_insert_mapping", r); 1252 1253 mg_complete(mg, false); 1254 return; 1255 } 1256 mg_complete(mg, true); 1257 break; 1258 1259 case POLICY_DEMOTE: 1260 r = dm_cache_remove_mapping(cache->cmd, op->cblock); 1261 if (r) { 1262 DMERR_LIMIT("%s: migration failed; couldn't update on disk metadata", 1263 cache_device_name(cache)); 1264 metadata_operation_failed(cache, "dm_cache_remove_mapping", r); 1265 1266 mg_complete(mg, false); 1267 return; 1268 } 1269 1270 /* 1271 * It would be nice if we only had to commit when a REQ_FLUSH 1272 * comes through. But there's one scenario that we have to 1273 * look out for: 1274 * 1275 * - vblock x in a cache block 1276 * - domotion occurs 1277 * - cache block gets reallocated and over written 1278 * - crash 1279 * 1280 * When we recover, because there was no commit the cache will 1281 * rollback to having the data for vblock x in the cache block. 1282 * But the cache block has since been overwritten, so it'll end 1283 * up pointing to data that was never in 'x' during the history 1284 * of the device. 1285 * 1286 * To avoid this issue we require a commit as part of the 1287 * demotion operation. 1288 */ 1289 init_continuation(&mg->k, mg_success); 1290 continue_after_commit(&cache->committer, &mg->k); 1291 schedule_commit(&cache->committer); 1292 break; 1293 1294 case POLICY_WRITEBACK: 1295 mg_complete(mg, true); 1296 break; 1297 } 1298 } 1299 1300 static void mg_update_metadata_after_copy(struct work_struct *ws) 1301 { 1302 struct dm_cache_migration *mg = ws_to_mg(ws); 1303 1304 /* 1305 * Did the copy succeed? 1306 */ 1307 if (mg->k.input) 1308 mg_complete(mg, false); 1309 else 1310 mg_update_metadata(ws); 1311 } 1312 1313 static void mg_upgrade_lock(struct work_struct *ws) 1314 { 1315 int r; 1316 struct dm_cache_migration *mg = ws_to_mg(ws); 1317 1318 /* 1319 * Did the copy succeed? 1320 */ 1321 if (mg->k.input) 1322 mg_complete(mg, false); 1323 1324 else { 1325 /* 1326 * Now we want the lock to prevent both reads and writes. 1327 */ 1328 r = dm_cell_lock_promote_v2(mg->cache->prison, mg->cell, 1329 READ_WRITE_LOCK_LEVEL); 1330 if (r < 0) 1331 mg_complete(mg, false); 1332 1333 else if (r) 1334 quiesce(mg, mg_update_metadata); 1335 1336 else 1337 mg_update_metadata(ws); 1338 } 1339 } 1340 1341 static void mg_full_copy(struct work_struct *ws) 1342 { 1343 struct dm_cache_migration *mg = ws_to_mg(ws); 1344 struct cache *cache = mg->cache; 1345 struct policy_work *op = mg->op; 1346 bool is_policy_promote = (op->op == POLICY_PROMOTE); 1347 1348 if ((!is_policy_promote && !is_dirty(cache, op->cblock)) || 1349 is_discarded_oblock(cache, op->oblock)) { 1350 mg_upgrade_lock(ws); 1351 return; 1352 } 1353 1354 init_continuation(&mg->k, mg_upgrade_lock); 1355 copy(mg, is_policy_promote); 1356 } 1357 1358 static void mg_copy(struct work_struct *ws) 1359 { 1360 struct dm_cache_migration *mg = ws_to_mg(ws); 1361 1362 if (mg->overwrite_bio) { 1363 /* 1364 * No exclusive lock was held when we last checked if the bio 1365 * was optimisable. So we have to check again in case things 1366 * have changed (eg, the block may no longer be discarded). 1367 */ 1368 if (!optimisable_bio(mg->cache, mg->overwrite_bio, mg->op->oblock)) { 1369 /* 1370 * Fallback to a real full copy after doing some tidying up. 1371 */ 1372 bool rb = bio_detain_shared(mg->cache, mg->op->oblock, mg->overwrite_bio); 1373 1374 BUG_ON(rb); /* An exclussive lock must _not_ be held for this block */ 1375 mg->overwrite_bio = NULL; 1376 inc_io_migrations(mg->cache); 1377 mg_full_copy(ws); 1378 return; 1379 } 1380 1381 /* 1382 * It's safe to do this here, even though it's new data 1383 * because all IO has been locked out of the block. 1384 * 1385 * mg_lock_writes() already took READ_WRITE_LOCK_LEVEL 1386 * so _not_ using mg_upgrade_lock() as continutation. 1387 */ 1388 overwrite(mg, mg_update_metadata_after_copy); 1389 1390 } else 1391 mg_full_copy(ws); 1392 } 1393 1394 static int mg_lock_writes(struct dm_cache_migration *mg) 1395 { 1396 int r; 1397 struct dm_cell_key_v2 key; 1398 struct cache *cache = mg->cache; 1399 struct dm_bio_prison_cell_v2 *prealloc; 1400 1401 prealloc = alloc_prison_cell(cache); 1402 1403 /* 1404 * Prevent writes to the block, but allow reads to continue. 1405 * Unless we're using an overwrite bio, in which case we lock 1406 * everything. 1407 */ 1408 build_key(mg->op->oblock, oblock_succ(mg->op->oblock), &key); 1409 r = dm_cell_lock_v2(cache->prison, &key, 1410 mg->overwrite_bio ? READ_WRITE_LOCK_LEVEL : WRITE_LOCK_LEVEL, 1411 prealloc, &mg->cell); 1412 if (r < 0) { 1413 free_prison_cell(cache, prealloc); 1414 mg_complete(mg, false); 1415 return r; 1416 } 1417 1418 if (mg->cell != prealloc) 1419 free_prison_cell(cache, prealloc); 1420 1421 if (r == 0) 1422 mg_copy(&mg->k.ws); 1423 else 1424 quiesce(mg, mg_copy); 1425 1426 return 0; 1427 } 1428 1429 static int mg_start(struct cache *cache, struct policy_work *op, struct bio *bio) 1430 { 1431 struct dm_cache_migration *mg; 1432 1433 if (!background_work_begin(cache)) { 1434 policy_complete_background_work(cache->policy, op, false); 1435 return -EPERM; 1436 } 1437 1438 mg = alloc_migration(cache); 1439 1440 mg->op = op; 1441 mg->overwrite_bio = bio; 1442 1443 if (!bio) 1444 inc_io_migrations(cache); 1445 1446 return mg_lock_writes(mg); 1447 } 1448 1449 /* 1450 *-------------------------------------------------------------- 1451 * invalidation processing 1452 *-------------------------------------------------------------- 1453 */ 1454 1455 static void invalidate_complete(struct dm_cache_migration *mg, bool success) 1456 { 1457 struct bio_list bios; 1458 struct cache *cache = mg->cache; 1459 1460 bio_list_init(&bios); 1461 if (dm_cell_unlock_v2(cache->prison, mg->cell, &bios)) 1462 free_prison_cell(cache, mg->cell); 1463 1464 if (!success && mg->overwrite_bio) 1465 bio_io_error(mg->overwrite_bio); 1466 1467 free_migration(mg); 1468 defer_bios(cache, &bios); 1469 1470 background_work_end(cache); 1471 } 1472 1473 static void invalidate_completed(struct work_struct *ws) 1474 { 1475 struct dm_cache_migration *mg = ws_to_mg(ws); 1476 1477 invalidate_complete(mg, !mg->k.input); 1478 } 1479 1480 static int invalidate_cblock(struct cache *cache, dm_cblock_t cblock) 1481 { 1482 int r; 1483 1484 r = policy_invalidate_mapping(cache->policy, cblock); 1485 if (!r) { 1486 r = dm_cache_remove_mapping(cache->cmd, cblock); 1487 if (r) { 1488 DMERR_LIMIT("%s: invalidation failed; couldn't update on disk metadata", 1489 cache_device_name(cache)); 1490 metadata_operation_failed(cache, "dm_cache_remove_mapping", r); 1491 } 1492 1493 } else if (r == -ENODATA) { 1494 /* 1495 * Harmless, already unmapped. 1496 */ 1497 r = 0; 1498 1499 } else 1500 DMERR("%s: policy_invalidate_mapping failed", cache_device_name(cache)); 1501 1502 return r; 1503 } 1504 1505 static void invalidate_remove(struct work_struct *ws) 1506 { 1507 int r; 1508 struct dm_cache_migration *mg = ws_to_mg(ws); 1509 struct cache *cache = mg->cache; 1510 1511 r = invalidate_cblock(cache, mg->invalidate_cblock); 1512 if (r) { 1513 invalidate_complete(mg, false); 1514 return; 1515 } 1516 1517 init_continuation(&mg->k, invalidate_completed); 1518 continue_after_commit(&cache->committer, &mg->k); 1519 remap_to_origin_clear_discard(cache, mg->overwrite_bio, mg->invalidate_oblock); 1520 mg->overwrite_bio = NULL; 1521 schedule_commit(&cache->committer); 1522 } 1523 1524 static int invalidate_lock(struct dm_cache_migration *mg) 1525 { 1526 int r; 1527 struct dm_cell_key_v2 key; 1528 struct cache *cache = mg->cache; 1529 struct dm_bio_prison_cell_v2 *prealloc; 1530 1531 prealloc = alloc_prison_cell(cache); 1532 1533 build_key(mg->invalidate_oblock, oblock_succ(mg->invalidate_oblock), &key); 1534 r = dm_cell_lock_v2(cache->prison, &key, 1535 READ_WRITE_LOCK_LEVEL, prealloc, &mg->cell); 1536 if (r < 0) { 1537 free_prison_cell(cache, prealloc); 1538 invalidate_complete(mg, false); 1539 return r; 1540 } 1541 1542 if (mg->cell != prealloc) 1543 free_prison_cell(cache, prealloc); 1544 1545 if (r) 1546 quiesce(mg, invalidate_remove); 1547 1548 else { 1549 /* 1550 * We can't call invalidate_remove() directly here because we 1551 * might still be in request context. 1552 */ 1553 init_continuation(&mg->k, invalidate_remove); 1554 queue_work(cache->wq, &mg->k.ws); 1555 } 1556 1557 return 0; 1558 } 1559 1560 static int invalidate_start(struct cache *cache, dm_cblock_t cblock, 1561 dm_oblock_t oblock, struct bio *bio) 1562 { 1563 struct dm_cache_migration *mg; 1564 1565 if (!background_work_begin(cache)) 1566 return -EPERM; 1567 1568 mg = alloc_migration(cache); 1569 1570 mg->overwrite_bio = bio; 1571 mg->invalidate_cblock = cblock; 1572 mg->invalidate_oblock = oblock; 1573 1574 return invalidate_lock(mg); 1575 } 1576 1577 /* 1578 *-------------------------------------------------------------- 1579 * bio processing 1580 *-------------------------------------------------------------- 1581 */ 1582 1583 enum busy { 1584 IDLE, 1585 BUSY 1586 }; 1587 1588 static enum busy spare_migration_bandwidth(struct cache *cache) 1589 { 1590 bool idle = dm_iot_idle_for(&cache->tracker, HZ); 1591 sector_t current_volume = (atomic_read(&cache->nr_io_migrations) + 1) * 1592 cache->sectors_per_block; 1593 1594 if (idle && current_volume <= cache->migration_threshold) 1595 return IDLE; 1596 else 1597 return BUSY; 1598 } 1599 1600 static void inc_hit_counter(struct cache *cache, struct bio *bio) 1601 { 1602 atomic_inc(bio_data_dir(bio) == READ ? 1603 &cache->stats.read_hit : &cache->stats.write_hit); 1604 } 1605 1606 static void inc_miss_counter(struct cache *cache, struct bio *bio) 1607 { 1608 atomic_inc(bio_data_dir(bio) == READ ? 1609 &cache->stats.read_miss : &cache->stats.write_miss); 1610 } 1611 1612 /*----------------------------------------------------------------*/ 1613 1614 static int map_bio(struct cache *cache, struct bio *bio, dm_oblock_t block, 1615 bool *commit_needed) 1616 { 1617 int r, data_dir; 1618 bool rb, background_queued; 1619 dm_cblock_t cblock; 1620 1621 *commit_needed = false; 1622 1623 rb = bio_detain_shared(cache, block, bio); 1624 if (!rb) { 1625 /* 1626 * An exclusive lock is held for this block, so we have to 1627 * wait. We set the commit_needed flag so the current 1628 * transaction will be committed asap, allowing this lock 1629 * to be dropped. 1630 */ 1631 *commit_needed = true; 1632 return DM_MAPIO_SUBMITTED; 1633 } 1634 1635 data_dir = bio_data_dir(bio); 1636 1637 if (optimisable_bio(cache, bio, block)) { 1638 struct policy_work *op = NULL; 1639 1640 r = policy_lookup_with_work(cache->policy, block, &cblock, data_dir, true, &op); 1641 if (unlikely(r && r != -ENOENT)) { 1642 DMERR_LIMIT("%s: policy_lookup_with_work() failed with r = %d", 1643 cache_device_name(cache), r); 1644 bio_io_error(bio); 1645 return DM_MAPIO_SUBMITTED; 1646 } 1647 1648 if (r == -ENOENT && op) { 1649 bio_drop_shared_lock(cache, bio); 1650 BUG_ON(op->op != POLICY_PROMOTE); 1651 mg_start(cache, op, bio); 1652 return DM_MAPIO_SUBMITTED; 1653 } 1654 } else { 1655 r = policy_lookup(cache->policy, block, &cblock, data_dir, false, &background_queued); 1656 if (unlikely(r && r != -ENOENT)) { 1657 DMERR_LIMIT("%s: policy_lookup() failed with r = %d", 1658 cache_device_name(cache), r); 1659 bio_io_error(bio); 1660 return DM_MAPIO_SUBMITTED; 1661 } 1662 1663 if (background_queued) 1664 wake_migration_worker(cache); 1665 } 1666 1667 if (r == -ENOENT) { 1668 struct per_bio_data *pb = get_per_bio_data(bio); 1669 1670 /* 1671 * Miss. 1672 */ 1673 inc_miss_counter(cache, bio); 1674 if (pb->req_nr == 0) { 1675 accounted_begin(cache, bio); 1676 remap_to_origin_clear_discard(cache, bio, block); 1677 } else { 1678 /* 1679 * This is a duplicate writethrough io that is no 1680 * longer needed because the block has been demoted. 1681 */ 1682 bio_endio(bio); 1683 return DM_MAPIO_SUBMITTED; 1684 } 1685 } else { 1686 /* 1687 * Hit. 1688 */ 1689 inc_hit_counter(cache, bio); 1690 1691 /* 1692 * Passthrough always maps to the origin, invalidating any 1693 * cache blocks that are written to. 1694 */ 1695 if (passthrough_mode(cache)) { 1696 if (bio_data_dir(bio) == WRITE) { 1697 bio_drop_shared_lock(cache, bio); 1698 atomic_inc(&cache->stats.demotion); 1699 invalidate_start(cache, cblock, block, bio); 1700 } else 1701 remap_to_origin_clear_discard(cache, bio, block); 1702 } else { 1703 if (bio_data_dir(bio) == WRITE && writethrough_mode(cache) && 1704 !is_dirty(cache, cblock)) { 1705 remap_to_origin_and_cache(cache, bio, block, cblock); 1706 accounted_begin(cache, bio); 1707 } else 1708 remap_to_cache_dirty(cache, bio, block, cblock); 1709 } 1710 } 1711 1712 /* 1713 * dm core turns FUA requests into a separate payload and FLUSH req. 1714 */ 1715 if (bio->bi_opf & REQ_FUA) { 1716 /* 1717 * issue_after_commit will call accounted_begin a second time. So 1718 * we call accounted_complete() to avoid double accounting. 1719 */ 1720 accounted_complete(cache, bio); 1721 issue_after_commit(&cache->committer, bio); 1722 *commit_needed = true; 1723 return DM_MAPIO_SUBMITTED; 1724 } 1725 1726 return DM_MAPIO_REMAPPED; 1727 } 1728 1729 static bool process_bio(struct cache *cache, struct bio *bio) 1730 { 1731 bool commit_needed; 1732 1733 if (map_bio(cache, bio, get_bio_block(cache, bio), &commit_needed) == DM_MAPIO_REMAPPED) 1734 dm_submit_bio_remap(bio, NULL); 1735 1736 return commit_needed; 1737 } 1738 1739 /* 1740 * A non-zero return indicates read_only or fail_io mode. 1741 */ 1742 static int commit(struct cache *cache, bool clean_shutdown) 1743 { 1744 int r; 1745 1746 if (get_cache_mode(cache) >= CM_READ_ONLY) 1747 return -EINVAL; 1748 1749 atomic_inc(&cache->stats.commit_count); 1750 r = dm_cache_commit(cache->cmd, clean_shutdown); 1751 if (r) 1752 metadata_operation_failed(cache, "dm_cache_commit", r); 1753 1754 return r; 1755 } 1756 1757 /* 1758 * Used by the batcher. 1759 */ 1760 static blk_status_t commit_op(void *context) 1761 { 1762 struct cache *cache = context; 1763 1764 if (dm_cache_changed_this_transaction(cache->cmd)) 1765 return errno_to_blk_status(commit(cache, false)); 1766 1767 return 0; 1768 } 1769 1770 /*----------------------------------------------------------------*/ 1771 1772 static bool process_flush_bio(struct cache *cache, struct bio *bio) 1773 { 1774 struct per_bio_data *pb = get_per_bio_data(bio); 1775 1776 if (!pb->req_nr) 1777 remap_to_origin(cache, bio); 1778 else 1779 remap_to_cache(cache, bio, 0); 1780 1781 issue_after_commit(&cache->committer, bio); 1782 return true; 1783 } 1784 1785 static bool process_discard_bio(struct cache *cache, struct bio *bio) 1786 { 1787 dm_dblock_t b, e; 1788 1789 /* 1790 * FIXME: do we need to lock the region? Or can we just assume the 1791 * user wont be so foolish as to issue discard concurrently with 1792 * other IO? 1793 */ 1794 calc_discard_block_range(cache, bio, &b, &e); 1795 while (b != e) { 1796 set_discard(cache, b); 1797 b = to_dblock(from_dblock(b) + 1); 1798 } 1799 1800 if (cache->features.discard_passdown) { 1801 remap_to_origin(cache, bio); 1802 dm_submit_bio_remap(bio, NULL); 1803 } else 1804 bio_endio(bio); 1805 1806 return false; 1807 } 1808 1809 static void process_deferred_bios(struct work_struct *ws) 1810 { 1811 struct cache *cache = container_of(ws, struct cache, deferred_bio_worker); 1812 1813 bool commit_needed = false; 1814 struct bio_list bios; 1815 struct bio *bio; 1816 1817 bio_list_init(&bios); 1818 1819 spin_lock_irq(&cache->lock); 1820 bio_list_merge(&bios, &cache->deferred_bios); 1821 bio_list_init(&cache->deferred_bios); 1822 spin_unlock_irq(&cache->lock); 1823 1824 while ((bio = bio_list_pop(&bios))) { 1825 if (bio->bi_opf & REQ_PREFLUSH) 1826 commit_needed = process_flush_bio(cache, bio) || commit_needed; 1827 1828 else if (bio_op(bio) == REQ_OP_DISCARD) 1829 commit_needed = process_discard_bio(cache, bio) || commit_needed; 1830 1831 else 1832 commit_needed = process_bio(cache, bio) || commit_needed; 1833 cond_resched(); 1834 } 1835 1836 if (commit_needed) 1837 schedule_commit(&cache->committer); 1838 } 1839 1840 /* 1841 *-------------------------------------------------------------- 1842 * Main worker loop 1843 *-------------------------------------------------------------- 1844 */ 1845 static void requeue_deferred_bios(struct cache *cache) 1846 { 1847 struct bio *bio; 1848 struct bio_list bios; 1849 1850 bio_list_init(&bios); 1851 bio_list_merge(&bios, &cache->deferred_bios); 1852 bio_list_init(&cache->deferred_bios); 1853 1854 while ((bio = bio_list_pop(&bios))) { 1855 bio->bi_status = BLK_STS_DM_REQUEUE; 1856 bio_endio(bio); 1857 cond_resched(); 1858 } 1859 } 1860 1861 /* 1862 * We want to commit periodically so that not too much 1863 * unwritten metadata builds up. 1864 */ 1865 static void do_waker(struct work_struct *ws) 1866 { 1867 struct cache *cache = container_of(to_delayed_work(ws), struct cache, waker); 1868 1869 policy_tick(cache->policy, true); 1870 wake_migration_worker(cache); 1871 schedule_commit(&cache->committer); 1872 queue_delayed_work(cache->wq, &cache->waker, COMMIT_PERIOD); 1873 } 1874 1875 static void check_migrations(struct work_struct *ws) 1876 { 1877 int r; 1878 struct policy_work *op; 1879 struct cache *cache = container_of(ws, struct cache, migration_worker); 1880 enum busy b; 1881 1882 for (;;) { 1883 b = spare_migration_bandwidth(cache); 1884 1885 r = policy_get_background_work(cache->policy, b == IDLE, &op); 1886 if (r == -ENODATA) 1887 break; 1888 1889 if (r) { 1890 DMERR_LIMIT("%s: policy_background_work failed", 1891 cache_device_name(cache)); 1892 break; 1893 } 1894 1895 r = mg_start(cache, op, NULL); 1896 if (r) 1897 break; 1898 1899 cond_resched(); 1900 } 1901 } 1902 1903 /* 1904 *-------------------------------------------------------------- 1905 * Target methods 1906 *-------------------------------------------------------------- 1907 */ 1908 1909 /* 1910 * This function gets called on the error paths of the constructor, so we 1911 * have to cope with a partially initialised struct. 1912 */ 1913 static void __destroy(struct cache *cache) 1914 { 1915 mempool_exit(&cache->migration_pool); 1916 1917 if (cache->prison) 1918 dm_bio_prison_destroy_v2(cache->prison); 1919 1920 if (cache->wq) 1921 destroy_workqueue(cache->wq); 1922 1923 if (cache->dirty_bitset) 1924 free_bitset(cache->dirty_bitset); 1925 1926 if (cache->discard_bitset) 1927 free_bitset(cache->discard_bitset); 1928 1929 if (cache->copier) 1930 dm_kcopyd_client_destroy(cache->copier); 1931 1932 if (cache->cmd) 1933 dm_cache_metadata_close(cache->cmd); 1934 1935 if (cache->metadata_dev) 1936 dm_put_device(cache->ti, cache->metadata_dev); 1937 1938 if (cache->origin_dev) 1939 dm_put_device(cache->ti, cache->origin_dev); 1940 1941 if (cache->cache_dev) 1942 dm_put_device(cache->ti, cache->cache_dev); 1943 1944 if (cache->policy) 1945 dm_cache_policy_destroy(cache->policy); 1946 1947 bioset_exit(&cache->bs); 1948 1949 kfree(cache); 1950 } 1951 1952 static void destroy(struct cache *cache) 1953 { 1954 unsigned int i; 1955 1956 cancel_delayed_work_sync(&cache->waker); 1957 1958 for (i = 0; i < cache->nr_ctr_args ; i++) 1959 kfree(cache->ctr_args[i]); 1960 kfree(cache->ctr_args); 1961 1962 __destroy(cache); 1963 } 1964 1965 static void cache_dtr(struct dm_target *ti) 1966 { 1967 struct cache *cache = ti->private; 1968 1969 destroy(cache); 1970 } 1971 1972 static sector_t get_dev_size(struct dm_dev *dev) 1973 { 1974 return bdev_nr_sectors(dev->bdev); 1975 } 1976 1977 /*----------------------------------------------------------------*/ 1978 1979 /* 1980 * Construct a cache device mapping. 1981 * 1982 * cache <metadata dev> <cache dev> <origin dev> <block size> 1983 * <#feature args> [<feature arg>]* 1984 * <policy> <#policy args> [<policy arg>]* 1985 * 1986 * metadata dev : fast device holding the persistent metadata 1987 * cache dev : fast device holding cached data blocks 1988 * origin dev : slow device holding original data blocks 1989 * block size : cache unit size in sectors 1990 * 1991 * #feature args : number of feature arguments passed 1992 * feature args : writethrough. (The default is writeback.) 1993 * 1994 * policy : the replacement policy to use 1995 * #policy args : an even number of policy arguments corresponding 1996 * to key/value pairs passed to the policy 1997 * policy args : key/value pairs passed to the policy 1998 * E.g. 'sequential_threshold 1024' 1999 * See cache-policies.txt for details. 2000 * 2001 * Optional feature arguments are: 2002 * writethrough : write through caching that prohibits cache block 2003 * content from being different from origin block content. 2004 * Without this argument, the default behaviour is to write 2005 * back cache block contents later for performance reasons, 2006 * so they may differ from the corresponding origin blocks. 2007 */ 2008 struct cache_args { 2009 struct dm_target *ti; 2010 2011 struct dm_dev *metadata_dev; 2012 2013 struct dm_dev *cache_dev; 2014 sector_t cache_sectors; 2015 2016 struct dm_dev *origin_dev; 2017 2018 uint32_t block_size; 2019 2020 const char *policy_name; 2021 int policy_argc; 2022 const char **policy_argv; 2023 2024 struct cache_features features; 2025 }; 2026 2027 static void destroy_cache_args(struct cache_args *ca) 2028 { 2029 if (ca->metadata_dev) 2030 dm_put_device(ca->ti, ca->metadata_dev); 2031 2032 if (ca->cache_dev) 2033 dm_put_device(ca->ti, ca->cache_dev); 2034 2035 if (ca->origin_dev) 2036 dm_put_device(ca->ti, ca->origin_dev); 2037 2038 kfree(ca); 2039 } 2040 2041 static bool at_least_one_arg(struct dm_arg_set *as, char **error) 2042 { 2043 if (!as->argc) { 2044 *error = "Insufficient args"; 2045 return false; 2046 } 2047 2048 return true; 2049 } 2050 2051 static int parse_metadata_dev(struct cache_args *ca, struct dm_arg_set *as, 2052 char **error) 2053 { 2054 int r; 2055 sector_t metadata_dev_size; 2056 2057 if (!at_least_one_arg(as, error)) 2058 return -EINVAL; 2059 2060 r = dm_get_device(ca->ti, dm_shift_arg(as), 2061 BLK_OPEN_READ | BLK_OPEN_WRITE, &ca->metadata_dev); 2062 if (r) { 2063 *error = "Error opening metadata device"; 2064 return r; 2065 } 2066 2067 metadata_dev_size = get_dev_size(ca->metadata_dev); 2068 if (metadata_dev_size > DM_CACHE_METADATA_MAX_SECTORS_WARNING) 2069 DMWARN("Metadata device %pg is larger than %u sectors: excess space will not be used.", 2070 ca->metadata_dev->bdev, THIN_METADATA_MAX_SECTORS); 2071 2072 return 0; 2073 } 2074 2075 static int parse_cache_dev(struct cache_args *ca, struct dm_arg_set *as, 2076 char **error) 2077 { 2078 int r; 2079 2080 if (!at_least_one_arg(as, error)) 2081 return -EINVAL; 2082 2083 r = dm_get_device(ca->ti, dm_shift_arg(as), 2084 BLK_OPEN_READ | BLK_OPEN_WRITE, &ca->cache_dev); 2085 if (r) { 2086 *error = "Error opening cache device"; 2087 return r; 2088 } 2089 ca->cache_sectors = get_dev_size(ca->cache_dev); 2090 2091 return 0; 2092 } 2093 2094 static int parse_origin_dev(struct cache_args *ca, struct dm_arg_set *as, 2095 char **error) 2096 { 2097 sector_t origin_sectors; 2098 int r; 2099 2100 if (!at_least_one_arg(as, error)) 2101 return -EINVAL; 2102 2103 r = dm_get_device(ca->ti, dm_shift_arg(as), 2104 BLK_OPEN_READ | BLK_OPEN_WRITE, &ca->origin_dev); 2105 if (r) { 2106 *error = "Error opening origin device"; 2107 return r; 2108 } 2109 2110 origin_sectors = get_dev_size(ca->origin_dev); 2111 if (ca->ti->len > origin_sectors) { 2112 *error = "Device size larger than cached device"; 2113 return -EINVAL; 2114 } 2115 2116 return 0; 2117 } 2118 2119 static int parse_block_size(struct cache_args *ca, struct dm_arg_set *as, 2120 char **error) 2121 { 2122 unsigned long block_size; 2123 2124 if (!at_least_one_arg(as, error)) 2125 return -EINVAL; 2126 2127 if (kstrtoul(dm_shift_arg(as), 10, &block_size) || !block_size || 2128 block_size < DATA_DEV_BLOCK_SIZE_MIN_SECTORS || 2129 block_size > DATA_DEV_BLOCK_SIZE_MAX_SECTORS || 2130 block_size & (DATA_DEV_BLOCK_SIZE_MIN_SECTORS - 1)) { 2131 *error = "Invalid data block size"; 2132 return -EINVAL; 2133 } 2134 2135 if (block_size > ca->cache_sectors) { 2136 *error = "Data block size is larger than the cache device"; 2137 return -EINVAL; 2138 } 2139 2140 ca->block_size = block_size; 2141 2142 return 0; 2143 } 2144 2145 static void init_features(struct cache_features *cf) 2146 { 2147 cf->mode = CM_WRITE; 2148 cf->io_mode = CM_IO_WRITEBACK; 2149 cf->metadata_version = 1; 2150 cf->discard_passdown = true; 2151 } 2152 2153 static int parse_features(struct cache_args *ca, struct dm_arg_set *as, 2154 char **error) 2155 { 2156 static const struct dm_arg _args[] = { 2157 {0, 3, "Invalid number of cache feature arguments"}, 2158 }; 2159 2160 int r, mode_ctr = 0; 2161 unsigned int argc; 2162 const char *arg; 2163 struct cache_features *cf = &ca->features; 2164 2165 init_features(cf); 2166 2167 r = dm_read_arg_group(_args, as, &argc, error); 2168 if (r) 2169 return -EINVAL; 2170 2171 while (argc--) { 2172 arg = dm_shift_arg(as); 2173 2174 if (!strcasecmp(arg, "writeback")) { 2175 cf->io_mode = CM_IO_WRITEBACK; 2176 mode_ctr++; 2177 } 2178 2179 else if (!strcasecmp(arg, "writethrough")) { 2180 cf->io_mode = CM_IO_WRITETHROUGH; 2181 mode_ctr++; 2182 } 2183 2184 else if (!strcasecmp(arg, "passthrough")) { 2185 cf->io_mode = CM_IO_PASSTHROUGH; 2186 mode_ctr++; 2187 } 2188 2189 else if (!strcasecmp(arg, "metadata2")) 2190 cf->metadata_version = 2; 2191 2192 else if (!strcasecmp(arg, "no_discard_passdown")) 2193 cf->discard_passdown = false; 2194 2195 else { 2196 *error = "Unrecognised cache feature requested"; 2197 return -EINVAL; 2198 } 2199 } 2200 2201 if (mode_ctr > 1) { 2202 *error = "Duplicate cache io_mode features requested"; 2203 return -EINVAL; 2204 } 2205 2206 return 0; 2207 } 2208 2209 static int parse_policy(struct cache_args *ca, struct dm_arg_set *as, 2210 char **error) 2211 { 2212 static const struct dm_arg _args[] = { 2213 {0, 1024, "Invalid number of policy arguments"}, 2214 }; 2215 2216 int r; 2217 2218 if (!at_least_one_arg(as, error)) 2219 return -EINVAL; 2220 2221 ca->policy_name = dm_shift_arg(as); 2222 2223 r = dm_read_arg_group(_args, as, &ca->policy_argc, error); 2224 if (r) 2225 return -EINVAL; 2226 2227 ca->policy_argv = (const char **)as->argv; 2228 dm_consume_args(as, ca->policy_argc); 2229 2230 return 0; 2231 } 2232 2233 static int parse_cache_args(struct cache_args *ca, int argc, char **argv, 2234 char **error) 2235 { 2236 int r; 2237 struct dm_arg_set as; 2238 2239 as.argc = argc; 2240 as.argv = argv; 2241 2242 r = parse_metadata_dev(ca, &as, error); 2243 if (r) 2244 return r; 2245 2246 r = parse_cache_dev(ca, &as, error); 2247 if (r) 2248 return r; 2249 2250 r = parse_origin_dev(ca, &as, error); 2251 if (r) 2252 return r; 2253 2254 r = parse_block_size(ca, &as, error); 2255 if (r) 2256 return r; 2257 2258 r = parse_features(ca, &as, error); 2259 if (r) 2260 return r; 2261 2262 r = parse_policy(ca, &as, error); 2263 if (r) 2264 return r; 2265 2266 return 0; 2267 } 2268 2269 /*----------------------------------------------------------------*/ 2270 2271 static struct kmem_cache *migration_cache = NULL; 2272 2273 #define NOT_CORE_OPTION 1 2274 2275 static int process_config_option(struct cache *cache, const char *key, const char *value) 2276 { 2277 unsigned long tmp; 2278 2279 if (!strcasecmp(key, "migration_threshold")) { 2280 if (kstrtoul(value, 10, &tmp)) 2281 return -EINVAL; 2282 2283 cache->migration_threshold = tmp; 2284 return 0; 2285 } 2286 2287 return NOT_CORE_OPTION; 2288 } 2289 2290 static int set_config_value(struct cache *cache, const char *key, const char *value) 2291 { 2292 int r = process_config_option(cache, key, value); 2293 2294 if (r == NOT_CORE_OPTION) 2295 r = policy_set_config_value(cache->policy, key, value); 2296 2297 if (r) 2298 DMWARN("bad config value for %s: %s", key, value); 2299 2300 return r; 2301 } 2302 2303 static int set_config_values(struct cache *cache, int argc, const char **argv) 2304 { 2305 int r = 0; 2306 2307 if (argc & 1) { 2308 DMWARN("Odd number of policy arguments given but they should be <key> <value> pairs."); 2309 return -EINVAL; 2310 } 2311 2312 while (argc) { 2313 r = set_config_value(cache, argv[0], argv[1]); 2314 if (r) 2315 break; 2316 2317 argc -= 2; 2318 argv += 2; 2319 } 2320 2321 return r; 2322 } 2323 2324 static int create_cache_policy(struct cache *cache, struct cache_args *ca, 2325 char **error) 2326 { 2327 struct dm_cache_policy *p = dm_cache_policy_create(ca->policy_name, 2328 cache->cache_size, 2329 cache->origin_sectors, 2330 cache->sectors_per_block); 2331 if (IS_ERR(p)) { 2332 *error = "Error creating cache's policy"; 2333 return PTR_ERR(p); 2334 } 2335 cache->policy = p; 2336 BUG_ON(!cache->policy); 2337 2338 return 0; 2339 } 2340 2341 /* 2342 * We want the discard block size to be at least the size of the cache 2343 * block size and have no more than 2^14 discard blocks across the origin. 2344 */ 2345 #define MAX_DISCARD_BLOCKS (1 << 14) 2346 2347 static bool too_many_discard_blocks(sector_t discard_block_size, 2348 sector_t origin_size) 2349 { 2350 (void) sector_div(origin_size, discard_block_size); 2351 2352 return origin_size > MAX_DISCARD_BLOCKS; 2353 } 2354 2355 static sector_t calculate_discard_block_size(sector_t cache_block_size, 2356 sector_t origin_size) 2357 { 2358 sector_t discard_block_size = cache_block_size; 2359 2360 if (origin_size) 2361 while (too_many_discard_blocks(discard_block_size, origin_size)) 2362 discard_block_size *= 2; 2363 2364 return discard_block_size; 2365 } 2366 2367 static void set_cache_size(struct cache *cache, dm_cblock_t size) 2368 { 2369 dm_block_t nr_blocks = from_cblock(size); 2370 2371 if (nr_blocks > (1 << 20) && cache->cache_size != size) 2372 DMWARN_LIMIT("You have created a cache device with a lot of individual cache blocks (%llu)\n" 2373 "All these mappings can consume a lot of kernel memory, and take some time to read/write.\n" 2374 "Please consider increasing the cache block size to reduce the overall cache block count.", 2375 (unsigned long long) nr_blocks); 2376 2377 cache->cache_size = size; 2378 } 2379 2380 #define DEFAULT_MIGRATION_THRESHOLD 2048 2381 2382 static int cache_create(struct cache_args *ca, struct cache **result) 2383 { 2384 int r = 0; 2385 char **error = &ca->ti->error; 2386 struct cache *cache; 2387 struct dm_target *ti = ca->ti; 2388 dm_block_t origin_blocks; 2389 struct dm_cache_metadata *cmd; 2390 bool may_format = ca->features.mode == CM_WRITE; 2391 2392 cache = kzalloc(sizeof(*cache), GFP_KERNEL); 2393 if (!cache) 2394 return -ENOMEM; 2395 2396 cache->ti = ca->ti; 2397 ti->private = cache; 2398 ti->accounts_remapped_io = true; 2399 ti->num_flush_bios = 2; 2400 ti->flush_supported = true; 2401 2402 ti->num_discard_bios = 1; 2403 ti->discards_supported = true; 2404 2405 ti->per_io_data_size = sizeof(struct per_bio_data); 2406 2407 cache->features = ca->features; 2408 if (writethrough_mode(cache)) { 2409 /* Create bioset for writethrough bios issued to origin */ 2410 r = bioset_init(&cache->bs, BIO_POOL_SIZE, 0, 0); 2411 if (r) 2412 goto bad; 2413 } 2414 2415 cache->metadata_dev = ca->metadata_dev; 2416 cache->origin_dev = ca->origin_dev; 2417 cache->cache_dev = ca->cache_dev; 2418 2419 ca->metadata_dev = ca->origin_dev = ca->cache_dev = NULL; 2420 2421 origin_blocks = cache->origin_sectors = ti->len; 2422 origin_blocks = block_div(origin_blocks, ca->block_size); 2423 cache->origin_blocks = to_oblock(origin_blocks); 2424 2425 cache->sectors_per_block = ca->block_size; 2426 if (dm_set_target_max_io_len(ti, cache->sectors_per_block)) { 2427 r = -EINVAL; 2428 goto bad; 2429 } 2430 2431 if (ca->block_size & (ca->block_size - 1)) { 2432 dm_block_t cache_size = ca->cache_sectors; 2433 2434 cache->sectors_per_block_shift = -1; 2435 cache_size = block_div(cache_size, ca->block_size); 2436 set_cache_size(cache, to_cblock(cache_size)); 2437 } else { 2438 cache->sectors_per_block_shift = __ffs(ca->block_size); 2439 set_cache_size(cache, to_cblock(ca->cache_sectors >> cache->sectors_per_block_shift)); 2440 } 2441 2442 r = create_cache_policy(cache, ca, error); 2443 if (r) 2444 goto bad; 2445 2446 cache->policy_nr_args = ca->policy_argc; 2447 cache->migration_threshold = DEFAULT_MIGRATION_THRESHOLD; 2448 2449 r = set_config_values(cache, ca->policy_argc, ca->policy_argv); 2450 if (r) { 2451 *error = "Error setting cache policy's config values"; 2452 goto bad; 2453 } 2454 2455 cmd = dm_cache_metadata_open(cache->metadata_dev->bdev, 2456 ca->block_size, may_format, 2457 dm_cache_policy_get_hint_size(cache->policy), 2458 ca->features.metadata_version); 2459 if (IS_ERR(cmd)) { 2460 *error = "Error creating metadata object"; 2461 r = PTR_ERR(cmd); 2462 goto bad; 2463 } 2464 cache->cmd = cmd; 2465 set_cache_mode(cache, CM_WRITE); 2466 if (get_cache_mode(cache) != CM_WRITE) { 2467 *error = "Unable to get write access to metadata, please check/repair metadata."; 2468 r = -EINVAL; 2469 goto bad; 2470 } 2471 2472 if (passthrough_mode(cache)) { 2473 bool all_clean; 2474 2475 r = dm_cache_metadata_all_clean(cache->cmd, &all_clean); 2476 if (r) { 2477 *error = "dm_cache_metadata_all_clean() failed"; 2478 goto bad; 2479 } 2480 2481 if (!all_clean) { 2482 *error = "Cannot enter passthrough mode unless all blocks are clean"; 2483 r = -EINVAL; 2484 goto bad; 2485 } 2486 2487 policy_allow_migrations(cache->policy, false); 2488 } 2489 2490 spin_lock_init(&cache->lock); 2491 bio_list_init(&cache->deferred_bios); 2492 atomic_set(&cache->nr_allocated_migrations, 0); 2493 atomic_set(&cache->nr_io_migrations, 0); 2494 init_waitqueue_head(&cache->migration_wait); 2495 2496 r = -ENOMEM; 2497 atomic_set(&cache->nr_dirty, 0); 2498 cache->dirty_bitset = alloc_bitset(from_cblock(cache->cache_size)); 2499 if (!cache->dirty_bitset) { 2500 *error = "could not allocate dirty bitset"; 2501 goto bad; 2502 } 2503 clear_bitset(cache->dirty_bitset, from_cblock(cache->cache_size)); 2504 2505 cache->discard_block_size = 2506 calculate_discard_block_size(cache->sectors_per_block, 2507 cache->origin_sectors); 2508 cache->discard_nr_blocks = to_dblock(dm_sector_div_up(cache->origin_sectors, 2509 cache->discard_block_size)); 2510 cache->discard_bitset = alloc_bitset(from_dblock(cache->discard_nr_blocks)); 2511 if (!cache->discard_bitset) { 2512 *error = "could not allocate discard bitset"; 2513 goto bad; 2514 } 2515 clear_bitset(cache->discard_bitset, from_dblock(cache->discard_nr_blocks)); 2516 2517 cache->copier = dm_kcopyd_client_create(&dm_kcopyd_throttle); 2518 if (IS_ERR(cache->copier)) { 2519 *error = "could not create kcopyd client"; 2520 r = PTR_ERR(cache->copier); 2521 goto bad; 2522 } 2523 2524 cache->wq = alloc_workqueue("dm-" DM_MSG_PREFIX, WQ_MEM_RECLAIM, 0); 2525 if (!cache->wq) { 2526 *error = "could not create workqueue for metadata object"; 2527 goto bad; 2528 } 2529 INIT_WORK(&cache->deferred_bio_worker, process_deferred_bios); 2530 INIT_WORK(&cache->migration_worker, check_migrations); 2531 INIT_DELAYED_WORK(&cache->waker, do_waker); 2532 2533 cache->prison = dm_bio_prison_create_v2(cache->wq); 2534 if (!cache->prison) { 2535 *error = "could not create bio prison"; 2536 goto bad; 2537 } 2538 2539 r = mempool_init_slab_pool(&cache->migration_pool, MIGRATION_POOL_SIZE, 2540 migration_cache); 2541 if (r) { 2542 *error = "Error creating cache's migration mempool"; 2543 goto bad; 2544 } 2545 2546 cache->need_tick_bio = true; 2547 cache->sized = false; 2548 cache->invalidate = false; 2549 cache->commit_requested = false; 2550 cache->loaded_mappings = false; 2551 cache->loaded_discards = false; 2552 2553 load_stats(cache); 2554 2555 atomic_set(&cache->stats.demotion, 0); 2556 atomic_set(&cache->stats.promotion, 0); 2557 atomic_set(&cache->stats.copies_avoided, 0); 2558 atomic_set(&cache->stats.cache_cell_clash, 0); 2559 atomic_set(&cache->stats.commit_count, 0); 2560 atomic_set(&cache->stats.discard_count, 0); 2561 2562 spin_lock_init(&cache->invalidation_lock); 2563 INIT_LIST_HEAD(&cache->invalidation_requests); 2564 2565 batcher_init(&cache->committer, commit_op, cache, 2566 issue_op, cache, cache->wq); 2567 dm_iot_init(&cache->tracker); 2568 2569 init_rwsem(&cache->background_work_lock); 2570 prevent_background_work(cache); 2571 2572 *result = cache; 2573 return 0; 2574 bad: 2575 __destroy(cache); 2576 return r; 2577 } 2578 2579 static int copy_ctr_args(struct cache *cache, int argc, const char **argv) 2580 { 2581 unsigned int i; 2582 const char **copy; 2583 2584 copy = kcalloc(argc, sizeof(*copy), GFP_KERNEL); 2585 if (!copy) 2586 return -ENOMEM; 2587 for (i = 0; i < argc; i++) { 2588 copy[i] = kstrdup(argv[i], GFP_KERNEL); 2589 if (!copy[i]) { 2590 while (i--) 2591 kfree(copy[i]); 2592 kfree(copy); 2593 return -ENOMEM; 2594 } 2595 } 2596 2597 cache->nr_ctr_args = argc; 2598 cache->ctr_args = copy; 2599 2600 return 0; 2601 } 2602 2603 static int cache_ctr(struct dm_target *ti, unsigned int argc, char **argv) 2604 { 2605 int r = -EINVAL; 2606 struct cache_args *ca; 2607 struct cache *cache = NULL; 2608 2609 ca = kzalloc(sizeof(*ca), GFP_KERNEL); 2610 if (!ca) { 2611 ti->error = "Error allocating memory for cache"; 2612 return -ENOMEM; 2613 } 2614 ca->ti = ti; 2615 2616 r = parse_cache_args(ca, argc, argv, &ti->error); 2617 if (r) 2618 goto out; 2619 2620 r = cache_create(ca, &cache); 2621 if (r) 2622 goto out; 2623 2624 r = copy_ctr_args(cache, argc - 3, (const char **)argv + 3); 2625 if (r) { 2626 __destroy(cache); 2627 goto out; 2628 } 2629 2630 ti->private = cache; 2631 out: 2632 destroy_cache_args(ca); 2633 return r; 2634 } 2635 2636 /*----------------------------------------------------------------*/ 2637 2638 static int cache_map(struct dm_target *ti, struct bio *bio) 2639 { 2640 struct cache *cache = ti->private; 2641 2642 int r; 2643 bool commit_needed; 2644 dm_oblock_t block = get_bio_block(cache, bio); 2645 2646 init_per_bio_data(bio); 2647 if (unlikely(from_oblock(block) >= from_oblock(cache->origin_blocks))) { 2648 /* 2649 * This can only occur if the io goes to a partial block at 2650 * the end of the origin device. We don't cache these. 2651 * Just remap to the origin and carry on. 2652 */ 2653 remap_to_origin(cache, bio); 2654 accounted_begin(cache, bio); 2655 return DM_MAPIO_REMAPPED; 2656 } 2657 2658 if (discard_or_flush(bio)) { 2659 defer_bio(cache, bio); 2660 return DM_MAPIO_SUBMITTED; 2661 } 2662 2663 r = map_bio(cache, bio, block, &commit_needed); 2664 if (commit_needed) 2665 schedule_commit(&cache->committer); 2666 2667 return r; 2668 } 2669 2670 static int cache_end_io(struct dm_target *ti, struct bio *bio, blk_status_t *error) 2671 { 2672 struct cache *cache = ti->private; 2673 unsigned long flags; 2674 struct per_bio_data *pb = get_per_bio_data(bio); 2675 2676 if (pb->tick) { 2677 policy_tick(cache->policy, false); 2678 2679 spin_lock_irqsave(&cache->lock, flags); 2680 cache->need_tick_bio = true; 2681 spin_unlock_irqrestore(&cache->lock, flags); 2682 } 2683 2684 bio_drop_shared_lock(cache, bio); 2685 accounted_complete(cache, bio); 2686 2687 return DM_ENDIO_DONE; 2688 } 2689 2690 static int write_dirty_bitset(struct cache *cache) 2691 { 2692 int r; 2693 2694 if (get_cache_mode(cache) >= CM_READ_ONLY) 2695 return -EINVAL; 2696 2697 r = dm_cache_set_dirty_bits(cache->cmd, from_cblock(cache->cache_size), cache->dirty_bitset); 2698 if (r) 2699 metadata_operation_failed(cache, "dm_cache_set_dirty_bits", r); 2700 2701 return r; 2702 } 2703 2704 static int write_discard_bitset(struct cache *cache) 2705 { 2706 unsigned int i, r; 2707 2708 if (get_cache_mode(cache) >= CM_READ_ONLY) 2709 return -EINVAL; 2710 2711 r = dm_cache_discard_bitset_resize(cache->cmd, cache->discard_block_size, 2712 cache->discard_nr_blocks); 2713 if (r) { 2714 DMERR("%s: could not resize on-disk discard bitset", cache_device_name(cache)); 2715 metadata_operation_failed(cache, "dm_cache_discard_bitset_resize", r); 2716 return r; 2717 } 2718 2719 for (i = 0; i < from_dblock(cache->discard_nr_blocks); i++) { 2720 r = dm_cache_set_discard(cache->cmd, to_dblock(i), 2721 is_discarded(cache, to_dblock(i))); 2722 if (r) { 2723 metadata_operation_failed(cache, "dm_cache_set_discard", r); 2724 return r; 2725 } 2726 } 2727 2728 return 0; 2729 } 2730 2731 static int write_hints(struct cache *cache) 2732 { 2733 int r; 2734 2735 if (get_cache_mode(cache) >= CM_READ_ONLY) 2736 return -EINVAL; 2737 2738 r = dm_cache_write_hints(cache->cmd, cache->policy); 2739 if (r) { 2740 metadata_operation_failed(cache, "dm_cache_write_hints", r); 2741 return r; 2742 } 2743 2744 return 0; 2745 } 2746 2747 /* 2748 * returns true on success 2749 */ 2750 static bool sync_metadata(struct cache *cache) 2751 { 2752 int r1, r2, r3, r4; 2753 2754 r1 = write_dirty_bitset(cache); 2755 if (r1) 2756 DMERR("%s: could not write dirty bitset", cache_device_name(cache)); 2757 2758 r2 = write_discard_bitset(cache); 2759 if (r2) 2760 DMERR("%s: could not write discard bitset", cache_device_name(cache)); 2761 2762 save_stats(cache); 2763 2764 r3 = write_hints(cache); 2765 if (r3) 2766 DMERR("%s: could not write hints", cache_device_name(cache)); 2767 2768 /* 2769 * If writing the above metadata failed, we still commit, but don't 2770 * set the clean shutdown flag. This will effectively force every 2771 * dirty bit to be set on reload. 2772 */ 2773 r4 = commit(cache, !r1 && !r2 && !r3); 2774 if (r4) 2775 DMERR("%s: could not write cache metadata", cache_device_name(cache)); 2776 2777 return !r1 && !r2 && !r3 && !r4; 2778 } 2779 2780 static void cache_postsuspend(struct dm_target *ti) 2781 { 2782 struct cache *cache = ti->private; 2783 2784 prevent_background_work(cache); 2785 BUG_ON(atomic_read(&cache->nr_io_migrations)); 2786 2787 cancel_delayed_work_sync(&cache->waker); 2788 drain_workqueue(cache->wq); 2789 WARN_ON(cache->tracker.in_flight); 2790 2791 /* 2792 * If it's a flush suspend there won't be any deferred bios, so this 2793 * call is harmless. 2794 */ 2795 requeue_deferred_bios(cache); 2796 2797 if (get_cache_mode(cache) == CM_WRITE) 2798 (void) sync_metadata(cache); 2799 } 2800 2801 static int load_mapping(void *context, dm_oblock_t oblock, dm_cblock_t cblock, 2802 bool dirty, uint32_t hint, bool hint_valid) 2803 { 2804 struct cache *cache = context; 2805 2806 if (dirty) { 2807 set_bit(from_cblock(cblock), cache->dirty_bitset); 2808 atomic_inc(&cache->nr_dirty); 2809 } else 2810 clear_bit(from_cblock(cblock), cache->dirty_bitset); 2811 2812 return policy_load_mapping(cache->policy, oblock, cblock, dirty, hint, hint_valid); 2813 } 2814 2815 /* 2816 * The discard block size in the on disk metadata is not 2817 * necessarily the same as we're currently using. So we have to 2818 * be careful to only set the discarded attribute if we know it 2819 * covers a complete block of the new size. 2820 */ 2821 struct discard_load_info { 2822 struct cache *cache; 2823 2824 /* 2825 * These blocks are sized using the on disk dblock size, rather 2826 * than the current one. 2827 */ 2828 dm_block_t block_size; 2829 dm_block_t discard_begin, discard_end; 2830 }; 2831 2832 static void discard_load_info_init(struct cache *cache, 2833 struct discard_load_info *li) 2834 { 2835 li->cache = cache; 2836 li->discard_begin = li->discard_end = 0; 2837 } 2838 2839 static void set_discard_range(struct discard_load_info *li) 2840 { 2841 sector_t b, e; 2842 2843 if (li->discard_begin == li->discard_end) 2844 return; 2845 2846 /* 2847 * Convert to sectors. 2848 */ 2849 b = li->discard_begin * li->block_size; 2850 e = li->discard_end * li->block_size; 2851 2852 /* 2853 * Then convert back to the current dblock size. 2854 */ 2855 b = dm_sector_div_up(b, li->cache->discard_block_size); 2856 sector_div(e, li->cache->discard_block_size); 2857 2858 /* 2859 * The origin may have shrunk, so we need to check we're still in 2860 * bounds. 2861 */ 2862 if (e > from_dblock(li->cache->discard_nr_blocks)) 2863 e = from_dblock(li->cache->discard_nr_blocks); 2864 2865 for (; b < e; b++) 2866 set_discard(li->cache, to_dblock(b)); 2867 } 2868 2869 static int load_discard(void *context, sector_t discard_block_size, 2870 dm_dblock_t dblock, bool discard) 2871 { 2872 struct discard_load_info *li = context; 2873 2874 li->block_size = discard_block_size; 2875 2876 if (discard) { 2877 if (from_dblock(dblock) == li->discard_end) 2878 /* 2879 * We're already in a discard range, just extend it. 2880 */ 2881 li->discard_end = li->discard_end + 1ULL; 2882 2883 else { 2884 /* 2885 * Emit the old range and start a new one. 2886 */ 2887 set_discard_range(li); 2888 li->discard_begin = from_dblock(dblock); 2889 li->discard_end = li->discard_begin + 1ULL; 2890 } 2891 } else { 2892 set_discard_range(li); 2893 li->discard_begin = li->discard_end = 0; 2894 } 2895 2896 return 0; 2897 } 2898 2899 static dm_cblock_t get_cache_dev_size(struct cache *cache) 2900 { 2901 sector_t size = get_dev_size(cache->cache_dev); 2902 (void) sector_div(size, cache->sectors_per_block); 2903 return to_cblock(size); 2904 } 2905 2906 static bool can_resize(struct cache *cache, dm_cblock_t new_size) 2907 { 2908 if (from_cblock(new_size) > from_cblock(cache->cache_size)) { 2909 DMERR("%s: unable to extend cache due to missing cache table reload", 2910 cache_device_name(cache)); 2911 return false; 2912 } 2913 2914 /* 2915 * We can't drop a dirty block when shrinking the cache. 2916 */ 2917 if (cache->loaded_mappings) { 2918 new_size = to_cblock(find_next_bit(cache->dirty_bitset, 2919 from_cblock(cache->cache_size), 2920 from_cblock(new_size))); 2921 if (new_size != cache->cache_size) { 2922 DMERR("%s: unable to shrink cache; cache block %llu is dirty", 2923 cache_device_name(cache), 2924 (unsigned long long) from_cblock(new_size)); 2925 return false; 2926 } 2927 } 2928 2929 return true; 2930 } 2931 2932 static int resize_cache_dev(struct cache *cache, dm_cblock_t new_size) 2933 { 2934 int r; 2935 2936 r = dm_cache_resize(cache->cmd, new_size); 2937 if (r) { 2938 DMERR("%s: could not resize cache metadata", cache_device_name(cache)); 2939 metadata_operation_failed(cache, "dm_cache_resize", r); 2940 return r; 2941 } 2942 2943 set_cache_size(cache, new_size); 2944 2945 return 0; 2946 } 2947 2948 static int cache_preresume(struct dm_target *ti) 2949 { 2950 int r = 0; 2951 struct cache *cache = ti->private; 2952 dm_cblock_t csize = get_cache_dev_size(cache); 2953 2954 /* 2955 * Check to see if the cache has resized. 2956 */ 2957 if (!cache->sized || csize != cache->cache_size) { 2958 if (!can_resize(cache, csize)) 2959 return -EINVAL; 2960 2961 r = resize_cache_dev(cache, csize); 2962 if (r) 2963 return r; 2964 2965 cache->sized = true; 2966 } 2967 2968 if (!cache->loaded_mappings) { 2969 r = dm_cache_load_mappings(cache->cmd, cache->policy, 2970 load_mapping, cache); 2971 if (r) { 2972 DMERR("%s: could not load cache mappings", cache_device_name(cache)); 2973 metadata_operation_failed(cache, "dm_cache_load_mappings", r); 2974 return r; 2975 } 2976 2977 cache->loaded_mappings = true; 2978 } 2979 2980 if (!cache->loaded_discards) { 2981 struct discard_load_info li; 2982 2983 /* 2984 * The discard bitset could have been resized, or the 2985 * discard block size changed. To be safe we start by 2986 * setting every dblock to not discarded. 2987 */ 2988 clear_bitset(cache->discard_bitset, from_dblock(cache->discard_nr_blocks)); 2989 2990 discard_load_info_init(cache, &li); 2991 r = dm_cache_load_discards(cache->cmd, load_discard, &li); 2992 if (r) { 2993 DMERR("%s: could not load origin discards", cache_device_name(cache)); 2994 metadata_operation_failed(cache, "dm_cache_load_discards", r); 2995 return r; 2996 } 2997 set_discard_range(&li); 2998 2999 cache->loaded_discards = true; 3000 } 3001 3002 return r; 3003 } 3004 3005 static void cache_resume(struct dm_target *ti) 3006 { 3007 struct cache *cache = ti->private; 3008 3009 cache->need_tick_bio = true; 3010 allow_background_work(cache); 3011 do_waker(&cache->waker.work); 3012 } 3013 3014 static void emit_flags(struct cache *cache, char *result, 3015 unsigned int maxlen, ssize_t *sz_ptr) 3016 { 3017 ssize_t sz = *sz_ptr; 3018 struct cache_features *cf = &cache->features; 3019 unsigned int count = (cf->metadata_version == 2) + !cf->discard_passdown + 1; 3020 3021 DMEMIT("%u ", count); 3022 3023 if (cf->metadata_version == 2) 3024 DMEMIT("metadata2 "); 3025 3026 if (writethrough_mode(cache)) 3027 DMEMIT("writethrough "); 3028 3029 else if (passthrough_mode(cache)) 3030 DMEMIT("passthrough "); 3031 3032 else if (writeback_mode(cache)) 3033 DMEMIT("writeback "); 3034 3035 else { 3036 DMEMIT("unknown "); 3037 DMERR("%s: internal error: unknown io mode: %d", 3038 cache_device_name(cache), (int) cf->io_mode); 3039 } 3040 3041 if (!cf->discard_passdown) 3042 DMEMIT("no_discard_passdown "); 3043 3044 *sz_ptr = sz; 3045 } 3046 3047 /* 3048 * Status format: 3049 * 3050 * <metadata block size> <#used metadata blocks>/<#total metadata blocks> 3051 * <cache block size> <#used cache blocks>/<#total cache blocks> 3052 * <#read hits> <#read misses> <#write hits> <#write misses> 3053 * <#demotions> <#promotions> <#dirty> 3054 * <#features> <features>* 3055 * <#core args> <core args> 3056 * <policy name> <#policy args> <policy args>* <cache metadata mode> <needs_check> 3057 */ 3058 static void cache_status(struct dm_target *ti, status_type_t type, 3059 unsigned int status_flags, char *result, unsigned int maxlen) 3060 { 3061 int r = 0; 3062 unsigned int i; 3063 ssize_t sz = 0; 3064 dm_block_t nr_free_blocks_metadata = 0; 3065 dm_block_t nr_blocks_metadata = 0; 3066 char buf[BDEVNAME_SIZE]; 3067 struct cache *cache = ti->private; 3068 dm_cblock_t residency; 3069 bool needs_check; 3070 3071 switch (type) { 3072 case STATUSTYPE_INFO: 3073 if (get_cache_mode(cache) == CM_FAIL) { 3074 DMEMIT("Fail"); 3075 break; 3076 } 3077 3078 /* Commit to ensure statistics aren't out-of-date */ 3079 if (!(status_flags & DM_STATUS_NOFLUSH_FLAG) && !dm_suspended(ti)) 3080 (void) commit(cache, false); 3081 3082 r = dm_cache_get_free_metadata_block_count(cache->cmd, &nr_free_blocks_metadata); 3083 if (r) { 3084 DMERR("%s: dm_cache_get_free_metadata_block_count returned %d", 3085 cache_device_name(cache), r); 3086 goto err; 3087 } 3088 3089 r = dm_cache_get_metadata_dev_size(cache->cmd, &nr_blocks_metadata); 3090 if (r) { 3091 DMERR("%s: dm_cache_get_metadata_dev_size returned %d", 3092 cache_device_name(cache), r); 3093 goto err; 3094 } 3095 3096 residency = policy_residency(cache->policy); 3097 3098 DMEMIT("%u %llu/%llu %llu %llu/%llu %u %u %u %u %u %u %lu ", 3099 (unsigned int)DM_CACHE_METADATA_BLOCK_SIZE, 3100 (unsigned long long)(nr_blocks_metadata - nr_free_blocks_metadata), 3101 (unsigned long long)nr_blocks_metadata, 3102 (unsigned long long)cache->sectors_per_block, 3103 (unsigned long long) from_cblock(residency), 3104 (unsigned long long) from_cblock(cache->cache_size), 3105 (unsigned int) atomic_read(&cache->stats.read_hit), 3106 (unsigned int) atomic_read(&cache->stats.read_miss), 3107 (unsigned int) atomic_read(&cache->stats.write_hit), 3108 (unsigned int) atomic_read(&cache->stats.write_miss), 3109 (unsigned int) atomic_read(&cache->stats.demotion), 3110 (unsigned int) atomic_read(&cache->stats.promotion), 3111 (unsigned long) atomic_read(&cache->nr_dirty)); 3112 3113 emit_flags(cache, result, maxlen, &sz); 3114 3115 DMEMIT("2 migration_threshold %llu ", (unsigned long long) cache->migration_threshold); 3116 3117 DMEMIT("%s ", dm_cache_policy_get_name(cache->policy)); 3118 if (sz < maxlen) { 3119 r = policy_emit_config_values(cache->policy, result, maxlen, &sz); 3120 if (r) 3121 DMERR("%s: policy_emit_config_values returned %d", 3122 cache_device_name(cache), r); 3123 } 3124 3125 if (get_cache_mode(cache) == CM_READ_ONLY) 3126 DMEMIT("ro "); 3127 else 3128 DMEMIT("rw "); 3129 3130 r = dm_cache_metadata_needs_check(cache->cmd, &needs_check); 3131 3132 if (r || needs_check) 3133 DMEMIT("needs_check "); 3134 else 3135 DMEMIT("- "); 3136 3137 break; 3138 3139 case STATUSTYPE_TABLE: 3140 format_dev_t(buf, cache->metadata_dev->bdev->bd_dev); 3141 DMEMIT("%s ", buf); 3142 format_dev_t(buf, cache->cache_dev->bdev->bd_dev); 3143 DMEMIT("%s ", buf); 3144 format_dev_t(buf, cache->origin_dev->bdev->bd_dev); 3145 DMEMIT("%s", buf); 3146 3147 for (i = 0; i < cache->nr_ctr_args - 1; i++) 3148 DMEMIT(" %s", cache->ctr_args[i]); 3149 if (cache->nr_ctr_args) 3150 DMEMIT(" %s", cache->ctr_args[cache->nr_ctr_args - 1]); 3151 break; 3152 3153 case STATUSTYPE_IMA: 3154 DMEMIT_TARGET_NAME_VERSION(ti->type); 3155 if (get_cache_mode(cache) == CM_FAIL) 3156 DMEMIT(",metadata_mode=fail"); 3157 else if (get_cache_mode(cache) == CM_READ_ONLY) 3158 DMEMIT(",metadata_mode=ro"); 3159 else 3160 DMEMIT(",metadata_mode=rw"); 3161 3162 format_dev_t(buf, cache->metadata_dev->bdev->bd_dev); 3163 DMEMIT(",cache_metadata_device=%s", buf); 3164 format_dev_t(buf, cache->cache_dev->bdev->bd_dev); 3165 DMEMIT(",cache_device=%s", buf); 3166 format_dev_t(buf, cache->origin_dev->bdev->bd_dev); 3167 DMEMIT(",cache_origin_device=%s", buf); 3168 DMEMIT(",writethrough=%c", writethrough_mode(cache) ? 'y' : 'n'); 3169 DMEMIT(",writeback=%c", writeback_mode(cache) ? 'y' : 'n'); 3170 DMEMIT(",passthrough=%c", passthrough_mode(cache) ? 'y' : 'n'); 3171 DMEMIT(",metadata2=%c", cache->features.metadata_version == 2 ? 'y' : 'n'); 3172 DMEMIT(",no_discard_passdown=%c", cache->features.discard_passdown ? 'n' : 'y'); 3173 DMEMIT(";"); 3174 break; 3175 } 3176 3177 return; 3178 3179 err: 3180 DMEMIT("Error"); 3181 } 3182 3183 /* 3184 * Defines a range of cblocks, begin to (end - 1) are in the range. end is 3185 * the one-past-the-end value. 3186 */ 3187 struct cblock_range { 3188 dm_cblock_t begin; 3189 dm_cblock_t end; 3190 }; 3191 3192 /* 3193 * A cache block range can take two forms: 3194 * 3195 * i) A single cblock, eg. '3456' 3196 * ii) A begin and end cblock with a dash between, eg. 123-234 3197 */ 3198 static int parse_cblock_range(struct cache *cache, const char *str, 3199 struct cblock_range *result) 3200 { 3201 char dummy; 3202 uint64_t b, e; 3203 int r; 3204 3205 /* 3206 * Try and parse form (ii) first. 3207 */ 3208 r = sscanf(str, "%llu-%llu%c", &b, &e, &dummy); 3209 if (r < 0) 3210 return r; 3211 3212 if (r == 2) { 3213 result->begin = to_cblock(b); 3214 result->end = to_cblock(e); 3215 return 0; 3216 } 3217 3218 /* 3219 * That didn't work, try form (i). 3220 */ 3221 r = sscanf(str, "%llu%c", &b, &dummy); 3222 if (r < 0) 3223 return r; 3224 3225 if (r == 1) { 3226 result->begin = to_cblock(b); 3227 result->end = to_cblock(from_cblock(result->begin) + 1u); 3228 return 0; 3229 } 3230 3231 DMERR("%s: invalid cblock range '%s'", cache_device_name(cache), str); 3232 return -EINVAL; 3233 } 3234 3235 static int validate_cblock_range(struct cache *cache, struct cblock_range *range) 3236 { 3237 uint64_t b = from_cblock(range->begin); 3238 uint64_t e = from_cblock(range->end); 3239 uint64_t n = from_cblock(cache->cache_size); 3240 3241 if (b >= n) { 3242 DMERR("%s: begin cblock out of range: %llu >= %llu", 3243 cache_device_name(cache), b, n); 3244 return -EINVAL; 3245 } 3246 3247 if (e > n) { 3248 DMERR("%s: end cblock out of range: %llu > %llu", 3249 cache_device_name(cache), e, n); 3250 return -EINVAL; 3251 } 3252 3253 if (b >= e) { 3254 DMERR("%s: invalid cblock range: %llu >= %llu", 3255 cache_device_name(cache), b, e); 3256 return -EINVAL; 3257 } 3258 3259 return 0; 3260 } 3261 3262 static inline dm_cblock_t cblock_succ(dm_cblock_t b) 3263 { 3264 return to_cblock(from_cblock(b) + 1); 3265 } 3266 3267 static int request_invalidation(struct cache *cache, struct cblock_range *range) 3268 { 3269 int r = 0; 3270 3271 /* 3272 * We don't need to do any locking here because we know we're in 3273 * passthrough mode. There's is potential for a race between an 3274 * invalidation triggered by an io and an invalidation message. This 3275 * is harmless, we must not worry if the policy call fails. 3276 */ 3277 while (range->begin != range->end) { 3278 r = invalidate_cblock(cache, range->begin); 3279 if (r) 3280 return r; 3281 3282 range->begin = cblock_succ(range->begin); 3283 } 3284 3285 cache->commit_requested = true; 3286 return r; 3287 } 3288 3289 static int process_invalidate_cblocks_message(struct cache *cache, unsigned int count, 3290 const char **cblock_ranges) 3291 { 3292 int r = 0; 3293 unsigned int i; 3294 struct cblock_range range; 3295 3296 if (!passthrough_mode(cache)) { 3297 DMERR("%s: cache has to be in passthrough mode for invalidation", 3298 cache_device_name(cache)); 3299 return -EPERM; 3300 } 3301 3302 for (i = 0; i < count; i++) { 3303 r = parse_cblock_range(cache, cblock_ranges[i], &range); 3304 if (r) 3305 break; 3306 3307 r = validate_cblock_range(cache, &range); 3308 if (r) 3309 break; 3310 3311 /* 3312 * Pass begin and end origin blocks to the worker and wake it. 3313 */ 3314 r = request_invalidation(cache, &range); 3315 if (r) 3316 break; 3317 } 3318 3319 return r; 3320 } 3321 3322 /* 3323 * Supports 3324 * "<key> <value>" 3325 * and 3326 * "invalidate_cblocks [(<begin>)|(<begin>-<end>)]* 3327 * 3328 * The key migration_threshold is supported by the cache target core. 3329 */ 3330 static int cache_message(struct dm_target *ti, unsigned int argc, char **argv, 3331 char *result, unsigned int maxlen) 3332 { 3333 struct cache *cache = ti->private; 3334 3335 if (!argc) 3336 return -EINVAL; 3337 3338 if (get_cache_mode(cache) >= CM_READ_ONLY) { 3339 DMERR("%s: unable to service cache target messages in READ_ONLY or FAIL mode", 3340 cache_device_name(cache)); 3341 return -EOPNOTSUPP; 3342 } 3343 3344 if (!strcasecmp(argv[0], "invalidate_cblocks")) 3345 return process_invalidate_cblocks_message(cache, argc - 1, (const char **) argv + 1); 3346 3347 if (argc != 2) 3348 return -EINVAL; 3349 3350 return set_config_value(cache, argv[0], argv[1]); 3351 } 3352 3353 static int cache_iterate_devices(struct dm_target *ti, 3354 iterate_devices_callout_fn fn, void *data) 3355 { 3356 int r = 0; 3357 struct cache *cache = ti->private; 3358 3359 r = fn(ti, cache->cache_dev, 0, get_dev_size(cache->cache_dev), data); 3360 if (!r) 3361 r = fn(ti, cache->origin_dev, 0, ti->len, data); 3362 3363 return r; 3364 } 3365 3366 /* 3367 * If discard_passdown was enabled verify that the origin device 3368 * supports discards. Disable discard_passdown if not. 3369 */ 3370 static void disable_passdown_if_not_supported(struct cache *cache) 3371 { 3372 struct block_device *origin_bdev = cache->origin_dev->bdev; 3373 struct queue_limits *origin_limits = &bdev_get_queue(origin_bdev)->limits; 3374 const char *reason = NULL; 3375 3376 if (!cache->features.discard_passdown) 3377 return; 3378 3379 if (!bdev_max_discard_sectors(origin_bdev)) 3380 reason = "discard unsupported"; 3381 3382 else if (origin_limits->max_discard_sectors < cache->sectors_per_block) 3383 reason = "max discard sectors smaller than a block"; 3384 3385 if (reason) { 3386 DMWARN("Origin device (%pg) %s: Disabling discard passdown.", 3387 origin_bdev, reason); 3388 cache->features.discard_passdown = false; 3389 } 3390 } 3391 3392 static void set_discard_limits(struct cache *cache, struct queue_limits *limits) 3393 { 3394 struct block_device *origin_bdev = cache->origin_dev->bdev; 3395 struct queue_limits *origin_limits = &bdev_get_queue(origin_bdev)->limits; 3396 3397 if (!cache->features.discard_passdown) { 3398 /* No passdown is done so setting own virtual limits */ 3399 limits->max_discard_sectors = min_t(sector_t, cache->discard_block_size * 1024, 3400 cache->origin_sectors); 3401 limits->discard_granularity = cache->discard_block_size << SECTOR_SHIFT; 3402 return; 3403 } 3404 3405 /* 3406 * cache_iterate_devices() is stacking both origin and fast device limits 3407 * but discards aren't passed to fast device, so inherit origin's limits. 3408 */ 3409 limits->max_discard_sectors = origin_limits->max_discard_sectors; 3410 limits->max_hw_discard_sectors = origin_limits->max_hw_discard_sectors; 3411 limits->discard_granularity = origin_limits->discard_granularity; 3412 limits->discard_alignment = origin_limits->discard_alignment; 3413 limits->discard_misaligned = origin_limits->discard_misaligned; 3414 } 3415 3416 static void cache_io_hints(struct dm_target *ti, struct queue_limits *limits) 3417 { 3418 struct cache *cache = ti->private; 3419 uint64_t io_opt_sectors = limits->io_opt >> SECTOR_SHIFT; 3420 3421 /* 3422 * If the system-determined stacked limits are compatible with the 3423 * cache's blocksize (io_opt is a factor) do not override them. 3424 */ 3425 if (io_opt_sectors < cache->sectors_per_block || 3426 do_div(io_opt_sectors, cache->sectors_per_block)) { 3427 blk_limits_io_min(limits, cache->sectors_per_block << SECTOR_SHIFT); 3428 blk_limits_io_opt(limits, cache->sectors_per_block << SECTOR_SHIFT); 3429 } 3430 3431 disable_passdown_if_not_supported(cache); 3432 set_discard_limits(cache, limits); 3433 } 3434 3435 /*----------------------------------------------------------------*/ 3436 3437 static struct target_type cache_target = { 3438 .name = "cache", 3439 .version = {2, 2, 0}, 3440 .module = THIS_MODULE, 3441 .ctr = cache_ctr, 3442 .dtr = cache_dtr, 3443 .map = cache_map, 3444 .end_io = cache_end_io, 3445 .postsuspend = cache_postsuspend, 3446 .preresume = cache_preresume, 3447 .resume = cache_resume, 3448 .status = cache_status, 3449 .message = cache_message, 3450 .iterate_devices = cache_iterate_devices, 3451 .io_hints = cache_io_hints, 3452 }; 3453 3454 static int __init dm_cache_init(void) 3455 { 3456 int r; 3457 3458 migration_cache = KMEM_CACHE(dm_cache_migration, 0); 3459 if (!migration_cache) { 3460 r = -ENOMEM; 3461 goto err; 3462 } 3463 3464 btracker_work_cache = kmem_cache_create("dm_cache_bt_work", 3465 sizeof(struct bt_work), __alignof__(struct bt_work), 0, NULL); 3466 if (!btracker_work_cache) { 3467 r = -ENOMEM; 3468 goto err; 3469 } 3470 3471 r = dm_register_target(&cache_target); 3472 if (r) { 3473 goto err; 3474 } 3475 3476 return 0; 3477 3478 err: 3479 kmem_cache_destroy(migration_cache); 3480 kmem_cache_destroy(btracker_work_cache); 3481 return r; 3482 } 3483 3484 static void __exit dm_cache_exit(void) 3485 { 3486 dm_unregister_target(&cache_target); 3487 kmem_cache_destroy(migration_cache); 3488 kmem_cache_destroy(btracker_work_cache); 3489 } 3490 3491 module_init(dm_cache_init); 3492 module_exit(dm_cache_exit); 3493 3494 MODULE_DESCRIPTION(DM_NAME " cache target"); 3495 MODULE_AUTHOR("Joe Thornber <ejt@redhat.com>"); 3496 MODULE_LICENSE("GPL"); 3497