1 /* 2 * Copyright (C) 2012 Red Hat. All rights reserved. 3 * 4 * This file is released under the GPL. 5 */ 6 7 #include "dm.h" 8 #include "dm-bio-prison-v2.h" 9 #include "dm-bio-record.h" 10 #include "dm-cache-metadata.h" 11 12 #include <linux/dm-io.h> 13 #include <linux/dm-kcopyd.h> 14 #include <linux/jiffies.h> 15 #include <linux/init.h> 16 #include <linux/mempool.h> 17 #include <linux/module.h> 18 #include <linux/rwsem.h> 19 #include <linux/slab.h> 20 #include <linux/vmalloc.h> 21 22 #define DM_MSG_PREFIX "cache" 23 24 DECLARE_DM_KCOPYD_THROTTLE_WITH_MODULE_PARM(cache_copy_throttle, 25 "A percentage of time allocated for copying to and/or from cache"); 26 27 /*----------------------------------------------------------------*/ 28 29 /* 30 * Glossary: 31 * 32 * oblock: index of an origin block 33 * cblock: index of a cache block 34 * promotion: movement of a block from origin to cache 35 * demotion: movement of a block from cache to origin 36 * migration: movement of a block between the origin and cache device, 37 * either direction 38 */ 39 40 /*----------------------------------------------------------------*/ 41 42 struct io_tracker { 43 spinlock_t lock; 44 45 /* 46 * Sectors of in-flight IO. 47 */ 48 sector_t in_flight; 49 50 /* 51 * The time, in jiffies, when this device became idle (if it is 52 * indeed idle). 53 */ 54 unsigned long idle_time; 55 unsigned long last_update_time; 56 }; 57 58 static void iot_init(struct io_tracker *iot) 59 { 60 spin_lock_init(&iot->lock); 61 iot->in_flight = 0ul; 62 iot->idle_time = 0ul; 63 iot->last_update_time = jiffies; 64 } 65 66 static bool __iot_idle_for(struct io_tracker *iot, unsigned long jifs) 67 { 68 if (iot->in_flight) 69 return false; 70 71 return time_after(jiffies, iot->idle_time + jifs); 72 } 73 74 static bool iot_idle_for(struct io_tracker *iot, unsigned long jifs) 75 { 76 bool r; 77 78 spin_lock_irq(&iot->lock); 79 r = __iot_idle_for(iot, jifs); 80 spin_unlock_irq(&iot->lock); 81 82 return r; 83 } 84 85 static void iot_io_begin(struct io_tracker *iot, sector_t len) 86 { 87 spin_lock_irq(&iot->lock); 88 iot->in_flight += len; 89 spin_unlock_irq(&iot->lock); 90 } 91 92 static void __iot_io_end(struct io_tracker *iot, sector_t len) 93 { 94 if (!len) 95 return; 96 97 iot->in_flight -= len; 98 if (!iot->in_flight) 99 iot->idle_time = jiffies; 100 } 101 102 static void iot_io_end(struct io_tracker *iot, sector_t len) 103 { 104 unsigned long flags; 105 106 spin_lock_irqsave(&iot->lock, flags); 107 __iot_io_end(iot, len); 108 spin_unlock_irqrestore(&iot->lock, flags); 109 } 110 111 /*----------------------------------------------------------------*/ 112 113 /* 114 * Represents a chunk of future work. 'input' allows continuations to pass 115 * values between themselves, typically error values. 116 */ 117 struct continuation { 118 struct work_struct ws; 119 blk_status_t input; 120 }; 121 122 static inline void init_continuation(struct continuation *k, 123 void (*fn)(struct work_struct *)) 124 { 125 INIT_WORK(&k->ws, fn); 126 k->input = 0; 127 } 128 129 static inline void queue_continuation(struct workqueue_struct *wq, 130 struct continuation *k) 131 { 132 queue_work(wq, &k->ws); 133 } 134 135 /*----------------------------------------------------------------*/ 136 137 /* 138 * The batcher collects together pieces of work that need a particular 139 * operation to occur before they can proceed (typically a commit). 140 */ 141 struct batcher { 142 /* 143 * The operation that everyone is waiting for. 144 */ 145 blk_status_t (*commit_op)(void *context); 146 void *commit_context; 147 148 /* 149 * This is how bios should be issued once the commit op is complete 150 * (accounted_request). 151 */ 152 void (*issue_op)(struct bio *bio, void *context); 153 void *issue_context; 154 155 /* 156 * Queued work gets put on here after commit. 157 */ 158 struct workqueue_struct *wq; 159 160 spinlock_t lock; 161 struct list_head work_items; 162 struct bio_list bios; 163 struct work_struct commit_work; 164 165 bool commit_scheduled; 166 }; 167 168 static void __commit(struct work_struct *_ws) 169 { 170 struct batcher *b = container_of(_ws, struct batcher, commit_work); 171 blk_status_t r; 172 struct list_head work_items; 173 struct work_struct *ws, *tmp; 174 struct continuation *k; 175 struct bio *bio; 176 struct bio_list bios; 177 178 INIT_LIST_HEAD(&work_items); 179 bio_list_init(&bios); 180 181 /* 182 * We have to grab these before the commit_op to avoid a race 183 * condition. 184 */ 185 spin_lock_irq(&b->lock); 186 list_splice_init(&b->work_items, &work_items); 187 bio_list_merge(&bios, &b->bios); 188 bio_list_init(&b->bios); 189 b->commit_scheduled = false; 190 spin_unlock_irq(&b->lock); 191 192 r = b->commit_op(b->commit_context); 193 194 list_for_each_entry_safe(ws, tmp, &work_items, entry) { 195 k = container_of(ws, struct continuation, ws); 196 k->input = r; 197 INIT_LIST_HEAD(&ws->entry); /* to avoid a WARN_ON */ 198 queue_work(b->wq, ws); 199 } 200 201 while ((bio = bio_list_pop(&bios))) { 202 if (r) { 203 bio->bi_status = r; 204 bio_endio(bio); 205 } else 206 b->issue_op(bio, b->issue_context); 207 } 208 } 209 210 static void batcher_init(struct batcher *b, 211 blk_status_t (*commit_op)(void *), 212 void *commit_context, 213 void (*issue_op)(struct bio *bio, void *), 214 void *issue_context, 215 struct workqueue_struct *wq) 216 { 217 b->commit_op = commit_op; 218 b->commit_context = commit_context; 219 b->issue_op = issue_op; 220 b->issue_context = issue_context; 221 b->wq = wq; 222 223 spin_lock_init(&b->lock); 224 INIT_LIST_HEAD(&b->work_items); 225 bio_list_init(&b->bios); 226 INIT_WORK(&b->commit_work, __commit); 227 b->commit_scheduled = false; 228 } 229 230 static void async_commit(struct batcher *b) 231 { 232 queue_work(b->wq, &b->commit_work); 233 } 234 235 static void continue_after_commit(struct batcher *b, struct continuation *k) 236 { 237 bool commit_scheduled; 238 239 spin_lock_irq(&b->lock); 240 commit_scheduled = b->commit_scheduled; 241 list_add_tail(&k->ws.entry, &b->work_items); 242 spin_unlock_irq(&b->lock); 243 244 if (commit_scheduled) 245 async_commit(b); 246 } 247 248 /* 249 * Bios are errored if commit failed. 250 */ 251 static void issue_after_commit(struct batcher *b, struct bio *bio) 252 { 253 bool commit_scheduled; 254 255 spin_lock_irq(&b->lock); 256 commit_scheduled = b->commit_scheduled; 257 bio_list_add(&b->bios, bio); 258 spin_unlock_irq(&b->lock); 259 260 if (commit_scheduled) 261 async_commit(b); 262 } 263 264 /* 265 * Call this if some urgent work is waiting for the commit to complete. 266 */ 267 static void schedule_commit(struct batcher *b) 268 { 269 bool immediate; 270 271 spin_lock_irq(&b->lock); 272 immediate = !list_empty(&b->work_items) || !bio_list_empty(&b->bios); 273 b->commit_scheduled = true; 274 spin_unlock_irq(&b->lock); 275 276 if (immediate) 277 async_commit(b); 278 } 279 280 /* 281 * There are a couple of places where we let a bio run, but want to do some 282 * work before calling its endio function. We do this by temporarily 283 * changing the endio fn. 284 */ 285 struct dm_hook_info { 286 bio_end_io_t *bi_end_io; 287 }; 288 289 static void dm_hook_bio(struct dm_hook_info *h, struct bio *bio, 290 bio_end_io_t *bi_end_io, void *bi_private) 291 { 292 h->bi_end_io = bio->bi_end_io; 293 294 bio->bi_end_io = bi_end_io; 295 bio->bi_private = bi_private; 296 } 297 298 static void dm_unhook_bio(struct dm_hook_info *h, struct bio *bio) 299 { 300 bio->bi_end_io = h->bi_end_io; 301 } 302 303 /*----------------------------------------------------------------*/ 304 305 #define MIGRATION_POOL_SIZE 128 306 #define COMMIT_PERIOD HZ 307 #define MIGRATION_COUNT_WINDOW 10 308 309 /* 310 * The block size of the device holding cache data must be 311 * between 32KB and 1GB. 312 */ 313 #define DATA_DEV_BLOCK_SIZE_MIN_SECTORS (32 * 1024 >> SECTOR_SHIFT) 314 #define DATA_DEV_BLOCK_SIZE_MAX_SECTORS (1024 * 1024 * 1024 >> SECTOR_SHIFT) 315 316 enum cache_metadata_mode { 317 CM_WRITE, /* metadata may be changed */ 318 CM_READ_ONLY, /* metadata may not be changed */ 319 CM_FAIL 320 }; 321 322 enum cache_io_mode { 323 /* 324 * Data is written to cached blocks only. These blocks are marked 325 * dirty. If you lose the cache device you will lose data. 326 * Potential performance increase for both reads and writes. 327 */ 328 CM_IO_WRITEBACK, 329 330 /* 331 * Data is written to both cache and origin. Blocks are never 332 * dirty. Potential performance benfit for reads only. 333 */ 334 CM_IO_WRITETHROUGH, 335 336 /* 337 * A degraded mode useful for various cache coherency situations 338 * (eg, rolling back snapshots). Reads and writes always go to the 339 * origin. If a write goes to a cached oblock, then the cache 340 * block is invalidated. 341 */ 342 CM_IO_PASSTHROUGH 343 }; 344 345 struct cache_features { 346 enum cache_metadata_mode mode; 347 enum cache_io_mode io_mode; 348 unsigned metadata_version; 349 bool discard_passdown:1; 350 }; 351 352 struct cache_stats { 353 atomic_t read_hit; 354 atomic_t read_miss; 355 atomic_t write_hit; 356 atomic_t write_miss; 357 atomic_t demotion; 358 atomic_t promotion; 359 atomic_t writeback; 360 atomic_t copies_avoided; 361 atomic_t cache_cell_clash; 362 atomic_t commit_count; 363 atomic_t discard_count; 364 }; 365 366 struct cache { 367 struct dm_target *ti; 368 spinlock_t lock; 369 370 /* 371 * Fields for converting from sectors to blocks. 372 */ 373 int sectors_per_block_shift; 374 sector_t sectors_per_block; 375 376 struct dm_cache_metadata *cmd; 377 378 /* 379 * Metadata is written to this device. 380 */ 381 struct dm_dev *metadata_dev; 382 383 /* 384 * The slower of the two data devices. Typically a spindle. 385 */ 386 struct dm_dev *origin_dev; 387 388 /* 389 * The faster of the two data devices. Typically an SSD. 390 */ 391 struct dm_dev *cache_dev; 392 393 /* 394 * Size of the origin device in _complete_ blocks and native sectors. 395 */ 396 dm_oblock_t origin_blocks; 397 sector_t origin_sectors; 398 399 /* 400 * Size of the cache device in blocks. 401 */ 402 dm_cblock_t cache_size; 403 404 /* 405 * Invalidation fields. 406 */ 407 spinlock_t invalidation_lock; 408 struct list_head invalidation_requests; 409 410 sector_t migration_threshold; 411 wait_queue_head_t migration_wait; 412 atomic_t nr_allocated_migrations; 413 414 /* 415 * The number of in flight migrations that are performing 416 * background io. eg, promotion, writeback. 417 */ 418 atomic_t nr_io_migrations; 419 420 struct bio_list deferred_bios; 421 422 struct rw_semaphore quiesce_lock; 423 424 /* 425 * origin_blocks entries, discarded if set. 426 */ 427 dm_dblock_t discard_nr_blocks; 428 unsigned long *discard_bitset; 429 uint32_t discard_block_size; /* a power of 2 times sectors per block */ 430 431 /* 432 * Rather than reconstructing the table line for the status we just 433 * save it and regurgitate. 434 */ 435 unsigned nr_ctr_args; 436 const char **ctr_args; 437 438 struct dm_kcopyd_client *copier; 439 struct work_struct deferred_bio_worker; 440 struct work_struct migration_worker; 441 struct workqueue_struct *wq; 442 struct delayed_work waker; 443 struct dm_bio_prison_v2 *prison; 444 445 /* 446 * cache_size entries, dirty if set 447 */ 448 unsigned long *dirty_bitset; 449 atomic_t nr_dirty; 450 451 unsigned policy_nr_args; 452 struct dm_cache_policy *policy; 453 454 /* 455 * Cache features such as write-through. 456 */ 457 struct cache_features features; 458 459 struct cache_stats stats; 460 461 bool need_tick_bio:1; 462 bool sized:1; 463 bool invalidate:1; 464 bool commit_requested:1; 465 bool loaded_mappings:1; 466 bool loaded_discards:1; 467 468 struct rw_semaphore background_work_lock; 469 470 struct batcher committer; 471 struct work_struct commit_ws; 472 473 struct io_tracker tracker; 474 475 mempool_t migration_pool; 476 477 struct bio_set bs; 478 }; 479 480 struct per_bio_data { 481 bool tick:1; 482 unsigned req_nr:2; 483 struct dm_bio_prison_cell_v2 *cell; 484 struct dm_hook_info hook_info; 485 sector_t len; 486 }; 487 488 struct dm_cache_migration { 489 struct continuation k; 490 struct cache *cache; 491 492 struct policy_work *op; 493 struct bio *overwrite_bio; 494 struct dm_bio_prison_cell_v2 *cell; 495 496 dm_cblock_t invalidate_cblock; 497 dm_oblock_t invalidate_oblock; 498 }; 499 500 /*----------------------------------------------------------------*/ 501 502 static bool writethrough_mode(struct cache *cache) 503 { 504 return cache->features.io_mode == CM_IO_WRITETHROUGH; 505 } 506 507 static bool writeback_mode(struct cache *cache) 508 { 509 return cache->features.io_mode == CM_IO_WRITEBACK; 510 } 511 512 static inline bool passthrough_mode(struct cache *cache) 513 { 514 return unlikely(cache->features.io_mode == CM_IO_PASSTHROUGH); 515 } 516 517 /*----------------------------------------------------------------*/ 518 519 static void wake_deferred_bio_worker(struct cache *cache) 520 { 521 queue_work(cache->wq, &cache->deferred_bio_worker); 522 } 523 524 static void wake_migration_worker(struct cache *cache) 525 { 526 if (passthrough_mode(cache)) 527 return; 528 529 queue_work(cache->wq, &cache->migration_worker); 530 } 531 532 /*----------------------------------------------------------------*/ 533 534 static struct dm_bio_prison_cell_v2 *alloc_prison_cell(struct cache *cache) 535 { 536 return dm_bio_prison_alloc_cell_v2(cache->prison, GFP_NOIO); 537 } 538 539 static void free_prison_cell(struct cache *cache, struct dm_bio_prison_cell_v2 *cell) 540 { 541 dm_bio_prison_free_cell_v2(cache->prison, cell); 542 } 543 544 static struct dm_cache_migration *alloc_migration(struct cache *cache) 545 { 546 struct dm_cache_migration *mg; 547 548 mg = mempool_alloc(&cache->migration_pool, GFP_NOIO); 549 550 memset(mg, 0, sizeof(*mg)); 551 552 mg->cache = cache; 553 atomic_inc(&cache->nr_allocated_migrations); 554 555 return mg; 556 } 557 558 static void free_migration(struct dm_cache_migration *mg) 559 { 560 struct cache *cache = mg->cache; 561 562 if (atomic_dec_and_test(&cache->nr_allocated_migrations)) 563 wake_up(&cache->migration_wait); 564 565 mempool_free(mg, &cache->migration_pool); 566 } 567 568 /*----------------------------------------------------------------*/ 569 570 static inline dm_oblock_t oblock_succ(dm_oblock_t b) 571 { 572 return to_oblock(from_oblock(b) + 1ull); 573 } 574 575 static void build_key(dm_oblock_t begin, dm_oblock_t end, struct dm_cell_key_v2 *key) 576 { 577 key->virtual = 0; 578 key->dev = 0; 579 key->block_begin = from_oblock(begin); 580 key->block_end = from_oblock(end); 581 } 582 583 /* 584 * We have two lock levels. Level 0, which is used to prevent WRITEs, and 585 * level 1 which prevents *both* READs and WRITEs. 586 */ 587 #define WRITE_LOCK_LEVEL 0 588 #define READ_WRITE_LOCK_LEVEL 1 589 590 static unsigned lock_level(struct bio *bio) 591 { 592 return bio_data_dir(bio) == WRITE ? 593 WRITE_LOCK_LEVEL : 594 READ_WRITE_LOCK_LEVEL; 595 } 596 597 /*---------------------------------------------------------------- 598 * Per bio data 599 *--------------------------------------------------------------*/ 600 601 static struct per_bio_data *get_per_bio_data(struct bio *bio) 602 { 603 struct per_bio_data *pb = dm_per_bio_data(bio, sizeof(struct per_bio_data)); 604 BUG_ON(!pb); 605 return pb; 606 } 607 608 static struct per_bio_data *init_per_bio_data(struct bio *bio) 609 { 610 struct per_bio_data *pb = get_per_bio_data(bio); 611 612 pb->tick = false; 613 pb->req_nr = dm_bio_get_target_bio_nr(bio); 614 pb->cell = NULL; 615 pb->len = 0; 616 617 return pb; 618 } 619 620 /*----------------------------------------------------------------*/ 621 622 static void defer_bio(struct cache *cache, struct bio *bio) 623 { 624 spin_lock_irq(&cache->lock); 625 bio_list_add(&cache->deferred_bios, bio); 626 spin_unlock_irq(&cache->lock); 627 628 wake_deferred_bio_worker(cache); 629 } 630 631 static void defer_bios(struct cache *cache, struct bio_list *bios) 632 { 633 spin_lock_irq(&cache->lock); 634 bio_list_merge(&cache->deferred_bios, bios); 635 bio_list_init(bios); 636 spin_unlock_irq(&cache->lock); 637 638 wake_deferred_bio_worker(cache); 639 } 640 641 /*----------------------------------------------------------------*/ 642 643 static bool bio_detain_shared(struct cache *cache, dm_oblock_t oblock, struct bio *bio) 644 { 645 bool r; 646 struct per_bio_data *pb; 647 struct dm_cell_key_v2 key; 648 dm_oblock_t end = to_oblock(from_oblock(oblock) + 1ULL); 649 struct dm_bio_prison_cell_v2 *cell_prealloc, *cell; 650 651 cell_prealloc = alloc_prison_cell(cache); /* FIXME: allow wait if calling from worker */ 652 653 build_key(oblock, end, &key); 654 r = dm_cell_get_v2(cache->prison, &key, lock_level(bio), bio, cell_prealloc, &cell); 655 if (!r) { 656 /* 657 * Failed to get the lock. 658 */ 659 free_prison_cell(cache, cell_prealloc); 660 return r; 661 } 662 663 if (cell != cell_prealloc) 664 free_prison_cell(cache, cell_prealloc); 665 666 pb = get_per_bio_data(bio); 667 pb->cell = cell; 668 669 return r; 670 } 671 672 /*----------------------------------------------------------------*/ 673 674 static bool is_dirty(struct cache *cache, dm_cblock_t b) 675 { 676 return test_bit(from_cblock(b), cache->dirty_bitset); 677 } 678 679 static void set_dirty(struct cache *cache, dm_cblock_t cblock) 680 { 681 if (!test_and_set_bit(from_cblock(cblock), cache->dirty_bitset)) { 682 atomic_inc(&cache->nr_dirty); 683 policy_set_dirty(cache->policy, cblock); 684 } 685 } 686 687 /* 688 * These two are called when setting after migrations to force the policy 689 * and dirty bitset to be in sync. 690 */ 691 static void force_set_dirty(struct cache *cache, dm_cblock_t cblock) 692 { 693 if (!test_and_set_bit(from_cblock(cblock), cache->dirty_bitset)) 694 atomic_inc(&cache->nr_dirty); 695 policy_set_dirty(cache->policy, cblock); 696 } 697 698 static void force_clear_dirty(struct cache *cache, dm_cblock_t cblock) 699 { 700 if (test_and_clear_bit(from_cblock(cblock), cache->dirty_bitset)) { 701 if (atomic_dec_return(&cache->nr_dirty) == 0) 702 dm_table_event(cache->ti->table); 703 } 704 705 policy_clear_dirty(cache->policy, cblock); 706 } 707 708 /*----------------------------------------------------------------*/ 709 710 static bool block_size_is_power_of_two(struct cache *cache) 711 { 712 return cache->sectors_per_block_shift >= 0; 713 } 714 715 /* gcc on ARM generates spurious references to __udivdi3 and __umoddi3 */ 716 #if defined(CONFIG_ARM) && __GNUC__ == 4 && __GNUC_MINOR__ <= 6 717 __always_inline 718 #endif 719 static dm_block_t block_div(dm_block_t b, uint32_t n) 720 { 721 do_div(b, n); 722 723 return b; 724 } 725 726 static dm_block_t oblocks_per_dblock(struct cache *cache) 727 { 728 dm_block_t oblocks = cache->discard_block_size; 729 730 if (block_size_is_power_of_two(cache)) 731 oblocks >>= cache->sectors_per_block_shift; 732 else 733 oblocks = block_div(oblocks, cache->sectors_per_block); 734 735 return oblocks; 736 } 737 738 static dm_dblock_t oblock_to_dblock(struct cache *cache, dm_oblock_t oblock) 739 { 740 return to_dblock(block_div(from_oblock(oblock), 741 oblocks_per_dblock(cache))); 742 } 743 744 static void set_discard(struct cache *cache, dm_dblock_t b) 745 { 746 BUG_ON(from_dblock(b) >= from_dblock(cache->discard_nr_blocks)); 747 atomic_inc(&cache->stats.discard_count); 748 749 spin_lock_irq(&cache->lock); 750 set_bit(from_dblock(b), cache->discard_bitset); 751 spin_unlock_irq(&cache->lock); 752 } 753 754 static void clear_discard(struct cache *cache, dm_dblock_t b) 755 { 756 spin_lock_irq(&cache->lock); 757 clear_bit(from_dblock(b), cache->discard_bitset); 758 spin_unlock_irq(&cache->lock); 759 } 760 761 static bool is_discarded(struct cache *cache, dm_dblock_t b) 762 { 763 int r; 764 spin_lock_irq(&cache->lock); 765 r = test_bit(from_dblock(b), cache->discard_bitset); 766 spin_unlock_irq(&cache->lock); 767 768 return r; 769 } 770 771 static bool is_discarded_oblock(struct cache *cache, dm_oblock_t b) 772 { 773 int r; 774 spin_lock_irq(&cache->lock); 775 r = test_bit(from_dblock(oblock_to_dblock(cache, b)), 776 cache->discard_bitset); 777 spin_unlock_irq(&cache->lock); 778 779 return r; 780 } 781 782 /*---------------------------------------------------------------- 783 * Remapping 784 *--------------------------------------------------------------*/ 785 static void remap_to_origin(struct cache *cache, struct bio *bio) 786 { 787 bio_set_dev(bio, cache->origin_dev->bdev); 788 } 789 790 static void remap_to_cache(struct cache *cache, struct bio *bio, 791 dm_cblock_t cblock) 792 { 793 sector_t bi_sector = bio->bi_iter.bi_sector; 794 sector_t block = from_cblock(cblock); 795 796 bio_set_dev(bio, cache->cache_dev->bdev); 797 if (!block_size_is_power_of_two(cache)) 798 bio->bi_iter.bi_sector = 799 (block * cache->sectors_per_block) + 800 sector_div(bi_sector, cache->sectors_per_block); 801 else 802 bio->bi_iter.bi_sector = 803 (block << cache->sectors_per_block_shift) | 804 (bi_sector & (cache->sectors_per_block - 1)); 805 } 806 807 static void check_if_tick_bio_needed(struct cache *cache, struct bio *bio) 808 { 809 struct per_bio_data *pb; 810 811 spin_lock_irq(&cache->lock); 812 if (cache->need_tick_bio && !op_is_flush(bio->bi_opf) && 813 bio_op(bio) != REQ_OP_DISCARD) { 814 pb = get_per_bio_data(bio); 815 pb->tick = true; 816 cache->need_tick_bio = false; 817 } 818 spin_unlock_irq(&cache->lock); 819 } 820 821 static void __remap_to_origin_clear_discard(struct cache *cache, struct bio *bio, 822 dm_oblock_t oblock, bool bio_has_pbd) 823 { 824 if (bio_has_pbd) 825 check_if_tick_bio_needed(cache, bio); 826 remap_to_origin(cache, bio); 827 if (bio_data_dir(bio) == WRITE) 828 clear_discard(cache, oblock_to_dblock(cache, oblock)); 829 } 830 831 static void remap_to_origin_clear_discard(struct cache *cache, struct bio *bio, 832 dm_oblock_t oblock) 833 { 834 // FIXME: check_if_tick_bio_needed() is called way too much through this interface 835 __remap_to_origin_clear_discard(cache, bio, oblock, true); 836 } 837 838 static void remap_to_cache_dirty(struct cache *cache, struct bio *bio, 839 dm_oblock_t oblock, dm_cblock_t cblock) 840 { 841 check_if_tick_bio_needed(cache, bio); 842 remap_to_cache(cache, bio, cblock); 843 if (bio_data_dir(bio) == WRITE) { 844 set_dirty(cache, cblock); 845 clear_discard(cache, oblock_to_dblock(cache, oblock)); 846 } 847 } 848 849 static dm_oblock_t get_bio_block(struct cache *cache, struct bio *bio) 850 { 851 sector_t block_nr = bio->bi_iter.bi_sector; 852 853 if (!block_size_is_power_of_two(cache)) 854 (void) sector_div(block_nr, cache->sectors_per_block); 855 else 856 block_nr >>= cache->sectors_per_block_shift; 857 858 return to_oblock(block_nr); 859 } 860 861 static bool accountable_bio(struct cache *cache, struct bio *bio) 862 { 863 return bio_op(bio) != REQ_OP_DISCARD; 864 } 865 866 static void accounted_begin(struct cache *cache, struct bio *bio) 867 { 868 struct per_bio_data *pb; 869 870 if (accountable_bio(cache, bio)) { 871 pb = get_per_bio_data(bio); 872 pb->len = bio_sectors(bio); 873 iot_io_begin(&cache->tracker, pb->len); 874 } 875 } 876 877 static void accounted_complete(struct cache *cache, struct bio *bio) 878 { 879 struct per_bio_data *pb = get_per_bio_data(bio); 880 881 iot_io_end(&cache->tracker, pb->len); 882 } 883 884 static void accounted_request(struct cache *cache, struct bio *bio) 885 { 886 accounted_begin(cache, bio); 887 submit_bio_noacct(bio); 888 } 889 890 static void issue_op(struct bio *bio, void *context) 891 { 892 struct cache *cache = context; 893 accounted_request(cache, bio); 894 } 895 896 /* 897 * When running in writethrough mode we need to send writes to clean blocks 898 * to both the cache and origin devices. Clone the bio and send them in parallel. 899 */ 900 static void remap_to_origin_and_cache(struct cache *cache, struct bio *bio, 901 dm_oblock_t oblock, dm_cblock_t cblock) 902 { 903 struct bio *origin_bio = bio_clone_fast(bio, GFP_NOIO, &cache->bs); 904 905 BUG_ON(!origin_bio); 906 907 bio_chain(origin_bio, bio); 908 /* 909 * Passing false to __remap_to_origin_clear_discard() skips 910 * all code that might use per_bio_data (since clone doesn't have it) 911 */ 912 __remap_to_origin_clear_discard(cache, origin_bio, oblock, false); 913 submit_bio(origin_bio); 914 915 remap_to_cache(cache, bio, cblock); 916 } 917 918 /*---------------------------------------------------------------- 919 * Failure modes 920 *--------------------------------------------------------------*/ 921 static enum cache_metadata_mode get_cache_mode(struct cache *cache) 922 { 923 return cache->features.mode; 924 } 925 926 static const char *cache_device_name(struct cache *cache) 927 { 928 return dm_table_device_name(cache->ti->table); 929 } 930 931 static void notify_mode_switch(struct cache *cache, enum cache_metadata_mode mode) 932 { 933 const char *descs[] = { 934 "write", 935 "read-only", 936 "fail" 937 }; 938 939 dm_table_event(cache->ti->table); 940 DMINFO("%s: switching cache to %s mode", 941 cache_device_name(cache), descs[(int)mode]); 942 } 943 944 static void set_cache_mode(struct cache *cache, enum cache_metadata_mode new_mode) 945 { 946 bool needs_check; 947 enum cache_metadata_mode old_mode = get_cache_mode(cache); 948 949 if (dm_cache_metadata_needs_check(cache->cmd, &needs_check)) { 950 DMERR("%s: unable to read needs_check flag, setting failure mode.", 951 cache_device_name(cache)); 952 new_mode = CM_FAIL; 953 } 954 955 if (new_mode == CM_WRITE && needs_check) { 956 DMERR("%s: unable to switch cache to write mode until repaired.", 957 cache_device_name(cache)); 958 if (old_mode != new_mode) 959 new_mode = old_mode; 960 else 961 new_mode = CM_READ_ONLY; 962 } 963 964 /* Never move out of fail mode */ 965 if (old_mode == CM_FAIL) 966 new_mode = CM_FAIL; 967 968 switch (new_mode) { 969 case CM_FAIL: 970 case CM_READ_ONLY: 971 dm_cache_metadata_set_read_only(cache->cmd); 972 break; 973 974 case CM_WRITE: 975 dm_cache_metadata_set_read_write(cache->cmd); 976 break; 977 } 978 979 cache->features.mode = new_mode; 980 981 if (new_mode != old_mode) 982 notify_mode_switch(cache, new_mode); 983 } 984 985 static void abort_transaction(struct cache *cache) 986 { 987 const char *dev_name = cache_device_name(cache); 988 989 if (get_cache_mode(cache) >= CM_READ_ONLY) 990 return; 991 992 if (dm_cache_metadata_set_needs_check(cache->cmd)) { 993 DMERR("%s: failed to set 'needs_check' flag in metadata", dev_name); 994 set_cache_mode(cache, CM_FAIL); 995 } 996 997 DMERR_LIMIT("%s: aborting current metadata transaction", dev_name); 998 if (dm_cache_metadata_abort(cache->cmd)) { 999 DMERR("%s: failed to abort metadata transaction", dev_name); 1000 set_cache_mode(cache, CM_FAIL); 1001 } 1002 } 1003 1004 static void metadata_operation_failed(struct cache *cache, const char *op, int r) 1005 { 1006 DMERR_LIMIT("%s: metadata operation '%s' failed: error = %d", 1007 cache_device_name(cache), op, r); 1008 abort_transaction(cache); 1009 set_cache_mode(cache, CM_READ_ONLY); 1010 } 1011 1012 /*----------------------------------------------------------------*/ 1013 1014 static void load_stats(struct cache *cache) 1015 { 1016 struct dm_cache_statistics stats; 1017 1018 dm_cache_metadata_get_stats(cache->cmd, &stats); 1019 atomic_set(&cache->stats.read_hit, stats.read_hits); 1020 atomic_set(&cache->stats.read_miss, stats.read_misses); 1021 atomic_set(&cache->stats.write_hit, stats.write_hits); 1022 atomic_set(&cache->stats.write_miss, stats.write_misses); 1023 } 1024 1025 static void save_stats(struct cache *cache) 1026 { 1027 struct dm_cache_statistics stats; 1028 1029 if (get_cache_mode(cache) >= CM_READ_ONLY) 1030 return; 1031 1032 stats.read_hits = atomic_read(&cache->stats.read_hit); 1033 stats.read_misses = atomic_read(&cache->stats.read_miss); 1034 stats.write_hits = atomic_read(&cache->stats.write_hit); 1035 stats.write_misses = atomic_read(&cache->stats.write_miss); 1036 1037 dm_cache_metadata_set_stats(cache->cmd, &stats); 1038 } 1039 1040 static void update_stats(struct cache_stats *stats, enum policy_operation op) 1041 { 1042 switch (op) { 1043 case POLICY_PROMOTE: 1044 atomic_inc(&stats->promotion); 1045 break; 1046 1047 case POLICY_DEMOTE: 1048 atomic_inc(&stats->demotion); 1049 break; 1050 1051 case POLICY_WRITEBACK: 1052 atomic_inc(&stats->writeback); 1053 break; 1054 } 1055 } 1056 1057 /*---------------------------------------------------------------- 1058 * Migration processing 1059 * 1060 * Migration covers moving data from the origin device to the cache, or 1061 * vice versa. 1062 *--------------------------------------------------------------*/ 1063 1064 static void inc_io_migrations(struct cache *cache) 1065 { 1066 atomic_inc(&cache->nr_io_migrations); 1067 } 1068 1069 static void dec_io_migrations(struct cache *cache) 1070 { 1071 atomic_dec(&cache->nr_io_migrations); 1072 } 1073 1074 static bool discard_or_flush(struct bio *bio) 1075 { 1076 return bio_op(bio) == REQ_OP_DISCARD || op_is_flush(bio->bi_opf); 1077 } 1078 1079 static void calc_discard_block_range(struct cache *cache, struct bio *bio, 1080 dm_dblock_t *b, dm_dblock_t *e) 1081 { 1082 sector_t sb = bio->bi_iter.bi_sector; 1083 sector_t se = bio_end_sector(bio); 1084 1085 *b = to_dblock(dm_sector_div_up(sb, cache->discard_block_size)); 1086 1087 if (se - sb < cache->discard_block_size) 1088 *e = *b; 1089 else 1090 *e = to_dblock(block_div(se, cache->discard_block_size)); 1091 } 1092 1093 /*----------------------------------------------------------------*/ 1094 1095 static void prevent_background_work(struct cache *cache) 1096 { 1097 lockdep_off(); 1098 down_write(&cache->background_work_lock); 1099 lockdep_on(); 1100 } 1101 1102 static void allow_background_work(struct cache *cache) 1103 { 1104 lockdep_off(); 1105 up_write(&cache->background_work_lock); 1106 lockdep_on(); 1107 } 1108 1109 static bool background_work_begin(struct cache *cache) 1110 { 1111 bool r; 1112 1113 lockdep_off(); 1114 r = down_read_trylock(&cache->background_work_lock); 1115 lockdep_on(); 1116 1117 return r; 1118 } 1119 1120 static void background_work_end(struct cache *cache) 1121 { 1122 lockdep_off(); 1123 up_read(&cache->background_work_lock); 1124 lockdep_on(); 1125 } 1126 1127 /*----------------------------------------------------------------*/ 1128 1129 static bool bio_writes_complete_block(struct cache *cache, struct bio *bio) 1130 { 1131 return (bio_data_dir(bio) == WRITE) && 1132 (bio->bi_iter.bi_size == (cache->sectors_per_block << SECTOR_SHIFT)); 1133 } 1134 1135 static bool optimisable_bio(struct cache *cache, struct bio *bio, dm_oblock_t block) 1136 { 1137 return writeback_mode(cache) && 1138 (is_discarded_oblock(cache, block) || bio_writes_complete_block(cache, bio)); 1139 } 1140 1141 static void quiesce(struct dm_cache_migration *mg, 1142 void (*continuation)(struct work_struct *)) 1143 { 1144 init_continuation(&mg->k, continuation); 1145 dm_cell_quiesce_v2(mg->cache->prison, mg->cell, &mg->k.ws); 1146 } 1147 1148 static struct dm_cache_migration *ws_to_mg(struct work_struct *ws) 1149 { 1150 struct continuation *k = container_of(ws, struct continuation, ws); 1151 return container_of(k, struct dm_cache_migration, k); 1152 } 1153 1154 static void copy_complete(int read_err, unsigned long write_err, void *context) 1155 { 1156 struct dm_cache_migration *mg = container_of(context, struct dm_cache_migration, k); 1157 1158 if (read_err || write_err) 1159 mg->k.input = BLK_STS_IOERR; 1160 1161 queue_continuation(mg->cache->wq, &mg->k); 1162 } 1163 1164 static void copy(struct dm_cache_migration *mg, bool promote) 1165 { 1166 struct dm_io_region o_region, c_region; 1167 struct cache *cache = mg->cache; 1168 1169 o_region.bdev = cache->origin_dev->bdev; 1170 o_region.sector = from_oblock(mg->op->oblock) * cache->sectors_per_block; 1171 o_region.count = cache->sectors_per_block; 1172 1173 c_region.bdev = cache->cache_dev->bdev; 1174 c_region.sector = from_cblock(mg->op->cblock) * cache->sectors_per_block; 1175 c_region.count = cache->sectors_per_block; 1176 1177 if (promote) 1178 dm_kcopyd_copy(cache->copier, &o_region, 1, &c_region, 0, copy_complete, &mg->k); 1179 else 1180 dm_kcopyd_copy(cache->copier, &c_region, 1, &o_region, 0, copy_complete, &mg->k); 1181 } 1182 1183 static void bio_drop_shared_lock(struct cache *cache, struct bio *bio) 1184 { 1185 struct per_bio_data *pb = get_per_bio_data(bio); 1186 1187 if (pb->cell && dm_cell_put_v2(cache->prison, pb->cell)) 1188 free_prison_cell(cache, pb->cell); 1189 pb->cell = NULL; 1190 } 1191 1192 static void overwrite_endio(struct bio *bio) 1193 { 1194 struct dm_cache_migration *mg = bio->bi_private; 1195 struct cache *cache = mg->cache; 1196 struct per_bio_data *pb = get_per_bio_data(bio); 1197 1198 dm_unhook_bio(&pb->hook_info, bio); 1199 1200 if (bio->bi_status) 1201 mg->k.input = bio->bi_status; 1202 1203 queue_continuation(cache->wq, &mg->k); 1204 } 1205 1206 static void overwrite(struct dm_cache_migration *mg, 1207 void (*continuation)(struct work_struct *)) 1208 { 1209 struct bio *bio = mg->overwrite_bio; 1210 struct per_bio_data *pb = get_per_bio_data(bio); 1211 1212 dm_hook_bio(&pb->hook_info, bio, overwrite_endio, mg); 1213 1214 /* 1215 * The overwrite bio is part of the copy operation, as such it does 1216 * not set/clear discard or dirty flags. 1217 */ 1218 if (mg->op->op == POLICY_PROMOTE) 1219 remap_to_cache(mg->cache, bio, mg->op->cblock); 1220 else 1221 remap_to_origin(mg->cache, bio); 1222 1223 init_continuation(&mg->k, continuation); 1224 accounted_request(mg->cache, bio); 1225 } 1226 1227 /* 1228 * Migration steps: 1229 * 1230 * 1) exclusive lock preventing WRITEs 1231 * 2) quiesce 1232 * 3) copy or issue overwrite bio 1233 * 4) upgrade to exclusive lock preventing READs and WRITEs 1234 * 5) quiesce 1235 * 6) update metadata and commit 1236 * 7) unlock 1237 */ 1238 static void mg_complete(struct dm_cache_migration *mg, bool success) 1239 { 1240 struct bio_list bios; 1241 struct cache *cache = mg->cache; 1242 struct policy_work *op = mg->op; 1243 dm_cblock_t cblock = op->cblock; 1244 1245 if (success) 1246 update_stats(&cache->stats, op->op); 1247 1248 switch (op->op) { 1249 case POLICY_PROMOTE: 1250 clear_discard(cache, oblock_to_dblock(cache, op->oblock)); 1251 policy_complete_background_work(cache->policy, op, success); 1252 1253 if (mg->overwrite_bio) { 1254 if (success) 1255 force_set_dirty(cache, cblock); 1256 else if (mg->k.input) 1257 mg->overwrite_bio->bi_status = mg->k.input; 1258 else 1259 mg->overwrite_bio->bi_status = BLK_STS_IOERR; 1260 bio_endio(mg->overwrite_bio); 1261 } else { 1262 if (success) 1263 force_clear_dirty(cache, cblock); 1264 dec_io_migrations(cache); 1265 } 1266 break; 1267 1268 case POLICY_DEMOTE: 1269 /* 1270 * We clear dirty here to update the nr_dirty counter. 1271 */ 1272 if (success) 1273 force_clear_dirty(cache, cblock); 1274 policy_complete_background_work(cache->policy, op, success); 1275 dec_io_migrations(cache); 1276 break; 1277 1278 case POLICY_WRITEBACK: 1279 if (success) 1280 force_clear_dirty(cache, cblock); 1281 policy_complete_background_work(cache->policy, op, success); 1282 dec_io_migrations(cache); 1283 break; 1284 } 1285 1286 bio_list_init(&bios); 1287 if (mg->cell) { 1288 if (dm_cell_unlock_v2(cache->prison, mg->cell, &bios)) 1289 free_prison_cell(cache, mg->cell); 1290 } 1291 1292 free_migration(mg); 1293 defer_bios(cache, &bios); 1294 wake_migration_worker(cache); 1295 1296 background_work_end(cache); 1297 } 1298 1299 static void mg_success(struct work_struct *ws) 1300 { 1301 struct dm_cache_migration *mg = ws_to_mg(ws); 1302 mg_complete(mg, mg->k.input == 0); 1303 } 1304 1305 static void mg_update_metadata(struct work_struct *ws) 1306 { 1307 int r; 1308 struct dm_cache_migration *mg = ws_to_mg(ws); 1309 struct cache *cache = mg->cache; 1310 struct policy_work *op = mg->op; 1311 1312 switch (op->op) { 1313 case POLICY_PROMOTE: 1314 r = dm_cache_insert_mapping(cache->cmd, op->cblock, op->oblock); 1315 if (r) { 1316 DMERR_LIMIT("%s: migration failed; couldn't insert mapping", 1317 cache_device_name(cache)); 1318 metadata_operation_failed(cache, "dm_cache_insert_mapping", r); 1319 1320 mg_complete(mg, false); 1321 return; 1322 } 1323 mg_complete(mg, true); 1324 break; 1325 1326 case POLICY_DEMOTE: 1327 r = dm_cache_remove_mapping(cache->cmd, op->cblock); 1328 if (r) { 1329 DMERR_LIMIT("%s: migration failed; couldn't update on disk metadata", 1330 cache_device_name(cache)); 1331 metadata_operation_failed(cache, "dm_cache_remove_mapping", r); 1332 1333 mg_complete(mg, false); 1334 return; 1335 } 1336 1337 /* 1338 * It would be nice if we only had to commit when a REQ_FLUSH 1339 * comes through. But there's one scenario that we have to 1340 * look out for: 1341 * 1342 * - vblock x in a cache block 1343 * - domotion occurs 1344 * - cache block gets reallocated and over written 1345 * - crash 1346 * 1347 * When we recover, because there was no commit the cache will 1348 * rollback to having the data for vblock x in the cache block. 1349 * But the cache block has since been overwritten, so it'll end 1350 * up pointing to data that was never in 'x' during the history 1351 * of the device. 1352 * 1353 * To avoid this issue we require a commit as part of the 1354 * demotion operation. 1355 */ 1356 init_continuation(&mg->k, mg_success); 1357 continue_after_commit(&cache->committer, &mg->k); 1358 schedule_commit(&cache->committer); 1359 break; 1360 1361 case POLICY_WRITEBACK: 1362 mg_complete(mg, true); 1363 break; 1364 } 1365 } 1366 1367 static void mg_update_metadata_after_copy(struct work_struct *ws) 1368 { 1369 struct dm_cache_migration *mg = ws_to_mg(ws); 1370 1371 /* 1372 * Did the copy succeed? 1373 */ 1374 if (mg->k.input) 1375 mg_complete(mg, false); 1376 else 1377 mg_update_metadata(ws); 1378 } 1379 1380 static void mg_upgrade_lock(struct work_struct *ws) 1381 { 1382 int r; 1383 struct dm_cache_migration *mg = ws_to_mg(ws); 1384 1385 /* 1386 * Did the copy succeed? 1387 */ 1388 if (mg->k.input) 1389 mg_complete(mg, false); 1390 1391 else { 1392 /* 1393 * Now we want the lock to prevent both reads and writes. 1394 */ 1395 r = dm_cell_lock_promote_v2(mg->cache->prison, mg->cell, 1396 READ_WRITE_LOCK_LEVEL); 1397 if (r < 0) 1398 mg_complete(mg, false); 1399 1400 else if (r) 1401 quiesce(mg, mg_update_metadata); 1402 1403 else 1404 mg_update_metadata(ws); 1405 } 1406 } 1407 1408 static void mg_full_copy(struct work_struct *ws) 1409 { 1410 struct dm_cache_migration *mg = ws_to_mg(ws); 1411 struct cache *cache = mg->cache; 1412 struct policy_work *op = mg->op; 1413 bool is_policy_promote = (op->op == POLICY_PROMOTE); 1414 1415 if ((!is_policy_promote && !is_dirty(cache, op->cblock)) || 1416 is_discarded_oblock(cache, op->oblock)) { 1417 mg_upgrade_lock(ws); 1418 return; 1419 } 1420 1421 init_continuation(&mg->k, mg_upgrade_lock); 1422 copy(mg, is_policy_promote); 1423 } 1424 1425 static void mg_copy(struct work_struct *ws) 1426 { 1427 struct dm_cache_migration *mg = ws_to_mg(ws); 1428 1429 if (mg->overwrite_bio) { 1430 /* 1431 * No exclusive lock was held when we last checked if the bio 1432 * was optimisable. So we have to check again in case things 1433 * have changed (eg, the block may no longer be discarded). 1434 */ 1435 if (!optimisable_bio(mg->cache, mg->overwrite_bio, mg->op->oblock)) { 1436 /* 1437 * Fallback to a real full copy after doing some tidying up. 1438 */ 1439 bool rb = bio_detain_shared(mg->cache, mg->op->oblock, mg->overwrite_bio); 1440 BUG_ON(rb); /* An exclussive lock must _not_ be held for this block */ 1441 mg->overwrite_bio = NULL; 1442 inc_io_migrations(mg->cache); 1443 mg_full_copy(ws); 1444 return; 1445 } 1446 1447 /* 1448 * It's safe to do this here, even though it's new data 1449 * because all IO has been locked out of the block. 1450 * 1451 * mg_lock_writes() already took READ_WRITE_LOCK_LEVEL 1452 * so _not_ using mg_upgrade_lock() as continutation. 1453 */ 1454 overwrite(mg, mg_update_metadata_after_copy); 1455 1456 } else 1457 mg_full_copy(ws); 1458 } 1459 1460 static int mg_lock_writes(struct dm_cache_migration *mg) 1461 { 1462 int r; 1463 struct dm_cell_key_v2 key; 1464 struct cache *cache = mg->cache; 1465 struct dm_bio_prison_cell_v2 *prealloc; 1466 1467 prealloc = alloc_prison_cell(cache); 1468 1469 /* 1470 * Prevent writes to the block, but allow reads to continue. 1471 * Unless we're using an overwrite bio, in which case we lock 1472 * everything. 1473 */ 1474 build_key(mg->op->oblock, oblock_succ(mg->op->oblock), &key); 1475 r = dm_cell_lock_v2(cache->prison, &key, 1476 mg->overwrite_bio ? READ_WRITE_LOCK_LEVEL : WRITE_LOCK_LEVEL, 1477 prealloc, &mg->cell); 1478 if (r < 0) { 1479 free_prison_cell(cache, prealloc); 1480 mg_complete(mg, false); 1481 return r; 1482 } 1483 1484 if (mg->cell != prealloc) 1485 free_prison_cell(cache, prealloc); 1486 1487 if (r == 0) 1488 mg_copy(&mg->k.ws); 1489 else 1490 quiesce(mg, mg_copy); 1491 1492 return 0; 1493 } 1494 1495 static int mg_start(struct cache *cache, struct policy_work *op, struct bio *bio) 1496 { 1497 struct dm_cache_migration *mg; 1498 1499 if (!background_work_begin(cache)) { 1500 policy_complete_background_work(cache->policy, op, false); 1501 return -EPERM; 1502 } 1503 1504 mg = alloc_migration(cache); 1505 1506 mg->op = op; 1507 mg->overwrite_bio = bio; 1508 1509 if (!bio) 1510 inc_io_migrations(cache); 1511 1512 return mg_lock_writes(mg); 1513 } 1514 1515 /*---------------------------------------------------------------- 1516 * invalidation processing 1517 *--------------------------------------------------------------*/ 1518 1519 static void invalidate_complete(struct dm_cache_migration *mg, bool success) 1520 { 1521 struct bio_list bios; 1522 struct cache *cache = mg->cache; 1523 1524 bio_list_init(&bios); 1525 if (dm_cell_unlock_v2(cache->prison, mg->cell, &bios)) 1526 free_prison_cell(cache, mg->cell); 1527 1528 if (!success && mg->overwrite_bio) 1529 bio_io_error(mg->overwrite_bio); 1530 1531 free_migration(mg); 1532 defer_bios(cache, &bios); 1533 1534 background_work_end(cache); 1535 } 1536 1537 static void invalidate_completed(struct work_struct *ws) 1538 { 1539 struct dm_cache_migration *mg = ws_to_mg(ws); 1540 invalidate_complete(mg, !mg->k.input); 1541 } 1542 1543 static int invalidate_cblock(struct cache *cache, dm_cblock_t cblock) 1544 { 1545 int r = policy_invalidate_mapping(cache->policy, cblock); 1546 if (!r) { 1547 r = dm_cache_remove_mapping(cache->cmd, cblock); 1548 if (r) { 1549 DMERR_LIMIT("%s: invalidation failed; couldn't update on disk metadata", 1550 cache_device_name(cache)); 1551 metadata_operation_failed(cache, "dm_cache_remove_mapping", r); 1552 } 1553 1554 } else if (r == -ENODATA) { 1555 /* 1556 * Harmless, already unmapped. 1557 */ 1558 r = 0; 1559 1560 } else 1561 DMERR("%s: policy_invalidate_mapping failed", cache_device_name(cache)); 1562 1563 return r; 1564 } 1565 1566 static void invalidate_remove(struct work_struct *ws) 1567 { 1568 int r; 1569 struct dm_cache_migration *mg = ws_to_mg(ws); 1570 struct cache *cache = mg->cache; 1571 1572 r = invalidate_cblock(cache, mg->invalidate_cblock); 1573 if (r) { 1574 invalidate_complete(mg, false); 1575 return; 1576 } 1577 1578 init_continuation(&mg->k, invalidate_completed); 1579 continue_after_commit(&cache->committer, &mg->k); 1580 remap_to_origin_clear_discard(cache, mg->overwrite_bio, mg->invalidate_oblock); 1581 mg->overwrite_bio = NULL; 1582 schedule_commit(&cache->committer); 1583 } 1584 1585 static int invalidate_lock(struct dm_cache_migration *mg) 1586 { 1587 int r; 1588 struct dm_cell_key_v2 key; 1589 struct cache *cache = mg->cache; 1590 struct dm_bio_prison_cell_v2 *prealloc; 1591 1592 prealloc = alloc_prison_cell(cache); 1593 1594 build_key(mg->invalidate_oblock, oblock_succ(mg->invalidate_oblock), &key); 1595 r = dm_cell_lock_v2(cache->prison, &key, 1596 READ_WRITE_LOCK_LEVEL, prealloc, &mg->cell); 1597 if (r < 0) { 1598 free_prison_cell(cache, prealloc); 1599 invalidate_complete(mg, false); 1600 return r; 1601 } 1602 1603 if (mg->cell != prealloc) 1604 free_prison_cell(cache, prealloc); 1605 1606 if (r) 1607 quiesce(mg, invalidate_remove); 1608 1609 else { 1610 /* 1611 * We can't call invalidate_remove() directly here because we 1612 * might still be in request context. 1613 */ 1614 init_continuation(&mg->k, invalidate_remove); 1615 queue_work(cache->wq, &mg->k.ws); 1616 } 1617 1618 return 0; 1619 } 1620 1621 static int invalidate_start(struct cache *cache, dm_cblock_t cblock, 1622 dm_oblock_t oblock, struct bio *bio) 1623 { 1624 struct dm_cache_migration *mg; 1625 1626 if (!background_work_begin(cache)) 1627 return -EPERM; 1628 1629 mg = alloc_migration(cache); 1630 1631 mg->overwrite_bio = bio; 1632 mg->invalidate_cblock = cblock; 1633 mg->invalidate_oblock = oblock; 1634 1635 return invalidate_lock(mg); 1636 } 1637 1638 /*---------------------------------------------------------------- 1639 * bio processing 1640 *--------------------------------------------------------------*/ 1641 1642 enum busy { 1643 IDLE, 1644 BUSY 1645 }; 1646 1647 static enum busy spare_migration_bandwidth(struct cache *cache) 1648 { 1649 bool idle = iot_idle_for(&cache->tracker, HZ); 1650 sector_t current_volume = (atomic_read(&cache->nr_io_migrations) + 1) * 1651 cache->sectors_per_block; 1652 1653 if (idle && current_volume <= cache->migration_threshold) 1654 return IDLE; 1655 else 1656 return BUSY; 1657 } 1658 1659 static void inc_hit_counter(struct cache *cache, struct bio *bio) 1660 { 1661 atomic_inc(bio_data_dir(bio) == READ ? 1662 &cache->stats.read_hit : &cache->stats.write_hit); 1663 } 1664 1665 static void inc_miss_counter(struct cache *cache, struct bio *bio) 1666 { 1667 atomic_inc(bio_data_dir(bio) == READ ? 1668 &cache->stats.read_miss : &cache->stats.write_miss); 1669 } 1670 1671 /*----------------------------------------------------------------*/ 1672 1673 static int map_bio(struct cache *cache, struct bio *bio, dm_oblock_t block, 1674 bool *commit_needed) 1675 { 1676 int r, data_dir; 1677 bool rb, background_queued; 1678 dm_cblock_t cblock; 1679 1680 *commit_needed = false; 1681 1682 rb = bio_detain_shared(cache, block, bio); 1683 if (!rb) { 1684 /* 1685 * An exclusive lock is held for this block, so we have to 1686 * wait. We set the commit_needed flag so the current 1687 * transaction will be committed asap, allowing this lock 1688 * to be dropped. 1689 */ 1690 *commit_needed = true; 1691 return DM_MAPIO_SUBMITTED; 1692 } 1693 1694 data_dir = bio_data_dir(bio); 1695 1696 if (optimisable_bio(cache, bio, block)) { 1697 struct policy_work *op = NULL; 1698 1699 r = policy_lookup_with_work(cache->policy, block, &cblock, data_dir, true, &op); 1700 if (unlikely(r && r != -ENOENT)) { 1701 DMERR_LIMIT("%s: policy_lookup_with_work() failed with r = %d", 1702 cache_device_name(cache), r); 1703 bio_io_error(bio); 1704 return DM_MAPIO_SUBMITTED; 1705 } 1706 1707 if (r == -ENOENT && op) { 1708 bio_drop_shared_lock(cache, bio); 1709 BUG_ON(op->op != POLICY_PROMOTE); 1710 mg_start(cache, op, bio); 1711 return DM_MAPIO_SUBMITTED; 1712 } 1713 } else { 1714 r = policy_lookup(cache->policy, block, &cblock, data_dir, false, &background_queued); 1715 if (unlikely(r && r != -ENOENT)) { 1716 DMERR_LIMIT("%s: policy_lookup() failed with r = %d", 1717 cache_device_name(cache), r); 1718 bio_io_error(bio); 1719 return DM_MAPIO_SUBMITTED; 1720 } 1721 1722 if (background_queued) 1723 wake_migration_worker(cache); 1724 } 1725 1726 if (r == -ENOENT) { 1727 struct per_bio_data *pb = get_per_bio_data(bio); 1728 1729 /* 1730 * Miss. 1731 */ 1732 inc_miss_counter(cache, bio); 1733 if (pb->req_nr == 0) { 1734 accounted_begin(cache, bio); 1735 remap_to_origin_clear_discard(cache, bio, block); 1736 } else { 1737 /* 1738 * This is a duplicate writethrough io that is no 1739 * longer needed because the block has been demoted. 1740 */ 1741 bio_endio(bio); 1742 return DM_MAPIO_SUBMITTED; 1743 } 1744 } else { 1745 /* 1746 * Hit. 1747 */ 1748 inc_hit_counter(cache, bio); 1749 1750 /* 1751 * Passthrough always maps to the origin, invalidating any 1752 * cache blocks that are written to. 1753 */ 1754 if (passthrough_mode(cache)) { 1755 if (bio_data_dir(bio) == WRITE) { 1756 bio_drop_shared_lock(cache, bio); 1757 atomic_inc(&cache->stats.demotion); 1758 invalidate_start(cache, cblock, block, bio); 1759 } else 1760 remap_to_origin_clear_discard(cache, bio, block); 1761 } else { 1762 if (bio_data_dir(bio) == WRITE && writethrough_mode(cache) && 1763 !is_dirty(cache, cblock)) { 1764 remap_to_origin_and_cache(cache, bio, block, cblock); 1765 accounted_begin(cache, bio); 1766 } else 1767 remap_to_cache_dirty(cache, bio, block, cblock); 1768 } 1769 } 1770 1771 /* 1772 * dm core turns FUA requests into a separate payload and FLUSH req. 1773 */ 1774 if (bio->bi_opf & REQ_FUA) { 1775 /* 1776 * issue_after_commit will call accounted_begin a second time. So 1777 * we call accounted_complete() to avoid double accounting. 1778 */ 1779 accounted_complete(cache, bio); 1780 issue_after_commit(&cache->committer, bio); 1781 *commit_needed = true; 1782 return DM_MAPIO_SUBMITTED; 1783 } 1784 1785 return DM_MAPIO_REMAPPED; 1786 } 1787 1788 static bool process_bio(struct cache *cache, struct bio *bio) 1789 { 1790 bool commit_needed; 1791 1792 if (map_bio(cache, bio, get_bio_block(cache, bio), &commit_needed) == DM_MAPIO_REMAPPED) 1793 submit_bio_noacct(bio); 1794 1795 return commit_needed; 1796 } 1797 1798 /* 1799 * A non-zero return indicates read_only or fail_io mode. 1800 */ 1801 static int commit(struct cache *cache, bool clean_shutdown) 1802 { 1803 int r; 1804 1805 if (get_cache_mode(cache) >= CM_READ_ONLY) 1806 return -EINVAL; 1807 1808 atomic_inc(&cache->stats.commit_count); 1809 r = dm_cache_commit(cache->cmd, clean_shutdown); 1810 if (r) 1811 metadata_operation_failed(cache, "dm_cache_commit", r); 1812 1813 return r; 1814 } 1815 1816 /* 1817 * Used by the batcher. 1818 */ 1819 static blk_status_t commit_op(void *context) 1820 { 1821 struct cache *cache = context; 1822 1823 if (dm_cache_changed_this_transaction(cache->cmd)) 1824 return errno_to_blk_status(commit(cache, false)); 1825 1826 return 0; 1827 } 1828 1829 /*----------------------------------------------------------------*/ 1830 1831 static bool process_flush_bio(struct cache *cache, struct bio *bio) 1832 { 1833 struct per_bio_data *pb = get_per_bio_data(bio); 1834 1835 if (!pb->req_nr) 1836 remap_to_origin(cache, bio); 1837 else 1838 remap_to_cache(cache, bio, 0); 1839 1840 issue_after_commit(&cache->committer, bio); 1841 return true; 1842 } 1843 1844 static bool process_discard_bio(struct cache *cache, struct bio *bio) 1845 { 1846 dm_dblock_t b, e; 1847 1848 // FIXME: do we need to lock the region? Or can we just assume the 1849 // user wont be so foolish as to issue discard concurrently with 1850 // other IO? 1851 calc_discard_block_range(cache, bio, &b, &e); 1852 while (b != e) { 1853 set_discard(cache, b); 1854 b = to_dblock(from_dblock(b) + 1); 1855 } 1856 1857 if (cache->features.discard_passdown) { 1858 remap_to_origin(cache, bio); 1859 submit_bio_noacct(bio); 1860 } else 1861 bio_endio(bio); 1862 1863 return false; 1864 } 1865 1866 static void process_deferred_bios(struct work_struct *ws) 1867 { 1868 struct cache *cache = container_of(ws, struct cache, deferred_bio_worker); 1869 1870 bool commit_needed = false; 1871 struct bio_list bios; 1872 struct bio *bio; 1873 1874 bio_list_init(&bios); 1875 1876 spin_lock_irq(&cache->lock); 1877 bio_list_merge(&bios, &cache->deferred_bios); 1878 bio_list_init(&cache->deferred_bios); 1879 spin_unlock_irq(&cache->lock); 1880 1881 while ((bio = bio_list_pop(&bios))) { 1882 if (bio->bi_opf & REQ_PREFLUSH) 1883 commit_needed = process_flush_bio(cache, bio) || commit_needed; 1884 1885 else if (bio_op(bio) == REQ_OP_DISCARD) 1886 commit_needed = process_discard_bio(cache, bio) || commit_needed; 1887 1888 else 1889 commit_needed = process_bio(cache, bio) || commit_needed; 1890 } 1891 1892 if (commit_needed) 1893 schedule_commit(&cache->committer); 1894 } 1895 1896 /*---------------------------------------------------------------- 1897 * Main worker loop 1898 *--------------------------------------------------------------*/ 1899 1900 static void requeue_deferred_bios(struct cache *cache) 1901 { 1902 struct bio *bio; 1903 struct bio_list bios; 1904 1905 bio_list_init(&bios); 1906 bio_list_merge(&bios, &cache->deferred_bios); 1907 bio_list_init(&cache->deferred_bios); 1908 1909 while ((bio = bio_list_pop(&bios))) { 1910 bio->bi_status = BLK_STS_DM_REQUEUE; 1911 bio_endio(bio); 1912 } 1913 } 1914 1915 /* 1916 * We want to commit periodically so that not too much 1917 * unwritten metadata builds up. 1918 */ 1919 static void do_waker(struct work_struct *ws) 1920 { 1921 struct cache *cache = container_of(to_delayed_work(ws), struct cache, waker); 1922 1923 policy_tick(cache->policy, true); 1924 wake_migration_worker(cache); 1925 schedule_commit(&cache->committer); 1926 queue_delayed_work(cache->wq, &cache->waker, COMMIT_PERIOD); 1927 } 1928 1929 static void check_migrations(struct work_struct *ws) 1930 { 1931 int r; 1932 struct policy_work *op; 1933 struct cache *cache = container_of(ws, struct cache, migration_worker); 1934 enum busy b; 1935 1936 for (;;) { 1937 b = spare_migration_bandwidth(cache); 1938 1939 r = policy_get_background_work(cache->policy, b == IDLE, &op); 1940 if (r == -ENODATA) 1941 break; 1942 1943 if (r) { 1944 DMERR_LIMIT("%s: policy_background_work failed", 1945 cache_device_name(cache)); 1946 break; 1947 } 1948 1949 r = mg_start(cache, op, NULL); 1950 if (r) 1951 break; 1952 } 1953 } 1954 1955 /*---------------------------------------------------------------- 1956 * Target methods 1957 *--------------------------------------------------------------*/ 1958 1959 /* 1960 * This function gets called on the error paths of the constructor, so we 1961 * have to cope with a partially initialised struct. 1962 */ 1963 static void destroy(struct cache *cache) 1964 { 1965 unsigned i; 1966 1967 mempool_exit(&cache->migration_pool); 1968 1969 if (cache->prison) 1970 dm_bio_prison_destroy_v2(cache->prison); 1971 1972 if (cache->wq) 1973 destroy_workqueue(cache->wq); 1974 1975 if (cache->dirty_bitset) 1976 free_bitset(cache->dirty_bitset); 1977 1978 if (cache->discard_bitset) 1979 free_bitset(cache->discard_bitset); 1980 1981 if (cache->copier) 1982 dm_kcopyd_client_destroy(cache->copier); 1983 1984 if (cache->cmd) 1985 dm_cache_metadata_close(cache->cmd); 1986 1987 if (cache->metadata_dev) 1988 dm_put_device(cache->ti, cache->metadata_dev); 1989 1990 if (cache->origin_dev) 1991 dm_put_device(cache->ti, cache->origin_dev); 1992 1993 if (cache->cache_dev) 1994 dm_put_device(cache->ti, cache->cache_dev); 1995 1996 if (cache->policy) 1997 dm_cache_policy_destroy(cache->policy); 1998 1999 for (i = 0; i < cache->nr_ctr_args ; i++) 2000 kfree(cache->ctr_args[i]); 2001 kfree(cache->ctr_args); 2002 2003 bioset_exit(&cache->bs); 2004 2005 kfree(cache); 2006 } 2007 2008 static void cache_dtr(struct dm_target *ti) 2009 { 2010 struct cache *cache = ti->private; 2011 2012 destroy(cache); 2013 } 2014 2015 static sector_t get_dev_size(struct dm_dev *dev) 2016 { 2017 return i_size_read(dev->bdev->bd_inode) >> SECTOR_SHIFT; 2018 } 2019 2020 /*----------------------------------------------------------------*/ 2021 2022 /* 2023 * Construct a cache device mapping. 2024 * 2025 * cache <metadata dev> <cache dev> <origin dev> <block size> 2026 * <#feature args> [<feature arg>]* 2027 * <policy> <#policy args> [<policy arg>]* 2028 * 2029 * metadata dev : fast device holding the persistent metadata 2030 * cache dev : fast device holding cached data blocks 2031 * origin dev : slow device holding original data blocks 2032 * block size : cache unit size in sectors 2033 * 2034 * #feature args : number of feature arguments passed 2035 * feature args : writethrough. (The default is writeback.) 2036 * 2037 * policy : the replacement policy to use 2038 * #policy args : an even number of policy arguments corresponding 2039 * to key/value pairs passed to the policy 2040 * policy args : key/value pairs passed to the policy 2041 * E.g. 'sequential_threshold 1024' 2042 * See cache-policies.txt for details. 2043 * 2044 * Optional feature arguments are: 2045 * writethrough : write through caching that prohibits cache block 2046 * content from being different from origin block content. 2047 * Without this argument, the default behaviour is to write 2048 * back cache block contents later for performance reasons, 2049 * so they may differ from the corresponding origin blocks. 2050 */ 2051 struct cache_args { 2052 struct dm_target *ti; 2053 2054 struct dm_dev *metadata_dev; 2055 2056 struct dm_dev *cache_dev; 2057 sector_t cache_sectors; 2058 2059 struct dm_dev *origin_dev; 2060 sector_t origin_sectors; 2061 2062 uint32_t block_size; 2063 2064 const char *policy_name; 2065 int policy_argc; 2066 const char **policy_argv; 2067 2068 struct cache_features features; 2069 }; 2070 2071 static void destroy_cache_args(struct cache_args *ca) 2072 { 2073 if (ca->metadata_dev) 2074 dm_put_device(ca->ti, ca->metadata_dev); 2075 2076 if (ca->cache_dev) 2077 dm_put_device(ca->ti, ca->cache_dev); 2078 2079 if (ca->origin_dev) 2080 dm_put_device(ca->ti, ca->origin_dev); 2081 2082 kfree(ca); 2083 } 2084 2085 static bool at_least_one_arg(struct dm_arg_set *as, char **error) 2086 { 2087 if (!as->argc) { 2088 *error = "Insufficient args"; 2089 return false; 2090 } 2091 2092 return true; 2093 } 2094 2095 static int parse_metadata_dev(struct cache_args *ca, struct dm_arg_set *as, 2096 char **error) 2097 { 2098 int r; 2099 sector_t metadata_dev_size; 2100 char b[BDEVNAME_SIZE]; 2101 2102 if (!at_least_one_arg(as, error)) 2103 return -EINVAL; 2104 2105 r = dm_get_device(ca->ti, dm_shift_arg(as), FMODE_READ | FMODE_WRITE, 2106 &ca->metadata_dev); 2107 if (r) { 2108 *error = "Error opening metadata device"; 2109 return r; 2110 } 2111 2112 metadata_dev_size = get_dev_size(ca->metadata_dev); 2113 if (metadata_dev_size > DM_CACHE_METADATA_MAX_SECTORS_WARNING) 2114 DMWARN("Metadata device %s is larger than %u sectors: excess space will not be used.", 2115 bdevname(ca->metadata_dev->bdev, b), THIN_METADATA_MAX_SECTORS); 2116 2117 return 0; 2118 } 2119 2120 static int parse_cache_dev(struct cache_args *ca, struct dm_arg_set *as, 2121 char **error) 2122 { 2123 int r; 2124 2125 if (!at_least_one_arg(as, error)) 2126 return -EINVAL; 2127 2128 r = dm_get_device(ca->ti, dm_shift_arg(as), FMODE_READ | FMODE_WRITE, 2129 &ca->cache_dev); 2130 if (r) { 2131 *error = "Error opening cache device"; 2132 return r; 2133 } 2134 ca->cache_sectors = get_dev_size(ca->cache_dev); 2135 2136 return 0; 2137 } 2138 2139 static int parse_origin_dev(struct cache_args *ca, struct dm_arg_set *as, 2140 char **error) 2141 { 2142 int r; 2143 2144 if (!at_least_one_arg(as, error)) 2145 return -EINVAL; 2146 2147 r = dm_get_device(ca->ti, dm_shift_arg(as), FMODE_READ | FMODE_WRITE, 2148 &ca->origin_dev); 2149 if (r) { 2150 *error = "Error opening origin device"; 2151 return r; 2152 } 2153 2154 ca->origin_sectors = get_dev_size(ca->origin_dev); 2155 if (ca->ti->len > ca->origin_sectors) { 2156 *error = "Device size larger than cached device"; 2157 return -EINVAL; 2158 } 2159 2160 return 0; 2161 } 2162 2163 static int parse_block_size(struct cache_args *ca, struct dm_arg_set *as, 2164 char **error) 2165 { 2166 unsigned long block_size; 2167 2168 if (!at_least_one_arg(as, error)) 2169 return -EINVAL; 2170 2171 if (kstrtoul(dm_shift_arg(as), 10, &block_size) || !block_size || 2172 block_size < DATA_DEV_BLOCK_SIZE_MIN_SECTORS || 2173 block_size > DATA_DEV_BLOCK_SIZE_MAX_SECTORS || 2174 block_size & (DATA_DEV_BLOCK_SIZE_MIN_SECTORS - 1)) { 2175 *error = "Invalid data block size"; 2176 return -EINVAL; 2177 } 2178 2179 if (block_size > ca->cache_sectors) { 2180 *error = "Data block size is larger than the cache device"; 2181 return -EINVAL; 2182 } 2183 2184 ca->block_size = block_size; 2185 2186 return 0; 2187 } 2188 2189 static void init_features(struct cache_features *cf) 2190 { 2191 cf->mode = CM_WRITE; 2192 cf->io_mode = CM_IO_WRITEBACK; 2193 cf->metadata_version = 1; 2194 cf->discard_passdown = true; 2195 } 2196 2197 static int parse_features(struct cache_args *ca, struct dm_arg_set *as, 2198 char **error) 2199 { 2200 static const struct dm_arg _args[] = { 2201 {0, 3, "Invalid number of cache feature arguments"}, 2202 }; 2203 2204 int r, mode_ctr = 0; 2205 unsigned argc; 2206 const char *arg; 2207 struct cache_features *cf = &ca->features; 2208 2209 init_features(cf); 2210 2211 r = dm_read_arg_group(_args, as, &argc, error); 2212 if (r) 2213 return -EINVAL; 2214 2215 while (argc--) { 2216 arg = dm_shift_arg(as); 2217 2218 if (!strcasecmp(arg, "writeback")) { 2219 cf->io_mode = CM_IO_WRITEBACK; 2220 mode_ctr++; 2221 } 2222 2223 else if (!strcasecmp(arg, "writethrough")) { 2224 cf->io_mode = CM_IO_WRITETHROUGH; 2225 mode_ctr++; 2226 } 2227 2228 else if (!strcasecmp(arg, "passthrough")) { 2229 cf->io_mode = CM_IO_PASSTHROUGH; 2230 mode_ctr++; 2231 } 2232 2233 else if (!strcasecmp(arg, "metadata2")) 2234 cf->metadata_version = 2; 2235 2236 else if (!strcasecmp(arg, "no_discard_passdown")) 2237 cf->discard_passdown = false; 2238 2239 else { 2240 *error = "Unrecognised cache feature requested"; 2241 return -EINVAL; 2242 } 2243 } 2244 2245 if (mode_ctr > 1) { 2246 *error = "Duplicate cache io_mode features requested"; 2247 return -EINVAL; 2248 } 2249 2250 return 0; 2251 } 2252 2253 static int parse_policy(struct cache_args *ca, struct dm_arg_set *as, 2254 char **error) 2255 { 2256 static const struct dm_arg _args[] = { 2257 {0, 1024, "Invalid number of policy arguments"}, 2258 }; 2259 2260 int r; 2261 2262 if (!at_least_one_arg(as, error)) 2263 return -EINVAL; 2264 2265 ca->policy_name = dm_shift_arg(as); 2266 2267 r = dm_read_arg_group(_args, as, &ca->policy_argc, error); 2268 if (r) 2269 return -EINVAL; 2270 2271 ca->policy_argv = (const char **)as->argv; 2272 dm_consume_args(as, ca->policy_argc); 2273 2274 return 0; 2275 } 2276 2277 static int parse_cache_args(struct cache_args *ca, int argc, char **argv, 2278 char **error) 2279 { 2280 int r; 2281 struct dm_arg_set as; 2282 2283 as.argc = argc; 2284 as.argv = argv; 2285 2286 r = parse_metadata_dev(ca, &as, error); 2287 if (r) 2288 return r; 2289 2290 r = parse_cache_dev(ca, &as, error); 2291 if (r) 2292 return r; 2293 2294 r = parse_origin_dev(ca, &as, error); 2295 if (r) 2296 return r; 2297 2298 r = parse_block_size(ca, &as, error); 2299 if (r) 2300 return r; 2301 2302 r = parse_features(ca, &as, error); 2303 if (r) 2304 return r; 2305 2306 r = parse_policy(ca, &as, error); 2307 if (r) 2308 return r; 2309 2310 return 0; 2311 } 2312 2313 /*----------------------------------------------------------------*/ 2314 2315 static struct kmem_cache *migration_cache; 2316 2317 #define NOT_CORE_OPTION 1 2318 2319 static int process_config_option(struct cache *cache, const char *key, const char *value) 2320 { 2321 unsigned long tmp; 2322 2323 if (!strcasecmp(key, "migration_threshold")) { 2324 if (kstrtoul(value, 10, &tmp)) 2325 return -EINVAL; 2326 2327 cache->migration_threshold = tmp; 2328 return 0; 2329 } 2330 2331 return NOT_CORE_OPTION; 2332 } 2333 2334 static int set_config_value(struct cache *cache, const char *key, const char *value) 2335 { 2336 int r = process_config_option(cache, key, value); 2337 2338 if (r == NOT_CORE_OPTION) 2339 r = policy_set_config_value(cache->policy, key, value); 2340 2341 if (r) 2342 DMWARN("bad config value for %s: %s", key, value); 2343 2344 return r; 2345 } 2346 2347 static int set_config_values(struct cache *cache, int argc, const char **argv) 2348 { 2349 int r = 0; 2350 2351 if (argc & 1) { 2352 DMWARN("Odd number of policy arguments given but they should be <key> <value> pairs."); 2353 return -EINVAL; 2354 } 2355 2356 while (argc) { 2357 r = set_config_value(cache, argv[0], argv[1]); 2358 if (r) 2359 break; 2360 2361 argc -= 2; 2362 argv += 2; 2363 } 2364 2365 return r; 2366 } 2367 2368 static int create_cache_policy(struct cache *cache, struct cache_args *ca, 2369 char **error) 2370 { 2371 struct dm_cache_policy *p = dm_cache_policy_create(ca->policy_name, 2372 cache->cache_size, 2373 cache->origin_sectors, 2374 cache->sectors_per_block); 2375 if (IS_ERR(p)) { 2376 *error = "Error creating cache's policy"; 2377 return PTR_ERR(p); 2378 } 2379 cache->policy = p; 2380 BUG_ON(!cache->policy); 2381 2382 return 0; 2383 } 2384 2385 /* 2386 * We want the discard block size to be at least the size of the cache 2387 * block size and have no more than 2^14 discard blocks across the origin. 2388 */ 2389 #define MAX_DISCARD_BLOCKS (1 << 14) 2390 2391 static bool too_many_discard_blocks(sector_t discard_block_size, 2392 sector_t origin_size) 2393 { 2394 (void) sector_div(origin_size, discard_block_size); 2395 2396 return origin_size > MAX_DISCARD_BLOCKS; 2397 } 2398 2399 static sector_t calculate_discard_block_size(sector_t cache_block_size, 2400 sector_t origin_size) 2401 { 2402 sector_t discard_block_size = cache_block_size; 2403 2404 if (origin_size) 2405 while (too_many_discard_blocks(discard_block_size, origin_size)) 2406 discard_block_size *= 2; 2407 2408 return discard_block_size; 2409 } 2410 2411 static void set_cache_size(struct cache *cache, dm_cblock_t size) 2412 { 2413 dm_block_t nr_blocks = from_cblock(size); 2414 2415 if (nr_blocks > (1 << 20) && cache->cache_size != size) 2416 DMWARN_LIMIT("You have created a cache device with a lot of individual cache blocks (%llu)\n" 2417 "All these mappings can consume a lot of kernel memory, and take some time to read/write.\n" 2418 "Please consider increasing the cache block size to reduce the overall cache block count.", 2419 (unsigned long long) nr_blocks); 2420 2421 cache->cache_size = size; 2422 } 2423 2424 #define DEFAULT_MIGRATION_THRESHOLD 2048 2425 2426 static int cache_create(struct cache_args *ca, struct cache **result) 2427 { 2428 int r = 0; 2429 char **error = &ca->ti->error; 2430 struct cache *cache; 2431 struct dm_target *ti = ca->ti; 2432 dm_block_t origin_blocks; 2433 struct dm_cache_metadata *cmd; 2434 bool may_format = ca->features.mode == CM_WRITE; 2435 2436 cache = kzalloc(sizeof(*cache), GFP_KERNEL); 2437 if (!cache) 2438 return -ENOMEM; 2439 2440 cache->ti = ca->ti; 2441 ti->private = cache; 2442 ti->num_flush_bios = 2; 2443 ti->flush_supported = true; 2444 2445 ti->num_discard_bios = 1; 2446 ti->discards_supported = true; 2447 2448 ti->per_io_data_size = sizeof(struct per_bio_data); 2449 2450 cache->features = ca->features; 2451 if (writethrough_mode(cache)) { 2452 /* Create bioset for writethrough bios issued to origin */ 2453 r = bioset_init(&cache->bs, BIO_POOL_SIZE, 0, 0); 2454 if (r) 2455 goto bad; 2456 } 2457 2458 cache->metadata_dev = ca->metadata_dev; 2459 cache->origin_dev = ca->origin_dev; 2460 cache->cache_dev = ca->cache_dev; 2461 2462 ca->metadata_dev = ca->origin_dev = ca->cache_dev = NULL; 2463 2464 origin_blocks = cache->origin_sectors = ca->origin_sectors; 2465 origin_blocks = block_div(origin_blocks, ca->block_size); 2466 cache->origin_blocks = to_oblock(origin_blocks); 2467 2468 cache->sectors_per_block = ca->block_size; 2469 if (dm_set_target_max_io_len(ti, cache->sectors_per_block)) { 2470 r = -EINVAL; 2471 goto bad; 2472 } 2473 2474 if (ca->block_size & (ca->block_size - 1)) { 2475 dm_block_t cache_size = ca->cache_sectors; 2476 2477 cache->sectors_per_block_shift = -1; 2478 cache_size = block_div(cache_size, ca->block_size); 2479 set_cache_size(cache, to_cblock(cache_size)); 2480 } else { 2481 cache->sectors_per_block_shift = __ffs(ca->block_size); 2482 set_cache_size(cache, to_cblock(ca->cache_sectors >> cache->sectors_per_block_shift)); 2483 } 2484 2485 r = create_cache_policy(cache, ca, error); 2486 if (r) 2487 goto bad; 2488 2489 cache->policy_nr_args = ca->policy_argc; 2490 cache->migration_threshold = DEFAULT_MIGRATION_THRESHOLD; 2491 2492 r = set_config_values(cache, ca->policy_argc, ca->policy_argv); 2493 if (r) { 2494 *error = "Error setting cache policy's config values"; 2495 goto bad; 2496 } 2497 2498 cmd = dm_cache_metadata_open(cache->metadata_dev->bdev, 2499 ca->block_size, may_format, 2500 dm_cache_policy_get_hint_size(cache->policy), 2501 ca->features.metadata_version); 2502 if (IS_ERR(cmd)) { 2503 *error = "Error creating metadata object"; 2504 r = PTR_ERR(cmd); 2505 goto bad; 2506 } 2507 cache->cmd = cmd; 2508 set_cache_mode(cache, CM_WRITE); 2509 if (get_cache_mode(cache) != CM_WRITE) { 2510 *error = "Unable to get write access to metadata, please check/repair metadata."; 2511 r = -EINVAL; 2512 goto bad; 2513 } 2514 2515 if (passthrough_mode(cache)) { 2516 bool all_clean; 2517 2518 r = dm_cache_metadata_all_clean(cache->cmd, &all_clean); 2519 if (r) { 2520 *error = "dm_cache_metadata_all_clean() failed"; 2521 goto bad; 2522 } 2523 2524 if (!all_clean) { 2525 *error = "Cannot enter passthrough mode unless all blocks are clean"; 2526 r = -EINVAL; 2527 goto bad; 2528 } 2529 2530 policy_allow_migrations(cache->policy, false); 2531 } 2532 2533 spin_lock_init(&cache->lock); 2534 bio_list_init(&cache->deferred_bios); 2535 atomic_set(&cache->nr_allocated_migrations, 0); 2536 atomic_set(&cache->nr_io_migrations, 0); 2537 init_waitqueue_head(&cache->migration_wait); 2538 2539 r = -ENOMEM; 2540 atomic_set(&cache->nr_dirty, 0); 2541 cache->dirty_bitset = alloc_bitset(from_cblock(cache->cache_size)); 2542 if (!cache->dirty_bitset) { 2543 *error = "could not allocate dirty bitset"; 2544 goto bad; 2545 } 2546 clear_bitset(cache->dirty_bitset, from_cblock(cache->cache_size)); 2547 2548 cache->discard_block_size = 2549 calculate_discard_block_size(cache->sectors_per_block, 2550 cache->origin_sectors); 2551 cache->discard_nr_blocks = to_dblock(dm_sector_div_up(cache->origin_sectors, 2552 cache->discard_block_size)); 2553 cache->discard_bitset = alloc_bitset(from_dblock(cache->discard_nr_blocks)); 2554 if (!cache->discard_bitset) { 2555 *error = "could not allocate discard bitset"; 2556 goto bad; 2557 } 2558 clear_bitset(cache->discard_bitset, from_dblock(cache->discard_nr_blocks)); 2559 2560 cache->copier = dm_kcopyd_client_create(&dm_kcopyd_throttle); 2561 if (IS_ERR(cache->copier)) { 2562 *error = "could not create kcopyd client"; 2563 r = PTR_ERR(cache->copier); 2564 goto bad; 2565 } 2566 2567 cache->wq = alloc_workqueue("dm-" DM_MSG_PREFIX, WQ_MEM_RECLAIM, 0); 2568 if (!cache->wq) { 2569 *error = "could not create workqueue for metadata object"; 2570 goto bad; 2571 } 2572 INIT_WORK(&cache->deferred_bio_worker, process_deferred_bios); 2573 INIT_WORK(&cache->migration_worker, check_migrations); 2574 INIT_DELAYED_WORK(&cache->waker, do_waker); 2575 2576 cache->prison = dm_bio_prison_create_v2(cache->wq); 2577 if (!cache->prison) { 2578 *error = "could not create bio prison"; 2579 goto bad; 2580 } 2581 2582 r = mempool_init_slab_pool(&cache->migration_pool, MIGRATION_POOL_SIZE, 2583 migration_cache); 2584 if (r) { 2585 *error = "Error creating cache's migration mempool"; 2586 goto bad; 2587 } 2588 2589 cache->need_tick_bio = true; 2590 cache->sized = false; 2591 cache->invalidate = false; 2592 cache->commit_requested = false; 2593 cache->loaded_mappings = false; 2594 cache->loaded_discards = false; 2595 2596 load_stats(cache); 2597 2598 atomic_set(&cache->stats.demotion, 0); 2599 atomic_set(&cache->stats.promotion, 0); 2600 atomic_set(&cache->stats.copies_avoided, 0); 2601 atomic_set(&cache->stats.cache_cell_clash, 0); 2602 atomic_set(&cache->stats.commit_count, 0); 2603 atomic_set(&cache->stats.discard_count, 0); 2604 2605 spin_lock_init(&cache->invalidation_lock); 2606 INIT_LIST_HEAD(&cache->invalidation_requests); 2607 2608 batcher_init(&cache->committer, commit_op, cache, 2609 issue_op, cache, cache->wq); 2610 iot_init(&cache->tracker); 2611 2612 init_rwsem(&cache->background_work_lock); 2613 prevent_background_work(cache); 2614 2615 *result = cache; 2616 return 0; 2617 bad: 2618 destroy(cache); 2619 return r; 2620 } 2621 2622 static int copy_ctr_args(struct cache *cache, int argc, const char **argv) 2623 { 2624 unsigned i; 2625 const char **copy; 2626 2627 copy = kcalloc(argc, sizeof(*copy), GFP_KERNEL); 2628 if (!copy) 2629 return -ENOMEM; 2630 for (i = 0; i < argc; i++) { 2631 copy[i] = kstrdup(argv[i], GFP_KERNEL); 2632 if (!copy[i]) { 2633 while (i--) 2634 kfree(copy[i]); 2635 kfree(copy); 2636 return -ENOMEM; 2637 } 2638 } 2639 2640 cache->nr_ctr_args = argc; 2641 cache->ctr_args = copy; 2642 2643 return 0; 2644 } 2645 2646 static int cache_ctr(struct dm_target *ti, unsigned argc, char **argv) 2647 { 2648 int r = -EINVAL; 2649 struct cache_args *ca; 2650 struct cache *cache = NULL; 2651 2652 ca = kzalloc(sizeof(*ca), GFP_KERNEL); 2653 if (!ca) { 2654 ti->error = "Error allocating memory for cache"; 2655 return -ENOMEM; 2656 } 2657 ca->ti = ti; 2658 2659 r = parse_cache_args(ca, argc, argv, &ti->error); 2660 if (r) 2661 goto out; 2662 2663 r = cache_create(ca, &cache); 2664 if (r) 2665 goto out; 2666 2667 r = copy_ctr_args(cache, argc - 3, (const char **)argv + 3); 2668 if (r) { 2669 destroy(cache); 2670 goto out; 2671 } 2672 2673 ti->private = cache; 2674 out: 2675 destroy_cache_args(ca); 2676 return r; 2677 } 2678 2679 /*----------------------------------------------------------------*/ 2680 2681 static int cache_map(struct dm_target *ti, struct bio *bio) 2682 { 2683 struct cache *cache = ti->private; 2684 2685 int r; 2686 bool commit_needed; 2687 dm_oblock_t block = get_bio_block(cache, bio); 2688 2689 init_per_bio_data(bio); 2690 if (unlikely(from_oblock(block) >= from_oblock(cache->origin_blocks))) { 2691 /* 2692 * This can only occur if the io goes to a partial block at 2693 * the end of the origin device. We don't cache these. 2694 * Just remap to the origin and carry on. 2695 */ 2696 remap_to_origin(cache, bio); 2697 accounted_begin(cache, bio); 2698 return DM_MAPIO_REMAPPED; 2699 } 2700 2701 if (discard_or_flush(bio)) { 2702 defer_bio(cache, bio); 2703 return DM_MAPIO_SUBMITTED; 2704 } 2705 2706 r = map_bio(cache, bio, block, &commit_needed); 2707 if (commit_needed) 2708 schedule_commit(&cache->committer); 2709 2710 return r; 2711 } 2712 2713 static int cache_end_io(struct dm_target *ti, struct bio *bio, blk_status_t *error) 2714 { 2715 struct cache *cache = ti->private; 2716 unsigned long flags; 2717 struct per_bio_data *pb = get_per_bio_data(bio); 2718 2719 if (pb->tick) { 2720 policy_tick(cache->policy, false); 2721 2722 spin_lock_irqsave(&cache->lock, flags); 2723 cache->need_tick_bio = true; 2724 spin_unlock_irqrestore(&cache->lock, flags); 2725 } 2726 2727 bio_drop_shared_lock(cache, bio); 2728 accounted_complete(cache, bio); 2729 2730 return DM_ENDIO_DONE; 2731 } 2732 2733 static int write_dirty_bitset(struct cache *cache) 2734 { 2735 int r; 2736 2737 if (get_cache_mode(cache) >= CM_READ_ONLY) 2738 return -EINVAL; 2739 2740 r = dm_cache_set_dirty_bits(cache->cmd, from_cblock(cache->cache_size), cache->dirty_bitset); 2741 if (r) 2742 metadata_operation_failed(cache, "dm_cache_set_dirty_bits", r); 2743 2744 return r; 2745 } 2746 2747 static int write_discard_bitset(struct cache *cache) 2748 { 2749 unsigned i, r; 2750 2751 if (get_cache_mode(cache) >= CM_READ_ONLY) 2752 return -EINVAL; 2753 2754 r = dm_cache_discard_bitset_resize(cache->cmd, cache->discard_block_size, 2755 cache->discard_nr_blocks); 2756 if (r) { 2757 DMERR("%s: could not resize on-disk discard bitset", cache_device_name(cache)); 2758 metadata_operation_failed(cache, "dm_cache_discard_bitset_resize", r); 2759 return r; 2760 } 2761 2762 for (i = 0; i < from_dblock(cache->discard_nr_blocks); i++) { 2763 r = dm_cache_set_discard(cache->cmd, to_dblock(i), 2764 is_discarded(cache, to_dblock(i))); 2765 if (r) { 2766 metadata_operation_failed(cache, "dm_cache_set_discard", r); 2767 return r; 2768 } 2769 } 2770 2771 return 0; 2772 } 2773 2774 static int write_hints(struct cache *cache) 2775 { 2776 int r; 2777 2778 if (get_cache_mode(cache) >= CM_READ_ONLY) 2779 return -EINVAL; 2780 2781 r = dm_cache_write_hints(cache->cmd, cache->policy); 2782 if (r) { 2783 metadata_operation_failed(cache, "dm_cache_write_hints", r); 2784 return r; 2785 } 2786 2787 return 0; 2788 } 2789 2790 /* 2791 * returns true on success 2792 */ 2793 static bool sync_metadata(struct cache *cache) 2794 { 2795 int r1, r2, r3, r4; 2796 2797 r1 = write_dirty_bitset(cache); 2798 if (r1) 2799 DMERR("%s: could not write dirty bitset", cache_device_name(cache)); 2800 2801 r2 = write_discard_bitset(cache); 2802 if (r2) 2803 DMERR("%s: could not write discard bitset", cache_device_name(cache)); 2804 2805 save_stats(cache); 2806 2807 r3 = write_hints(cache); 2808 if (r3) 2809 DMERR("%s: could not write hints", cache_device_name(cache)); 2810 2811 /* 2812 * If writing the above metadata failed, we still commit, but don't 2813 * set the clean shutdown flag. This will effectively force every 2814 * dirty bit to be set on reload. 2815 */ 2816 r4 = commit(cache, !r1 && !r2 && !r3); 2817 if (r4) 2818 DMERR("%s: could not write cache metadata", cache_device_name(cache)); 2819 2820 return !r1 && !r2 && !r3 && !r4; 2821 } 2822 2823 static void cache_postsuspend(struct dm_target *ti) 2824 { 2825 struct cache *cache = ti->private; 2826 2827 prevent_background_work(cache); 2828 BUG_ON(atomic_read(&cache->nr_io_migrations)); 2829 2830 cancel_delayed_work_sync(&cache->waker); 2831 drain_workqueue(cache->wq); 2832 WARN_ON(cache->tracker.in_flight); 2833 2834 /* 2835 * If it's a flush suspend there won't be any deferred bios, so this 2836 * call is harmless. 2837 */ 2838 requeue_deferred_bios(cache); 2839 2840 if (get_cache_mode(cache) == CM_WRITE) 2841 (void) sync_metadata(cache); 2842 } 2843 2844 static int load_mapping(void *context, dm_oblock_t oblock, dm_cblock_t cblock, 2845 bool dirty, uint32_t hint, bool hint_valid) 2846 { 2847 int r; 2848 struct cache *cache = context; 2849 2850 if (dirty) { 2851 set_bit(from_cblock(cblock), cache->dirty_bitset); 2852 atomic_inc(&cache->nr_dirty); 2853 } else 2854 clear_bit(from_cblock(cblock), cache->dirty_bitset); 2855 2856 r = policy_load_mapping(cache->policy, oblock, cblock, dirty, hint, hint_valid); 2857 if (r) 2858 return r; 2859 2860 return 0; 2861 } 2862 2863 /* 2864 * The discard block size in the on disk metadata is not 2865 * neccessarily the same as we're currently using. So we have to 2866 * be careful to only set the discarded attribute if we know it 2867 * covers a complete block of the new size. 2868 */ 2869 struct discard_load_info { 2870 struct cache *cache; 2871 2872 /* 2873 * These blocks are sized using the on disk dblock size, rather 2874 * than the current one. 2875 */ 2876 dm_block_t block_size; 2877 dm_block_t discard_begin, discard_end; 2878 }; 2879 2880 static void discard_load_info_init(struct cache *cache, 2881 struct discard_load_info *li) 2882 { 2883 li->cache = cache; 2884 li->discard_begin = li->discard_end = 0; 2885 } 2886 2887 static void set_discard_range(struct discard_load_info *li) 2888 { 2889 sector_t b, e; 2890 2891 if (li->discard_begin == li->discard_end) 2892 return; 2893 2894 /* 2895 * Convert to sectors. 2896 */ 2897 b = li->discard_begin * li->block_size; 2898 e = li->discard_end * li->block_size; 2899 2900 /* 2901 * Then convert back to the current dblock size. 2902 */ 2903 b = dm_sector_div_up(b, li->cache->discard_block_size); 2904 sector_div(e, li->cache->discard_block_size); 2905 2906 /* 2907 * The origin may have shrunk, so we need to check we're still in 2908 * bounds. 2909 */ 2910 if (e > from_dblock(li->cache->discard_nr_blocks)) 2911 e = from_dblock(li->cache->discard_nr_blocks); 2912 2913 for (; b < e; b++) 2914 set_discard(li->cache, to_dblock(b)); 2915 } 2916 2917 static int load_discard(void *context, sector_t discard_block_size, 2918 dm_dblock_t dblock, bool discard) 2919 { 2920 struct discard_load_info *li = context; 2921 2922 li->block_size = discard_block_size; 2923 2924 if (discard) { 2925 if (from_dblock(dblock) == li->discard_end) 2926 /* 2927 * We're already in a discard range, just extend it. 2928 */ 2929 li->discard_end = li->discard_end + 1ULL; 2930 2931 else { 2932 /* 2933 * Emit the old range and start a new one. 2934 */ 2935 set_discard_range(li); 2936 li->discard_begin = from_dblock(dblock); 2937 li->discard_end = li->discard_begin + 1ULL; 2938 } 2939 } else { 2940 set_discard_range(li); 2941 li->discard_begin = li->discard_end = 0; 2942 } 2943 2944 return 0; 2945 } 2946 2947 static dm_cblock_t get_cache_dev_size(struct cache *cache) 2948 { 2949 sector_t size = get_dev_size(cache->cache_dev); 2950 (void) sector_div(size, cache->sectors_per_block); 2951 return to_cblock(size); 2952 } 2953 2954 static bool can_resize(struct cache *cache, dm_cblock_t new_size) 2955 { 2956 if (from_cblock(new_size) > from_cblock(cache->cache_size)) { 2957 if (cache->sized) { 2958 DMERR("%s: unable to extend cache due to missing cache table reload", 2959 cache_device_name(cache)); 2960 return false; 2961 } 2962 } 2963 2964 /* 2965 * We can't drop a dirty block when shrinking the cache. 2966 */ 2967 while (from_cblock(new_size) < from_cblock(cache->cache_size)) { 2968 new_size = to_cblock(from_cblock(new_size) + 1); 2969 if (is_dirty(cache, new_size)) { 2970 DMERR("%s: unable to shrink cache; cache block %llu is dirty", 2971 cache_device_name(cache), 2972 (unsigned long long) from_cblock(new_size)); 2973 return false; 2974 } 2975 } 2976 2977 return true; 2978 } 2979 2980 static int resize_cache_dev(struct cache *cache, dm_cblock_t new_size) 2981 { 2982 int r; 2983 2984 r = dm_cache_resize(cache->cmd, new_size); 2985 if (r) { 2986 DMERR("%s: could not resize cache metadata", cache_device_name(cache)); 2987 metadata_operation_failed(cache, "dm_cache_resize", r); 2988 return r; 2989 } 2990 2991 set_cache_size(cache, new_size); 2992 2993 return 0; 2994 } 2995 2996 static int cache_preresume(struct dm_target *ti) 2997 { 2998 int r = 0; 2999 struct cache *cache = ti->private; 3000 dm_cblock_t csize = get_cache_dev_size(cache); 3001 3002 /* 3003 * Check to see if the cache has resized. 3004 */ 3005 if (!cache->sized) { 3006 r = resize_cache_dev(cache, csize); 3007 if (r) 3008 return r; 3009 3010 cache->sized = true; 3011 3012 } else if (csize != cache->cache_size) { 3013 if (!can_resize(cache, csize)) 3014 return -EINVAL; 3015 3016 r = resize_cache_dev(cache, csize); 3017 if (r) 3018 return r; 3019 } 3020 3021 if (!cache->loaded_mappings) { 3022 r = dm_cache_load_mappings(cache->cmd, cache->policy, 3023 load_mapping, cache); 3024 if (r) { 3025 DMERR("%s: could not load cache mappings", cache_device_name(cache)); 3026 metadata_operation_failed(cache, "dm_cache_load_mappings", r); 3027 return r; 3028 } 3029 3030 cache->loaded_mappings = true; 3031 } 3032 3033 if (!cache->loaded_discards) { 3034 struct discard_load_info li; 3035 3036 /* 3037 * The discard bitset could have been resized, or the 3038 * discard block size changed. To be safe we start by 3039 * setting every dblock to not discarded. 3040 */ 3041 clear_bitset(cache->discard_bitset, from_dblock(cache->discard_nr_blocks)); 3042 3043 discard_load_info_init(cache, &li); 3044 r = dm_cache_load_discards(cache->cmd, load_discard, &li); 3045 if (r) { 3046 DMERR("%s: could not load origin discards", cache_device_name(cache)); 3047 metadata_operation_failed(cache, "dm_cache_load_discards", r); 3048 return r; 3049 } 3050 set_discard_range(&li); 3051 3052 cache->loaded_discards = true; 3053 } 3054 3055 return r; 3056 } 3057 3058 static void cache_resume(struct dm_target *ti) 3059 { 3060 struct cache *cache = ti->private; 3061 3062 cache->need_tick_bio = true; 3063 allow_background_work(cache); 3064 do_waker(&cache->waker.work); 3065 } 3066 3067 static void emit_flags(struct cache *cache, char *result, 3068 unsigned maxlen, ssize_t *sz_ptr) 3069 { 3070 ssize_t sz = *sz_ptr; 3071 struct cache_features *cf = &cache->features; 3072 unsigned count = (cf->metadata_version == 2) + !cf->discard_passdown + 1; 3073 3074 DMEMIT("%u ", count); 3075 3076 if (cf->metadata_version == 2) 3077 DMEMIT("metadata2 "); 3078 3079 if (writethrough_mode(cache)) 3080 DMEMIT("writethrough "); 3081 3082 else if (passthrough_mode(cache)) 3083 DMEMIT("passthrough "); 3084 3085 else if (writeback_mode(cache)) 3086 DMEMIT("writeback "); 3087 3088 else { 3089 DMEMIT("unknown "); 3090 DMERR("%s: internal error: unknown io mode: %d", 3091 cache_device_name(cache), (int) cf->io_mode); 3092 } 3093 3094 if (!cf->discard_passdown) 3095 DMEMIT("no_discard_passdown "); 3096 3097 *sz_ptr = sz; 3098 } 3099 3100 /* 3101 * Status format: 3102 * 3103 * <metadata block size> <#used metadata blocks>/<#total metadata blocks> 3104 * <cache block size> <#used cache blocks>/<#total cache blocks> 3105 * <#read hits> <#read misses> <#write hits> <#write misses> 3106 * <#demotions> <#promotions> <#dirty> 3107 * <#features> <features>* 3108 * <#core args> <core args> 3109 * <policy name> <#policy args> <policy args>* <cache metadata mode> <needs_check> 3110 */ 3111 static void cache_status(struct dm_target *ti, status_type_t type, 3112 unsigned status_flags, char *result, unsigned maxlen) 3113 { 3114 int r = 0; 3115 unsigned i; 3116 ssize_t sz = 0; 3117 dm_block_t nr_free_blocks_metadata = 0; 3118 dm_block_t nr_blocks_metadata = 0; 3119 char buf[BDEVNAME_SIZE]; 3120 struct cache *cache = ti->private; 3121 dm_cblock_t residency; 3122 bool needs_check; 3123 3124 switch (type) { 3125 case STATUSTYPE_INFO: 3126 if (get_cache_mode(cache) == CM_FAIL) { 3127 DMEMIT("Fail"); 3128 break; 3129 } 3130 3131 /* Commit to ensure statistics aren't out-of-date */ 3132 if (!(status_flags & DM_STATUS_NOFLUSH_FLAG) && !dm_suspended(ti)) 3133 (void) commit(cache, false); 3134 3135 r = dm_cache_get_free_metadata_block_count(cache->cmd, &nr_free_blocks_metadata); 3136 if (r) { 3137 DMERR("%s: dm_cache_get_free_metadata_block_count returned %d", 3138 cache_device_name(cache), r); 3139 goto err; 3140 } 3141 3142 r = dm_cache_get_metadata_dev_size(cache->cmd, &nr_blocks_metadata); 3143 if (r) { 3144 DMERR("%s: dm_cache_get_metadata_dev_size returned %d", 3145 cache_device_name(cache), r); 3146 goto err; 3147 } 3148 3149 residency = policy_residency(cache->policy); 3150 3151 DMEMIT("%u %llu/%llu %llu %llu/%llu %u %u %u %u %u %u %lu ", 3152 (unsigned)DM_CACHE_METADATA_BLOCK_SIZE, 3153 (unsigned long long)(nr_blocks_metadata - nr_free_blocks_metadata), 3154 (unsigned long long)nr_blocks_metadata, 3155 (unsigned long long)cache->sectors_per_block, 3156 (unsigned long long) from_cblock(residency), 3157 (unsigned long long) from_cblock(cache->cache_size), 3158 (unsigned) atomic_read(&cache->stats.read_hit), 3159 (unsigned) atomic_read(&cache->stats.read_miss), 3160 (unsigned) atomic_read(&cache->stats.write_hit), 3161 (unsigned) atomic_read(&cache->stats.write_miss), 3162 (unsigned) atomic_read(&cache->stats.demotion), 3163 (unsigned) atomic_read(&cache->stats.promotion), 3164 (unsigned long) atomic_read(&cache->nr_dirty)); 3165 3166 emit_flags(cache, result, maxlen, &sz); 3167 3168 DMEMIT("2 migration_threshold %llu ", (unsigned long long) cache->migration_threshold); 3169 3170 DMEMIT("%s ", dm_cache_policy_get_name(cache->policy)); 3171 if (sz < maxlen) { 3172 r = policy_emit_config_values(cache->policy, result, maxlen, &sz); 3173 if (r) 3174 DMERR("%s: policy_emit_config_values returned %d", 3175 cache_device_name(cache), r); 3176 } 3177 3178 if (get_cache_mode(cache) == CM_READ_ONLY) 3179 DMEMIT("ro "); 3180 else 3181 DMEMIT("rw "); 3182 3183 r = dm_cache_metadata_needs_check(cache->cmd, &needs_check); 3184 3185 if (r || needs_check) 3186 DMEMIT("needs_check "); 3187 else 3188 DMEMIT("- "); 3189 3190 break; 3191 3192 case STATUSTYPE_TABLE: 3193 format_dev_t(buf, cache->metadata_dev->bdev->bd_dev); 3194 DMEMIT("%s ", buf); 3195 format_dev_t(buf, cache->cache_dev->bdev->bd_dev); 3196 DMEMIT("%s ", buf); 3197 format_dev_t(buf, cache->origin_dev->bdev->bd_dev); 3198 DMEMIT("%s", buf); 3199 3200 for (i = 0; i < cache->nr_ctr_args - 1; i++) 3201 DMEMIT(" %s", cache->ctr_args[i]); 3202 if (cache->nr_ctr_args) 3203 DMEMIT(" %s", cache->ctr_args[cache->nr_ctr_args - 1]); 3204 } 3205 3206 return; 3207 3208 err: 3209 DMEMIT("Error"); 3210 } 3211 3212 /* 3213 * Defines a range of cblocks, begin to (end - 1) are in the range. end is 3214 * the one-past-the-end value. 3215 */ 3216 struct cblock_range { 3217 dm_cblock_t begin; 3218 dm_cblock_t end; 3219 }; 3220 3221 /* 3222 * A cache block range can take two forms: 3223 * 3224 * i) A single cblock, eg. '3456' 3225 * ii) A begin and end cblock with a dash between, eg. 123-234 3226 */ 3227 static int parse_cblock_range(struct cache *cache, const char *str, 3228 struct cblock_range *result) 3229 { 3230 char dummy; 3231 uint64_t b, e; 3232 int r; 3233 3234 /* 3235 * Try and parse form (ii) first. 3236 */ 3237 r = sscanf(str, "%llu-%llu%c", &b, &e, &dummy); 3238 if (r < 0) 3239 return r; 3240 3241 if (r == 2) { 3242 result->begin = to_cblock(b); 3243 result->end = to_cblock(e); 3244 return 0; 3245 } 3246 3247 /* 3248 * That didn't work, try form (i). 3249 */ 3250 r = sscanf(str, "%llu%c", &b, &dummy); 3251 if (r < 0) 3252 return r; 3253 3254 if (r == 1) { 3255 result->begin = to_cblock(b); 3256 result->end = to_cblock(from_cblock(result->begin) + 1u); 3257 return 0; 3258 } 3259 3260 DMERR("%s: invalid cblock range '%s'", cache_device_name(cache), str); 3261 return -EINVAL; 3262 } 3263 3264 static int validate_cblock_range(struct cache *cache, struct cblock_range *range) 3265 { 3266 uint64_t b = from_cblock(range->begin); 3267 uint64_t e = from_cblock(range->end); 3268 uint64_t n = from_cblock(cache->cache_size); 3269 3270 if (b >= n) { 3271 DMERR("%s: begin cblock out of range: %llu >= %llu", 3272 cache_device_name(cache), b, n); 3273 return -EINVAL; 3274 } 3275 3276 if (e > n) { 3277 DMERR("%s: end cblock out of range: %llu > %llu", 3278 cache_device_name(cache), e, n); 3279 return -EINVAL; 3280 } 3281 3282 if (b >= e) { 3283 DMERR("%s: invalid cblock range: %llu >= %llu", 3284 cache_device_name(cache), b, e); 3285 return -EINVAL; 3286 } 3287 3288 return 0; 3289 } 3290 3291 static inline dm_cblock_t cblock_succ(dm_cblock_t b) 3292 { 3293 return to_cblock(from_cblock(b) + 1); 3294 } 3295 3296 static int request_invalidation(struct cache *cache, struct cblock_range *range) 3297 { 3298 int r = 0; 3299 3300 /* 3301 * We don't need to do any locking here because we know we're in 3302 * passthrough mode. There's is potential for a race between an 3303 * invalidation triggered by an io and an invalidation message. This 3304 * is harmless, we must not worry if the policy call fails. 3305 */ 3306 while (range->begin != range->end) { 3307 r = invalidate_cblock(cache, range->begin); 3308 if (r) 3309 return r; 3310 3311 range->begin = cblock_succ(range->begin); 3312 } 3313 3314 cache->commit_requested = true; 3315 return r; 3316 } 3317 3318 static int process_invalidate_cblocks_message(struct cache *cache, unsigned count, 3319 const char **cblock_ranges) 3320 { 3321 int r = 0; 3322 unsigned i; 3323 struct cblock_range range; 3324 3325 if (!passthrough_mode(cache)) { 3326 DMERR("%s: cache has to be in passthrough mode for invalidation", 3327 cache_device_name(cache)); 3328 return -EPERM; 3329 } 3330 3331 for (i = 0; i < count; i++) { 3332 r = parse_cblock_range(cache, cblock_ranges[i], &range); 3333 if (r) 3334 break; 3335 3336 r = validate_cblock_range(cache, &range); 3337 if (r) 3338 break; 3339 3340 /* 3341 * Pass begin and end origin blocks to the worker and wake it. 3342 */ 3343 r = request_invalidation(cache, &range); 3344 if (r) 3345 break; 3346 } 3347 3348 return r; 3349 } 3350 3351 /* 3352 * Supports 3353 * "<key> <value>" 3354 * and 3355 * "invalidate_cblocks [(<begin>)|(<begin>-<end>)]* 3356 * 3357 * The key migration_threshold is supported by the cache target core. 3358 */ 3359 static int cache_message(struct dm_target *ti, unsigned argc, char **argv, 3360 char *result, unsigned maxlen) 3361 { 3362 struct cache *cache = ti->private; 3363 3364 if (!argc) 3365 return -EINVAL; 3366 3367 if (get_cache_mode(cache) >= CM_READ_ONLY) { 3368 DMERR("%s: unable to service cache target messages in READ_ONLY or FAIL mode", 3369 cache_device_name(cache)); 3370 return -EOPNOTSUPP; 3371 } 3372 3373 if (!strcasecmp(argv[0], "invalidate_cblocks")) 3374 return process_invalidate_cblocks_message(cache, argc - 1, (const char **) argv + 1); 3375 3376 if (argc != 2) 3377 return -EINVAL; 3378 3379 return set_config_value(cache, argv[0], argv[1]); 3380 } 3381 3382 static int cache_iterate_devices(struct dm_target *ti, 3383 iterate_devices_callout_fn fn, void *data) 3384 { 3385 int r = 0; 3386 struct cache *cache = ti->private; 3387 3388 r = fn(ti, cache->cache_dev, 0, get_dev_size(cache->cache_dev), data); 3389 if (!r) 3390 r = fn(ti, cache->origin_dev, 0, ti->len, data); 3391 3392 return r; 3393 } 3394 3395 static bool origin_dev_supports_discard(struct block_device *origin_bdev) 3396 { 3397 struct request_queue *q = bdev_get_queue(origin_bdev); 3398 3399 return q && blk_queue_discard(q); 3400 } 3401 3402 /* 3403 * If discard_passdown was enabled verify that the origin device 3404 * supports discards. Disable discard_passdown if not. 3405 */ 3406 static void disable_passdown_if_not_supported(struct cache *cache) 3407 { 3408 struct block_device *origin_bdev = cache->origin_dev->bdev; 3409 struct queue_limits *origin_limits = &bdev_get_queue(origin_bdev)->limits; 3410 const char *reason = NULL; 3411 char buf[BDEVNAME_SIZE]; 3412 3413 if (!cache->features.discard_passdown) 3414 return; 3415 3416 if (!origin_dev_supports_discard(origin_bdev)) 3417 reason = "discard unsupported"; 3418 3419 else if (origin_limits->max_discard_sectors < cache->sectors_per_block) 3420 reason = "max discard sectors smaller than a block"; 3421 3422 if (reason) { 3423 DMWARN("Origin device (%s) %s: Disabling discard passdown.", 3424 bdevname(origin_bdev, buf), reason); 3425 cache->features.discard_passdown = false; 3426 } 3427 } 3428 3429 static void set_discard_limits(struct cache *cache, struct queue_limits *limits) 3430 { 3431 struct block_device *origin_bdev = cache->origin_dev->bdev; 3432 struct queue_limits *origin_limits = &bdev_get_queue(origin_bdev)->limits; 3433 3434 if (!cache->features.discard_passdown) { 3435 /* No passdown is done so setting own virtual limits */ 3436 limits->max_discard_sectors = min_t(sector_t, cache->discard_block_size * 1024, 3437 cache->origin_sectors); 3438 limits->discard_granularity = cache->discard_block_size << SECTOR_SHIFT; 3439 return; 3440 } 3441 3442 /* 3443 * cache_iterate_devices() is stacking both origin and fast device limits 3444 * but discards aren't passed to fast device, so inherit origin's limits. 3445 */ 3446 limits->max_discard_sectors = origin_limits->max_discard_sectors; 3447 limits->max_hw_discard_sectors = origin_limits->max_hw_discard_sectors; 3448 limits->discard_granularity = origin_limits->discard_granularity; 3449 limits->discard_alignment = origin_limits->discard_alignment; 3450 limits->discard_misaligned = origin_limits->discard_misaligned; 3451 } 3452 3453 static void cache_io_hints(struct dm_target *ti, struct queue_limits *limits) 3454 { 3455 struct cache *cache = ti->private; 3456 uint64_t io_opt_sectors = limits->io_opt >> SECTOR_SHIFT; 3457 3458 /* 3459 * If the system-determined stacked limits are compatible with the 3460 * cache's blocksize (io_opt is a factor) do not override them. 3461 */ 3462 if (io_opt_sectors < cache->sectors_per_block || 3463 do_div(io_opt_sectors, cache->sectors_per_block)) { 3464 blk_limits_io_min(limits, cache->sectors_per_block << SECTOR_SHIFT); 3465 blk_limits_io_opt(limits, cache->sectors_per_block << SECTOR_SHIFT); 3466 } 3467 3468 disable_passdown_if_not_supported(cache); 3469 set_discard_limits(cache, limits); 3470 } 3471 3472 /*----------------------------------------------------------------*/ 3473 3474 static struct target_type cache_target = { 3475 .name = "cache", 3476 .version = {2, 2, 0}, 3477 .module = THIS_MODULE, 3478 .ctr = cache_ctr, 3479 .dtr = cache_dtr, 3480 .map = cache_map, 3481 .end_io = cache_end_io, 3482 .postsuspend = cache_postsuspend, 3483 .preresume = cache_preresume, 3484 .resume = cache_resume, 3485 .status = cache_status, 3486 .message = cache_message, 3487 .iterate_devices = cache_iterate_devices, 3488 .io_hints = cache_io_hints, 3489 }; 3490 3491 static int __init dm_cache_init(void) 3492 { 3493 int r; 3494 3495 migration_cache = KMEM_CACHE(dm_cache_migration, 0); 3496 if (!migration_cache) 3497 return -ENOMEM; 3498 3499 r = dm_register_target(&cache_target); 3500 if (r) { 3501 DMERR("cache target registration failed: %d", r); 3502 kmem_cache_destroy(migration_cache); 3503 return r; 3504 } 3505 3506 return 0; 3507 } 3508 3509 static void __exit dm_cache_exit(void) 3510 { 3511 dm_unregister_target(&cache_target); 3512 kmem_cache_destroy(migration_cache); 3513 } 3514 3515 module_init(dm_cache_init); 3516 module_exit(dm_cache_exit); 3517 3518 MODULE_DESCRIPTION(DM_NAME " cache target"); 3519 MODULE_AUTHOR("Joe Thornber <ejt@redhat.com>"); 3520 MODULE_LICENSE("GPL"); 3521