1 /* 2 * Copyright (C) 2012 Red Hat. All rights reserved. 3 * 4 * This file is released under the GPL. 5 */ 6 7 #include "dm.h" 8 #include "dm-bio-prison-v2.h" 9 #include "dm-bio-record.h" 10 #include "dm-cache-metadata.h" 11 12 #include <linux/dm-io.h> 13 #include <linux/dm-kcopyd.h> 14 #include <linux/jiffies.h> 15 #include <linux/init.h> 16 #include <linux/mempool.h> 17 #include <linux/module.h> 18 #include <linux/rwsem.h> 19 #include <linux/slab.h> 20 #include <linux/vmalloc.h> 21 22 #define DM_MSG_PREFIX "cache" 23 24 DECLARE_DM_KCOPYD_THROTTLE_WITH_MODULE_PARM(cache_copy_throttle, 25 "A percentage of time allocated for copying to and/or from cache"); 26 27 /*----------------------------------------------------------------*/ 28 29 /* 30 * Glossary: 31 * 32 * oblock: index of an origin block 33 * cblock: index of a cache block 34 * promotion: movement of a block from origin to cache 35 * demotion: movement of a block from cache to origin 36 * migration: movement of a block between the origin and cache device, 37 * either direction 38 */ 39 40 /*----------------------------------------------------------------*/ 41 42 struct io_tracker { 43 spinlock_t lock; 44 45 /* 46 * Sectors of in-flight IO. 47 */ 48 sector_t in_flight; 49 50 /* 51 * The time, in jiffies, when this device became idle (if it is 52 * indeed idle). 53 */ 54 unsigned long idle_time; 55 unsigned long last_update_time; 56 }; 57 58 static void iot_init(struct io_tracker *iot) 59 { 60 spin_lock_init(&iot->lock); 61 iot->in_flight = 0ul; 62 iot->idle_time = 0ul; 63 iot->last_update_time = jiffies; 64 } 65 66 static bool __iot_idle_for(struct io_tracker *iot, unsigned long jifs) 67 { 68 if (iot->in_flight) 69 return false; 70 71 return time_after(jiffies, iot->idle_time + jifs); 72 } 73 74 static bool iot_idle_for(struct io_tracker *iot, unsigned long jifs) 75 { 76 bool r; 77 unsigned long flags; 78 79 spin_lock_irqsave(&iot->lock, flags); 80 r = __iot_idle_for(iot, jifs); 81 spin_unlock_irqrestore(&iot->lock, flags); 82 83 return r; 84 } 85 86 static void iot_io_begin(struct io_tracker *iot, sector_t len) 87 { 88 unsigned long flags; 89 90 spin_lock_irqsave(&iot->lock, flags); 91 iot->in_flight += len; 92 spin_unlock_irqrestore(&iot->lock, flags); 93 } 94 95 static void __iot_io_end(struct io_tracker *iot, sector_t len) 96 { 97 if (!len) 98 return; 99 100 iot->in_flight -= len; 101 if (!iot->in_flight) 102 iot->idle_time = jiffies; 103 } 104 105 static void iot_io_end(struct io_tracker *iot, sector_t len) 106 { 107 unsigned long flags; 108 109 spin_lock_irqsave(&iot->lock, flags); 110 __iot_io_end(iot, len); 111 spin_unlock_irqrestore(&iot->lock, flags); 112 } 113 114 /*----------------------------------------------------------------*/ 115 116 /* 117 * Represents a chunk of future work. 'input' allows continuations to pass 118 * values between themselves, typically error values. 119 */ 120 struct continuation { 121 struct work_struct ws; 122 blk_status_t input; 123 }; 124 125 static inline void init_continuation(struct continuation *k, 126 void (*fn)(struct work_struct *)) 127 { 128 INIT_WORK(&k->ws, fn); 129 k->input = 0; 130 } 131 132 static inline void queue_continuation(struct workqueue_struct *wq, 133 struct continuation *k) 134 { 135 queue_work(wq, &k->ws); 136 } 137 138 /*----------------------------------------------------------------*/ 139 140 /* 141 * The batcher collects together pieces of work that need a particular 142 * operation to occur before they can proceed (typically a commit). 143 */ 144 struct batcher { 145 /* 146 * The operation that everyone is waiting for. 147 */ 148 blk_status_t (*commit_op)(void *context); 149 void *commit_context; 150 151 /* 152 * This is how bios should be issued once the commit op is complete 153 * (accounted_request). 154 */ 155 void (*issue_op)(struct bio *bio, void *context); 156 void *issue_context; 157 158 /* 159 * Queued work gets put on here after commit. 160 */ 161 struct workqueue_struct *wq; 162 163 spinlock_t lock; 164 struct list_head work_items; 165 struct bio_list bios; 166 struct work_struct commit_work; 167 168 bool commit_scheduled; 169 }; 170 171 static void __commit(struct work_struct *_ws) 172 { 173 struct batcher *b = container_of(_ws, struct batcher, commit_work); 174 blk_status_t r; 175 unsigned long flags; 176 struct list_head work_items; 177 struct work_struct *ws, *tmp; 178 struct continuation *k; 179 struct bio *bio; 180 struct bio_list bios; 181 182 INIT_LIST_HEAD(&work_items); 183 bio_list_init(&bios); 184 185 /* 186 * We have to grab these before the commit_op to avoid a race 187 * condition. 188 */ 189 spin_lock_irqsave(&b->lock, flags); 190 list_splice_init(&b->work_items, &work_items); 191 bio_list_merge(&bios, &b->bios); 192 bio_list_init(&b->bios); 193 b->commit_scheduled = false; 194 spin_unlock_irqrestore(&b->lock, flags); 195 196 r = b->commit_op(b->commit_context); 197 198 list_for_each_entry_safe(ws, tmp, &work_items, entry) { 199 k = container_of(ws, struct continuation, ws); 200 k->input = r; 201 INIT_LIST_HEAD(&ws->entry); /* to avoid a WARN_ON */ 202 queue_work(b->wq, ws); 203 } 204 205 while ((bio = bio_list_pop(&bios))) { 206 if (r) { 207 bio->bi_status = r; 208 bio_endio(bio); 209 } else 210 b->issue_op(bio, b->issue_context); 211 } 212 } 213 214 static void batcher_init(struct batcher *b, 215 blk_status_t (*commit_op)(void *), 216 void *commit_context, 217 void (*issue_op)(struct bio *bio, void *), 218 void *issue_context, 219 struct workqueue_struct *wq) 220 { 221 b->commit_op = commit_op; 222 b->commit_context = commit_context; 223 b->issue_op = issue_op; 224 b->issue_context = issue_context; 225 b->wq = wq; 226 227 spin_lock_init(&b->lock); 228 INIT_LIST_HEAD(&b->work_items); 229 bio_list_init(&b->bios); 230 INIT_WORK(&b->commit_work, __commit); 231 b->commit_scheduled = false; 232 } 233 234 static void async_commit(struct batcher *b) 235 { 236 queue_work(b->wq, &b->commit_work); 237 } 238 239 static void continue_after_commit(struct batcher *b, struct continuation *k) 240 { 241 unsigned long flags; 242 bool commit_scheduled; 243 244 spin_lock_irqsave(&b->lock, flags); 245 commit_scheduled = b->commit_scheduled; 246 list_add_tail(&k->ws.entry, &b->work_items); 247 spin_unlock_irqrestore(&b->lock, flags); 248 249 if (commit_scheduled) 250 async_commit(b); 251 } 252 253 /* 254 * Bios are errored if commit failed. 255 */ 256 static void issue_after_commit(struct batcher *b, struct bio *bio) 257 { 258 unsigned long flags; 259 bool commit_scheduled; 260 261 spin_lock_irqsave(&b->lock, flags); 262 commit_scheduled = b->commit_scheduled; 263 bio_list_add(&b->bios, bio); 264 spin_unlock_irqrestore(&b->lock, flags); 265 266 if (commit_scheduled) 267 async_commit(b); 268 } 269 270 /* 271 * Call this if some urgent work is waiting for the commit to complete. 272 */ 273 static void schedule_commit(struct batcher *b) 274 { 275 bool immediate; 276 unsigned long flags; 277 278 spin_lock_irqsave(&b->lock, flags); 279 immediate = !list_empty(&b->work_items) || !bio_list_empty(&b->bios); 280 b->commit_scheduled = true; 281 spin_unlock_irqrestore(&b->lock, flags); 282 283 if (immediate) 284 async_commit(b); 285 } 286 287 /* 288 * There are a couple of places where we let a bio run, but want to do some 289 * work before calling its endio function. We do this by temporarily 290 * changing the endio fn. 291 */ 292 struct dm_hook_info { 293 bio_end_io_t *bi_end_io; 294 }; 295 296 static void dm_hook_bio(struct dm_hook_info *h, struct bio *bio, 297 bio_end_io_t *bi_end_io, void *bi_private) 298 { 299 h->bi_end_io = bio->bi_end_io; 300 301 bio->bi_end_io = bi_end_io; 302 bio->bi_private = bi_private; 303 } 304 305 static void dm_unhook_bio(struct dm_hook_info *h, struct bio *bio) 306 { 307 bio->bi_end_io = h->bi_end_io; 308 } 309 310 /*----------------------------------------------------------------*/ 311 312 #define MIGRATION_POOL_SIZE 128 313 #define COMMIT_PERIOD HZ 314 #define MIGRATION_COUNT_WINDOW 10 315 316 /* 317 * The block size of the device holding cache data must be 318 * between 32KB and 1GB. 319 */ 320 #define DATA_DEV_BLOCK_SIZE_MIN_SECTORS (32 * 1024 >> SECTOR_SHIFT) 321 #define DATA_DEV_BLOCK_SIZE_MAX_SECTORS (1024 * 1024 * 1024 >> SECTOR_SHIFT) 322 323 enum cache_metadata_mode { 324 CM_WRITE, /* metadata may be changed */ 325 CM_READ_ONLY, /* metadata may not be changed */ 326 CM_FAIL 327 }; 328 329 enum cache_io_mode { 330 /* 331 * Data is written to cached blocks only. These blocks are marked 332 * dirty. If you lose the cache device you will lose data. 333 * Potential performance increase for both reads and writes. 334 */ 335 CM_IO_WRITEBACK, 336 337 /* 338 * Data is written to both cache and origin. Blocks are never 339 * dirty. Potential performance benfit for reads only. 340 */ 341 CM_IO_WRITETHROUGH, 342 343 /* 344 * A degraded mode useful for various cache coherency situations 345 * (eg, rolling back snapshots). Reads and writes always go to the 346 * origin. If a write goes to a cached oblock, then the cache 347 * block is invalidated. 348 */ 349 CM_IO_PASSTHROUGH 350 }; 351 352 struct cache_features { 353 enum cache_metadata_mode mode; 354 enum cache_io_mode io_mode; 355 unsigned metadata_version; 356 bool discard_passdown:1; 357 }; 358 359 struct cache_stats { 360 atomic_t read_hit; 361 atomic_t read_miss; 362 atomic_t write_hit; 363 atomic_t write_miss; 364 atomic_t demotion; 365 atomic_t promotion; 366 atomic_t writeback; 367 atomic_t copies_avoided; 368 atomic_t cache_cell_clash; 369 atomic_t commit_count; 370 atomic_t discard_count; 371 }; 372 373 struct cache { 374 struct dm_target *ti; 375 spinlock_t lock; 376 377 /* 378 * Fields for converting from sectors to blocks. 379 */ 380 int sectors_per_block_shift; 381 sector_t sectors_per_block; 382 383 struct dm_cache_metadata *cmd; 384 385 /* 386 * Metadata is written to this device. 387 */ 388 struct dm_dev *metadata_dev; 389 390 /* 391 * The slower of the two data devices. Typically a spindle. 392 */ 393 struct dm_dev *origin_dev; 394 395 /* 396 * The faster of the two data devices. Typically an SSD. 397 */ 398 struct dm_dev *cache_dev; 399 400 /* 401 * Size of the origin device in _complete_ blocks and native sectors. 402 */ 403 dm_oblock_t origin_blocks; 404 sector_t origin_sectors; 405 406 /* 407 * Size of the cache device in blocks. 408 */ 409 dm_cblock_t cache_size; 410 411 /* 412 * Invalidation fields. 413 */ 414 spinlock_t invalidation_lock; 415 struct list_head invalidation_requests; 416 417 sector_t migration_threshold; 418 wait_queue_head_t migration_wait; 419 atomic_t nr_allocated_migrations; 420 421 /* 422 * The number of in flight migrations that are performing 423 * background io. eg, promotion, writeback. 424 */ 425 atomic_t nr_io_migrations; 426 427 struct bio_list deferred_bios; 428 429 struct rw_semaphore quiesce_lock; 430 431 struct dm_target_callbacks callbacks; 432 433 /* 434 * origin_blocks entries, discarded if set. 435 */ 436 dm_dblock_t discard_nr_blocks; 437 unsigned long *discard_bitset; 438 uint32_t discard_block_size; /* a power of 2 times sectors per block */ 439 440 /* 441 * Rather than reconstructing the table line for the status we just 442 * save it and regurgitate. 443 */ 444 unsigned nr_ctr_args; 445 const char **ctr_args; 446 447 struct dm_kcopyd_client *copier; 448 struct work_struct deferred_bio_worker; 449 struct work_struct migration_worker; 450 struct workqueue_struct *wq; 451 struct delayed_work waker; 452 struct dm_bio_prison_v2 *prison; 453 454 /* 455 * cache_size entries, dirty if set 456 */ 457 unsigned long *dirty_bitset; 458 atomic_t nr_dirty; 459 460 unsigned policy_nr_args; 461 struct dm_cache_policy *policy; 462 463 /* 464 * Cache features such as write-through. 465 */ 466 struct cache_features features; 467 468 struct cache_stats stats; 469 470 bool need_tick_bio:1; 471 bool sized:1; 472 bool invalidate:1; 473 bool commit_requested:1; 474 bool loaded_mappings:1; 475 bool loaded_discards:1; 476 477 struct rw_semaphore background_work_lock; 478 479 struct batcher committer; 480 struct work_struct commit_ws; 481 482 struct io_tracker tracker; 483 484 mempool_t migration_pool; 485 486 struct bio_set bs; 487 }; 488 489 struct per_bio_data { 490 bool tick:1; 491 unsigned req_nr:2; 492 struct dm_bio_prison_cell_v2 *cell; 493 struct dm_hook_info hook_info; 494 sector_t len; 495 }; 496 497 struct dm_cache_migration { 498 struct continuation k; 499 struct cache *cache; 500 501 struct policy_work *op; 502 struct bio *overwrite_bio; 503 struct dm_bio_prison_cell_v2 *cell; 504 505 dm_cblock_t invalidate_cblock; 506 dm_oblock_t invalidate_oblock; 507 }; 508 509 /*----------------------------------------------------------------*/ 510 511 static bool writethrough_mode(struct cache *cache) 512 { 513 return cache->features.io_mode == CM_IO_WRITETHROUGH; 514 } 515 516 static bool writeback_mode(struct cache *cache) 517 { 518 return cache->features.io_mode == CM_IO_WRITEBACK; 519 } 520 521 static inline bool passthrough_mode(struct cache *cache) 522 { 523 return unlikely(cache->features.io_mode == CM_IO_PASSTHROUGH); 524 } 525 526 /*----------------------------------------------------------------*/ 527 528 static void wake_deferred_bio_worker(struct cache *cache) 529 { 530 queue_work(cache->wq, &cache->deferred_bio_worker); 531 } 532 533 static void wake_migration_worker(struct cache *cache) 534 { 535 if (passthrough_mode(cache)) 536 return; 537 538 queue_work(cache->wq, &cache->migration_worker); 539 } 540 541 /*----------------------------------------------------------------*/ 542 543 static struct dm_bio_prison_cell_v2 *alloc_prison_cell(struct cache *cache) 544 { 545 return dm_bio_prison_alloc_cell_v2(cache->prison, GFP_NOIO); 546 } 547 548 static void free_prison_cell(struct cache *cache, struct dm_bio_prison_cell_v2 *cell) 549 { 550 dm_bio_prison_free_cell_v2(cache->prison, cell); 551 } 552 553 static struct dm_cache_migration *alloc_migration(struct cache *cache) 554 { 555 struct dm_cache_migration *mg; 556 557 mg = mempool_alloc(&cache->migration_pool, GFP_NOIO); 558 559 memset(mg, 0, sizeof(*mg)); 560 561 mg->cache = cache; 562 atomic_inc(&cache->nr_allocated_migrations); 563 564 return mg; 565 } 566 567 static void free_migration(struct dm_cache_migration *mg) 568 { 569 struct cache *cache = mg->cache; 570 571 if (atomic_dec_and_test(&cache->nr_allocated_migrations)) 572 wake_up(&cache->migration_wait); 573 574 mempool_free(mg, &cache->migration_pool); 575 } 576 577 /*----------------------------------------------------------------*/ 578 579 static inline dm_oblock_t oblock_succ(dm_oblock_t b) 580 { 581 return to_oblock(from_oblock(b) + 1ull); 582 } 583 584 static void build_key(dm_oblock_t begin, dm_oblock_t end, struct dm_cell_key_v2 *key) 585 { 586 key->virtual = 0; 587 key->dev = 0; 588 key->block_begin = from_oblock(begin); 589 key->block_end = from_oblock(end); 590 } 591 592 /* 593 * We have two lock levels. Level 0, which is used to prevent WRITEs, and 594 * level 1 which prevents *both* READs and WRITEs. 595 */ 596 #define WRITE_LOCK_LEVEL 0 597 #define READ_WRITE_LOCK_LEVEL 1 598 599 static unsigned lock_level(struct bio *bio) 600 { 601 return bio_data_dir(bio) == WRITE ? 602 WRITE_LOCK_LEVEL : 603 READ_WRITE_LOCK_LEVEL; 604 } 605 606 /*---------------------------------------------------------------- 607 * Per bio data 608 *--------------------------------------------------------------*/ 609 610 static struct per_bio_data *get_per_bio_data(struct bio *bio) 611 { 612 struct per_bio_data *pb = dm_per_bio_data(bio, sizeof(struct per_bio_data)); 613 BUG_ON(!pb); 614 return pb; 615 } 616 617 static struct per_bio_data *init_per_bio_data(struct bio *bio) 618 { 619 struct per_bio_data *pb = get_per_bio_data(bio); 620 621 pb->tick = false; 622 pb->req_nr = dm_bio_get_target_bio_nr(bio); 623 pb->cell = NULL; 624 pb->len = 0; 625 626 return pb; 627 } 628 629 /*----------------------------------------------------------------*/ 630 631 static void defer_bio(struct cache *cache, struct bio *bio) 632 { 633 unsigned long flags; 634 635 spin_lock_irqsave(&cache->lock, flags); 636 bio_list_add(&cache->deferred_bios, bio); 637 spin_unlock_irqrestore(&cache->lock, flags); 638 639 wake_deferred_bio_worker(cache); 640 } 641 642 static void defer_bios(struct cache *cache, struct bio_list *bios) 643 { 644 unsigned long flags; 645 646 spin_lock_irqsave(&cache->lock, flags); 647 bio_list_merge(&cache->deferred_bios, bios); 648 bio_list_init(bios); 649 spin_unlock_irqrestore(&cache->lock, flags); 650 651 wake_deferred_bio_worker(cache); 652 } 653 654 /*----------------------------------------------------------------*/ 655 656 static bool bio_detain_shared(struct cache *cache, dm_oblock_t oblock, struct bio *bio) 657 { 658 bool r; 659 struct per_bio_data *pb; 660 struct dm_cell_key_v2 key; 661 dm_oblock_t end = to_oblock(from_oblock(oblock) + 1ULL); 662 struct dm_bio_prison_cell_v2 *cell_prealloc, *cell; 663 664 cell_prealloc = alloc_prison_cell(cache); /* FIXME: allow wait if calling from worker */ 665 666 build_key(oblock, end, &key); 667 r = dm_cell_get_v2(cache->prison, &key, lock_level(bio), bio, cell_prealloc, &cell); 668 if (!r) { 669 /* 670 * Failed to get the lock. 671 */ 672 free_prison_cell(cache, cell_prealloc); 673 return r; 674 } 675 676 if (cell != cell_prealloc) 677 free_prison_cell(cache, cell_prealloc); 678 679 pb = get_per_bio_data(bio); 680 pb->cell = cell; 681 682 return r; 683 } 684 685 /*----------------------------------------------------------------*/ 686 687 static bool is_dirty(struct cache *cache, dm_cblock_t b) 688 { 689 return test_bit(from_cblock(b), cache->dirty_bitset); 690 } 691 692 static void set_dirty(struct cache *cache, dm_cblock_t cblock) 693 { 694 if (!test_and_set_bit(from_cblock(cblock), cache->dirty_bitset)) { 695 atomic_inc(&cache->nr_dirty); 696 policy_set_dirty(cache->policy, cblock); 697 } 698 } 699 700 /* 701 * These two are called when setting after migrations to force the policy 702 * and dirty bitset to be in sync. 703 */ 704 static void force_set_dirty(struct cache *cache, dm_cblock_t cblock) 705 { 706 if (!test_and_set_bit(from_cblock(cblock), cache->dirty_bitset)) 707 atomic_inc(&cache->nr_dirty); 708 policy_set_dirty(cache->policy, cblock); 709 } 710 711 static void force_clear_dirty(struct cache *cache, dm_cblock_t cblock) 712 { 713 if (test_and_clear_bit(from_cblock(cblock), cache->dirty_bitset)) { 714 if (atomic_dec_return(&cache->nr_dirty) == 0) 715 dm_table_event(cache->ti->table); 716 } 717 718 policy_clear_dirty(cache->policy, cblock); 719 } 720 721 /*----------------------------------------------------------------*/ 722 723 static bool block_size_is_power_of_two(struct cache *cache) 724 { 725 return cache->sectors_per_block_shift >= 0; 726 } 727 728 /* gcc on ARM generates spurious references to __udivdi3 and __umoddi3 */ 729 #if defined(CONFIG_ARM) && __GNUC__ == 4 && __GNUC_MINOR__ <= 6 730 __always_inline 731 #endif 732 static dm_block_t block_div(dm_block_t b, uint32_t n) 733 { 734 do_div(b, n); 735 736 return b; 737 } 738 739 static dm_block_t oblocks_per_dblock(struct cache *cache) 740 { 741 dm_block_t oblocks = cache->discard_block_size; 742 743 if (block_size_is_power_of_two(cache)) 744 oblocks >>= cache->sectors_per_block_shift; 745 else 746 oblocks = block_div(oblocks, cache->sectors_per_block); 747 748 return oblocks; 749 } 750 751 static dm_dblock_t oblock_to_dblock(struct cache *cache, dm_oblock_t oblock) 752 { 753 return to_dblock(block_div(from_oblock(oblock), 754 oblocks_per_dblock(cache))); 755 } 756 757 static void set_discard(struct cache *cache, dm_dblock_t b) 758 { 759 unsigned long flags; 760 761 BUG_ON(from_dblock(b) >= from_dblock(cache->discard_nr_blocks)); 762 atomic_inc(&cache->stats.discard_count); 763 764 spin_lock_irqsave(&cache->lock, flags); 765 set_bit(from_dblock(b), cache->discard_bitset); 766 spin_unlock_irqrestore(&cache->lock, flags); 767 } 768 769 static void clear_discard(struct cache *cache, dm_dblock_t b) 770 { 771 unsigned long flags; 772 773 spin_lock_irqsave(&cache->lock, flags); 774 clear_bit(from_dblock(b), cache->discard_bitset); 775 spin_unlock_irqrestore(&cache->lock, flags); 776 } 777 778 static bool is_discarded(struct cache *cache, dm_dblock_t b) 779 { 780 int r; 781 unsigned long flags; 782 783 spin_lock_irqsave(&cache->lock, flags); 784 r = test_bit(from_dblock(b), cache->discard_bitset); 785 spin_unlock_irqrestore(&cache->lock, flags); 786 787 return r; 788 } 789 790 static bool is_discarded_oblock(struct cache *cache, dm_oblock_t b) 791 { 792 int r; 793 unsigned long flags; 794 795 spin_lock_irqsave(&cache->lock, flags); 796 r = test_bit(from_dblock(oblock_to_dblock(cache, b)), 797 cache->discard_bitset); 798 spin_unlock_irqrestore(&cache->lock, flags); 799 800 return r; 801 } 802 803 /*---------------------------------------------------------------- 804 * Remapping 805 *--------------------------------------------------------------*/ 806 static void remap_to_origin(struct cache *cache, struct bio *bio) 807 { 808 bio_set_dev(bio, cache->origin_dev->bdev); 809 } 810 811 static void remap_to_cache(struct cache *cache, struct bio *bio, 812 dm_cblock_t cblock) 813 { 814 sector_t bi_sector = bio->bi_iter.bi_sector; 815 sector_t block = from_cblock(cblock); 816 817 bio_set_dev(bio, cache->cache_dev->bdev); 818 if (!block_size_is_power_of_two(cache)) 819 bio->bi_iter.bi_sector = 820 (block * cache->sectors_per_block) + 821 sector_div(bi_sector, cache->sectors_per_block); 822 else 823 bio->bi_iter.bi_sector = 824 (block << cache->sectors_per_block_shift) | 825 (bi_sector & (cache->sectors_per_block - 1)); 826 } 827 828 static void check_if_tick_bio_needed(struct cache *cache, struct bio *bio) 829 { 830 unsigned long flags; 831 struct per_bio_data *pb; 832 833 spin_lock_irqsave(&cache->lock, flags); 834 if (cache->need_tick_bio && !op_is_flush(bio->bi_opf) && 835 bio_op(bio) != REQ_OP_DISCARD) { 836 pb = get_per_bio_data(bio); 837 pb->tick = true; 838 cache->need_tick_bio = false; 839 } 840 spin_unlock_irqrestore(&cache->lock, flags); 841 } 842 843 static void __remap_to_origin_clear_discard(struct cache *cache, struct bio *bio, 844 dm_oblock_t oblock, bool bio_has_pbd) 845 { 846 if (bio_has_pbd) 847 check_if_tick_bio_needed(cache, bio); 848 remap_to_origin(cache, bio); 849 if (bio_data_dir(bio) == WRITE) 850 clear_discard(cache, oblock_to_dblock(cache, oblock)); 851 } 852 853 static void remap_to_origin_clear_discard(struct cache *cache, struct bio *bio, 854 dm_oblock_t oblock) 855 { 856 // FIXME: check_if_tick_bio_needed() is called way too much through this interface 857 __remap_to_origin_clear_discard(cache, bio, oblock, true); 858 } 859 860 static void remap_to_cache_dirty(struct cache *cache, struct bio *bio, 861 dm_oblock_t oblock, dm_cblock_t cblock) 862 { 863 check_if_tick_bio_needed(cache, bio); 864 remap_to_cache(cache, bio, cblock); 865 if (bio_data_dir(bio) == WRITE) { 866 set_dirty(cache, cblock); 867 clear_discard(cache, oblock_to_dblock(cache, oblock)); 868 } 869 } 870 871 static dm_oblock_t get_bio_block(struct cache *cache, struct bio *bio) 872 { 873 sector_t block_nr = bio->bi_iter.bi_sector; 874 875 if (!block_size_is_power_of_two(cache)) 876 (void) sector_div(block_nr, cache->sectors_per_block); 877 else 878 block_nr >>= cache->sectors_per_block_shift; 879 880 return to_oblock(block_nr); 881 } 882 883 static bool accountable_bio(struct cache *cache, struct bio *bio) 884 { 885 return bio_op(bio) != REQ_OP_DISCARD; 886 } 887 888 static void accounted_begin(struct cache *cache, struct bio *bio) 889 { 890 struct per_bio_data *pb; 891 892 if (accountable_bio(cache, bio)) { 893 pb = get_per_bio_data(bio); 894 pb->len = bio_sectors(bio); 895 iot_io_begin(&cache->tracker, pb->len); 896 } 897 } 898 899 static void accounted_complete(struct cache *cache, struct bio *bio) 900 { 901 struct per_bio_data *pb = get_per_bio_data(bio); 902 903 iot_io_end(&cache->tracker, pb->len); 904 } 905 906 static void accounted_request(struct cache *cache, struct bio *bio) 907 { 908 accounted_begin(cache, bio); 909 generic_make_request(bio); 910 } 911 912 static void issue_op(struct bio *bio, void *context) 913 { 914 struct cache *cache = context; 915 accounted_request(cache, bio); 916 } 917 918 /* 919 * When running in writethrough mode we need to send writes to clean blocks 920 * to both the cache and origin devices. Clone the bio and send them in parallel. 921 */ 922 static void remap_to_origin_and_cache(struct cache *cache, struct bio *bio, 923 dm_oblock_t oblock, dm_cblock_t cblock) 924 { 925 struct bio *origin_bio = bio_clone_fast(bio, GFP_NOIO, &cache->bs); 926 927 BUG_ON(!origin_bio); 928 929 bio_chain(origin_bio, bio); 930 /* 931 * Passing false to __remap_to_origin_clear_discard() skips 932 * all code that might use per_bio_data (since clone doesn't have it) 933 */ 934 __remap_to_origin_clear_discard(cache, origin_bio, oblock, false); 935 submit_bio(origin_bio); 936 937 remap_to_cache(cache, bio, cblock); 938 } 939 940 /*---------------------------------------------------------------- 941 * Failure modes 942 *--------------------------------------------------------------*/ 943 static enum cache_metadata_mode get_cache_mode(struct cache *cache) 944 { 945 return cache->features.mode; 946 } 947 948 static const char *cache_device_name(struct cache *cache) 949 { 950 return dm_device_name(dm_table_get_md(cache->ti->table)); 951 } 952 953 static void notify_mode_switch(struct cache *cache, enum cache_metadata_mode mode) 954 { 955 const char *descs[] = { 956 "write", 957 "read-only", 958 "fail" 959 }; 960 961 dm_table_event(cache->ti->table); 962 DMINFO("%s: switching cache to %s mode", 963 cache_device_name(cache), descs[(int)mode]); 964 } 965 966 static void set_cache_mode(struct cache *cache, enum cache_metadata_mode new_mode) 967 { 968 bool needs_check; 969 enum cache_metadata_mode old_mode = get_cache_mode(cache); 970 971 if (dm_cache_metadata_needs_check(cache->cmd, &needs_check)) { 972 DMERR("%s: unable to read needs_check flag, setting failure mode.", 973 cache_device_name(cache)); 974 new_mode = CM_FAIL; 975 } 976 977 if (new_mode == CM_WRITE && needs_check) { 978 DMERR("%s: unable to switch cache to write mode until repaired.", 979 cache_device_name(cache)); 980 if (old_mode != new_mode) 981 new_mode = old_mode; 982 else 983 new_mode = CM_READ_ONLY; 984 } 985 986 /* Never move out of fail mode */ 987 if (old_mode == CM_FAIL) 988 new_mode = CM_FAIL; 989 990 switch (new_mode) { 991 case CM_FAIL: 992 case CM_READ_ONLY: 993 dm_cache_metadata_set_read_only(cache->cmd); 994 break; 995 996 case CM_WRITE: 997 dm_cache_metadata_set_read_write(cache->cmd); 998 break; 999 } 1000 1001 cache->features.mode = new_mode; 1002 1003 if (new_mode != old_mode) 1004 notify_mode_switch(cache, new_mode); 1005 } 1006 1007 static void abort_transaction(struct cache *cache) 1008 { 1009 const char *dev_name = cache_device_name(cache); 1010 1011 if (get_cache_mode(cache) >= CM_READ_ONLY) 1012 return; 1013 1014 if (dm_cache_metadata_set_needs_check(cache->cmd)) { 1015 DMERR("%s: failed to set 'needs_check' flag in metadata", dev_name); 1016 set_cache_mode(cache, CM_FAIL); 1017 } 1018 1019 DMERR_LIMIT("%s: aborting current metadata transaction", dev_name); 1020 if (dm_cache_metadata_abort(cache->cmd)) { 1021 DMERR("%s: failed to abort metadata transaction", dev_name); 1022 set_cache_mode(cache, CM_FAIL); 1023 } 1024 } 1025 1026 static void metadata_operation_failed(struct cache *cache, const char *op, int r) 1027 { 1028 DMERR_LIMIT("%s: metadata operation '%s' failed: error = %d", 1029 cache_device_name(cache), op, r); 1030 abort_transaction(cache); 1031 set_cache_mode(cache, CM_READ_ONLY); 1032 } 1033 1034 /*----------------------------------------------------------------*/ 1035 1036 static void load_stats(struct cache *cache) 1037 { 1038 struct dm_cache_statistics stats; 1039 1040 dm_cache_metadata_get_stats(cache->cmd, &stats); 1041 atomic_set(&cache->stats.read_hit, stats.read_hits); 1042 atomic_set(&cache->stats.read_miss, stats.read_misses); 1043 atomic_set(&cache->stats.write_hit, stats.write_hits); 1044 atomic_set(&cache->stats.write_miss, stats.write_misses); 1045 } 1046 1047 static void save_stats(struct cache *cache) 1048 { 1049 struct dm_cache_statistics stats; 1050 1051 if (get_cache_mode(cache) >= CM_READ_ONLY) 1052 return; 1053 1054 stats.read_hits = atomic_read(&cache->stats.read_hit); 1055 stats.read_misses = atomic_read(&cache->stats.read_miss); 1056 stats.write_hits = atomic_read(&cache->stats.write_hit); 1057 stats.write_misses = atomic_read(&cache->stats.write_miss); 1058 1059 dm_cache_metadata_set_stats(cache->cmd, &stats); 1060 } 1061 1062 static void update_stats(struct cache_stats *stats, enum policy_operation op) 1063 { 1064 switch (op) { 1065 case POLICY_PROMOTE: 1066 atomic_inc(&stats->promotion); 1067 break; 1068 1069 case POLICY_DEMOTE: 1070 atomic_inc(&stats->demotion); 1071 break; 1072 1073 case POLICY_WRITEBACK: 1074 atomic_inc(&stats->writeback); 1075 break; 1076 } 1077 } 1078 1079 /*---------------------------------------------------------------- 1080 * Migration processing 1081 * 1082 * Migration covers moving data from the origin device to the cache, or 1083 * vice versa. 1084 *--------------------------------------------------------------*/ 1085 1086 static void inc_io_migrations(struct cache *cache) 1087 { 1088 atomic_inc(&cache->nr_io_migrations); 1089 } 1090 1091 static void dec_io_migrations(struct cache *cache) 1092 { 1093 atomic_dec(&cache->nr_io_migrations); 1094 } 1095 1096 static bool discard_or_flush(struct bio *bio) 1097 { 1098 return bio_op(bio) == REQ_OP_DISCARD || op_is_flush(bio->bi_opf); 1099 } 1100 1101 static void calc_discard_block_range(struct cache *cache, struct bio *bio, 1102 dm_dblock_t *b, dm_dblock_t *e) 1103 { 1104 sector_t sb = bio->bi_iter.bi_sector; 1105 sector_t se = bio_end_sector(bio); 1106 1107 *b = to_dblock(dm_sector_div_up(sb, cache->discard_block_size)); 1108 1109 if (se - sb < cache->discard_block_size) 1110 *e = *b; 1111 else 1112 *e = to_dblock(block_div(se, cache->discard_block_size)); 1113 } 1114 1115 /*----------------------------------------------------------------*/ 1116 1117 static void prevent_background_work(struct cache *cache) 1118 { 1119 lockdep_off(); 1120 down_write(&cache->background_work_lock); 1121 lockdep_on(); 1122 } 1123 1124 static void allow_background_work(struct cache *cache) 1125 { 1126 lockdep_off(); 1127 up_write(&cache->background_work_lock); 1128 lockdep_on(); 1129 } 1130 1131 static bool background_work_begin(struct cache *cache) 1132 { 1133 bool r; 1134 1135 lockdep_off(); 1136 r = down_read_trylock(&cache->background_work_lock); 1137 lockdep_on(); 1138 1139 return r; 1140 } 1141 1142 static void background_work_end(struct cache *cache) 1143 { 1144 lockdep_off(); 1145 up_read(&cache->background_work_lock); 1146 lockdep_on(); 1147 } 1148 1149 /*----------------------------------------------------------------*/ 1150 1151 static bool bio_writes_complete_block(struct cache *cache, struct bio *bio) 1152 { 1153 return (bio_data_dir(bio) == WRITE) && 1154 (bio->bi_iter.bi_size == (cache->sectors_per_block << SECTOR_SHIFT)); 1155 } 1156 1157 static bool optimisable_bio(struct cache *cache, struct bio *bio, dm_oblock_t block) 1158 { 1159 return writeback_mode(cache) && 1160 (is_discarded_oblock(cache, block) || bio_writes_complete_block(cache, bio)); 1161 } 1162 1163 static void quiesce(struct dm_cache_migration *mg, 1164 void (*continuation)(struct work_struct *)) 1165 { 1166 init_continuation(&mg->k, continuation); 1167 dm_cell_quiesce_v2(mg->cache->prison, mg->cell, &mg->k.ws); 1168 } 1169 1170 static struct dm_cache_migration *ws_to_mg(struct work_struct *ws) 1171 { 1172 struct continuation *k = container_of(ws, struct continuation, ws); 1173 return container_of(k, struct dm_cache_migration, k); 1174 } 1175 1176 static void copy_complete(int read_err, unsigned long write_err, void *context) 1177 { 1178 struct dm_cache_migration *mg = container_of(context, struct dm_cache_migration, k); 1179 1180 if (read_err || write_err) 1181 mg->k.input = BLK_STS_IOERR; 1182 1183 queue_continuation(mg->cache->wq, &mg->k); 1184 } 1185 1186 static void copy(struct dm_cache_migration *mg, bool promote) 1187 { 1188 struct dm_io_region o_region, c_region; 1189 struct cache *cache = mg->cache; 1190 1191 o_region.bdev = cache->origin_dev->bdev; 1192 o_region.sector = from_oblock(mg->op->oblock) * cache->sectors_per_block; 1193 o_region.count = cache->sectors_per_block; 1194 1195 c_region.bdev = cache->cache_dev->bdev; 1196 c_region.sector = from_cblock(mg->op->cblock) * cache->sectors_per_block; 1197 c_region.count = cache->sectors_per_block; 1198 1199 if (promote) 1200 dm_kcopyd_copy(cache->copier, &o_region, 1, &c_region, 0, copy_complete, &mg->k); 1201 else 1202 dm_kcopyd_copy(cache->copier, &c_region, 1, &o_region, 0, copy_complete, &mg->k); 1203 } 1204 1205 static void bio_drop_shared_lock(struct cache *cache, struct bio *bio) 1206 { 1207 struct per_bio_data *pb = get_per_bio_data(bio); 1208 1209 if (pb->cell && dm_cell_put_v2(cache->prison, pb->cell)) 1210 free_prison_cell(cache, pb->cell); 1211 pb->cell = NULL; 1212 } 1213 1214 static void overwrite_endio(struct bio *bio) 1215 { 1216 struct dm_cache_migration *mg = bio->bi_private; 1217 struct cache *cache = mg->cache; 1218 struct per_bio_data *pb = get_per_bio_data(bio); 1219 1220 dm_unhook_bio(&pb->hook_info, bio); 1221 1222 if (bio->bi_status) 1223 mg->k.input = bio->bi_status; 1224 1225 queue_continuation(cache->wq, &mg->k); 1226 } 1227 1228 static void overwrite(struct dm_cache_migration *mg, 1229 void (*continuation)(struct work_struct *)) 1230 { 1231 struct bio *bio = mg->overwrite_bio; 1232 struct per_bio_data *pb = get_per_bio_data(bio); 1233 1234 dm_hook_bio(&pb->hook_info, bio, overwrite_endio, mg); 1235 1236 /* 1237 * The overwrite bio is part of the copy operation, as such it does 1238 * not set/clear discard or dirty flags. 1239 */ 1240 if (mg->op->op == POLICY_PROMOTE) 1241 remap_to_cache(mg->cache, bio, mg->op->cblock); 1242 else 1243 remap_to_origin(mg->cache, bio); 1244 1245 init_continuation(&mg->k, continuation); 1246 accounted_request(mg->cache, bio); 1247 } 1248 1249 /* 1250 * Migration steps: 1251 * 1252 * 1) exclusive lock preventing WRITEs 1253 * 2) quiesce 1254 * 3) copy or issue overwrite bio 1255 * 4) upgrade to exclusive lock preventing READs and WRITEs 1256 * 5) quiesce 1257 * 6) update metadata and commit 1258 * 7) unlock 1259 */ 1260 static void mg_complete(struct dm_cache_migration *mg, bool success) 1261 { 1262 struct bio_list bios; 1263 struct cache *cache = mg->cache; 1264 struct policy_work *op = mg->op; 1265 dm_cblock_t cblock = op->cblock; 1266 1267 if (success) 1268 update_stats(&cache->stats, op->op); 1269 1270 switch (op->op) { 1271 case POLICY_PROMOTE: 1272 clear_discard(cache, oblock_to_dblock(cache, op->oblock)); 1273 policy_complete_background_work(cache->policy, op, success); 1274 1275 if (mg->overwrite_bio) { 1276 if (success) 1277 force_set_dirty(cache, cblock); 1278 else if (mg->k.input) 1279 mg->overwrite_bio->bi_status = mg->k.input; 1280 else 1281 mg->overwrite_bio->bi_status = BLK_STS_IOERR; 1282 bio_endio(mg->overwrite_bio); 1283 } else { 1284 if (success) 1285 force_clear_dirty(cache, cblock); 1286 dec_io_migrations(cache); 1287 } 1288 break; 1289 1290 case POLICY_DEMOTE: 1291 /* 1292 * We clear dirty here to update the nr_dirty counter. 1293 */ 1294 if (success) 1295 force_clear_dirty(cache, cblock); 1296 policy_complete_background_work(cache->policy, op, success); 1297 dec_io_migrations(cache); 1298 break; 1299 1300 case POLICY_WRITEBACK: 1301 if (success) 1302 force_clear_dirty(cache, cblock); 1303 policy_complete_background_work(cache->policy, op, success); 1304 dec_io_migrations(cache); 1305 break; 1306 } 1307 1308 bio_list_init(&bios); 1309 if (mg->cell) { 1310 if (dm_cell_unlock_v2(cache->prison, mg->cell, &bios)) 1311 free_prison_cell(cache, mg->cell); 1312 } 1313 1314 free_migration(mg); 1315 defer_bios(cache, &bios); 1316 wake_migration_worker(cache); 1317 1318 background_work_end(cache); 1319 } 1320 1321 static void mg_success(struct work_struct *ws) 1322 { 1323 struct dm_cache_migration *mg = ws_to_mg(ws); 1324 mg_complete(mg, mg->k.input == 0); 1325 } 1326 1327 static void mg_update_metadata(struct work_struct *ws) 1328 { 1329 int r; 1330 struct dm_cache_migration *mg = ws_to_mg(ws); 1331 struct cache *cache = mg->cache; 1332 struct policy_work *op = mg->op; 1333 1334 switch (op->op) { 1335 case POLICY_PROMOTE: 1336 r = dm_cache_insert_mapping(cache->cmd, op->cblock, op->oblock); 1337 if (r) { 1338 DMERR_LIMIT("%s: migration failed; couldn't insert mapping", 1339 cache_device_name(cache)); 1340 metadata_operation_failed(cache, "dm_cache_insert_mapping", r); 1341 1342 mg_complete(mg, false); 1343 return; 1344 } 1345 mg_complete(mg, true); 1346 break; 1347 1348 case POLICY_DEMOTE: 1349 r = dm_cache_remove_mapping(cache->cmd, op->cblock); 1350 if (r) { 1351 DMERR_LIMIT("%s: migration failed; couldn't update on disk metadata", 1352 cache_device_name(cache)); 1353 metadata_operation_failed(cache, "dm_cache_remove_mapping", r); 1354 1355 mg_complete(mg, false); 1356 return; 1357 } 1358 1359 /* 1360 * It would be nice if we only had to commit when a REQ_FLUSH 1361 * comes through. But there's one scenario that we have to 1362 * look out for: 1363 * 1364 * - vblock x in a cache block 1365 * - domotion occurs 1366 * - cache block gets reallocated and over written 1367 * - crash 1368 * 1369 * When we recover, because there was no commit the cache will 1370 * rollback to having the data for vblock x in the cache block. 1371 * But the cache block has since been overwritten, so it'll end 1372 * up pointing to data that was never in 'x' during the history 1373 * of the device. 1374 * 1375 * To avoid this issue we require a commit as part of the 1376 * demotion operation. 1377 */ 1378 init_continuation(&mg->k, mg_success); 1379 continue_after_commit(&cache->committer, &mg->k); 1380 schedule_commit(&cache->committer); 1381 break; 1382 1383 case POLICY_WRITEBACK: 1384 mg_complete(mg, true); 1385 break; 1386 } 1387 } 1388 1389 static void mg_update_metadata_after_copy(struct work_struct *ws) 1390 { 1391 struct dm_cache_migration *mg = ws_to_mg(ws); 1392 1393 /* 1394 * Did the copy succeed? 1395 */ 1396 if (mg->k.input) 1397 mg_complete(mg, false); 1398 else 1399 mg_update_metadata(ws); 1400 } 1401 1402 static void mg_upgrade_lock(struct work_struct *ws) 1403 { 1404 int r; 1405 struct dm_cache_migration *mg = ws_to_mg(ws); 1406 1407 /* 1408 * Did the copy succeed? 1409 */ 1410 if (mg->k.input) 1411 mg_complete(mg, false); 1412 1413 else { 1414 /* 1415 * Now we want the lock to prevent both reads and writes. 1416 */ 1417 r = dm_cell_lock_promote_v2(mg->cache->prison, mg->cell, 1418 READ_WRITE_LOCK_LEVEL); 1419 if (r < 0) 1420 mg_complete(mg, false); 1421 1422 else if (r) 1423 quiesce(mg, mg_update_metadata); 1424 1425 else 1426 mg_update_metadata(ws); 1427 } 1428 } 1429 1430 static void mg_full_copy(struct work_struct *ws) 1431 { 1432 struct dm_cache_migration *mg = ws_to_mg(ws); 1433 struct cache *cache = mg->cache; 1434 struct policy_work *op = mg->op; 1435 bool is_policy_promote = (op->op == POLICY_PROMOTE); 1436 1437 if ((!is_policy_promote && !is_dirty(cache, op->cblock)) || 1438 is_discarded_oblock(cache, op->oblock)) { 1439 mg_upgrade_lock(ws); 1440 return; 1441 } 1442 1443 init_continuation(&mg->k, mg_upgrade_lock); 1444 copy(mg, is_policy_promote); 1445 } 1446 1447 static void mg_copy(struct work_struct *ws) 1448 { 1449 struct dm_cache_migration *mg = ws_to_mg(ws); 1450 1451 if (mg->overwrite_bio) { 1452 /* 1453 * No exclusive lock was held when we last checked if the bio 1454 * was optimisable. So we have to check again in case things 1455 * have changed (eg, the block may no longer be discarded). 1456 */ 1457 if (!optimisable_bio(mg->cache, mg->overwrite_bio, mg->op->oblock)) { 1458 /* 1459 * Fallback to a real full copy after doing some tidying up. 1460 */ 1461 bool rb = bio_detain_shared(mg->cache, mg->op->oblock, mg->overwrite_bio); 1462 BUG_ON(rb); /* An exclussive lock must _not_ be held for this block */ 1463 mg->overwrite_bio = NULL; 1464 inc_io_migrations(mg->cache); 1465 mg_full_copy(ws); 1466 return; 1467 } 1468 1469 /* 1470 * It's safe to do this here, even though it's new data 1471 * because all IO has been locked out of the block. 1472 * 1473 * mg_lock_writes() already took READ_WRITE_LOCK_LEVEL 1474 * so _not_ using mg_upgrade_lock() as continutation. 1475 */ 1476 overwrite(mg, mg_update_metadata_after_copy); 1477 1478 } else 1479 mg_full_copy(ws); 1480 } 1481 1482 static int mg_lock_writes(struct dm_cache_migration *mg) 1483 { 1484 int r; 1485 struct dm_cell_key_v2 key; 1486 struct cache *cache = mg->cache; 1487 struct dm_bio_prison_cell_v2 *prealloc; 1488 1489 prealloc = alloc_prison_cell(cache); 1490 1491 /* 1492 * Prevent writes to the block, but allow reads to continue. 1493 * Unless we're using an overwrite bio, in which case we lock 1494 * everything. 1495 */ 1496 build_key(mg->op->oblock, oblock_succ(mg->op->oblock), &key); 1497 r = dm_cell_lock_v2(cache->prison, &key, 1498 mg->overwrite_bio ? READ_WRITE_LOCK_LEVEL : WRITE_LOCK_LEVEL, 1499 prealloc, &mg->cell); 1500 if (r < 0) { 1501 free_prison_cell(cache, prealloc); 1502 mg_complete(mg, false); 1503 return r; 1504 } 1505 1506 if (mg->cell != prealloc) 1507 free_prison_cell(cache, prealloc); 1508 1509 if (r == 0) 1510 mg_copy(&mg->k.ws); 1511 else 1512 quiesce(mg, mg_copy); 1513 1514 return 0; 1515 } 1516 1517 static int mg_start(struct cache *cache, struct policy_work *op, struct bio *bio) 1518 { 1519 struct dm_cache_migration *mg; 1520 1521 if (!background_work_begin(cache)) { 1522 policy_complete_background_work(cache->policy, op, false); 1523 return -EPERM; 1524 } 1525 1526 mg = alloc_migration(cache); 1527 1528 mg->op = op; 1529 mg->overwrite_bio = bio; 1530 1531 if (!bio) 1532 inc_io_migrations(cache); 1533 1534 return mg_lock_writes(mg); 1535 } 1536 1537 /*---------------------------------------------------------------- 1538 * invalidation processing 1539 *--------------------------------------------------------------*/ 1540 1541 static void invalidate_complete(struct dm_cache_migration *mg, bool success) 1542 { 1543 struct bio_list bios; 1544 struct cache *cache = mg->cache; 1545 1546 bio_list_init(&bios); 1547 if (dm_cell_unlock_v2(cache->prison, mg->cell, &bios)) 1548 free_prison_cell(cache, mg->cell); 1549 1550 if (!success && mg->overwrite_bio) 1551 bio_io_error(mg->overwrite_bio); 1552 1553 free_migration(mg); 1554 defer_bios(cache, &bios); 1555 1556 background_work_end(cache); 1557 } 1558 1559 static void invalidate_completed(struct work_struct *ws) 1560 { 1561 struct dm_cache_migration *mg = ws_to_mg(ws); 1562 invalidate_complete(mg, !mg->k.input); 1563 } 1564 1565 static int invalidate_cblock(struct cache *cache, dm_cblock_t cblock) 1566 { 1567 int r = policy_invalidate_mapping(cache->policy, cblock); 1568 if (!r) { 1569 r = dm_cache_remove_mapping(cache->cmd, cblock); 1570 if (r) { 1571 DMERR_LIMIT("%s: invalidation failed; couldn't update on disk metadata", 1572 cache_device_name(cache)); 1573 metadata_operation_failed(cache, "dm_cache_remove_mapping", r); 1574 } 1575 1576 } else if (r == -ENODATA) { 1577 /* 1578 * Harmless, already unmapped. 1579 */ 1580 r = 0; 1581 1582 } else 1583 DMERR("%s: policy_invalidate_mapping failed", cache_device_name(cache)); 1584 1585 return r; 1586 } 1587 1588 static void invalidate_remove(struct work_struct *ws) 1589 { 1590 int r; 1591 struct dm_cache_migration *mg = ws_to_mg(ws); 1592 struct cache *cache = mg->cache; 1593 1594 r = invalidate_cblock(cache, mg->invalidate_cblock); 1595 if (r) { 1596 invalidate_complete(mg, false); 1597 return; 1598 } 1599 1600 init_continuation(&mg->k, invalidate_completed); 1601 continue_after_commit(&cache->committer, &mg->k); 1602 remap_to_origin_clear_discard(cache, mg->overwrite_bio, mg->invalidate_oblock); 1603 mg->overwrite_bio = NULL; 1604 schedule_commit(&cache->committer); 1605 } 1606 1607 static int invalidate_lock(struct dm_cache_migration *mg) 1608 { 1609 int r; 1610 struct dm_cell_key_v2 key; 1611 struct cache *cache = mg->cache; 1612 struct dm_bio_prison_cell_v2 *prealloc; 1613 1614 prealloc = alloc_prison_cell(cache); 1615 1616 build_key(mg->invalidate_oblock, oblock_succ(mg->invalidate_oblock), &key); 1617 r = dm_cell_lock_v2(cache->prison, &key, 1618 READ_WRITE_LOCK_LEVEL, prealloc, &mg->cell); 1619 if (r < 0) { 1620 free_prison_cell(cache, prealloc); 1621 invalidate_complete(mg, false); 1622 return r; 1623 } 1624 1625 if (mg->cell != prealloc) 1626 free_prison_cell(cache, prealloc); 1627 1628 if (r) 1629 quiesce(mg, invalidate_remove); 1630 1631 else { 1632 /* 1633 * We can't call invalidate_remove() directly here because we 1634 * might still be in request context. 1635 */ 1636 init_continuation(&mg->k, invalidate_remove); 1637 queue_work(cache->wq, &mg->k.ws); 1638 } 1639 1640 return 0; 1641 } 1642 1643 static int invalidate_start(struct cache *cache, dm_cblock_t cblock, 1644 dm_oblock_t oblock, struct bio *bio) 1645 { 1646 struct dm_cache_migration *mg; 1647 1648 if (!background_work_begin(cache)) 1649 return -EPERM; 1650 1651 mg = alloc_migration(cache); 1652 1653 mg->overwrite_bio = bio; 1654 mg->invalidate_cblock = cblock; 1655 mg->invalidate_oblock = oblock; 1656 1657 return invalidate_lock(mg); 1658 } 1659 1660 /*---------------------------------------------------------------- 1661 * bio processing 1662 *--------------------------------------------------------------*/ 1663 1664 enum busy { 1665 IDLE, 1666 BUSY 1667 }; 1668 1669 static enum busy spare_migration_bandwidth(struct cache *cache) 1670 { 1671 bool idle = iot_idle_for(&cache->tracker, HZ); 1672 sector_t current_volume = (atomic_read(&cache->nr_io_migrations) + 1) * 1673 cache->sectors_per_block; 1674 1675 if (idle && current_volume <= cache->migration_threshold) 1676 return IDLE; 1677 else 1678 return BUSY; 1679 } 1680 1681 static void inc_hit_counter(struct cache *cache, struct bio *bio) 1682 { 1683 atomic_inc(bio_data_dir(bio) == READ ? 1684 &cache->stats.read_hit : &cache->stats.write_hit); 1685 } 1686 1687 static void inc_miss_counter(struct cache *cache, struct bio *bio) 1688 { 1689 atomic_inc(bio_data_dir(bio) == READ ? 1690 &cache->stats.read_miss : &cache->stats.write_miss); 1691 } 1692 1693 /*----------------------------------------------------------------*/ 1694 1695 static int map_bio(struct cache *cache, struct bio *bio, dm_oblock_t block, 1696 bool *commit_needed) 1697 { 1698 int r, data_dir; 1699 bool rb, background_queued; 1700 dm_cblock_t cblock; 1701 1702 *commit_needed = false; 1703 1704 rb = bio_detain_shared(cache, block, bio); 1705 if (!rb) { 1706 /* 1707 * An exclusive lock is held for this block, so we have to 1708 * wait. We set the commit_needed flag so the current 1709 * transaction will be committed asap, allowing this lock 1710 * to be dropped. 1711 */ 1712 *commit_needed = true; 1713 return DM_MAPIO_SUBMITTED; 1714 } 1715 1716 data_dir = bio_data_dir(bio); 1717 1718 if (optimisable_bio(cache, bio, block)) { 1719 struct policy_work *op = NULL; 1720 1721 r = policy_lookup_with_work(cache->policy, block, &cblock, data_dir, true, &op); 1722 if (unlikely(r && r != -ENOENT)) { 1723 DMERR_LIMIT("%s: policy_lookup_with_work() failed with r = %d", 1724 cache_device_name(cache), r); 1725 bio_io_error(bio); 1726 return DM_MAPIO_SUBMITTED; 1727 } 1728 1729 if (r == -ENOENT && op) { 1730 bio_drop_shared_lock(cache, bio); 1731 BUG_ON(op->op != POLICY_PROMOTE); 1732 mg_start(cache, op, bio); 1733 return DM_MAPIO_SUBMITTED; 1734 } 1735 } else { 1736 r = policy_lookup(cache->policy, block, &cblock, data_dir, false, &background_queued); 1737 if (unlikely(r && r != -ENOENT)) { 1738 DMERR_LIMIT("%s: policy_lookup() failed with r = %d", 1739 cache_device_name(cache), r); 1740 bio_io_error(bio); 1741 return DM_MAPIO_SUBMITTED; 1742 } 1743 1744 if (background_queued) 1745 wake_migration_worker(cache); 1746 } 1747 1748 if (r == -ENOENT) { 1749 struct per_bio_data *pb = get_per_bio_data(bio); 1750 1751 /* 1752 * Miss. 1753 */ 1754 inc_miss_counter(cache, bio); 1755 if (pb->req_nr == 0) { 1756 accounted_begin(cache, bio); 1757 remap_to_origin_clear_discard(cache, bio, block); 1758 } else { 1759 /* 1760 * This is a duplicate writethrough io that is no 1761 * longer needed because the block has been demoted. 1762 */ 1763 bio_endio(bio); 1764 return DM_MAPIO_SUBMITTED; 1765 } 1766 } else { 1767 /* 1768 * Hit. 1769 */ 1770 inc_hit_counter(cache, bio); 1771 1772 /* 1773 * Passthrough always maps to the origin, invalidating any 1774 * cache blocks that are written to. 1775 */ 1776 if (passthrough_mode(cache)) { 1777 if (bio_data_dir(bio) == WRITE) { 1778 bio_drop_shared_lock(cache, bio); 1779 atomic_inc(&cache->stats.demotion); 1780 invalidate_start(cache, cblock, block, bio); 1781 } else 1782 remap_to_origin_clear_discard(cache, bio, block); 1783 } else { 1784 if (bio_data_dir(bio) == WRITE && writethrough_mode(cache) && 1785 !is_dirty(cache, cblock)) { 1786 remap_to_origin_and_cache(cache, bio, block, cblock); 1787 accounted_begin(cache, bio); 1788 } else 1789 remap_to_cache_dirty(cache, bio, block, cblock); 1790 } 1791 } 1792 1793 /* 1794 * dm core turns FUA requests into a separate payload and FLUSH req. 1795 */ 1796 if (bio->bi_opf & REQ_FUA) { 1797 /* 1798 * issue_after_commit will call accounted_begin a second time. So 1799 * we call accounted_complete() to avoid double accounting. 1800 */ 1801 accounted_complete(cache, bio); 1802 issue_after_commit(&cache->committer, bio); 1803 *commit_needed = true; 1804 return DM_MAPIO_SUBMITTED; 1805 } 1806 1807 return DM_MAPIO_REMAPPED; 1808 } 1809 1810 static bool process_bio(struct cache *cache, struct bio *bio) 1811 { 1812 bool commit_needed; 1813 1814 if (map_bio(cache, bio, get_bio_block(cache, bio), &commit_needed) == DM_MAPIO_REMAPPED) 1815 generic_make_request(bio); 1816 1817 return commit_needed; 1818 } 1819 1820 /* 1821 * A non-zero return indicates read_only or fail_io mode. 1822 */ 1823 static int commit(struct cache *cache, bool clean_shutdown) 1824 { 1825 int r; 1826 1827 if (get_cache_mode(cache) >= CM_READ_ONLY) 1828 return -EINVAL; 1829 1830 atomic_inc(&cache->stats.commit_count); 1831 r = dm_cache_commit(cache->cmd, clean_shutdown); 1832 if (r) 1833 metadata_operation_failed(cache, "dm_cache_commit", r); 1834 1835 return r; 1836 } 1837 1838 /* 1839 * Used by the batcher. 1840 */ 1841 static blk_status_t commit_op(void *context) 1842 { 1843 struct cache *cache = context; 1844 1845 if (dm_cache_changed_this_transaction(cache->cmd)) 1846 return errno_to_blk_status(commit(cache, false)); 1847 1848 return 0; 1849 } 1850 1851 /*----------------------------------------------------------------*/ 1852 1853 static bool process_flush_bio(struct cache *cache, struct bio *bio) 1854 { 1855 struct per_bio_data *pb = get_per_bio_data(bio); 1856 1857 if (!pb->req_nr) 1858 remap_to_origin(cache, bio); 1859 else 1860 remap_to_cache(cache, bio, 0); 1861 1862 issue_after_commit(&cache->committer, bio); 1863 return true; 1864 } 1865 1866 static bool process_discard_bio(struct cache *cache, struct bio *bio) 1867 { 1868 dm_dblock_t b, e; 1869 1870 // FIXME: do we need to lock the region? Or can we just assume the 1871 // user wont be so foolish as to issue discard concurrently with 1872 // other IO? 1873 calc_discard_block_range(cache, bio, &b, &e); 1874 while (b != e) { 1875 set_discard(cache, b); 1876 b = to_dblock(from_dblock(b) + 1); 1877 } 1878 1879 if (cache->features.discard_passdown) { 1880 remap_to_origin(cache, bio); 1881 generic_make_request(bio); 1882 } else 1883 bio_endio(bio); 1884 1885 return false; 1886 } 1887 1888 static void process_deferred_bios(struct work_struct *ws) 1889 { 1890 struct cache *cache = container_of(ws, struct cache, deferred_bio_worker); 1891 1892 unsigned long flags; 1893 bool commit_needed = false; 1894 struct bio_list bios; 1895 struct bio *bio; 1896 1897 bio_list_init(&bios); 1898 1899 spin_lock_irqsave(&cache->lock, flags); 1900 bio_list_merge(&bios, &cache->deferred_bios); 1901 bio_list_init(&cache->deferred_bios); 1902 spin_unlock_irqrestore(&cache->lock, flags); 1903 1904 while ((bio = bio_list_pop(&bios))) { 1905 if (bio->bi_opf & REQ_PREFLUSH) 1906 commit_needed = process_flush_bio(cache, bio) || commit_needed; 1907 1908 else if (bio_op(bio) == REQ_OP_DISCARD) 1909 commit_needed = process_discard_bio(cache, bio) || commit_needed; 1910 1911 else 1912 commit_needed = process_bio(cache, bio) || commit_needed; 1913 } 1914 1915 if (commit_needed) 1916 schedule_commit(&cache->committer); 1917 } 1918 1919 /*---------------------------------------------------------------- 1920 * Main worker loop 1921 *--------------------------------------------------------------*/ 1922 1923 static void requeue_deferred_bios(struct cache *cache) 1924 { 1925 struct bio *bio; 1926 struct bio_list bios; 1927 1928 bio_list_init(&bios); 1929 bio_list_merge(&bios, &cache->deferred_bios); 1930 bio_list_init(&cache->deferred_bios); 1931 1932 while ((bio = bio_list_pop(&bios))) { 1933 bio->bi_status = BLK_STS_DM_REQUEUE; 1934 bio_endio(bio); 1935 } 1936 } 1937 1938 /* 1939 * We want to commit periodically so that not too much 1940 * unwritten metadata builds up. 1941 */ 1942 static void do_waker(struct work_struct *ws) 1943 { 1944 struct cache *cache = container_of(to_delayed_work(ws), struct cache, waker); 1945 1946 policy_tick(cache->policy, true); 1947 wake_migration_worker(cache); 1948 schedule_commit(&cache->committer); 1949 queue_delayed_work(cache->wq, &cache->waker, COMMIT_PERIOD); 1950 } 1951 1952 static void check_migrations(struct work_struct *ws) 1953 { 1954 int r; 1955 struct policy_work *op; 1956 struct cache *cache = container_of(ws, struct cache, migration_worker); 1957 enum busy b; 1958 1959 for (;;) { 1960 b = spare_migration_bandwidth(cache); 1961 1962 r = policy_get_background_work(cache->policy, b == IDLE, &op); 1963 if (r == -ENODATA) 1964 break; 1965 1966 if (r) { 1967 DMERR_LIMIT("%s: policy_background_work failed", 1968 cache_device_name(cache)); 1969 break; 1970 } 1971 1972 r = mg_start(cache, op, NULL); 1973 if (r) 1974 break; 1975 } 1976 } 1977 1978 /*---------------------------------------------------------------- 1979 * Target methods 1980 *--------------------------------------------------------------*/ 1981 1982 /* 1983 * This function gets called on the error paths of the constructor, so we 1984 * have to cope with a partially initialised struct. 1985 */ 1986 static void destroy(struct cache *cache) 1987 { 1988 unsigned i; 1989 1990 mempool_exit(&cache->migration_pool); 1991 1992 if (cache->prison) 1993 dm_bio_prison_destroy_v2(cache->prison); 1994 1995 if (cache->wq) 1996 destroy_workqueue(cache->wq); 1997 1998 if (cache->dirty_bitset) 1999 free_bitset(cache->dirty_bitset); 2000 2001 if (cache->discard_bitset) 2002 free_bitset(cache->discard_bitset); 2003 2004 if (cache->copier) 2005 dm_kcopyd_client_destroy(cache->copier); 2006 2007 if (cache->cmd) 2008 dm_cache_metadata_close(cache->cmd); 2009 2010 if (cache->metadata_dev) 2011 dm_put_device(cache->ti, cache->metadata_dev); 2012 2013 if (cache->origin_dev) 2014 dm_put_device(cache->ti, cache->origin_dev); 2015 2016 if (cache->cache_dev) 2017 dm_put_device(cache->ti, cache->cache_dev); 2018 2019 if (cache->policy) 2020 dm_cache_policy_destroy(cache->policy); 2021 2022 for (i = 0; i < cache->nr_ctr_args ; i++) 2023 kfree(cache->ctr_args[i]); 2024 kfree(cache->ctr_args); 2025 2026 bioset_exit(&cache->bs); 2027 2028 kfree(cache); 2029 } 2030 2031 static void cache_dtr(struct dm_target *ti) 2032 { 2033 struct cache *cache = ti->private; 2034 2035 destroy(cache); 2036 } 2037 2038 static sector_t get_dev_size(struct dm_dev *dev) 2039 { 2040 return i_size_read(dev->bdev->bd_inode) >> SECTOR_SHIFT; 2041 } 2042 2043 /*----------------------------------------------------------------*/ 2044 2045 /* 2046 * Construct a cache device mapping. 2047 * 2048 * cache <metadata dev> <cache dev> <origin dev> <block size> 2049 * <#feature args> [<feature arg>]* 2050 * <policy> <#policy args> [<policy arg>]* 2051 * 2052 * metadata dev : fast device holding the persistent metadata 2053 * cache dev : fast device holding cached data blocks 2054 * origin dev : slow device holding original data blocks 2055 * block size : cache unit size in sectors 2056 * 2057 * #feature args : number of feature arguments passed 2058 * feature args : writethrough. (The default is writeback.) 2059 * 2060 * policy : the replacement policy to use 2061 * #policy args : an even number of policy arguments corresponding 2062 * to key/value pairs passed to the policy 2063 * policy args : key/value pairs passed to the policy 2064 * E.g. 'sequential_threshold 1024' 2065 * See cache-policies.txt for details. 2066 * 2067 * Optional feature arguments are: 2068 * writethrough : write through caching that prohibits cache block 2069 * content from being different from origin block content. 2070 * Without this argument, the default behaviour is to write 2071 * back cache block contents later for performance reasons, 2072 * so they may differ from the corresponding origin blocks. 2073 */ 2074 struct cache_args { 2075 struct dm_target *ti; 2076 2077 struct dm_dev *metadata_dev; 2078 2079 struct dm_dev *cache_dev; 2080 sector_t cache_sectors; 2081 2082 struct dm_dev *origin_dev; 2083 sector_t origin_sectors; 2084 2085 uint32_t block_size; 2086 2087 const char *policy_name; 2088 int policy_argc; 2089 const char **policy_argv; 2090 2091 struct cache_features features; 2092 }; 2093 2094 static void destroy_cache_args(struct cache_args *ca) 2095 { 2096 if (ca->metadata_dev) 2097 dm_put_device(ca->ti, ca->metadata_dev); 2098 2099 if (ca->cache_dev) 2100 dm_put_device(ca->ti, ca->cache_dev); 2101 2102 if (ca->origin_dev) 2103 dm_put_device(ca->ti, ca->origin_dev); 2104 2105 kfree(ca); 2106 } 2107 2108 static bool at_least_one_arg(struct dm_arg_set *as, char **error) 2109 { 2110 if (!as->argc) { 2111 *error = "Insufficient args"; 2112 return false; 2113 } 2114 2115 return true; 2116 } 2117 2118 static int parse_metadata_dev(struct cache_args *ca, struct dm_arg_set *as, 2119 char **error) 2120 { 2121 int r; 2122 sector_t metadata_dev_size; 2123 char b[BDEVNAME_SIZE]; 2124 2125 if (!at_least_one_arg(as, error)) 2126 return -EINVAL; 2127 2128 r = dm_get_device(ca->ti, dm_shift_arg(as), FMODE_READ | FMODE_WRITE, 2129 &ca->metadata_dev); 2130 if (r) { 2131 *error = "Error opening metadata device"; 2132 return r; 2133 } 2134 2135 metadata_dev_size = get_dev_size(ca->metadata_dev); 2136 if (metadata_dev_size > DM_CACHE_METADATA_MAX_SECTORS_WARNING) 2137 DMWARN("Metadata device %s is larger than %u sectors: excess space will not be used.", 2138 bdevname(ca->metadata_dev->bdev, b), THIN_METADATA_MAX_SECTORS); 2139 2140 return 0; 2141 } 2142 2143 static int parse_cache_dev(struct cache_args *ca, struct dm_arg_set *as, 2144 char **error) 2145 { 2146 int r; 2147 2148 if (!at_least_one_arg(as, error)) 2149 return -EINVAL; 2150 2151 r = dm_get_device(ca->ti, dm_shift_arg(as), FMODE_READ | FMODE_WRITE, 2152 &ca->cache_dev); 2153 if (r) { 2154 *error = "Error opening cache device"; 2155 return r; 2156 } 2157 ca->cache_sectors = get_dev_size(ca->cache_dev); 2158 2159 return 0; 2160 } 2161 2162 static int parse_origin_dev(struct cache_args *ca, struct dm_arg_set *as, 2163 char **error) 2164 { 2165 int r; 2166 2167 if (!at_least_one_arg(as, error)) 2168 return -EINVAL; 2169 2170 r = dm_get_device(ca->ti, dm_shift_arg(as), FMODE_READ | FMODE_WRITE, 2171 &ca->origin_dev); 2172 if (r) { 2173 *error = "Error opening origin device"; 2174 return r; 2175 } 2176 2177 ca->origin_sectors = get_dev_size(ca->origin_dev); 2178 if (ca->ti->len > ca->origin_sectors) { 2179 *error = "Device size larger than cached device"; 2180 return -EINVAL; 2181 } 2182 2183 return 0; 2184 } 2185 2186 static int parse_block_size(struct cache_args *ca, struct dm_arg_set *as, 2187 char **error) 2188 { 2189 unsigned long block_size; 2190 2191 if (!at_least_one_arg(as, error)) 2192 return -EINVAL; 2193 2194 if (kstrtoul(dm_shift_arg(as), 10, &block_size) || !block_size || 2195 block_size < DATA_DEV_BLOCK_SIZE_MIN_SECTORS || 2196 block_size > DATA_DEV_BLOCK_SIZE_MAX_SECTORS || 2197 block_size & (DATA_DEV_BLOCK_SIZE_MIN_SECTORS - 1)) { 2198 *error = "Invalid data block size"; 2199 return -EINVAL; 2200 } 2201 2202 if (block_size > ca->cache_sectors) { 2203 *error = "Data block size is larger than the cache device"; 2204 return -EINVAL; 2205 } 2206 2207 ca->block_size = block_size; 2208 2209 return 0; 2210 } 2211 2212 static void init_features(struct cache_features *cf) 2213 { 2214 cf->mode = CM_WRITE; 2215 cf->io_mode = CM_IO_WRITEBACK; 2216 cf->metadata_version = 1; 2217 cf->discard_passdown = true; 2218 } 2219 2220 static int parse_features(struct cache_args *ca, struct dm_arg_set *as, 2221 char **error) 2222 { 2223 static const struct dm_arg _args[] = { 2224 {0, 3, "Invalid number of cache feature arguments"}, 2225 }; 2226 2227 int r, mode_ctr = 0; 2228 unsigned argc; 2229 const char *arg; 2230 struct cache_features *cf = &ca->features; 2231 2232 init_features(cf); 2233 2234 r = dm_read_arg_group(_args, as, &argc, error); 2235 if (r) 2236 return -EINVAL; 2237 2238 while (argc--) { 2239 arg = dm_shift_arg(as); 2240 2241 if (!strcasecmp(arg, "writeback")) { 2242 cf->io_mode = CM_IO_WRITEBACK; 2243 mode_ctr++; 2244 } 2245 2246 else if (!strcasecmp(arg, "writethrough")) { 2247 cf->io_mode = CM_IO_WRITETHROUGH; 2248 mode_ctr++; 2249 } 2250 2251 else if (!strcasecmp(arg, "passthrough")) { 2252 cf->io_mode = CM_IO_PASSTHROUGH; 2253 mode_ctr++; 2254 } 2255 2256 else if (!strcasecmp(arg, "metadata2")) 2257 cf->metadata_version = 2; 2258 2259 else if (!strcasecmp(arg, "no_discard_passdown")) 2260 cf->discard_passdown = false; 2261 2262 else { 2263 *error = "Unrecognised cache feature requested"; 2264 return -EINVAL; 2265 } 2266 } 2267 2268 if (mode_ctr > 1) { 2269 *error = "Duplicate cache io_mode features requested"; 2270 return -EINVAL; 2271 } 2272 2273 return 0; 2274 } 2275 2276 static int parse_policy(struct cache_args *ca, struct dm_arg_set *as, 2277 char **error) 2278 { 2279 static const struct dm_arg _args[] = { 2280 {0, 1024, "Invalid number of policy arguments"}, 2281 }; 2282 2283 int r; 2284 2285 if (!at_least_one_arg(as, error)) 2286 return -EINVAL; 2287 2288 ca->policy_name = dm_shift_arg(as); 2289 2290 r = dm_read_arg_group(_args, as, &ca->policy_argc, error); 2291 if (r) 2292 return -EINVAL; 2293 2294 ca->policy_argv = (const char **)as->argv; 2295 dm_consume_args(as, ca->policy_argc); 2296 2297 return 0; 2298 } 2299 2300 static int parse_cache_args(struct cache_args *ca, int argc, char **argv, 2301 char **error) 2302 { 2303 int r; 2304 struct dm_arg_set as; 2305 2306 as.argc = argc; 2307 as.argv = argv; 2308 2309 r = parse_metadata_dev(ca, &as, error); 2310 if (r) 2311 return r; 2312 2313 r = parse_cache_dev(ca, &as, error); 2314 if (r) 2315 return r; 2316 2317 r = parse_origin_dev(ca, &as, error); 2318 if (r) 2319 return r; 2320 2321 r = parse_block_size(ca, &as, error); 2322 if (r) 2323 return r; 2324 2325 r = parse_features(ca, &as, error); 2326 if (r) 2327 return r; 2328 2329 r = parse_policy(ca, &as, error); 2330 if (r) 2331 return r; 2332 2333 return 0; 2334 } 2335 2336 /*----------------------------------------------------------------*/ 2337 2338 static struct kmem_cache *migration_cache; 2339 2340 #define NOT_CORE_OPTION 1 2341 2342 static int process_config_option(struct cache *cache, const char *key, const char *value) 2343 { 2344 unsigned long tmp; 2345 2346 if (!strcasecmp(key, "migration_threshold")) { 2347 if (kstrtoul(value, 10, &tmp)) 2348 return -EINVAL; 2349 2350 cache->migration_threshold = tmp; 2351 return 0; 2352 } 2353 2354 return NOT_CORE_OPTION; 2355 } 2356 2357 static int set_config_value(struct cache *cache, const char *key, const char *value) 2358 { 2359 int r = process_config_option(cache, key, value); 2360 2361 if (r == NOT_CORE_OPTION) 2362 r = policy_set_config_value(cache->policy, key, value); 2363 2364 if (r) 2365 DMWARN("bad config value for %s: %s", key, value); 2366 2367 return r; 2368 } 2369 2370 static int set_config_values(struct cache *cache, int argc, const char **argv) 2371 { 2372 int r = 0; 2373 2374 if (argc & 1) { 2375 DMWARN("Odd number of policy arguments given but they should be <key> <value> pairs."); 2376 return -EINVAL; 2377 } 2378 2379 while (argc) { 2380 r = set_config_value(cache, argv[0], argv[1]); 2381 if (r) 2382 break; 2383 2384 argc -= 2; 2385 argv += 2; 2386 } 2387 2388 return r; 2389 } 2390 2391 static int create_cache_policy(struct cache *cache, struct cache_args *ca, 2392 char **error) 2393 { 2394 struct dm_cache_policy *p = dm_cache_policy_create(ca->policy_name, 2395 cache->cache_size, 2396 cache->origin_sectors, 2397 cache->sectors_per_block); 2398 if (IS_ERR(p)) { 2399 *error = "Error creating cache's policy"; 2400 return PTR_ERR(p); 2401 } 2402 cache->policy = p; 2403 BUG_ON(!cache->policy); 2404 2405 return 0; 2406 } 2407 2408 /* 2409 * We want the discard block size to be at least the size of the cache 2410 * block size and have no more than 2^14 discard blocks across the origin. 2411 */ 2412 #define MAX_DISCARD_BLOCKS (1 << 14) 2413 2414 static bool too_many_discard_blocks(sector_t discard_block_size, 2415 sector_t origin_size) 2416 { 2417 (void) sector_div(origin_size, discard_block_size); 2418 2419 return origin_size > MAX_DISCARD_BLOCKS; 2420 } 2421 2422 static sector_t calculate_discard_block_size(sector_t cache_block_size, 2423 sector_t origin_size) 2424 { 2425 sector_t discard_block_size = cache_block_size; 2426 2427 if (origin_size) 2428 while (too_many_discard_blocks(discard_block_size, origin_size)) 2429 discard_block_size *= 2; 2430 2431 return discard_block_size; 2432 } 2433 2434 static void set_cache_size(struct cache *cache, dm_cblock_t size) 2435 { 2436 dm_block_t nr_blocks = from_cblock(size); 2437 2438 if (nr_blocks > (1 << 20) && cache->cache_size != size) 2439 DMWARN_LIMIT("You have created a cache device with a lot of individual cache blocks (%llu)\n" 2440 "All these mappings can consume a lot of kernel memory, and take some time to read/write.\n" 2441 "Please consider increasing the cache block size to reduce the overall cache block count.", 2442 (unsigned long long) nr_blocks); 2443 2444 cache->cache_size = size; 2445 } 2446 2447 static int is_congested(struct dm_dev *dev, int bdi_bits) 2448 { 2449 struct request_queue *q = bdev_get_queue(dev->bdev); 2450 return bdi_congested(q->backing_dev_info, bdi_bits); 2451 } 2452 2453 static int cache_is_congested(struct dm_target_callbacks *cb, int bdi_bits) 2454 { 2455 struct cache *cache = container_of(cb, struct cache, callbacks); 2456 2457 return is_congested(cache->origin_dev, bdi_bits) || 2458 is_congested(cache->cache_dev, bdi_bits); 2459 } 2460 2461 #define DEFAULT_MIGRATION_THRESHOLD 2048 2462 2463 static int cache_create(struct cache_args *ca, struct cache **result) 2464 { 2465 int r = 0; 2466 char **error = &ca->ti->error; 2467 struct cache *cache; 2468 struct dm_target *ti = ca->ti; 2469 dm_block_t origin_blocks; 2470 struct dm_cache_metadata *cmd; 2471 bool may_format = ca->features.mode == CM_WRITE; 2472 2473 cache = kzalloc(sizeof(*cache), GFP_KERNEL); 2474 if (!cache) 2475 return -ENOMEM; 2476 2477 cache->ti = ca->ti; 2478 ti->private = cache; 2479 ti->num_flush_bios = 2; 2480 ti->flush_supported = true; 2481 2482 ti->num_discard_bios = 1; 2483 ti->discards_supported = true; 2484 2485 ti->per_io_data_size = sizeof(struct per_bio_data); 2486 2487 cache->features = ca->features; 2488 if (writethrough_mode(cache)) { 2489 /* Create bioset for writethrough bios issued to origin */ 2490 r = bioset_init(&cache->bs, BIO_POOL_SIZE, 0, 0); 2491 if (r) 2492 goto bad; 2493 } 2494 2495 cache->callbacks.congested_fn = cache_is_congested; 2496 dm_table_add_target_callbacks(ti->table, &cache->callbacks); 2497 2498 cache->metadata_dev = ca->metadata_dev; 2499 cache->origin_dev = ca->origin_dev; 2500 cache->cache_dev = ca->cache_dev; 2501 2502 ca->metadata_dev = ca->origin_dev = ca->cache_dev = NULL; 2503 2504 origin_blocks = cache->origin_sectors = ca->origin_sectors; 2505 origin_blocks = block_div(origin_blocks, ca->block_size); 2506 cache->origin_blocks = to_oblock(origin_blocks); 2507 2508 cache->sectors_per_block = ca->block_size; 2509 if (dm_set_target_max_io_len(ti, cache->sectors_per_block)) { 2510 r = -EINVAL; 2511 goto bad; 2512 } 2513 2514 if (ca->block_size & (ca->block_size - 1)) { 2515 dm_block_t cache_size = ca->cache_sectors; 2516 2517 cache->sectors_per_block_shift = -1; 2518 cache_size = block_div(cache_size, ca->block_size); 2519 set_cache_size(cache, to_cblock(cache_size)); 2520 } else { 2521 cache->sectors_per_block_shift = __ffs(ca->block_size); 2522 set_cache_size(cache, to_cblock(ca->cache_sectors >> cache->sectors_per_block_shift)); 2523 } 2524 2525 r = create_cache_policy(cache, ca, error); 2526 if (r) 2527 goto bad; 2528 2529 cache->policy_nr_args = ca->policy_argc; 2530 cache->migration_threshold = DEFAULT_MIGRATION_THRESHOLD; 2531 2532 r = set_config_values(cache, ca->policy_argc, ca->policy_argv); 2533 if (r) { 2534 *error = "Error setting cache policy's config values"; 2535 goto bad; 2536 } 2537 2538 cmd = dm_cache_metadata_open(cache->metadata_dev->bdev, 2539 ca->block_size, may_format, 2540 dm_cache_policy_get_hint_size(cache->policy), 2541 ca->features.metadata_version); 2542 if (IS_ERR(cmd)) { 2543 *error = "Error creating metadata object"; 2544 r = PTR_ERR(cmd); 2545 goto bad; 2546 } 2547 cache->cmd = cmd; 2548 set_cache_mode(cache, CM_WRITE); 2549 if (get_cache_mode(cache) != CM_WRITE) { 2550 *error = "Unable to get write access to metadata, please check/repair metadata."; 2551 r = -EINVAL; 2552 goto bad; 2553 } 2554 2555 if (passthrough_mode(cache)) { 2556 bool all_clean; 2557 2558 r = dm_cache_metadata_all_clean(cache->cmd, &all_clean); 2559 if (r) { 2560 *error = "dm_cache_metadata_all_clean() failed"; 2561 goto bad; 2562 } 2563 2564 if (!all_clean) { 2565 *error = "Cannot enter passthrough mode unless all blocks are clean"; 2566 r = -EINVAL; 2567 goto bad; 2568 } 2569 2570 policy_allow_migrations(cache->policy, false); 2571 } 2572 2573 spin_lock_init(&cache->lock); 2574 bio_list_init(&cache->deferred_bios); 2575 atomic_set(&cache->nr_allocated_migrations, 0); 2576 atomic_set(&cache->nr_io_migrations, 0); 2577 init_waitqueue_head(&cache->migration_wait); 2578 2579 r = -ENOMEM; 2580 atomic_set(&cache->nr_dirty, 0); 2581 cache->dirty_bitset = alloc_bitset(from_cblock(cache->cache_size)); 2582 if (!cache->dirty_bitset) { 2583 *error = "could not allocate dirty bitset"; 2584 goto bad; 2585 } 2586 clear_bitset(cache->dirty_bitset, from_cblock(cache->cache_size)); 2587 2588 cache->discard_block_size = 2589 calculate_discard_block_size(cache->sectors_per_block, 2590 cache->origin_sectors); 2591 cache->discard_nr_blocks = to_dblock(dm_sector_div_up(cache->origin_sectors, 2592 cache->discard_block_size)); 2593 cache->discard_bitset = alloc_bitset(from_dblock(cache->discard_nr_blocks)); 2594 if (!cache->discard_bitset) { 2595 *error = "could not allocate discard bitset"; 2596 goto bad; 2597 } 2598 clear_bitset(cache->discard_bitset, from_dblock(cache->discard_nr_blocks)); 2599 2600 cache->copier = dm_kcopyd_client_create(&dm_kcopyd_throttle); 2601 if (IS_ERR(cache->copier)) { 2602 *error = "could not create kcopyd client"; 2603 r = PTR_ERR(cache->copier); 2604 goto bad; 2605 } 2606 2607 cache->wq = alloc_workqueue("dm-" DM_MSG_PREFIX, WQ_MEM_RECLAIM, 0); 2608 if (!cache->wq) { 2609 *error = "could not create workqueue for metadata object"; 2610 goto bad; 2611 } 2612 INIT_WORK(&cache->deferred_bio_worker, process_deferred_bios); 2613 INIT_WORK(&cache->migration_worker, check_migrations); 2614 INIT_DELAYED_WORK(&cache->waker, do_waker); 2615 2616 cache->prison = dm_bio_prison_create_v2(cache->wq); 2617 if (!cache->prison) { 2618 *error = "could not create bio prison"; 2619 goto bad; 2620 } 2621 2622 r = mempool_init_slab_pool(&cache->migration_pool, MIGRATION_POOL_SIZE, 2623 migration_cache); 2624 if (r) { 2625 *error = "Error creating cache's migration mempool"; 2626 goto bad; 2627 } 2628 2629 cache->need_tick_bio = true; 2630 cache->sized = false; 2631 cache->invalidate = false; 2632 cache->commit_requested = false; 2633 cache->loaded_mappings = false; 2634 cache->loaded_discards = false; 2635 2636 load_stats(cache); 2637 2638 atomic_set(&cache->stats.demotion, 0); 2639 atomic_set(&cache->stats.promotion, 0); 2640 atomic_set(&cache->stats.copies_avoided, 0); 2641 atomic_set(&cache->stats.cache_cell_clash, 0); 2642 atomic_set(&cache->stats.commit_count, 0); 2643 atomic_set(&cache->stats.discard_count, 0); 2644 2645 spin_lock_init(&cache->invalidation_lock); 2646 INIT_LIST_HEAD(&cache->invalidation_requests); 2647 2648 batcher_init(&cache->committer, commit_op, cache, 2649 issue_op, cache, cache->wq); 2650 iot_init(&cache->tracker); 2651 2652 init_rwsem(&cache->background_work_lock); 2653 prevent_background_work(cache); 2654 2655 *result = cache; 2656 return 0; 2657 bad: 2658 destroy(cache); 2659 return r; 2660 } 2661 2662 static int copy_ctr_args(struct cache *cache, int argc, const char **argv) 2663 { 2664 unsigned i; 2665 const char **copy; 2666 2667 copy = kcalloc(argc, sizeof(*copy), GFP_KERNEL); 2668 if (!copy) 2669 return -ENOMEM; 2670 for (i = 0; i < argc; i++) { 2671 copy[i] = kstrdup(argv[i], GFP_KERNEL); 2672 if (!copy[i]) { 2673 while (i--) 2674 kfree(copy[i]); 2675 kfree(copy); 2676 return -ENOMEM; 2677 } 2678 } 2679 2680 cache->nr_ctr_args = argc; 2681 cache->ctr_args = copy; 2682 2683 return 0; 2684 } 2685 2686 static int cache_ctr(struct dm_target *ti, unsigned argc, char **argv) 2687 { 2688 int r = -EINVAL; 2689 struct cache_args *ca; 2690 struct cache *cache = NULL; 2691 2692 ca = kzalloc(sizeof(*ca), GFP_KERNEL); 2693 if (!ca) { 2694 ti->error = "Error allocating memory for cache"; 2695 return -ENOMEM; 2696 } 2697 ca->ti = ti; 2698 2699 r = parse_cache_args(ca, argc, argv, &ti->error); 2700 if (r) 2701 goto out; 2702 2703 r = cache_create(ca, &cache); 2704 if (r) 2705 goto out; 2706 2707 r = copy_ctr_args(cache, argc - 3, (const char **)argv + 3); 2708 if (r) { 2709 destroy(cache); 2710 goto out; 2711 } 2712 2713 ti->private = cache; 2714 out: 2715 destroy_cache_args(ca); 2716 return r; 2717 } 2718 2719 /*----------------------------------------------------------------*/ 2720 2721 static int cache_map(struct dm_target *ti, struct bio *bio) 2722 { 2723 struct cache *cache = ti->private; 2724 2725 int r; 2726 bool commit_needed; 2727 dm_oblock_t block = get_bio_block(cache, bio); 2728 2729 init_per_bio_data(bio); 2730 if (unlikely(from_oblock(block) >= from_oblock(cache->origin_blocks))) { 2731 /* 2732 * This can only occur if the io goes to a partial block at 2733 * the end of the origin device. We don't cache these. 2734 * Just remap to the origin and carry on. 2735 */ 2736 remap_to_origin(cache, bio); 2737 accounted_begin(cache, bio); 2738 return DM_MAPIO_REMAPPED; 2739 } 2740 2741 if (discard_or_flush(bio)) { 2742 defer_bio(cache, bio); 2743 return DM_MAPIO_SUBMITTED; 2744 } 2745 2746 r = map_bio(cache, bio, block, &commit_needed); 2747 if (commit_needed) 2748 schedule_commit(&cache->committer); 2749 2750 return r; 2751 } 2752 2753 static int cache_end_io(struct dm_target *ti, struct bio *bio, blk_status_t *error) 2754 { 2755 struct cache *cache = ti->private; 2756 unsigned long flags; 2757 struct per_bio_data *pb = get_per_bio_data(bio); 2758 2759 if (pb->tick) { 2760 policy_tick(cache->policy, false); 2761 2762 spin_lock_irqsave(&cache->lock, flags); 2763 cache->need_tick_bio = true; 2764 spin_unlock_irqrestore(&cache->lock, flags); 2765 } 2766 2767 bio_drop_shared_lock(cache, bio); 2768 accounted_complete(cache, bio); 2769 2770 return DM_ENDIO_DONE; 2771 } 2772 2773 static int write_dirty_bitset(struct cache *cache) 2774 { 2775 int r; 2776 2777 if (get_cache_mode(cache) >= CM_READ_ONLY) 2778 return -EINVAL; 2779 2780 r = dm_cache_set_dirty_bits(cache->cmd, from_cblock(cache->cache_size), cache->dirty_bitset); 2781 if (r) 2782 metadata_operation_failed(cache, "dm_cache_set_dirty_bits", r); 2783 2784 return r; 2785 } 2786 2787 static int write_discard_bitset(struct cache *cache) 2788 { 2789 unsigned i, r; 2790 2791 if (get_cache_mode(cache) >= CM_READ_ONLY) 2792 return -EINVAL; 2793 2794 r = dm_cache_discard_bitset_resize(cache->cmd, cache->discard_block_size, 2795 cache->discard_nr_blocks); 2796 if (r) { 2797 DMERR("%s: could not resize on-disk discard bitset", cache_device_name(cache)); 2798 metadata_operation_failed(cache, "dm_cache_discard_bitset_resize", r); 2799 return r; 2800 } 2801 2802 for (i = 0; i < from_dblock(cache->discard_nr_blocks); i++) { 2803 r = dm_cache_set_discard(cache->cmd, to_dblock(i), 2804 is_discarded(cache, to_dblock(i))); 2805 if (r) { 2806 metadata_operation_failed(cache, "dm_cache_set_discard", r); 2807 return r; 2808 } 2809 } 2810 2811 return 0; 2812 } 2813 2814 static int write_hints(struct cache *cache) 2815 { 2816 int r; 2817 2818 if (get_cache_mode(cache) >= CM_READ_ONLY) 2819 return -EINVAL; 2820 2821 r = dm_cache_write_hints(cache->cmd, cache->policy); 2822 if (r) { 2823 metadata_operation_failed(cache, "dm_cache_write_hints", r); 2824 return r; 2825 } 2826 2827 return 0; 2828 } 2829 2830 /* 2831 * returns true on success 2832 */ 2833 static bool sync_metadata(struct cache *cache) 2834 { 2835 int r1, r2, r3, r4; 2836 2837 r1 = write_dirty_bitset(cache); 2838 if (r1) 2839 DMERR("%s: could not write dirty bitset", cache_device_name(cache)); 2840 2841 r2 = write_discard_bitset(cache); 2842 if (r2) 2843 DMERR("%s: could not write discard bitset", cache_device_name(cache)); 2844 2845 save_stats(cache); 2846 2847 r3 = write_hints(cache); 2848 if (r3) 2849 DMERR("%s: could not write hints", cache_device_name(cache)); 2850 2851 /* 2852 * If writing the above metadata failed, we still commit, but don't 2853 * set the clean shutdown flag. This will effectively force every 2854 * dirty bit to be set on reload. 2855 */ 2856 r4 = commit(cache, !r1 && !r2 && !r3); 2857 if (r4) 2858 DMERR("%s: could not write cache metadata", cache_device_name(cache)); 2859 2860 return !r1 && !r2 && !r3 && !r4; 2861 } 2862 2863 static void cache_postsuspend(struct dm_target *ti) 2864 { 2865 struct cache *cache = ti->private; 2866 2867 prevent_background_work(cache); 2868 BUG_ON(atomic_read(&cache->nr_io_migrations)); 2869 2870 cancel_delayed_work(&cache->waker); 2871 flush_workqueue(cache->wq); 2872 WARN_ON(cache->tracker.in_flight); 2873 2874 /* 2875 * If it's a flush suspend there won't be any deferred bios, so this 2876 * call is harmless. 2877 */ 2878 requeue_deferred_bios(cache); 2879 2880 if (get_cache_mode(cache) == CM_WRITE) 2881 (void) sync_metadata(cache); 2882 } 2883 2884 static int load_mapping(void *context, dm_oblock_t oblock, dm_cblock_t cblock, 2885 bool dirty, uint32_t hint, bool hint_valid) 2886 { 2887 int r; 2888 struct cache *cache = context; 2889 2890 if (dirty) { 2891 set_bit(from_cblock(cblock), cache->dirty_bitset); 2892 atomic_inc(&cache->nr_dirty); 2893 } else 2894 clear_bit(from_cblock(cblock), cache->dirty_bitset); 2895 2896 r = policy_load_mapping(cache->policy, oblock, cblock, dirty, hint, hint_valid); 2897 if (r) 2898 return r; 2899 2900 return 0; 2901 } 2902 2903 /* 2904 * The discard block size in the on disk metadata is not 2905 * neccessarily the same as we're currently using. So we have to 2906 * be careful to only set the discarded attribute if we know it 2907 * covers a complete block of the new size. 2908 */ 2909 struct discard_load_info { 2910 struct cache *cache; 2911 2912 /* 2913 * These blocks are sized using the on disk dblock size, rather 2914 * than the current one. 2915 */ 2916 dm_block_t block_size; 2917 dm_block_t discard_begin, discard_end; 2918 }; 2919 2920 static void discard_load_info_init(struct cache *cache, 2921 struct discard_load_info *li) 2922 { 2923 li->cache = cache; 2924 li->discard_begin = li->discard_end = 0; 2925 } 2926 2927 static void set_discard_range(struct discard_load_info *li) 2928 { 2929 sector_t b, e; 2930 2931 if (li->discard_begin == li->discard_end) 2932 return; 2933 2934 /* 2935 * Convert to sectors. 2936 */ 2937 b = li->discard_begin * li->block_size; 2938 e = li->discard_end * li->block_size; 2939 2940 /* 2941 * Then convert back to the current dblock size. 2942 */ 2943 b = dm_sector_div_up(b, li->cache->discard_block_size); 2944 sector_div(e, li->cache->discard_block_size); 2945 2946 /* 2947 * The origin may have shrunk, so we need to check we're still in 2948 * bounds. 2949 */ 2950 if (e > from_dblock(li->cache->discard_nr_blocks)) 2951 e = from_dblock(li->cache->discard_nr_blocks); 2952 2953 for (; b < e; b++) 2954 set_discard(li->cache, to_dblock(b)); 2955 } 2956 2957 static int load_discard(void *context, sector_t discard_block_size, 2958 dm_dblock_t dblock, bool discard) 2959 { 2960 struct discard_load_info *li = context; 2961 2962 li->block_size = discard_block_size; 2963 2964 if (discard) { 2965 if (from_dblock(dblock) == li->discard_end) 2966 /* 2967 * We're already in a discard range, just extend it. 2968 */ 2969 li->discard_end = li->discard_end + 1ULL; 2970 2971 else { 2972 /* 2973 * Emit the old range and start a new one. 2974 */ 2975 set_discard_range(li); 2976 li->discard_begin = from_dblock(dblock); 2977 li->discard_end = li->discard_begin + 1ULL; 2978 } 2979 } else { 2980 set_discard_range(li); 2981 li->discard_begin = li->discard_end = 0; 2982 } 2983 2984 return 0; 2985 } 2986 2987 static dm_cblock_t get_cache_dev_size(struct cache *cache) 2988 { 2989 sector_t size = get_dev_size(cache->cache_dev); 2990 (void) sector_div(size, cache->sectors_per_block); 2991 return to_cblock(size); 2992 } 2993 2994 static bool can_resize(struct cache *cache, dm_cblock_t new_size) 2995 { 2996 if (from_cblock(new_size) > from_cblock(cache->cache_size)) { 2997 if (cache->sized) { 2998 DMERR("%s: unable to extend cache due to missing cache table reload", 2999 cache_device_name(cache)); 3000 return false; 3001 } 3002 } 3003 3004 /* 3005 * We can't drop a dirty block when shrinking the cache. 3006 */ 3007 while (from_cblock(new_size) < from_cblock(cache->cache_size)) { 3008 new_size = to_cblock(from_cblock(new_size) + 1); 3009 if (is_dirty(cache, new_size)) { 3010 DMERR("%s: unable to shrink cache; cache block %llu is dirty", 3011 cache_device_name(cache), 3012 (unsigned long long) from_cblock(new_size)); 3013 return false; 3014 } 3015 } 3016 3017 return true; 3018 } 3019 3020 static int resize_cache_dev(struct cache *cache, dm_cblock_t new_size) 3021 { 3022 int r; 3023 3024 r = dm_cache_resize(cache->cmd, new_size); 3025 if (r) { 3026 DMERR("%s: could not resize cache metadata", cache_device_name(cache)); 3027 metadata_operation_failed(cache, "dm_cache_resize", r); 3028 return r; 3029 } 3030 3031 set_cache_size(cache, new_size); 3032 3033 return 0; 3034 } 3035 3036 static int cache_preresume(struct dm_target *ti) 3037 { 3038 int r = 0; 3039 struct cache *cache = ti->private; 3040 dm_cblock_t csize = get_cache_dev_size(cache); 3041 3042 /* 3043 * Check to see if the cache has resized. 3044 */ 3045 if (!cache->sized) { 3046 r = resize_cache_dev(cache, csize); 3047 if (r) 3048 return r; 3049 3050 cache->sized = true; 3051 3052 } else if (csize != cache->cache_size) { 3053 if (!can_resize(cache, csize)) 3054 return -EINVAL; 3055 3056 r = resize_cache_dev(cache, csize); 3057 if (r) 3058 return r; 3059 } 3060 3061 if (!cache->loaded_mappings) { 3062 r = dm_cache_load_mappings(cache->cmd, cache->policy, 3063 load_mapping, cache); 3064 if (r) { 3065 DMERR("%s: could not load cache mappings", cache_device_name(cache)); 3066 metadata_operation_failed(cache, "dm_cache_load_mappings", r); 3067 return r; 3068 } 3069 3070 cache->loaded_mappings = true; 3071 } 3072 3073 if (!cache->loaded_discards) { 3074 struct discard_load_info li; 3075 3076 /* 3077 * The discard bitset could have been resized, or the 3078 * discard block size changed. To be safe we start by 3079 * setting every dblock to not discarded. 3080 */ 3081 clear_bitset(cache->discard_bitset, from_dblock(cache->discard_nr_blocks)); 3082 3083 discard_load_info_init(cache, &li); 3084 r = dm_cache_load_discards(cache->cmd, load_discard, &li); 3085 if (r) { 3086 DMERR("%s: could not load origin discards", cache_device_name(cache)); 3087 metadata_operation_failed(cache, "dm_cache_load_discards", r); 3088 return r; 3089 } 3090 set_discard_range(&li); 3091 3092 cache->loaded_discards = true; 3093 } 3094 3095 return r; 3096 } 3097 3098 static void cache_resume(struct dm_target *ti) 3099 { 3100 struct cache *cache = ti->private; 3101 3102 cache->need_tick_bio = true; 3103 allow_background_work(cache); 3104 do_waker(&cache->waker.work); 3105 } 3106 3107 static void emit_flags(struct cache *cache, char *result, 3108 unsigned maxlen, ssize_t *sz_ptr) 3109 { 3110 ssize_t sz = *sz_ptr; 3111 struct cache_features *cf = &cache->features; 3112 unsigned count = (cf->metadata_version == 2) + !cf->discard_passdown + 1; 3113 3114 DMEMIT("%u ", count); 3115 3116 if (cf->metadata_version == 2) 3117 DMEMIT("metadata2 "); 3118 3119 if (writethrough_mode(cache)) 3120 DMEMIT("writethrough "); 3121 3122 else if (passthrough_mode(cache)) 3123 DMEMIT("passthrough "); 3124 3125 else if (writeback_mode(cache)) 3126 DMEMIT("writeback "); 3127 3128 else { 3129 DMEMIT("unknown "); 3130 DMERR("%s: internal error: unknown io mode: %d", 3131 cache_device_name(cache), (int) cf->io_mode); 3132 } 3133 3134 if (!cf->discard_passdown) 3135 DMEMIT("no_discard_passdown "); 3136 3137 *sz_ptr = sz; 3138 } 3139 3140 /* 3141 * Status format: 3142 * 3143 * <metadata block size> <#used metadata blocks>/<#total metadata blocks> 3144 * <cache block size> <#used cache blocks>/<#total cache blocks> 3145 * <#read hits> <#read misses> <#write hits> <#write misses> 3146 * <#demotions> <#promotions> <#dirty> 3147 * <#features> <features>* 3148 * <#core args> <core args> 3149 * <policy name> <#policy args> <policy args>* <cache metadata mode> <needs_check> 3150 */ 3151 static void cache_status(struct dm_target *ti, status_type_t type, 3152 unsigned status_flags, char *result, unsigned maxlen) 3153 { 3154 int r = 0; 3155 unsigned i; 3156 ssize_t sz = 0; 3157 dm_block_t nr_free_blocks_metadata = 0; 3158 dm_block_t nr_blocks_metadata = 0; 3159 char buf[BDEVNAME_SIZE]; 3160 struct cache *cache = ti->private; 3161 dm_cblock_t residency; 3162 bool needs_check; 3163 3164 switch (type) { 3165 case STATUSTYPE_INFO: 3166 if (get_cache_mode(cache) == CM_FAIL) { 3167 DMEMIT("Fail"); 3168 break; 3169 } 3170 3171 /* Commit to ensure statistics aren't out-of-date */ 3172 if (!(status_flags & DM_STATUS_NOFLUSH_FLAG) && !dm_suspended(ti)) 3173 (void) commit(cache, false); 3174 3175 r = dm_cache_get_free_metadata_block_count(cache->cmd, &nr_free_blocks_metadata); 3176 if (r) { 3177 DMERR("%s: dm_cache_get_free_metadata_block_count returned %d", 3178 cache_device_name(cache), r); 3179 goto err; 3180 } 3181 3182 r = dm_cache_get_metadata_dev_size(cache->cmd, &nr_blocks_metadata); 3183 if (r) { 3184 DMERR("%s: dm_cache_get_metadata_dev_size returned %d", 3185 cache_device_name(cache), r); 3186 goto err; 3187 } 3188 3189 residency = policy_residency(cache->policy); 3190 3191 DMEMIT("%u %llu/%llu %llu %llu/%llu %u %u %u %u %u %u %lu ", 3192 (unsigned)DM_CACHE_METADATA_BLOCK_SIZE, 3193 (unsigned long long)(nr_blocks_metadata - nr_free_blocks_metadata), 3194 (unsigned long long)nr_blocks_metadata, 3195 (unsigned long long)cache->sectors_per_block, 3196 (unsigned long long) from_cblock(residency), 3197 (unsigned long long) from_cblock(cache->cache_size), 3198 (unsigned) atomic_read(&cache->stats.read_hit), 3199 (unsigned) atomic_read(&cache->stats.read_miss), 3200 (unsigned) atomic_read(&cache->stats.write_hit), 3201 (unsigned) atomic_read(&cache->stats.write_miss), 3202 (unsigned) atomic_read(&cache->stats.demotion), 3203 (unsigned) atomic_read(&cache->stats.promotion), 3204 (unsigned long) atomic_read(&cache->nr_dirty)); 3205 3206 emit_flags(cache, result, maxlen, &sz); 3207 3208 DMEMIT("2 migration_threshold %llu ", (unsigned long long) cache->migration_threshold); 3209 3210 DMEMIT("%s ", dm_cache_policy_get_name(cache->policy)); 3211 if (sz < maxlen) { 3212 r = policy_emit_config_values(cache->policy, result, maxlen, &sz); 3213 if (r) 3214 DMERR("%s: policy_emit_config_values returned %d", 3215 cache_device_name(cache), r); 3216 } 3217 3218 if (get_cache_mode(cache) == CM_READ_ONLY) 3219 DMEMIT("ro "); 3220 else 3221 DMEMIT("rw "); 3222 3223 r = dm_cache_metadata_needs_check(cache->cmd, &needs_check); 3224 3225 if (r || needs_check) 3226 DMEMIT("needs_check "); 3227 else 3228 DMEMIT("- "); 3229 3230 break; 3231 3232 case STATUSTYPE_TABLE: 3233 format_dev_t(buf, cache->metadata_dev->bdev->bd_dev); 3234 DMEMIT("%s ", buf); 3235 format_dev_t(buf, cache->cache_dev->bdev->bd_dev); 3236 DMEMIT("%s ", buf); 3237 format_dev_t(buf, cache->origin_dev->bdev->bd_dev); 3238 DMEMIT("%s", buf); 3239 3240 for (i = 0; i < cache->nr_ctr_args - 1; i++) 3241 DMEMIT(" %s", cache->ctr_args[i]); 3242 if (cache->nr_ctr_args) 3243 DMEMIT(" %s", cache->ctr_args[cache->nr_ctr_args - 1]); 3244 } 3245 3246 return; 3247 3248 err: 3249 DMEMIT("Error"); 3250 } 3251 3252 /* 3253 * Defines a range of cblocks, begin to (end - 1) are in the range. end is 3254 * the one-past-the-end value. 3255 */ 3256 struct cblock_range { 3257 dm_cblock_t begin; 3258 dm_cblock_t end; 3259 }; 3260 3261 /* 3262 * A cache block range can take two forms: 3263 * 3264 * i) A single cblock, eg. '3456' 3265 * ii) A begin and end cblock with a dash between, eg. 123-234 3266 */ 3267 static int parse_cblock_range(struct cache *cache, const char *str, 3268 struct cblock_range *result) 3269 { 3270 char dummy; 3271 uint64_t b, e; 3272 int r; 3273 3274 /* 3275 * Try and parse form (ii) first. 3276 */ 3277 r = sscanf(str, "%llu-%llu%c", &b, &e, &dummy); 3278 if (r < 0) 3279 return r; 3280 3281 if (r == 2) { 3282 result->begin = to_cblock(b); 3283 result->end = to_cblock(e); 3284 return 0; 3285 } 3286 3287 /* 3288 * That didn't work, try form (i). 3289 */ 3290 r = sscanf(str, "%llu%c", &b, &dummy); 3291 if (r < 0) 3292 return r; 3293 3294 if (r == 1) { 3295 result->begin = to_cblock(b); 3296 result->end = to_cblock(from_cblock(result->begin) + 1u); 3297 return 0; 3298 } 3299 3300 DMERR("%s: invalid cblock range '%s'", cache_device_name(cache), str); 3301 return -EINVAL; 3302 } 3303 3304 static int validate_cblock_range(struct cache *cache, struct cblock_range *range) 3305 { 3306 uint64_t b = from_cblock(range->begin); 3307 uint64_t e = from_cblock(range->end); 3308 uint64_t n = from_cblock(cache->cache_size); 3309 3310 if (b >= n) { 3311 DMERR("%s: begin cblock out of range: %llu >= %llu", 3312 cache_device_name(cache), b, n); 3313 return -EINVAL; 3314 } 3315 3316 if (e > n) { 3317 DMERR("%s: end cblock out of range: %llu > %llu", 3318 cache_device_name(cache), e, n); 3319 return -EINVAL; 3320 } 3321 3322 if (b >= e) { 3323 DMERR("%s: invalid cblock range: %llu >= %llu", 3324 cache_device_name(cache), b, e); 3325 return -EINVAL; 3326 } 3327 3328 return 0; 3329 } 3330 3331 static inline dm_cblock_t cblock_succ(dm_cblock_t b) 3332 { 3333 return to_cblock(from_cblock(b) + 1); 3334 } 3335 3336 static int request_invalidation(struct cache *cache, struct cblock_range *range) 3337 { 3338 int r = 0; 3339 3340 /* 3341 * We don't need to do any locking here because we know we're in 3342 * passthrough mode. There's is potential for a race between an 3343 * invalidation triggered by an io and an invalidation message. This 3344 * is harmless, we must not worry if the policy call fails. 3345 */ 3346 while (range->begin != range->end) { 3347 r = invalidate_cblock(cache, range->begin); 3348 if (r) 3349 return r; 3350 3351 range->begin = cblock_succ(range->begin); 3352 } 3353 3354 cache->commit_requested = true; 3355 return r; 3356 } 3357 3358 static int process_invalidate_cblocks_message(struct cache *cache, unsigned count, 3359 const char **cblock_ranges) 3360 { 3361 int r = 0; 3362 unsigned i; 3363 struct cblock_range range; 3364 3365 if (!passthrough_mode(cache)) { 3366 DMERR("%s: cache has to be in passthrough mode for invalidation", 3367 cache_device_name(cache)); 3368 return -EPERM; 3369 } 3370 3371 for (i = 0; i < count; i++) { 3372 r = parse_cblock_range(cache, cblock_ranges[i], &range); 3373 if (r) 3374 break; 3375 3376 r = validate_cblock_range(cache, &range); 3377 if (r) 3378 break; 3379 3380 /* 3381 * Pass begin and end origin blocks to the worker and wake it. 3382 */ 3383 r = request_invalidation(cache, &range); 3384 if (r) 3385 break; 3386 } 3387 3388 return r; 3389 } 3390 3391 /* 3392 * Supports 3393 * "<key> <value>" 3394 * and 3395 * "invalidate_cblocks [(<begin>)|(<begin>-<end>)]* 3396 * 3397 * The key migration_threshold is supported by the cache target core. 3398 */ 3399 static int cache_message(struct dm_target *ti, unsigned argc, char **argv, 3400 char *result, unsigned maxlen) 3401 { 3402 struct cache *cache = ti->private; 3403 3404 if (!argc) 3405 return -EINVAL; 3406 3407 if (get_cache_mode(cache) >= CM_READ_ONLY) { 3408 DMERR("%s: unable to service cache target messages in READ_ONLY or FAIL mode", 3409 cache_device_name(cache)); 3410 return -EOPNOTSUPP; 3411 } 3412 3413 if (!strcasecmp(argv[0], "invalidate_cblocks")) 3414 return process_invalidate_cblocks_message(cache, argc - 1, (const char **) argv + 1); 3415 3416 if (argc != 2) 3417 return -EINVAL; 3418 3419 return set_config_value(cache, argv[0], argv[1]); 3420 } 3421 3422 static int cache_iterate_devices(struct dm_target *ti, 3423 iterate_devices_callout_fn fn, void *data) 3424 { 3425 int r = 0; 3426 struct cache *cache = ti->private; 3427 3428 r = fn(ti, cache->cache_dev, 0, get_dev_size(cache->cache_dev), data); 3429 if (!r) 3430 r = fn(ti, cache->origin_dev, 0, ti->len, data); 3431 3432 return r; 3433 } 3434 3435 static bool origin_dev_supports_discard(struct block_device *origin_bdev) 3436 { 3437 struct request_queue *q = bdev_get_queue(origin_bdev); 3438 3439 return q && blk_queue_discard(q); 3440 } 3441 3442 /* 3443 * If discard_passdown was enabled verify that the origin device 3444 * supports discards. Disable discard_passdown if not. 3445 */ 3446 static void disable_passdown_if_not_supported(struct cache *cache) 3447 { 3448 struct block_device *origin_bdev = cache->origin_dev->bdev; 3449 struct queue_limits *origin_limits = &bdev_get_queue(origin_bdev)->limits; 3450 const char *reason = NULL; 3451 char buf[BDEVNAME_SIZE]; 3452 3453 if (!cache->features.discard_passdown) 3454 return; 3455 3456 if (!origin_dev_supports_discard(origin_bdev)) 3457 reason = "discard unsupported"; 3458 3459 else if (origin_limits->max_discard_sectors < cache->sectors_per_block) 3460 reason = "max discard sectors smaller than a block"; 3461 3462 if (reason) { 3463 DMWARN("Origin device (%s) %s: Disabling discard passdown.", 3464 bdevname(origin_bdev, buf), reason); 3465 cache->features.discard_passdown = false; 3466 } 3467 } 3468 3469 static void set_discard_limits(struct cache *cache, struct queue_limits *limits) 3470 { 3471 struct block_device *origin_bdev = cache->origin_dev->bdev; 3472 struct queue_limits *origin_limits = &bdev_get_queue(origin_bdev)->limits; 3473 3474 if (!cache->features.discard_passdown) { 3475 /* No passdown is done so setting own virtual limits */ 3476 limits->max_discard_sectors = min_t(sector_t, cache->discard_block_size * 1024, 3477 cache->origin_sectors); 3478 limits->discard_granularity = cache->discard_block_size << SECTOR_SHIFT; 3479 return; 3480 } 3481 3482 /* 3483 * cache_iterate_devices() is stacking both origin and fast device limits 3484 * but discards aren't passed to fast device, so inherit origin's limits. 3485 */ 3486 limits->max_discard_sectors = origin_limits->max_discard_sectors; 3487 limits->max_hw_discard_sectors = origin_limits->max_hw_discard_sectors; 3488 limits->discard_granularity = origin_limits->discard_granularity; 3489 limits->discard_alignment = origin_limits->discard_alignment; 3490 limits->discard_misaligned = origin_limits->discard_misaligned; 3491 } 3492 3493 static void cache_io_hints(struct dm_target *ti, struct queue_limits *limits) 3494 { 3495 struct cache *cache = ti->private; 3496 uint64_t io_opt_sectors = limits->io_opt >> SECTOR_SHIFT; 3497 3498 /* 3499 * If the system-determined stacked limits are compatible with the 3500 * cache's blocksize (io_opt is a factor) do not override them. 3501 */ 3502 if (io_opt_sectors < cache->sectors_per_block || 3503 do_div(io_opt_sectors, cache->sectors_per_block)) { 3504 blk_limits_io_min(limits, cache->sectors_per_block << SECTOR_SHIFT); 3505 blk_limits_io_opt(limits, cache->sectors_per_block << SECTOR_SHIFT); 3506 } 3507 3508 disable_passdown_if_not_supported(cache); 3509 set_discard_limits(cache, limits); 3510 } 3511 3512 /*----------------------------------------------------------------*/ 3513 3514 static struct target_type cache_target = { 3515 .name = "cache", 3516 .version = {2, 1, 0}, 3517 .module = THIS_MODULE, 3518 .ctr = cache_ctr, 3519 .dtr = cache_dtr, 3520 .map = cache_map, 3521 .end_io = cache_end_io, 3522 .postsuspend = cache_postsuspend, 3523 .preresume = cache_preresume, 3524 .resume = cache_resume, 3525 .status = cache_status, 3526 .message = cache_message, 3527 .iterate_devices = cache_iterate_devices, 3528 .io_hints = cache_io_hints, 3529 }; 3530 3531 static int __init dm_cache_init(void) 3532 { 3533 int r; 3534 3535 migration_cache = KMEM_CACHE(dm_cache_migration, 0); 3536 if (!migration_cache) 3537 return -ENOMEM; 3538 3539 r = dm_register_target(&cache_target); 3540 if (r) { 3541 DMERR("cache target registration failed: %d", r); 3542 kmem_cache_destroy(migration_cache); 3543 return r; 3544 } 3545 3546 return 0; 3547 } 3548 3549 static void __exit dm_cache_exit(void) 3550 { 3551 dm_unregister_target(&cache_target); 3552 kmem_cache_destroy(migration_cache); 3553 } 3554 3555 module_init(dm_cache_init); 3556 module_exit(dm_cache_exit); 3557 3558 MODULE_DESCRIPTION(DM_NAME " cache target"); 3559 MODULE_AUTHOR("Joe Thornber <ejt@redhat.com>"); 3560 MODULE_LICENSE("GPL"); 3561