1 /* 2 * Copyright (C) 2012 Red Hat. All rights reserved. 3 * 4 * This file is released under the GPL. 5 */ 6 7 #include "dm.h" 8 #include "dm-bio-prison-v2.h" 9 #include "dm-bio-record.h" 10 #include "dm-cache-metadata.h" 11 12 #include <linux/dm-io.h> 13 #include <linux/dm-kcopyd.h> 14 #include <linux/jiffies.h> 15 #include <linux/init.h> 16 #include <linux/mempool.h> 17 #include <linux/module.h> 18 #include <linux/rwsem.h> 19 #include <linux/slab.h> 20 #include <linux/vmalloc.h> 21 22 #define DM_MSG_PREFIX "cache" 23 24 DECLARE_DM_KCOPYD_THROTTLE_WITH_MODULE_PARM(cache_copy_throttle, 25 "A percentage of time allocated for copying to and/or from cache"); 26 27 /*----------------------------------------------------------------*/ 28 29 /* 30 * Glossary: 31 * 32 * oblock: index of an origin block 33 * cblock: index of a cache block 34 * promotion: movement of a block from origin to cache 35 * demotion: movement of a block from cache to origin 36 * migration: movement of a block between the origin and cache device, 37 * either direction 38 */ 39 40 /*----------------------------------------------------------------*/ 41 42 struct io_tracker { 43 spinlock_t lock; 44 45 /* 46 * Sectors of in-flight IO. 47 */ 48 sector_t in_flight; 49 50 /* 51 * The time, in jiffies, when this device became idle (if it is 52 * indeed idle). 53 */ 54 unsigned long idle_time; 55 unsigned long last_update_time; 56 }; 57 58 static void iot_init(struct io_tracker *iot) 59 { 60 spin_lock_init(&iot->lock); 61 iot->in_flight = 0ul; 62 iot->idle_time = 0ul; 63 iot->last_update_time = jiffies; 64 } 65 66 static bool __iot_idle_for(struct io_tracker *iot, unsigned long jifs) 67 { 68 if (iot->in_flight) 69 return false; 70 71 return time_after(jiffies, iot->idle_time + jifs); 72 } 73 74 static bool iot_idle_for(struct io_tracker *iot, unsigned long jifs) 75 { 76 bool r; 77 unsigned long flags; 78 79 spin_lock_irqsave(&iot->lock, flags); 80 r = __iot_idle_for(iot, jifs); 81 spin_unlock_irqrestore(&iot->lock, flags); 82 83 return r; 84 } 85 86 static void iot_io_begin(struct io_tracker *iot, sector_t len) 87 { 88 unsigned long flags; 89 90 spin_lock_irqsave(&iot->lock, flags); 91 iot->in_flight += len; 92 spin_unlock_irqrestore(&iot->lock, flags); 93 } 94 95 static void __iot_io_end(struct io_tracker *iot, sector_t len) 96 { 97 if (!len) 98 return; 99 100 iot->in_flight -= len; 101 if (!iot->in_flight) 102 iot->idle_time = jiffies; 103 } 104 105 static void iot_io_end(struct io_tracker *iot, sector_t len) 106 { 107 unsigned long flags; 108 109 spin_lock_irqsave(&iot->lock, flags); 110 __iot_io_end(iot, len); 111 spin_unlock_irqrestore(&iot->lock, flags); 112 } 113 114 /*----------------------------------------------------------------*/ 115 116 /* 117 * Represents a chunk of future work. 'input' allows continuations to pass 118 * values between themselves, typically error values. 119 */ 120 struct continuation { 121 struct work_struct ws; 122 blk_status_t input; 123 }; 124 125 static inline void init_continuation(struct continuation *k, 126 void (*fn)(struct work_struct *)) 127 { 128 INIT_WORK(&k->ws, fn); 129 k->input = 0; 130 } 131 132 static inline void queue_continuation(struct workqueue_struct *wq, 133 struct continuation *k) 134 { 135 queue_work(wq, &k->ws); 136 } 137 138 /*----------------------------------------------------------------*/ 139 140 /* 141 * The batcher collects together pieces of work that need a particular 142 * operation to occur before they can proceed (typically a commit). 143 */ 144 struct batcher { 145 /* 146 * The operation that everyone is waiting for. 147 */ 148 blk_status_t (*commit_op)(void *context); 149 void *commit_context; 150 151 /* 152 * This is how bios should be issued once the commit op is complete 153 * (accounted_request). 154 */ 155 void (*issue_op)(struct bio *bio, void *context); 156 void *issue_context; 157 158 /* 159 * Queued work gets put on here after commit. 160 */ 161 struct workqueue_struct *wq; 162 163 spinlock_t lock; 164 struct list_head work_items; 165 struct bio_list bios; 166 struct work_struct commit_work; 167 168 bool commit_scheduled; 169 }; 170 171 static void __commit(struct work_struct *_ws) 172 { 173 struct batcher *b = container_of(_ws, struct batcher, commit_work); 174 blk_status_t r; 175 unsigned long flags; 176 struct list_head work_items; 177 struct work_struct *ws, *tmp; 178 struct continuation *k; 179 struct bio *bio; 180 struct bio_list bios; 181 182 INIT_LIST_HEAD(&work_items); 183 bio_list_init(&bios); 184 185 /* 186 * We have to grab these before the commit_op to avoid a race 187 * condition. 188 */ 189 spin_lock_irqsave(&b->lock, flags); 190 list_splice_init(&b->work_items, &work_items); 191 bio_list_merge(&bios, &b->bios); 192 bio_list_init(&b->bios); 193 b->commit_scheduled = false; 194 spin_unlock_irqrestore(&b->lock, flags); 195 196 r = b->commit_op(b->commit_context); 197 198 list_for_each_entry_safe(ws, tmp, &work_items, entry) { 199 k = container_of(ws, struct continuation, ws); 200 k->input = r; 201 INIT_LIST_HEAD(&ws->entry); /* to avoid a WARN_ON */ 202 queue_work(b->wq, ws); 203 } 204 205 while ((bio = bio_list_pop(&bios))) { 206 if (r) { 207 bio->bi_status = r; 208 bio_endio(bio); 209 } else 210 b->issue_op(bio, b->issue_context); 211 } 212 } 213 214 static void batcher_init(struct batcher *b, 215 blk_status_t (*commit_op)(void *), 216 void *commit_context, 217 void (*issue_op)(struct bio *bio, void *), 218 void *issue_context, 219 struct workqueue_struct *wq) 220 { 221 b->commit_op = commit_op; 222 b->commit_context = commit_context; 223 b->issue_op = issue_op; 224 b->issue_context = issue_context; 225 b->wq = wq; 226 227 spin_lock_init(&b->lock); 228 INIT_LIST_HEAD(&b->work_items); 229 bio_list_init(&b->bios); 230 INIT_WORK(&b->commit_work, __commit); 231 b->commit_scheduled = false; 232 } 233 234 static void async_commit(struct batcher *b) 235 { 236 queue_work(b->wq, &b->commit_work); 237 } 238 239 static void continue_after_commit(struct batcher *b, struct continuation *k) 240 { 241 unsigned long flags; 242 bool commit_scheduled; 243 244 spin_lock_irqsave(&b->lock, flags); 245 commit_scheduled = b->commit_scheduled; 246 list_add_tail(&k->ws.entry, &b->work_items); 247 spin_unlock_irqrestore(&b->lock, flags); 248 249 if (commit_scheduled) 250 async_commit(b); 251 } 252 253 /* 254 * Bios are errored if commit failed. 255 */ 256 static void issue_after_commit(struct batcher *b, struct bio *bio) 257 { 258 unsigned long flags; 259 bool commit_scheduled; 260 261 spin_lock_irqsave(&b->lock, flags); 262 commit_scheduled = b->commit_scheduled; 263 bio_list_add(&b->bios, bio); 264 spin_unlock_irqrestore(&b->lock, flags); 265 266 if (commit_scheduled) 267 async_commit(b); 268 } 269 270 /* 271 * Call this if some urgent work is waiting for the commit to complete. 272 */ 273 static void schedule_commit(struct batcher *b) 274 { 275 bool immediate; 276 unsigned long flags; 277 278 spin_lock_irqsave(&b->lock, flags); 279 immediate = !list_empty(&b->work_items) || !bio_list_empty(&b->bios); 280 b->commit_scheduled = true; 281 spin_unlock_irqrestore(&b->lock, flags); 282 283 if (immediate) 284 async_commit(b); 285 } 286 287 /* 288 * There are a couple of places where we let a bio run, but want to do some 289 * work before calling its endio function. We do this by temporarily 290 * changing the endio fn. 291 */ 292 struct dm_hook_info { 293 bio_end_io_t *bi_end_io; 294 }; 295 296 static void dm_hook_bio(struct dm_hook_info *h, struct bio *bio, 297 bio_end_io_t *bi_end_io, void *bi_private) 298 { 299 h->bi_end_io = bio->bi_end_io; 300 301 bio->bi_end_io = bi_end_io; 302 bio->bi_private = bi_private; 303 } 304 305 static void dm_unhook_bio(struct dm_hook_info *h, struct bio *bio) 306 { 307 bio->bi_end_io = h->bi_end_io; 308 } 309 310 /*----------------------------------------------------------------*/ 311 312 #define MIGRATION_POOL_SIZE 128 313 #define COMMIT_PERIOD HZ 314 #define MIGRATION_COUNT_WINDOW 10 315 316 /* 317 * The block size of the device holding cache data must be 318 * between 32KB and 1GB. 319 */ 320 #define DATA_DEV_BLOCK_SIZE_MIN_SECTORS (32 * 1024 >> SECTOR_SHIFT) 321 #define DATA_DEV_BLOCK_SIZE_MAX_SECTORS (1024 * 1024 * 1024 >> SECTOR_SHIFT) 322 323 enum cache_metadata_mode { 324 CM_WRITE, /* metadata may be changed */ 325 CM_READ_ONLY, /* metadata may not be changed */ 326 CM_FAIL 327 }; 328 329 enum cache_io_mode { 330 /* 331 * Data is written to cached blocks only. These blocks are marked 332 * dirty. If you lose the cache device you will lose data. 333 * Potential performance increase for both reads and writes. 334 */ 335 CM_IO_WRITEBACK, 336 337 /* 338 * Data is written to both cache and origin. Blocks are never 339 * dirty. Potential performance benfit for reads only. 340 */ 341 CM_IO_WRITETHROUGH, 342 343 /* 344 * A degraded mode useful for various cache coherency situations 345 * (eg, rolling back snapshots). Reads and writes always go to the 346 * origin. If a write goes to a cached oblock, then the cache 347 * block is invalidated. 348 */ 349 CM_IO_PASSTHROUGH 350 }; 351 352 struct cache_features { 353 enum cache_metadata_mode mode; 354 enum cache_io_mode io_mode; 355 unsigned metadata_version; 356 }; 357 358 struct cache_stats { 359 atomic_t read_hit; 360 atomic_t read_miss; 361 atomic_t write_hit; 362 atomic_t write_miss; 363 atomic_t demotion; 364 atomic_t promotion; 365 atomic_t writeback; 366 atomic_t copies_avoided; 367 atomic_t cache_cell_clash; 368 atomic_t commit_count; 369 atomic_t discard_count; 370 }; 371 372 struct cache { 373 struct dm_target *ti; 374 spinlock_t lock; 375 376 /* 377 * Fields for converting from sectors to blocks. 378 */ 379 int sectors_per_block_shift; 380 sector_t sectors_per_block; 381 382 struct dm_cache_metadata *cmd; 383 384 /* 385 * Metadata is written to this device. 386 */ 387 struct dm_dev *metadata_dev; 388 389 /* 390 * The slower of the two data devices. Typically a spindle. 391 */ 392 struct dm_dev *origin_dev; 393 394 /* 395 * The faster of the two data devices. Typically an SSD. 396 */ 397 struct dm_dev *cache_dev; 398 399 /* 400 * Size of the origin device in _complete_ blocks and native sectors. 401 */ 402 dm_oblock_t origin_blocks; 403 sector_t origin_sectors; 404 405 /* 406 * Size of the cache device in blocks. 407 */ 408 dm_cblock_t cache_size; 409 410 /* 411 * Invalidation fields. 412 */ 413 spinlock_t invalidation_lock; 414 struct list_head invalidation_requests; 415 416 sector_t migration_threshold; 417 wait_queue_head_t migration_wait; 418 atomic_t nr_allocated_migrations; 419 420 /* 421 * The number of in flight migrations that are performing 422 * background io. eg, promotion, writeback. 423 */ 424 atomic_t nr_io_migrations; 425 426 struct bio_list deferred_bios; 427 428 struct rw_semaphore quiesce_lock; 429 430 struct dm_target_callbacks callbacks; 431 432 /* 433 * origin_blocks entries, discarded if set. 434 */ 435 dm_dblock_t discard_nr_blocks; 436 unsigned long *discard_bitset; 437 uint32_t discard_block_size; /* a power of 2 times sectors per block */ 438 439 /* 440 * Rather than reconstructing the table line for the status we just 441 * save it and regurgitate. 442 */ 443 unsigned nr_ctr_args; 444 const char **ctr_args; 445 446 struct dm_kcopyd_client *copier; 447 struct work_struct deferred_bio_worker; 448 struct work_struct migration_worker; 449 struct workqueue_struct *wq; 450 struct delayed_work waker; 451 struct dm_bio_prison_v2 *prison; 452 453 /* 454 * cache_size entries, dirty if set 455 */ 456 unsigned long *dirty_bitset; 457 atomic_t nr_dirty; 458 459 unsigned policy_nr_args; 460 struct dm_cache_policy *policy; 461 462 /* 463 * Cache features such as write-through. 464 */ 465 struct cache_features features; 466 467 struct cache_stats stats; 468 469 bool need_tick_bio:1; 470 bool sized:1; 471 bool invalidate:1; 472 bool commit_requested:1; 473 bool loaded_mappings:1; 474 bool loaded_discards:1; 475 476 struct rw_semaphore background_work_lock; 477 478 struct batcher committer; 479 struct work_struct commit_ws; 480 481 struct io_tracker tracker; 482 483 mempool_t migration_pool; 484 485 struct bio_set bs; 486 }; 487 488 struct per_bio_data { 489 bool tick:1; 490 unsigned req_nr:2; 491 struct dm_bio_prison_cell_v2 *cell; 492 struct dm_hook_info hook_info; 493 sector_t len; 494 }; 495 496 struct dm_cache_migration { 497 struct continuation k; 498 struct cache *cache; 499 500 struct policy_work *op; 501 struct bio *overwrite_bio; 502 struct dm_bio_prison_cell_v2 *cell; 503 504 dm_cblock_t invalidate_cblock; 505 dm_oblock_t invalidate_oblock; 506 }; 507 508 /*----------------------------------------------------------------*/ 509 510 static bool writethrough_mode(struct cache *cache) 511 { 512 return cache->features.io_mode == CM_IO_WRITETHROUGH; 513 } 514 515 static bool writeback_mode(struct cache *cache) 516 { 517 return cache->features.io_mode == CM_IO_WRITEBACK; 518 } 519 520 static inline bool passthrough_mode(struct cache *cache) 521 { 522 return unlikely(cache->features.io_mode == CM_IO_PASSTHROUGH); 523 } 524 525 /*----------------------------------------------------------------*/ 526 527 static void wake_deferred_bio_worker(struct cache *cache) 528 { 529 queue_work(cache->wq, &cache->deferred_bio_worker); 530 } 531 532 static void wake_migration_worker(struct cache *cache) 533 { 534 if (passthrough_mode(cache)) 535 return; 536 537 queue_work(cache->wq, &cache->migration_worker); 538 } 539 540 /*----------------------------------------------------------------*/ 541 542 static struct dm_bio_prison_cell_v2 *alloc_prison_cell(struct cache *cache) 543 { 544 return dm_bio_prison_alloc_cell_v2(cache->prison, GFP_NOWAIT); 545 } 546 547 static void free_prison_cell(struct cache *cache, struct dm_bio_prison_cell_v2 *cell) 548 { 549 dm_bio_prison_free_cell_v2(cache->prison, cell); 550 } 551 552 static struct dm_cache_migration *alloc_migration(struct cache *cache) 553 { 554 struct dm_cache_migration *mg; 555 556 mg = mempool_alloc(&cache->migration_pool, GFP_NOWAIT); 557 if (!mg) 558 return NULL; 559 560 memset(mg, 0, sizeof(*mg)); 561 562 mg->cache = cache; 563 atomic_inc(&cache->nr_allocated_migrations); 564 565 return mg; 566 } 567 568 static void free_migration(struct dm_cache_migration *mg) 569 { 570 struct cache *cache = mg->cache; 571 572 if (atomic_dec_and_test(&cache->nr_allocated_migrations)) 573 wake_up(&cache->migration_wait); 574 575 mempool_free(mg, &cache->migration_pool); 576 } 577 578 /*----------------------------------------------------------------*/ 579 580 static inline dm_oblock_t oblock_succ(dm_oblock_t b) 581 { 582 return to_oblock(from_oblock(b) + 1ull); 583 } 584 585 static void build_key(dm_oblock_t begin, dm_oblock_t end, struct dm_cell_key_v2 *key) 586 { 587 key->virtual = 0; 588 key->dev = 0; 589 key->block_begin = from_oblock(begin); 590 key->block_end = from_oblock(end); 591 } 592 593 /* 594 * We have two lock levels. Level 0, which is used to prevent WRITEs, and 595 * level 1 which prevents *both* READs and WRITEs. 596 */ 597 #define WRITE_LOCK_LEVEL 0 598 #define READ_WRITE_LOCK_LEVEL 1 599 600 static unsigned lock_level(struct bio *bio) 601 { 602 return bio_data_dir(bio) == WRITE ? 603 WRITE_LOCK_LEVEL : 604 READ_WRITE_LOCK_LEVEL; 605 } 606 607 /*---------------------------------------------------------------- 608 * Per bio data 609 *--------------------------------------------------------------*/ 610 611 static struct per_bio_data *get_per_bio_data(struct bio *bio) 612 { 613 struct per_bio_data *pb = dm_per_bio_data(bio, sizeof(struct per_bio_data)); 614 BUG_ON(!pb); 615 return pb; 616 } 617 618 static struct per_bio_data *init_per_bio_data(struct bio *bio) 619 { 620 struct per_bio_data *pb = get_per_bio_data(bio); 621 622 pb->tick = false; 623 pb->req_nr = dm_bio_get_target_bio_nr(bio); 624 pb->cell = NULL; 625 pb->len = 0; 626 627 return pb; 628 } 629 630 /*----------------------------------------------------------------*/ 631 632 static void defer_bio(struct cache *cache, struct bio *bio) 633 { 634 unsigned long flags; 635 636 spin_lock_irqsave(&cache->lock, flags); 637 bio_list_add(&cache->deferred_bios, bio); 638 spin_unlock_irqrestore(&cache->lock, flags); 639 640 wake_deferred_bio_worker(cache); 641 } 642 643 static void defer_bios(struct cache *cache, struct bio_list *bios) 644 { 645 unsigned long flags; 646 647 spin_lock_irqsave(&cache->lock, flags); 648 bio_list_merge(&cache->deferred_bios, bios); 649 bio_list_init(bios); 650 spin_unlock_irqrestore(&cache->lock, flags); 651 652 wake_deferred_bio_worker(cache); 653 } 654 655 /*----------------------------------------------------------------*/ 656 657 static bool bio_detain_shared(struct cache *cache, dm_oblock_t oblock, struct bio *bio) 658 { 659 bool r; 660 struct per_bio_data *pb; 661 struct dm_cell_key_v2 key; 662 dm_oblock_t end = to_oblock(from_oblock(oblock) + 1ULL); 663 struct dm_bio_prison_cell_v2 *cell_prealloc, *cell; 664 665 cell_prealloc = alloc_prison_cell(cache); /* FIXME: allow wait if calling from worker */ 666 if (!cell_prealloc) { 667 defer_bio(cache, bio); 668 return false; 669 } 670 671 build_key(oblock, end, &key); 672 r = dm_cell_get_v2(cache->prison, &key, lock_level(bio), bio, cell_prealloc, &cell); 673 if (!r) { 674 /* 675 * Failed to get the lock. 676 */ 677 free_prison_cell(cache, cell_prealloc); 678 return r; 679 } 680 681 if (cell != cell_prealloc) 682 free_prison_cell(cache, cell_prealloc); 683 684 pb = get_per_bio_data(bio); 685 pb->cell = cell; 686 687 return r; 688 } 689 690 /*----------------------------------------------------------------*/ 691 692 static bool is_dirty(struct cache *cache, dm_cblock_t b) 693 { 694 return test_bit(from_cblock(b), cache->dirty_bitset); 695 } 696 697 static void set_dirty(struct cache *cache, dm_cblock_t cblock) 698 { 699 if (!test_and_set_bit(from_cblock(cblock), cache->dirty_bitset)) { 700 atomic_inc(&cache->nr_dirty); 701 policy_set_dirty(cache->policy, cblock); 702 } 703 } 704 705 /* 706 * These two are called when setting after migrations to force the policy 707 * and dirty bitset to be in sync. 708 */ 709 static void force_set_dirty(struct cache *cache, dm_cblock_t cblock) 710 { 711 if (!test_and_set_bit(from_cblock(cblock), cache->dirty_bitset)) 712 atomic_inc(&cache->nr_dirty); 713 policy_set_dirty(cache->policy, cblock); 714 } 715 716 static void force_clear_dirty(struct cache *cache, dm_cblock_t cblock) 717 { 718 if (test_and_clear_bit(from_cblock(cblock), cache->dirty_bitset)) { 719 if (atomic_dec_return(&cache->nr_dirty) == 0) 720 dm_table_event(cache->ti->table); 721 } 722 723 policy_clear_dirty(cache->policy, cblock); 724 } 725 726 /*----------------------------------------------------------------*/ 727 728 static bool block_size_is_power_of_two(struct cache *cache) 729 { 730 return cache->sectors_per_block_shift >= 0; 731 } 732 733 /* gcc on ARM generates spurious references to __udivdi3 and __umoddi3 */ 734 #if defined(CONFIG_ARM) && __GNUC__ == 4 && __GNUC_MINOR__ <= 6 735 __always_inline 736 #endif 737 static dm_block_t block_div(dm_block_t b, uint32_t n) 738 { 739 do_div(b, n); 740 741 return b; 742 } 743 744 static dm_block_t oblocks_per_dblock(struct cache *cache) 745 { 746 dm_block_t oblocks = cache->discard_block_size; 747 748 if (block_size_is_power_of_two(cache)) 749 oblocks >>= cache->sectors_per_block_shift; 750 else 751 oblocks = block_div(oblocks, cache->sectors_per_block); 752 753 return oblocks; 754 } 755 756 static dm_dblock_t oblock_to_dblock(struct cache *cache, dm_oblock_t oblock) 757 { 758 return to_dblock(block_div(from_oblock(oblock), 759 oblocks_per_dblock(cache))); 760 } 761 762 static void set_discard(struct cache *cache, dm_dblock_t b) 763 { 764 unsigned long flags; 765 766 BUG_ON(from_dblock(b) >= from_dblock(cache->discard_nr_blocks)); 767 atomic_inc(&cache->stats.discard_count); 768 769 spin_lock_irqsave(&cache->lock, flags); 770 set_bit(from_dblock(b), cache->discard_bitset); 771 spin_unlock_irqrestore(&cache->lock, flags); 772 } 773 774 static void clear_discard(struct cache *cache, dm_dblock_t b) 775 { 776 unsigned long flags; 777 778 spin_lock_irqsave(&cache->lock, flags); 779 clear_bit(from_dblock(b), cache->discard_bitset); 780 spin_unlock_irqrestore(&cache->lock, flags); 781 } 782 783 static bool is_discarded(struct cache *cache, dm_dblock_t b) 784 { 785 int r; 786 unsigned long flags; 787 788 spin_lock_irqsave(&cache->lock, flags); 789 r = test_bit(from_dblock(b), cache->discard_bitset); 790 spin_unlock_irqrestore(&cache->lock, flags); 791 792 return r; 793 } 794 795 static bool is_discarded_oblock(struct cache *cache, dm_oblock_t b) 796 { 797 int r; 798 unsigned long flags; 799 800 spin_lock_irqsave(&cache->lock, flags); 801 r = test_bit(from_dblock(oblock_to_dblock(cache, b)), 802 cache->discard_bitset); 803 spin_unlock_irqrestore(&cache->lock, flags); 804 805 return r; 806 } 807 808 /*---------------------------------------------------------------- 809 * Remapping 810 *--------------------------------------------------------------*/ 811 static void remap_to_origin(struct cache *cache, struct bio *bio) 812 { 813 bio_set_dev(bio, cache->origin_dev->bdev); 814 } 815 816 static void remap_to_cache(struct cache *cache, struct bio *bio, 817 dm_cblock_t cblock) 818 { 819 sector_t bi_sector = bio->bi_iter.bi_sector; 820 sector_t block = from_cblock(cblock); 821 822 bio_set_dev(bio, cache->cache_dev->bdev); 823 if (!block_size_is_power_of_two(cache)) 824 bio->bi_iter.bi_sector = 825 (block * cache->sectors_per_block) + 826 sector_div(bi_sector, cache->sectors_per_block); 827 else 828 bio->bi_iter.bi_sector = 829 (block << cache->sectors_per_block_shift) | 830 (bi_sector & (cache->sectors_per_block - 1)); 831 } 832 833 static void check_if_tick_bio_needed(struct cache *cache, struct bio *bio) 834 { 835 unsigned long flags; 836 struct per_bio_data *pb; 837 838 spin_lock_irqsave(&cache->lock, flags); 839 if (cache->need_tick_bio && !op_is_flush(bio->bi_opf) && 840 bio_op(bio) != REQ_OP_DISCARD) { 841 pb = get_per_bio_data(bio); 842 pb->tick = true; 843 cache->need_tick_bio = false; 844 } 845 spin_unlock_irqrestore(&cache->lock, flags); 846 } 847 848 static void __remap_to_origin_clear_discard(struct cache *cache, struct bio *bio, 849 dm_oblock_t oblock, bool bio_has_pbd) 850 { 851 if (bio_has_pbd) 852 check_if_tick_bio_needed(cache, bio); 853 remap_to_origin(cache, bio); 854 if (bio_data_dir(bio) == WRITE) 855 clear_discard(cache, oblock_to_dblock(cache, oblock)); 856 } 857 858 static void remap_to_origin_clear_discard(struct cache *cache, struct bio *bio, 859 dm_oblock_t oblock) 860 { 861 // FIXME: check_if_tick_bio_needed() is called way too much through this interface 862 __remap_to_origin_clear_discard(cache, bio, oblock, true); 863 } 864 865 static void remap_to_cache_dirty(struct cache *cache, struct bio *bio, 866 dm_oblock_t oblock, dm_cblock_t cblock) 867 { 868 check_if_tick_bio_needed(cache, bio); 869 remap_to_cache(cache, bio, cblock); 870 if (bio_data_dir(bio) == WRITE) { 871 set_dirty(cache, cblock); 872 clear_discard(cache, oblock_to_dblock(cache, oblock)); 873 } 874 } 875 876 static dm_oblock_t get_bio_block(struct cache *cache, struct bio *bio) 877 { 878 sector_t block_nr = bio->bi_iter.bi_sector; 879 880 if (!block_size_is_power_of_two(cache)) 881 (void) sector_div(block_nr, cache->sectors_per_block); 882 else 883 block_nr >>= cache->sectors_per_block_shift; 884 885 return to_oblock(block_nr); 886 } 887 888 static bool accountable_bio(struct cache *cache, struct bio *bio) 889 { 890 return bio_op(bio) != REQ_OP_DISCARD; 891 } 892 893 static void accounted_begin(struct cache *cache, struct bio *bio) 894 { 895 struct per_bio_data *pb; 896 897 if (accountable_bio(cache, bio)) { 898 pb = get_per_bio_data(bio); 899 pb->len = bio_sectors(bio); 900 iot_io_begin(&cache->tracker, pb->len); 901 } 902 } 903 904 static void accounted_complete(struct cache *cache, struct bio *bio) 905 { 906 struct per_bio_data *pb = get_per_bio_data(bio); 907 908 iot_io_end(&cache->tracker, pb->len); 909 } 910 911 static void accounted_request(struct cache *cache, struct bio *bio) 912 { 913 accounted_begin(cache, bio); 914 generic_make_request(bio); 915 } 916 917 static void issue_op(struct bio *bio, void *context) 918 { 919 struct cache *cache = context; 920 accounted_request(cache, bio); 921 } 922 923 /* 924 * When running in writethrough mode we need to send writes to clean blocks 925 * to both the cache and origin devices. Clone the bio and send them in parallel. 926 */ 927 static void remap_to_origin_and_cache(struct cache *cache, struct bio *bio, 928 dm_oblock_t oblock, dm_cblock_t cblock) 929 { 930 struct bio *origin_bio = bio_clone_fast(bio, GFP_NOIO, &cache->bs); 931 932 BUG_ON(!origin_bio); 933 934 bio_chain(origin_bio, bio); 935 /* 936 * Passing false to __remap_to_origin_clear_discard() skips 937 * all code that might use per_bio_data (since clone doesn't have it) 938 */ 939 __remap_to_origin_clear_discard(cache, origin_bio, oblock, false); 940 submit_bio(origin_bio); 941 942 remap_to_cache(cache, bio, cblock); 943 } 944 945 /*---------------------------------------------------------------- 946 * Failure modes 947 *--------------------------------------------------------------*/ 948 static enum cache_metadata_mode get_cache_mode(struct cache *cache) 949 { 950 return cache->features.mode; 951 } 952 953 static const char *cache_device_name(struct cache *cache) 954 { 955 return dm_device_name(dm_table_get_md(cache->ti->table)); 956 } 957 958 static void notify_mode_switch(struct cache *cache, enum cache_metadata_mode mode) 959 { 960 const char *descs[] = { 961 "write", 962 "read-only", 963 "fail" 964 }; 965 966 dm_table_event(cache->ti->table); 967 DMINFO("%s: switching cache to %s mode", 968 cache_device_name(cache), descs[(int)mode]); 969 } 970 971 static void set_cache_mode(struct cache *cache, enum cache_metadata_mode new_mode) 972 { 973 bool needs_check; 974 enum cache_metadata_mode old_mode = get_cache_mode(cache); 975 976 if (dm_cache_metadata_needs_check(cache->cmd, &needs_check)) { 977 DMERR("%s: unable to read needs_check flag, setting failure mode.", 978 cache_device_name(cache)); 979 new_mode = CM_FAIL; 980 } 981 982 if (new_mode == CM_WRITE && needs_check) { 983 DMERR("%s: unable to switch cache to write mode until repaired.", 984 cache_device_name(cache)); 985 if (old_mode != new_mode) 986 new_mode = old_mode; 987 else 988 new_mode = CM_READ_ONLY; 989 } 990 991 /* Never move out of fail mode */ 992 if (old_mode == CM_FAIL) 993 new_mode = CM_FAIL; 994 995 switch (new_mode) { 996 case CM_FAIL: 997 case CM_READ_ONLY: 998 dm_cache_metadata_set_read_only(cache->cmd); 999 break; 1000 1001 case CM_WRITE: 1002 dm_cache_metadata_set_read_write(cache->cmd); 1003 break; 1004 } 1005 1006 cache->features.mode = new_mode; 1007 1008 if (new_mode != old_mode) 1009 notify_mode_switch(cache, new_mode); 1010 } 1011 1012 static void abort_transaction(struct cache *cache) 1013 { 1014 const char *dev_name = cache_device_name(cache); 1015 1016 if (get_cache_mode(cache) >= CM_READ_ONLY) 1017 return; 1018 1019 if (dm_cache_metadata_set_needs_check(cache->cmd)) { 1020 DMERR("%s: failed to set 'needs_check' flag in metadata", dev_name); 1021 set_cache_mode(cache, CM_FAIL); 1022 } 1023 1024 DMERR_LIMIT("%s: aborting current metadata transaction", dev_name); 1025 if (dm_cache_metadata_abort(cache->cmd)) { 1026 DMERR("%s: failed to abort metadata transaction", dev_name); 1027 set_cache_mode(cache, CM_FAIL); 1028 } 1029 } 1030 1031 static void metadata_operation_failed(struct cache *cache, const char *op, int r) 1032 { 1033 DMERR_LIMIT("%s: metadata operation '%s' failed: error = %d", 1034 cache_device_name(cache), op, r); 1035 abort_transaction(cache); 1036 set_cache_mode(cache, CM_READ_ONLY); 1037 } 1038 1039 /*----------------------------------------------------------------*/ 1040 1041 static void load_stats(struct cache *cache) 1042 { 1043 struct dm_cache_statistics stats; 1044 1045 dm_cache_metadata_get_stats(cache->cmd, &stats); 1046 atomic_set(&cache->stats.read_hit, stats.read_hits); 1047 atomic_set(&cache->stats.read_miss, stats.read_misses); 1048 atomic_set(&cache->stats.write_hit, stats.write_hits); 1049 atomic_set(&cache->stats.write_miss, stats.write_misses); 1050 } 1051 1052 static void save_stats(struct cache *cache) 1053 { 1054 struct dm_cache_statistics stats; 1055 1056 if (get_cache_mode(cache) >= CM_READ_ONLY) 1057 return; 1058 1059 stats.read_hits = atomic_read(&cache->stats.read_hit); 1060 stats.read_misses = atomic_read(&cache->stats.read_miss); 1061 stats.write_hits = atomic_read(&cache->stats.write_hit); 1062 stats.write_misses = atomic_read(&cache->stats.write_miss); 1063 1064 dm_cache_metadata_set_stats(cache->cmd, &stats); 1065 } 1066 1067 static void update_stats(struct cache_stats *stats, enum policy_operation op) 1068 { 1069 switch (op) { 1070 case POLICY_PROMOTE: 1071 atomic_inc(&stats->promotion); 1072 break; 1073 1074 case POLICY_DEMOTE: 1075 atomic_inc(&stats->demotion); 1076 break; 1077 1078 case POLICY_WRITEBACK: 1079 atomic_inc(&stats->writeback); 1080 break; 1081 } 1082 } 1083 1084 /*---------------------------------------------------------------- 1085 * Migration processing 1086 * 1087 * Migration covers moving data from the origin device to the cache, or 1088 * vice versa. 1089 *--------------------------------------------------------------*/ 1090 1091 static void inc_io_migrations(struct cache *cache) 1092 { 1093 atomic_inc(&cache->nr_io_migrations); 1094 } 1095 1096 static void dec_io_migrations(struct cache *cache) 1097 { 1098 atomic_dec(&cache->nr_io_migrations); 1099 } 1100 1101 static bool discard_or_flush(struct bio *bio) 1102 { 1103 return bio_op(bio) == REQ_OP_DISCARD || op_is_flush(bio->bi_opf); 1104 } 1105 1106 static void calc_discard_block_range(struct cache *cache, struct bio *bio, 1107 dm_dblock_t *b, dm_dblock_t *e) 1108 { 1109 sector_t sb = bio->bi_iter.bi_sector; 1110 sector_t se = bio_end_sector(bio); 1111 1112 *b = to_dblock(dm_sector_div_up(sb, cache->discard_block_size)); 1113 1114 if (se - sb < cache->discard_block_size) 1115 *e = *b; 1116 else 1117 *e = to_dblock(block_div(se, cache->discard_block_size)); 1118 } 1119 1120 /*----------------------------------------------------------------*/ 1121 1122 static void prevent_background_work(struct cache *cache) 1123 { 1124 lockdep_off(); 1125 down_write(&cache->background_work_lock); 1126 lockdep_on(); 1127 } 1128 1129 static void allow_background_work(struct cache *cache) 1130 { 1131 lockdep_off(); 1132 up_write(&cache->background_work_lock); 1133 lockdep_on(); 1134 } 1135 1136 static bool background_work_begin(struct cache *cache) 1137 { 1138 bool r; 1139 1140 lockdep_off(); 1141 r = down_read_trylock(&cache->background_work_lock); 1142 lockdep_on(); 1143 1144 return r; 1145 } 1146 1147 static void background_work_end(struct cache *cache) 1148 { 1149 lockdep_off(); 1150 up_read(&cache->background_work_lock); 1151 lockdep_on(); 1152 } 1153 1154 /*----------------------------------------------------------------*/ 1155 1156 static bool bio_writes_complete_block(struct cache *cache, struct bio *bio) 1157 { 1158 return (bio_data_dir(bio) == WRITE) && 1159 (bio->bi_iter.bi_size == (cache->sectors_per_block << SECTOR_SHIFT)); 1160 } 1161 1162 static bool optimisable_bio(struct cache *cache, struct bio *bio, dm_oblock_t block) 1163 { 1164 return writeback_mode(cache) && 1165 (is_discarded_oblock(cache, block) || bio_writes_complete_block(cache, bio)); 1166 } 1167 1168 static void quiesce(struct dm_cache_migration *mg, 1169 void (*continuation)(struct work_struct *)) 1170 { 1171 init_continuation(&mg->k, continuation); 1172 dm_cell_quiesce_v2(mg->cache->prison, mg->cell, &mg->k.ws); 1173 } 1174 1175 static struct dm_cache_migration *ws_to_mg(struct work_struct *ws) 1176 { 1177 struct continuation *k = container_of(ws, struct continuation, ws); 1178 return container_of(k, struct dm_cache_migration, k); 1179 } 1180 1181 static void copy_complete(int read_err, unsigned long write_err, void *context) 1182 { 1183 struct dm_cache_migration *mg = container_of(context, struct dm_cache_migration, k); 1184 1185 if (read_err || write_err) 1186 mg->k.input = BLK_STS_IOERR; 1187 1188 queue_continuation(mg->cache->wq, &mg->k); 1189 } 1190 1191 static int copy(struct dm_cache_migration *mg, bool promote) 1192 { 1193 int r; 1194 struct dm_io_region o_region, c_region; 1195 struct cache *cache = mg->cache; 1196 1197 o_region.bdev = cache->origin_dev->bdev; 1198 o_region.sector = from_oblock(mg->op->oblock) * cache->sectors_per_block; 1199 o_region.count = cache->sectors_per_block; 1200 1201 c_region.bdev = cache->cache_dev->bdev; 1202 c_region.sector = from_cblock(mg->op->cblock) * cache->sectors_per_block; 1203 c_region.count = cache->sectors_per_block; 1204 1205 if (promote) 1206 r = dm_kcopyd_copy(cache->copier, &o_region, 1, &c_region, 0, copy_complete, &mg->k); 1207 else 1208 r = dm_kcopyd_copy(cache->copier, &c_region, 1, &o_region, 0, copy_complete, &mg->k); 1209 1210 return r; 1211 } 1212 1213 static void bio_drop_shared_lock(struct cache *cache, struct bio *bio) 1214 { 1215 struct per_bio_data *pb = get_per_bio_data(bio); 1216 1217 if (pb->cell && dm_cell_put_v2(cache->prison, pb->cell)) 1218 free_prison_cell(cache, pb->cell); 1219 pb->cell = NULL; 1220 } 1221 1222 static void overwrite_endio(struct bio *bio) 1223 { 1224 struct dm_cache_migration *mg = bio->bi_private; 1225 struct cache *cache = mg->cache; 1226 struct per_bio_data *pb = get_per_bio_data(bio); 1227 1228 dm_unhook_bio(&pb->hook_info, bio); 1229 1230 if (bio->bi_status) 1231 mg->k.input = bio->bi_status; 1232 1233 queue_continuation(cache->wq, &mg->k); 1234 } 1235 1236 static void overwrite(struct dm_cache_migration *mg, 1237 void (*continuation)(struct work_struct *)) 1238 { 1239 struct bio *bio = mg->overwrite_bio; 1240 struct per_bio_data *pb = get_per_bio_data(bio); 1241 1242 dm_hook_bio(&pb->hook_info, bio, overwrite_endio, mg); 1243 1244 /* 1245 * The overwrite bio is part of the copy operation, as such it does 1246 * not set/clear discard or dirty flags. 1247 */ 1248 if (mg->op->op == POLICY_PROMOTE) 1249 remap_to_cache(mg->cache, bio, mg->op->cblock); 1250 else 1251 remap_to_origin(mg->cache, bio); 1252 1253 init_continuation(&mg->k, continuation); 1254 accounted_request(mg->cache, bio); 1255 } 1256 1257 /* 1258 * Migration steps: 1259 * 1260 * 1) exclusive lock preventing WRITEs 1261 * 2) quiesce 1262 * 3) copy or issue overwrite bio 1263 * 4) upgrade to exclusive lock preventing READs and WRITEs 1264 * 5) quiesce 1265 * 6) update metadata and commit 1266 * 7) unlock 1267 */ 1268 static void mg_complete(struct dm_cache_migration *mg, bool success) 1269 { 1270 struct bio_list bios; 1271 struct cache *cache = mg->cache; 1272 struct policy_work *op = mg->op; 1273 dm_cblock_t cblock = op->cblock; 1274 1275 if (success) 1276 update_stats(&cache->stats, op->op); 1277 1278 switch (op->op) { 1279 case POLICY_PROMOTE: 1280 clear_discard(cache, oblock_to_dblock(cache, op->oblock)); 1281 policy_complete_background_work(cache->policy, op, success); 1282 1283 if (mg->overwrite_bio) { 1284 if (success) 1285 force_set_dirty(cache, cblock); 1286 else if (mg->k.input) 1287 mg->overwrite_bio->bi_status = mg->k.input; 1288 else 1289 mg->overwrite_bio->bi_status = BLK_STS_IOERR; 1290 bio_endio(mg->overwrite_bio); 1291 } else { 1292 if (success) 1293 force_clear_dirty(cache, cblock); 1294 dec_io_migrations(cache); 1295 } 1296 break; 1297 1298 case POLICY_DEMOTE: 1299 /* 1300 * We clear dirty here to update the nr_dirty counter. 1301 */ 1302 if (success) 1303 force_clear_dirty(cache, cblock); 1304 policy_complete_background_work(cache->policy, op, success); 1305 dec_io_migrations(cache); 1306 break; 1307 1308 case POLICY_WRITEBACK: 1309 if (success) 1310 force_clear_dirty(cache, cblock); 1311 policy_complete_background_work(cache->policy, op, success); 1312 dec_io_migrations(cache); 1313 break; 1314 } 1315 1316 bio_list_init(&bios); 1317 if (mg->cell) { 1318 if (dm_cell_unlock_v2(cache->prison, mg->cell, &bios)) 1319 free_prison_cell(cache, mg->cell); 1320 } 1321 1322 free_migration(mg); 1323 defer_bios(cache, &bios); 1324 wake_migration_worker(cache); 1325 1326 background_work_end(cache); 1327 } 1328 1329 static void mg_success(struct work_struct *ws) 1330 { 1331 struct dm_cache_migration *mg = ws_to_mg(ws); 1332 mg_complete(mg, mg->k.input == 0); 1333 } 1334 1335 static void mg_update_metadata(struct work_struct *ws) 1336 { 1337 int r; 1338 struct dm_cache_migration *mg = ws_to_mg(ws); 1339 struct cache *cache = mg->cache; 1340 struct policy_work *op = mg->op; 1341 1342 switch (op->op) { 1343 case POLICY_PROMOTE: 1344 r = dm_cache_insert_mapping(cache->cmd, op->cblock, op->oblock); 1345 if (r) { 1346 DMERR_LIMIT("%s: migration failed; couldn't insert mapping", 1347 cache_device_name(cache)); 1348 metadata_operation_failed(cache, "dm_cache_insert_mapping", r); 1349 1350 mg_complete(mg, false); 1351 return; 1352 } 1353 mg_complete(mg, true); 1354 break; 1355 1356 case POLICY_DEMOTE: 1357 r = dm_cache_remove_mapping(cache->cmd, op->cblock); 1358 if (r) { 1359 DMERR_LIMIT("%s: migration failed; couldn't update on disk metadata", 1360 cache_device_name(cache)); 1361 metadata_operation_failed(cache, "dm_cache_remove_mapping", r); 1362 1363 mg_complete(mg, false); 1364 return; 1365 } 1366 1367 /* 1368 * It would be nice if we only had to commit when a REQ_FLUSH 1369 * comes through. But there's one scenario that we have to 1370 * look out for: 1371 * 1372 * - vblock x in a cache block 1373 * - domotion occurs 1374 * - cache block gets reallocated and over written 1375 * - crash 1376 * 1377 * When we recover, because there was no commit the cache will 1378 * rollback to having the data for vblock x in the cache block. 1379 * But the cache block has since been overwritten, so it'll end 1380 * up pointing to data that was never in 'x' during the history 1381 * of the device. 1382 * 1383 * To avoid this issue we require a commit as part of the 1384 * demotion operation. 1385 */ 1386 init_continuation(&mg->k, mg_success); 1387 continue_after_commit(&cache->committer, &mg->k); 1388 schedule_commit(&cache->committer); 1389 break; 1390 1391 case POLICY_WRITEBACK: 1392 mg_complete(mg, true); 1393 break; 1394 } 1395 } 1396 1397 static void mg_update_metadata_after_copy(struct work_struct *ws) 1398 { 1399 struct dm_cache_migration *mg = ws_to_mg(ws); 1400 1401 /* 1402 * Did the copy succeed? 1403 */ 1404 if (mg->k.input) 1405 mg_complete(mg, false); 1406 else 1407 mg_update_metadata(ws); 1408 } 1409 1410 static void mg_upgrade_lock(struct work_struct *ws) 1411 { 1412 int r; 1413 struct dm_cache_migration *mg = ws_to_mg(ws); 1414 1415 /* 1416 * Did the copy succeed? 1417 */ 1418 if (mg->k.input) 1419 mg_complete(mg, false); 1420 1421 else { 1422 /* 1423 * Now we want the lock to prevent both reads and writes. 1424 */ 1425 r = dm_cell_lock_promote_v2(mg->cache->prison, mg->cell, 1426 READ_WRITE_LOCK_LEVEL); 1427 if (r < 0) 1428 mg_complete(mg, false); 1429 1430 else if (r) 1431 quiesce(mg, mg_update_metadata); 1432 1433 else 1434 mg_update_metadata(ws); 1435 } 1436 } 1437 1438 static void mg_full_copy(struct work_struct *ws) 1439 { 1440 struct dm_cache_migration *mg = ws_to_mg(ws); 1441 struct cache *cache = mg->cache; 1442 struct policy_work *op = mg->op; 1443 bool is_policy_promote = (op->op == POLICY_PROMOTE); 1444 1445 if ((!is_policy_promote && !is_dirty(cache, op->cblock)) || 1446 is_discarded_oblock(cache, op->oblock)) { 1447 mg_upgrade_lock(ws); 1448 return; 1449 } 1450 1451 init_continuation(&mg->k, mg_upgrade_lock); 1452 1453 if (copy(mg, is_policy_promote)) { 1454 DMERR_LIMIT("%s: migration copy failed", cache_device_name(cache)); 1455 mg->k.input = BLK_STS_IOERR; 1456 mg_complete(mg, false); 1457 } 1458 } 1459 1460 static void mg_copy(struct work_struct *ws) 1461 { 1462 struct dm_cache_migration *mg = ws_to_mg(ws); 1463 1464 if (mg->overwrite_bio) { 1465 /* 1466 * No exclusive lock was held when we last checked if the bio 1467 * was optimisable. So we have to check again in case things 1468 * have changed (eg, the block may no longer be discarded). 1469 */ 1470 if (!optimisable_bio(mg->cache, mg->overwrite_bio, mg->op->oblock)) { 1471 /* 1472 * Fallback to a real full copy after doing some tidying up. 1473 */ 1474 bool rb = bio_detain_shared(mg->cache, mg->op->oblock, mg->overwrite_bio); 1475 BUG_ON(rb); /* An exclussive lock must _not_ be held for this block */ 1476 mg->overwrite_bio = NULL; 1477 inc_io_migrations(mg->cache); 1478 mg_full_copy(ws); 1479 return; 1480 } 1481 1482 /* 1483 * It's safe to do this here, even though it's new data 1484 * because all IO has been locked out of the block. 1485 * 1486 * mg_lock_writes() already took READ_WRITE_LOCK_LEVEL 1487 * so _not_ using mg_upgrade_lock() as continutation. 1488 */ 1489 overwrite(mg, mg_update_metadata_after_copy); 1490 1491 } else 1492 mg_full_copy(ws); 1493 } 1494 1495 static int mg_lock_writes(struct dm_cache_migration *mg) 1496 { 1497 int r; 1498 struct dm_cell_key_v2 key; 1499 struct cache *cache = mg->cache; 1500 struct dm_bio_prison_cell_v2 *prealloc; 1501 1502 prealloc = alloc_prison_cell(cache); 1503 if (!prealloc) { 1504 DMERR_LIMIT("%s: alloc_prison_cell failed", cache_device_name(cache)); 1505 mg_complete(mg, false); 1506 return -ENOMEM; 1507 } 1508 1509 /* 1510 * Prevent writes to the block, but allow reads to continue. 1511 * Unless we're using an overwrite bio, in which case we lock 1512 * everything. 1513 */ 1514 build_key(mg->op->oblock, oblock_succ(mg->op->oblock), &key); 1515 r = dm_cell_lock_v2(cache->prison, &key, 1516 mg->overwrite_bio ? READ_WRITE_LOCK_LEVEL : WRITE_LOCK_LEVEL, 1517 prealloc, &mg->cell); 1518 if (r < 0) { 1519 free_prison_cell(cache, prealloc); 1520 mg_complete(mg, false); 1521 return r; 1522 } 1523 1524 if (mg->cell != prealloc) 1525 free_prison_cell(cache, prealloc); 1526 1527 if (r == 0) 1528 mg_copy(&mg->k.ws); 1529 else 1530 quiesce(mg, mg_copy); 1531 1532 return 0; 1533 } 1534 1535 static int mg_start(struct cache *cache, struct policy_work *op, struct bio *bio) 1536 { 1537 struct dm_cache_migration *mg; 1538 1539 if (!background_work_begin(cache)) { 1540 policy_complete_background_work(cache->policy, op, false); 1541 return -EPERM; 1542 } 1543 1544 mg = alloc_migration(cache); 1545 if (!mg) { 1546 policy_complete_background_work(cache->policy, op, false); 1547 background_work_end(cache); 1548 return -ENOMEM; 1549 } 1550 1551 mg->op = op; 1552 mg->overwrite_bio = bio; 1553 1554 if (!bio) 1555 inc_io_migrations(cache); 1556 1557 return mg_lock_writes(mg); 1558 } 1559 1560 /*---------------------------------------------------------------- 1561 * invalidation processing 1562 *--------------------------------------------------------------*/ 1563 1564 static void invalidate_complete(struct dm_cache_migration *mg, bool success) 1565 { 1566 struct bio_list bios; 1567 struct cache *cache = mg->cache; 1568 1569 bio_list_init(&bios); 1570 if (dm_cell_unlock_v2(cache->prison, mg->cell, &bios)) 1571 free_prison_cell(cache, mg->cell); 1572 1573 if (!success && mg->overwrite_bio) 1574 bio_io_error(mg->overwrite_bio); 1575 1576 free_migration(mg); 1577 defer_bios(cache, &bios); 1578 1579 background_work_end(cache); 1580 } 1581 1582 static void invalidate_completed(struct work_struct *ws) 1583 { 1584 struct dm_cache_migration *mg = ws_to_mg(ws); 1585 invalidate_complete(mg, !mg->k.input); 1586 } 1587 1588 static int invalidate_cblock(struct cache *cache, dm_cblock_t cblock) 1589 { 1590 int r = policy_invalidate_mapping(cache->policy, cblock); 1591 if (!r) { 1592 r = dm_cache_remove_mapping(cache->cmd, cblock); 1593 if (r) { 1594 DMERR_LIMIT("%s: invalidation failed; couldn't update on disk metadata", 1595 cache_device_name(cache)); 1596 metadata_operation_failed(cache, "dm_cache_remove_mapping", r); 1597 } 1598 1599 } else if (r == -ENODATA) { 1600 /* 1601 * Harmless, already unmapped. 1602 */ 1603 r = 0; 1604 1605 } else 1606 DMERR("%s: policy_invalidate_mapping failed", cache_device_name(cache)); 1607 1608 return r; 1609 } 1610 1611 static void invalidate_remove(struct work_struct *ws) 1612 { 1613 int r; 1614 struct dm_cache_migration *mg = ws_to_mg(ws); 1615 struct cache *cache = mg->cache; 1616 1617 r = invalidate_cblock(cache, mg->invalidate_cblock); 1618 if (r) { 1619 invalidate_complete(mg, false); 1620 return; 1621 } 1622 1623 init_continuation(&mg->k, invalidate_completed); 1624 continue_after_commit(&cache->committer, &mg->k); 1625 remap_to_origin_clear_discard(cache, mg->overwrite_bio, mg->invalidate_oblock); 1626 mg->overwrite_bio = NULL; 1627 schedule_commit(&cache->committer); 1628 } 1629 1630 static int invalidate_lock(struct dm_cache_migration *mg) 1631 { 1632 int r; 1633 struct dm_cell_key_v2 key; 1634 struct cache *cache = mg->cache; 1635 struct dm_bio_prison_cell_v2 *prealloc; 1636 1637 prealloc = alloc_prison_cell(cache); 1638 if (!prealloc) { 1639 invalidate_complete(mg, false); 1640 return -ENOMEM; 1641 } 1642 1643 build_key(mg->invalidate_oblock, oblock_succ(mg->invalidate_oblock), &key); 1644 r = dm_cell_lock_v2(cache->prison, &key, 1645 READ_WRITE_LOCK_LEVEL, prealloc, &mg->cell); 1646 if (r < 0) { 1647 free_prison_cell(cache, prealloc); 1648 invalidate_complete(mg, false); 1649 return r; 1650 } 1651 1652 if (mg->cell != prealloc) 1653 free_prison_cell(cache, prealloc); 1654 1655 if (r) 1656 quiesce(mg, invalidate_remove); 1657 1658 else { 1659 /* 1660 * We can't call invalidate_remove() directly here because we 1661 * might still be in request context. 1662 */ 1663 init_continuation(&mg->k, invalidate_remove); 1664 queue_work(cache->wq, &mg->k.ws); 1665 } 1666 1667 return 0; 1668 } 1669 1670 static int invalidate_start(struct cache *cache, dm_cblock_t cblock, 1671 dm_oblock_t oblock, struct bio *bio) 1672 { 1673 struct dm_cache_migration *mg; 1674 1675 if (!background_work_begin(cache)) 1676 return -EPERM; 1677 1678 mg = alloc_migration(cache); 1679 if (!mg) { 1680 background_work_end(cache); 1681 return -ENOMEM; 1682 } 1683 1684 mg->overwrite_bio = bio; 1685 mg->invalidate_cblock = cblock; 1686 mg->invalidate_oblock = oblock; 1687 1688 return invalidate_lock(mg); 1689 } 1690 1691 /*---------------------------------------------------------------- 1692 * bio processing 1693 *--------------------------------------------------------------*/ 1694 1695 enum busy { 1696 IDLE, 1697 BUSY 1698 }; 1699 1700 static enum busy spare_migration_bandwidth(struct cache *cache) 1701 { 1702 bool idle = iot_idle_for(&cache->tracker, HZ); 1703 sector_t current_volume = (atomic_read(&cache->nr_io_migrations) + 1) * 1704 cache->sectors_per_block; 1705 1706 if (idle && current_volume <= cache->migration_threshold) 1707 return IDLE; 1708 else 1709 return BUSY; 1710 } 1711 1712 static void inc_hit_counter(struct cache *cache, struct bio *bio) 1713 { 1714 atomic_inc(bio_data_dir(bio) == READ ? 1715 &cache->stats.read_hit : &cache->stats.write_hit); 1716 } 1717 1718 static void inc_miss_counter(struct cache *cache, struct bio *bio) 1719 { 1720 atomic_inc(bio_data_dir(bio) == READ ? 1721 &cache->stats.read_miss : &cache->stats.write_miss); 1722 } 1723 1724 /*----------------------------------------------------------------*/ 1725 1726 static int map_bio(struct cache *cache, struct bio *bio, dm_oblock_t block, 1727 bool *commit_needed) 1728 { 1729 int r, data_dir; 1730 bool rb, background_queued; 1731 dm_cblock_t cblock; 1732 1733 *commit_needed = false; 1734 1735 rb = bio_detain_shared(cache, block, bio); 1736 if (!rb) { 1737 /* 1738 * An exclusive lock is held for this block, so we have to 1739 * wait. We set the commit_needed flag so the current 1740 * transaction will be committed asap, allowing this lock 1741 * to be dropped. 1742 */ 1743 *commit_needed = true; 1744 return DM_MAPIO_SUBMITTED; 1745 } 1746 1747 data_dir = bio_data_dir(bio); 1748 1749 if (optimisable_bio(cache, bio, block)) { 1750 struct policy_work *op = NULL; 1751 1752 r = policy_lookup_with_work(cache->policy, block, &cblock, data_dir, true, &op); 1753 if (unlikely(r && r != -ENOENT)) { 1754 DMERR_LIMIT("%s: policy_lookup_with_work() failed with r = %d", 1755 cache_device_name(cache), r); 1756 bio_io_error(bio); 1757 return DM_MAPIO_SUBMITTED; 1758 } 1759 1760 if (r == -ENOENT && op) { 1761 bio_drop_shared_lock(cache, bio); 1762 BUG_ON(op->op != POLICY_PROMOTE); 1763 mg_start(cache, op, bio); 1764 return DM_MAPIO_SUBMITTED; 1765 } 1766 } else { 1767 r = policy_lookup(cache->policy, block, &cblock, data_dir, false, &background_queued); 1768 if (unlikely(r && r != -ENOENT)) { 1769 DMERR_LIMIT("%s: policy_lookup() failed with r = %d", 1770 cache_device_name(cache), r); 1771 bio_io_error(bio); 1772 return DM_MAPIO_SUBMITTED; 1773 } 1774 1775 if (background_queued) 1776 wake_migration_worker(cache); 1777 } 1778 1779 if (r == -ENOENT) { 1780 struct per_bio_data *pb = get_per_bio_data(bio); 1781 1782 /* 1783 * Miss. 1784 */ 1785 inc_miss_counter(cache, bio); 1786 if (pb->req_nr == 0) { 1787 accounted_begin(cache, bio); 1788 remap_to_origin_clear_discard(cache, bio, block); 1789 } else { 1790 /* 1791 * This is a duplicate writethrough io that is no 1792 * longer needed because the block has been demoted. 1793 */ 1794 bio_endio(bio); 1795 return DM_MAPIO_SUBMITTED; 1796 } 1797 } else { 1798 /* 1799 * Hit. 1800 */ 1801 inc_hit_counter(cache, bio); 1802 1803 /* 1804 * Passthrough always maps to the origin, invalidating any 1805 * cache blocks that are written to. 1806 */ 1807 if (passthrough_mode(cache)) { 1808 if (bio_data_dir(bio) == WRITE) { 1809 bio_drop_shared_lock(cache, bio); 1810 atomic_inc(&cache->stats.demotion); 1811 invalidate_start(cache, cblock, block, bio); 1812 } else 1813 remap_to_origin_clear_discard(cache, bio, block); 1814 } else { 1815 if (bio_data_dir(bio) == WRITE && writethrough_mode(cache) && 1816 !is_dirty(cache, cblock)) { 1817 remap_to_origin_and_cache(cache, bio, block, cblock); 1818 accounted_begin(cache, bio); 1819 } else 1820 remap_to_cache_dirty(cache, bio, block, cblock); 1821 } 1822 } 1823 1824 /* 1825 * dm core turns FUA requests into a separate payload and FLUSH req. 1826 */ 1827 if (bio->bi_opf & REQ_FUA) { 1828 /* 1829 * issue_after_commit will call accounted_begin a second time. So 1830 * we call accounted_complete() to avoid double accounting. 1831 */ 1832 accounted_complete(cache, bio); 1833 issue_after_commit(&cache->committer, bio); 1834 *commit_needed = true; 1835 return DM_MAPIO_SUBMITTED; 1836 } 1837 1838 return DM_MAPIO_REMAPPED; 1839 } 1840 1841 static bool process_bio(struct cache *cache, struct bio *bio) 1842 { 1843 bool commit_needed; 1844 1845 if (map_bio(cache, bio, get_bio_block(cache, bio), &commit_needed) == DM_MAPIO_REMAPPED) 1846 generic_make_request(bio); 1847 1848 return commit_needed; 1849 } 1850 1851 /* 1852 * A non-zero return indicates read_only or fail_io mode. 1853 */ 1854 static int commit(struct cache *cache, bool clean_shutdown) 1855 { 1856 int r; 1857 1858 if (get_cache_mode(cache) >= CM_READ_ONLY) 1859 return -EINVAL; 1860 1861 atomic_inc(&cache->stats.commit_count); 1862 r = dm_cache_commit(cache->cmd, clean_shutdown); 1863 if (r) 1864 metadata_operation_failed(cache, "dm_cache_commit", r); 1865 1866 return r; 1867 } 1868 1869 /* 1870 * Used by the batcher. 1871 */ 1872 static blk_status_t commit_op(void *context) 1873 { 1874 struct cache *cache = context; 1875 1876 if (dm_cache_changed_this_transaction(cache->cmd)) 1877 return errno_to_blk_status(commit(cache, false)); 1878 1879 return 0; 1880 } 1881 1882 /*----------------------------------------------------------------*/ 1883 1884 static bool process_flush_bio(struct cache *cache, struct bio *bio) 1885 { 1886 struct per_bio_data *pb = get_per_bio_data(bio); 1887 1888 if (!pb->req_nr) 1889 remap_to_origin(cache, bio); 1890 else 1891 remap_to_cache(cache, bio, 0); 1892 1893 issue_after_commit(&cache->committer, bio); 1894 return true; 1895 } 1896 1897 static bool process_discard_bio(struct cache *cache, struct bio *bio) 1898 { 1899 dm_dblock_t b, e; 1900 1901 // FIXME: do we need to lock the region? Or can we just assume the 1902 // user wont be so foolish as to issue discard concurrently with 1903 // other IO? 1904 calc_discard_block_range(cache, bio, &b, &e); 1905 while (b != e) { 1906 set_discard(cache, b); 1907 b = to_dblock(from_dblock(b) + 1); 1908 } 1909 1910 bio_endio(bio); 1911 1912 return false; 1913 } 1914 1915 static void process_deferred_bios(struct work_struct *ws) 1916 { 1917 struct cache *cache = container_of(ws, struct cache, deferred_bio_worker); 1918 1919 unsigned long flags; 1920 bool commit_needed = false; 1921 struct bio_list bios; 1922 struct bio *bio; 1923 1924 bio_list_init(&bios); 1925 1926 spin_lock_irqsave(&cache->lock, flags); 1927 bio_list_merge(&bios, &cache->deferred_bios); 1928 bio_list_init(&cache->deferred_bios); 1929 spin_unlock_irqrestore(&cache->lock, flags); 1930 1931 while ((bio = bio_list_pop(&bios))) { 1932 if (bio->bi_opf & REQ_PREFLUSH) 1933 commit_needed = process_flush_bio(cache, bio) || commit_needed; 1934 1935 else if (bio_op(bio) == REQ_OP_DISCARD) 1936 commit_needed = process_discard_bio(cache, bio) || commit_needed; 1937 1938 else 1939 commit_needed = process_bio(cache, bio) || commit_needed; 1940 } 1941 1942 if (commit_needed) 1943 schedule_commit(&cache->committer); 1944 } 1945 1946 /*---------------------------------------------------------------- 1947 * Main worker loop 1948 *--------------------------------------------------------------*/ 1949 1950 static void requeue_deferred_bios(struct cache *cache) 1951 { 1952 struct bio *bio; 1953 struct bio_list bios; 1954 1955 bio_list_init(&bios); 1956 bio_list_merge(&bios, &cache->deferred_bios); 1957 bio_list_init(&cache->deferred_bios); 1958 1959 while ((bio = bio_list_pop(&bios))) { 1960 bio->bi_status = BLK_STS_DM_REQUEUE; 1961 bio_endio(bio); 1962 } 1963 } 1964 1965 /* 1966 * We want to commit periodically so that not too much 1967 * unwritten metadata builds up. 1968 */ 1969 static void do_waker(struct work_struct *ws) 1970 { 1971 struct cache *cache = container_of(to_delayed_work(ws), struct cache, waker); 1972 1973 policy_tick(cache->policy, true); 1974 wake_migration_worker(cache); 1975 schedule_commit(&cache->committer); 1976 queue_delayed_work(cache->wq, &cache->waker, COMMIT_PERIOD); 1977 } 1978 1979 static void check_migrations(struct work_struct *ws) 1980 { 1981 int r; 1982 struct policy_work *op; 1983 struct cache *cache = container_of(ws, struct cache, migration_worker); 1984 enum busy b; 1985 1986 for (;;) { 1987 b = spare_migration_bandwidth(cache); 1988 1989 r = policy_get_background_work(cache->policy, b == IDLE, &op); 1990 if (r == -ENODATA) 1991 break; 1992 1993 if (r) { 1994 DMERR_LIMIT("%s: policy_background_work failed", 1995 cache_device_name(cache)); 1996 break; 1997 } 1998 1999 r = mg_start(cache, op, NULL); 2000 if (r) 2001 break; 2002 } 2003 } 2004 2005 /*---------------------------------------------------------------- 2006 * Target methods 2007 *--------------------------------------------------------------*/ 2008 2009 /* 2010 * This function gets called on the error paths of the constructor, so we 2011 * have to cope with a partially initialised struct. 2012 */ 2013 static void destroy(struct cache *cache) 2014 { 2015 unsigned i; 2016 2017 mempool_exit(&cache->migration_pool); 2018 2019 if (cache->prison) 2020 dm_bio_prison_destroy_v2(cache->prison); 2021 2022 if (cache->wq) 2023 destroy_workqueue(cache->wq); 2024 2025 if (cache->dirty_bitset) 2026 free_bitset(cache->dirty_bitset); 2027 2028 if (cache->discard_bitset) 2029 free_bitset(cache->discard_bitset); 2030 2031 if (cache->copier) 2032 dm_kcopyd_client_destroy(cache->copier); 2033 2034 if (cache->cmd) 2035 dm_cache_metadata_close(cache->cmd); 2036 2037 if (cache->metadata_dev) 2038 dm_put_device(cache->ti, cache->metadata_dev); 2039 2040 if (cache->origin_dev) 2041 dm_put_device(cache->ti, cache->origin_dev); 2042 2043 if (cache->cache_dev) 2044 dm_put_device(cache->ti, cache->cache_dev); 2045 2046 if (cache->policy) 2047 dm_cache_policy_destroy(cache->policy); 2048 2049 for (i = 0; i < cache->nr_ctr_args ; i++) 2050 kfree(cache->ctr_args[i]); 2051 kfree(cache->ctr_args); 2052 2053 bioset_exit(&cache->bs); 2054 2055 kfree(cache); 2056 } 2057 2058 static void cache_dtr(struct dm_target *ti) 2059 { 2060 struct cache *cache = ti->private; 2061 2062 destroy(cache); 2063 } 2064 2065 static sector_t get_dev_size(struct dm_dev *dev) 2066 { 2067 return i_size_read(dev->bdev->bd_inode) >> SECTOR_SHIFT; 2068 } 2069 2070 /*----------------------------------------------------------------*/ 2071 2072 /* 2073 * Construct a cache device mapping. 2074 * 2075 * cache <metadata dev> <cache dev> <origin dev> <block size> 2076 * <#feature args> [<feature arg>]* 2077 * <policy> <#policy args> [<policy arg>]* 2078 * 2079 * metadata dev : fast device holding the persistent metadata 2080 * cache dev : fast device holding cached data blocks 2081 * origin dev : slow device holding original data blocks 2082 * block size : cache unit size in sectors 2083 * 2084 * #feature args : number of feature arguments passed 2085 * feature args : writethrough. (The default is writeback.) 2086 * 2087 * policy : the replacement policy to use 2088 * #policy args : an even number of policy arguments corresponding 2089 * to key/value pairs passed to the policy 2090 * policy args : key/value pairs passed to the policy 2091 * E.g. 'sequential_threshold 1024' 2092 * See cache-policies.txt for details. 2093 * 2094 * Optional feature arguments are: 2095 * writethrough : write through caching that prohibits cache block 2096 * content from being different from origin block content. 2097 * Without this argument, the default behaviour is to write 2098 * back cache block contents later for performance reasons, 2099 * so they may differ from the corresponding origin blocks. 2100 */ 2101 struct cache_args { 2102 struct dm_target *ti; 2103 2104 struct dm_dev *metadata_dev; 2105 2106 struct dm_dev *cache_dev; 2107 sector_t cache_sectors; 2108 2109 struct dm_dev *origin_dev; 2110 sector_t origin_sectors; 2111 2112 uint32_t block_size; 2113 2114 const char *policy_name; 2115 int policy_argc; 2116 const char **policy_argv; 2117 2118 struct cache_features features; 2119 }; 2120 2121 static void destroy_cache_args(struct cache_args *ca) 2122 { 2123 if (ca->metadata_dev) 2124 dm_put_device(ca->ti, ca->metadata_dev); 2125 2126 if (ca->cache_dev) 2127 dm_put_device(ca->ti, ca->cache_dev); 2128 2129 if (ca->origin_dev) 2130 dm_put_device(ca->ti, ca->origin_dev); 2131 2132 kfree(ca); 2133 } 2134 2135 static bool at_least_one_arg(struct dm_arg_set *as, char **error) 2136 { 2137 if (!as->argc) { 2138 *error = "Insufficient args"; 2139 return false; 2140 } 2141 2142 return true; 2143 } 2144 2145 static int parse_metadata_dev(struct cache_args *ca, struct dm_arg_set *as, 2146 char **error) 2147 { 2148 int r; 2149 sector_t metadata_dev_size; 2150 char b[BDEVNAME_SIZE]; 2151 2152 if (!at_least_one_arg(as, error)) 2153 return -EINVAL; 2154 2155 r = dm_get_device(ca->ti, dm_shift_arg(as), FMODE_READ | FMODE_WRITE, 2156 &ca->metadata_dev); 2157 if (r) { 2158 *error = "Error opening metadata device"; 2159 return r; 2160 } 2161 2162 metadata_dev_size = get_dev_size(ca->metadata_dev); 2163 if (metadata_dev_size > DM_CACHE_METADATA_MAX_SECTORS_WARNING) 2164 DMWARN("Metadata device %s is larger than %u sectors: excess space will not be used.", 2165 bdevname(ca->metadata_dev->bdev, b), THIN_METADATA_MAX_SECTORS); 2166 2167 return 0; 2168 } 2169 2170 static int parse_cache_dev(struct cache_args *ca, struct dm_arg_set *as, 2171 char **error) 2172 { 2173 int r; 2174 2175 if (!at_least_one_arg(as, error)) 2176 return -EINVAL; 2177 2178 r = dm_get_device(ca->ti, dm_shift_arg(as), FMODE_READ | FMODE_WRITE, 2179 &ca->cache_dev); 2180 if (r) { 2181 *error = "Error opening cache device"; 2182 return r; 2183 } 2184 ca->cache_sectors = get_dev_size(ca->cache_dev); 2185 2186 return 0; 2187 } 2188 2189 static int parse_origin_dev(struct cache_args *ca, struct dm_arg_set *as, 2190 char **error) 2191 { 2192 int r; 2193 2194 if (!at_least_one_arg(as, error)) 2195 return -EINVAL; 2196 2197 r = dm_get_device(ca->ti, dm_shift_arg(as), FMODE_READ | FMODE_WRITE, 2198 &ca->origin_dev); 2199 if (r) { 2200 *error = "Error opening origin device"; 2201 return r; 2202 } 2203 2204 ca->origin_sectors = get_dev_size(ca->origin_dev); 2205 if (ca->ti->len > ca->origin_sectors) { 2206 *error = "Device size larger than cached device"; 2207 return -EINVAL; 2208 } 2209 2210 return 0; 2211 } 2212 2213 static int parse_block_size(struct cache_args *ca, struct dm_arg_set *as, 2214 char **error) 2215 { 2216 unsigned long block_size; 2217 2218 if (!at_least_one_arg(as, error)) 2219 return -EINVAL; 2220 2221 if (kstrtoul(dm_shift_arg(as), 10, &block_size) || !block_size || 2222 block_size < DATA_DEV_BLOCK_SIZE_MIN_SECTORS || 2223 block_size > DATA_DEV_BLOCK_SIZE_MAX_SECTORS || 2224 block_size & (DATA_DEV_BLOCK_SIZE_MIN_SECTORS - 1)) { 2225 *error = "Invalid data block size"; 2226 return -EINVAL; 2227 } 2228 2229 if (block_size > ca->cache_sectors) { 2230 *error = "Data block size is larger than the cache device"; 2231 return -EINVAL; 2232 } 2233 2234 ca->block_size = block_size; 2235 2236 return 0; 2237 } 2238 2239 static void init_features(struct cache_features *cf) 2240 { 2241 cf->mode = CM_WRITE; 2242 cf->io_mode = CM_IO_WRITEBACK; 2243 cf->metadata_version = 1; 2244 } 2245 2246 static int parse_features(struct cache_args *ca, struct dm_arg_set *as, 2247 char **error) 2248 { 2249 static const struct dm_arg _args[] = { 2250 {0, 2, "Invalid number of cache feature arguments"}, 2251 }; 2252 2253 int r; 2254 unsigned argc; 2255 const char *arg; 2256 struct cache_features *cf = &ca->features; 2257 2258 init_features(cf); 2259 2260 r = dm_read_arg_group(_args, as, &argc, error); 2261 if (r) 2262 return -EINVAL; 2263 2264 while (argc--) { 2265 arg = dm_shift_arg(as); 2266 2267 if (!strcasecmp(arg, "writeback")) 2268 cf->io_mode = CM_IO_WRITEBACK; 2269 2270 else if (!strcasecmp(arg, "writethrough")) 2271 cf->io_mode = CM_IO_WRITETHROUGH; 2272 2273 else if (!strcasecmp(arg, "passthrough")) 2274 cf->io_mode = CM_IO_PASSTHROUGH; 2275 2276 else if (!strcasecmp(arg, "metadata2")) 2277 cf->metadata_version = 2; 2278 2279 else { 2280 *error = "Unrecognised cache feature requested"; 2281 return -EINVAL; 2282 } 2283 } 2284 2285 return 0; 2286 } 2287 2288 static int parse_policy(struct cache_args *ca, struct dm_arg_set *as, 2289 char **error) 2290 { 2291 static const struct dm_arg _args[] = { 2292 {0, 1024, "Invalid number of policy arguments"}, 2293 }; 2294 2295 int r; 2296 2297 if (!at_least_one_arg(as, error)) 2298 return -EINVAL; 2299 2300 ca->policy_name = dm_shift_arg(as); 2301 2302 r = dm_read_arg_group(_args, as, &ca->policy_argc, error); 2303 if (r) 2304 return -EINVAL; 2305 2306 ca->policy_argv = (const char **)as->argv; 2307 dm_consume_args(as, ca->policy_argc); 2308 2309 return 0; 2310 } 2311 2312 static int parse_cache_args(struct cache_args *ca, int argc, char **argv, 2313 char **error) 2314 { 2315 int r; 2316 struct dm_arg_set as; 2317 2318 as.argc = argc; 2319 as.argv = argv; 2320 2321 r = parse_metadata_dev(ca, &as, error); 2322 if (r) 2323 return r; 2324 2325 r = parse_cache_dev(ca, &as, error); 2326 if (r) 2327 return r; 2328 2329 r = parse_origin_dev(ca, &as, error); 2330 if (r) 2331 return r; 2332 2333 r = parse_block_size(ca, &as, error); 2334 if (r) 2335 return r; 2336 2337 r = parse_features(ca, &as, error); 2338 if (r) 2339 return r; 2340 2341 r = parse_policy(ca, &as, error); 2342 if (r) 2343 return r; 2344 2345 return 0; 2346 } 2347 2348 /*----------------------------------------------------------------*/ 2349 2350 static struct kmem_cache *migration_cache; 2351 2352 #define NOT_CORE_OPTION 1 2353 2354 static int process_config_option(struct cache *cache, const char *key, const char *value) 2355 { 2356 unsigned long tmp; 2357 2358 if (!strcasecmp(key, "migration_threshold")) { 2359 if (kstrtoul(value, 10, &tmp)) 2360 return -EINVAL; 2361 2362 cache->migration_threshold = tmp; 2363 return 0; 2364 } 2365 2366 return NOT_CORE_OPTION; 2367 } 2368 2369 static int set_config_value(struct cache *cache, const char *key, const char *value) 2370 { 2371 int r = process_config_option(cache, key, value); 2372 2373 if (r == NOT_CORE_OPTION) 2374 r = policy_set_config_value(cache->policy, key, value); 2375 2376 if (r) 2377 DMWARN("bad config value for %s: %s", key, value); 2378 2379 return r; 2380 } 2381 2382 static int set_config_values(struct cache *cache, int argc, const char **argv) 2383 { 2384 int r = 0; 2385 2386 if (argc & 1) { 2387 DMWARN("Odd number of policy arguments given but they should be <key> <value> pairs."); 2388 return -EINVAL; 2389 } 2390 2391 while (argc) { 2392 r = set_config_value(cache, argv[0], argv[1]); 2393 if (r) 2394 break; 2395 2396 argc -= 2; 2397 argv += 2; 2398 } 2399 2400 return r; 2401 } 2402 2403 static int create_cache_policy(struct cache *cache, struct cache_args *ca, 2404 char **error) 2405 { 2406 struct dm_cache_policy *p = dm_cache_policy_create(ca->policy_name, 2407 cache->cache_size, 2408 cache->origin_sectors, 2409 cache->sectors_per_block); 2410 if (IS_ERR(p)) { 2411 *error = "Error creating cache's policy"; 2412 return PTR_ERR(p); 2413 } 2414 cache->policy = p; 2415 BUG_ON(!cache->policy); 2416 2417 return 0; 2418 } 2419 2420 /* 2421 * We want the discard block size to be at least the size of the cache 2422 * block size and have no more than 2^14 discard blocks across the origin. 2423 */ 2424 #define MAX_DISCARD_BLOCKS (1 << 14) 2425 2426 static bool too_many_discard_blocks(sector_t discard_block_size, 2427 sector_t origin_size) 2428 { 2429 (void) sector_div(origin_size, discard_block_size); 2430 2431 return origin_size > MAX_DISCARD_BLOCKS; 2432 } 2433 2434 static sector_t calculate_discard_block_size(sector_t cache_block_size, 2435 sector_t origin_size) 2436 { 2437 sector_t discard_block_size = cache_block_size; 2438 2439 if (origin_size) 2440 while (too_many_discard_blocks(discard_block_size, origin_size)) 2441 discard_block_size *= 2; 2442 2443 return discard_block_size; 2444 } 2445 2446 static void set_cache_size(struct cache *cache, dm_cblock_t size) 2447 { 2448 dm_block_t nr_blocks = from_cblock(size); 2449 2450 if (nr_blocks > (1 << 20) && cache->cache_size != size) 2451 DMWARN_LIMIT("You have created a cache device with a lot of individual cache blocks (%llu)\n" 2452 "All these mappings can consume a lot of kernel memory, and take some time to read/write.\n" 2453 "Please consider increasing the cache block size to reduce the overall cache block count.", 2454 (unsigned long long) nr_blocks); 2455 2456 cache->cache_size = size; 2457 } 2458 2459 static int is_congested(struct dm_dev *dev, int bdi_bits) 2460 { 2461 struct request_queue *q = bdev_get_queue(dev->bdev); 2462 return bdi_congested(q->backing_dev_info, bdi_bits); 2463 } 2464 2465 static int cache_is_congested(struct dm_target_callbacks *cb, int bdi_bits) 2466 { 2467 struct cache *cache = container_of(cb, struct cache, callbacks); 2468 2469 return is_congested(cache->origin_dev, bdi_bits) || 2470 is_congested(cache->cache_dev, bdi_bits); 2471 } 2472 2473 #define DEFAULT_MIGRATION_THRESHOLD 2048 2474 2475 static int cache_create(struct cache_args *ca, struct cache **result) 2476 { 2477 int r = 0; 2478 char **error = &ca->ti->error; 2479 struct cache *cache; 2480 struct dm_target *ti = ca->ti; 2481 dm_block_t origin_blocks; 2482 struct dm_cache_metadata *cmd; 2483 bool may_format = ca->features.mode == CM_WRITE; 2484 2485 cache = kzalloc(sizeof(*cache), GFP_KERNEL); 2486 if (!cache) 2487 return -ENOMEM; 2488 2489 cache->ti = ca->ti; 2490 ti->private = cache; 2491 ti->num_flush_bios = 2; 2492 ti->flush_supported = true; 2493 2494 ti->num_discard_bios = 1; 2495 ti->discards_supported = true; 2496 ti->split_discard_bios = false; 2497 2498 ti->per_io_data_size = sizeof(struct per_bio_data); 2499 2500 cache->features = ca->features; 2501 if (writethrough_mode(cache)) { 2502 /* Create bioset for writethrough bios issued to origin */ 2503 r = bioset_init(&cache->bs, BIO_POOL_SIZE, 0, 0); 2504 if (r) 2505 goto bad; 2506 } 2507 2508 cache->callbacks.congested_fn = cache_is_congested; 2509 dm_table_add_target_callbacks(ti->table, &cache->callbacks); 2510 2511 cache->metadata_dev = ca->metadata_dev; 2512 cache->origin_dev = ca->origin_dev; 2513 cache->cache_dev = ca->cache_dev; 2514 2515 ca->metadata_dev = ca->origin_dev = ca->cache_dev = NULL; 2516 2517 origin_blocks = cache->origin_sectors = ca->origin_sectors; 2518 origin_blocks = block_div(origin_blocks, ca->block_size); 2519 cache->origin_blocks = to_oblock(origin_blocks); 2520 2521 cache->sectors_per_block = ca->block_size; 2522 if (dm_set_target_max_io_len(ti, cache->sectors_per_block)) { 2523 r = -EINVAL; 2524 goto bad; 2525 } 2526 2527 if (ca->block_size & (ca->block_size - 1)) { 2528 dm_block_t cache_size = ca->cache_sectors; 2529 2530 cache->sectors_per_block_shift = -1; 2531 cache_size = block_div(cache_size, ca->block_size); 2532 set_cache_size(cache, to_cblock(cache_size)); 2533 } else { 2534 cache->sectors_per_block_shift = __ffs(ca->block_size); 2535 set_cache_size(cache, to_cblock(ca->cache_sectors >> cache->sectors_per_block_shift)); 2536 } 2537 2538 r = create_cache_policy(cache, ca, error); 2539 if (r) 2540 goto bad; 2541 2542 cache->policy_nr_args = ca->policy_argc; 2543 cache->migration_threshold = DEFAULT_MIGRATION_THRESHOLD; 2544 2545 r = set_config_values(cache, ca->policy_argc, ca->policy_argv); 2546 if (r) { 2547 *error = "Error setting cache policy's config values"; 2548 goto bad; 2549 } 2550 2551 cmd = dm_cache_metadata_open(cache->metadata_dev->bdev, 2552 ca->block_size, may_format, 2553 dm_cache_policy_get_hint_size(cache->policy), 2554 ca->features.metadata_version); 2555 if (IS_ERR(cmd)) { 2556 *error = "Error creating metadata object"; 2557 r = PTR_ERR(cmd); 2558 goto bad; 2559 } 2560 cache->cmd = cmd; 2561 set_cache_mode(cache, CM_WRITE); 2562 if (get_cache_mode(cache) != CM_WRITE) { 2563 *error = "Unable to get write access to metadata, please check/repair metadata."; 2564 r = -EINVAL; 2565 goto bad; 2566 } 2567 2568 if (passthrough_mode(cache)) { 2569 bool all_clean; 2570 2571 r = dm_cache_metadata_all_clean(cache->cmd, &all_clean); 2572 if (r) { 2573 *error = "dm_cache_metadata_all_clean() failed"; 2574 goto bad; 2575 } 2576 2577 if (!all_clean) { 2578 *error = "Cannot enter passthrough mode unless all blocks are clean"; 2579 r = -EINVAL; 2580 goto bad; 2581 } 2582 2583 policy_allow_migrations(cache->policy, false); 2584 } 2585 2586 spin_lock_init(&cache->lock); 2587 bio_list_init(&cache->deferred_bios); 2588 atomic_set(&cache->nr_allocated_migrations, 0); 2589 atomic_set(&cache->nr_io_migrations, 0); 2590 init_waitqueue_head(&cache->migration_wait); 2591 2592 r = -ENOMEM; 2593 atomic_set(&cache->nr_dirty, 0); 2594 cache->dirty_bitset = alloc_bitset(from_cblock(cache->cache_size)); 2595 if (!cache->dirty_bitset) { 2596 *error = "could not allocate dirty bitset"; 2597 goto bad; 2598 } 2599 clear_bitset(cache->dirty_bitset, from_cblock(cache->cache_size)); 2600 2601 cache->discard_block_size = 2602 calculate_discard_block_size(cache->sectors_per_block, 2603 cache->origin_sectors); 2604 cache->discard_nr_blocks = to_dblock(dm_sector_div_up(cache->origin_sectors, 2605 cache->discard_block_size)); 2606 cache->discard_bitset = alloc_bitset(from_dblock(cache->discard_nr_blocks)); 2607 if (!cache->discard_bitset) { 2608 *error = "could not allocate discard bitset"; 2609 goto bad; 2610 } 2611 clear_bitset(cache->discard_bitset, from_dblock(cache->discard_nr_blocks)); 2612 2613 cache->copier = dm_kcopyd_client_create(&dm_kcopyd_throttle); 2614 if (IS_ERR(cache->copier)) { 2615 *error = "could not create kcopyd client"; 2616 r = PTR_ERR(cache->copier); 2617 goto bad; 2618 } 2619 2620 cache->wq = alloc_workqueue("dm-" DM_MSG_PREFIX, WQ_MEM_RECLAIM, 0); 2621 if (!cache->wq) { 2622 *error = "could not create workqueue for metadata object"; 2623 goto bad; 2624 } 2625 INIT_WORK(&cache->deferred_bio_worker, process_deferred_bios); 2626 INIT_WORK(&cache->migration_worker, check_migrations); 2627 INIT_DELAYED_WORK(&cache->waker, do_waker); 2628 2629 cache->prison = dm_bio_prison_create_v2(cache->wq); 2630 if (!cache->prison) { 2631 *error = "could not create bio prison"; 2632 goto bad; 2633 } 2634 2635 r = mempool_init_slab_pool(&cache->migration_pool, MIGRATION_POOL_SIZE, 2636 migration_cache); 2637 if (r) { 2638 *error = "Error creating cache's migration mempool"; 2639 goto bad; 2640 } 2641 2642 cache->need_tick_bio = true; 2643 cache->sized = false; 2644 cache->invalidate = false; 2645 cache->commit_requested = false; 2646 cache->loaded_mappings = false; 2647 cache->loaded_discards = false; 2648 2649 load_stats(cache); 2650 2651 atomic_set(&cache->stats.demotion, 0); 2652 atomic_set(&cache->stats.promotion, 0); 2653 atomic_set(&cache->stats.copies_avoided, 0); 2654 atomic_set(&cache->stats.cache_cell_clash, 0); 2655 atomic_set(&cache->stats.commit_count, 0); 2656 atomic_set(&cache->stats.discard_count, 0); 2657 2658 spin_lock_init(&cache->invalidation_lock); 2659 INIT_LIST_HEAD(&cache->invalidation_requests); 2660 2661 batcher_init(&cache->committer, commit_op, cache, 2662 issue_op, cache, cache->wq); 2663 iot_init(&cache->tracker); 2664 2665 init_rwsem(&cache->background_work_lock); 2666 prevent_background_work(cache); 2667 2668 *result = cache; 2669 return 0; 2670 bad: 2671 destroy(cache); 2672 return r; 2673 } 2674 2675 static int copy_ctr_args(struct cache *cache, int argc, const char **argv) 2676 { 2677 unsigned i; 2678 const char **copy; 2679 2680 copy = kcalloc(argc, sizeof(*copy), GFP_KERNEL); 2681 if (!copy) 2682 return -ENOMEM; 2683 for (i = 0; i < argc; i++) { 2684 copy[i] = kstrdup(argv[i], GFP_KERNEL); 2685 if (!copy[i]) { 2686 while (i--) 2687 kfree(copy[i]); 2688 kfree(copy); 2689 return -ENOMEM; 2690 } 2691 } 2692 2693 cache->nr_ctr_args = argc; 2694 cache->ctr_args = copy; 2695 2696 return 0; 2697 } 2698 2699 static int cache_ctr(struct dm_target *ti, unsigned argc, char **argv) 2700 { 2701 int r = -EINVAL; 2702 struct cache_args *ca; 2703 struct cache *cache = NULL; 2704 2705 ca = kzalloc(sizeof(*ca), GFP_KERNEL); 2706 if (!ca) { 2707 ti->error = "Error allocating memory for cache"; 2708 return -ENOMEM; 2709 } 2710 ca->ti = ti; 2711 2712 r = parse_cache_args(ca, argc, argv, &ti->error); 2713 if (r) 2714 goto out; 2715 2716 r = cache_create(ca, &cache); 2717 if (r) 2718 goto out; 2719 2720 r = copy_ctr_args(cache, argc - 3, (const char **)argv + 3); 2721 if (r) { 2722 destroy(cache); 2723 goto out; 2724 } 2725 2726 ti->private = cache; 2727 out: 2728 destroy_cache_args(ca); 2729 return r; 2730 } 2731 2732 /*----------------------------------------------------------------*/ 2733 2734 static int cache_map(struct dm_target *ti, struct bio *bio) 2735 { 2736 struct cache *cache = ti->private; 2737 2738 int r; 2739 bool commit_needed; 2740 dm_oblock_t block = get_bio_block(cache, bio); 2741 2742 init_per_bio_data(bio); 2743 if (unlikely(from_oblock(block) >= from_oblock(cache->origin_blocks))) { 2744 /* 2745 * This can only occur if the io goes to a partial block at 2746 * the end of the origin device. We don't cache these. 2747 * Just remap to the origin and carry on. 2748 */ 2749 remap_to_origin(cache, bio); 2750 accounted_begin(cache, bio); 2751 return DM_MAPIO_REMAPPED; 2752 } 2753 2754 if (discard_or_flush(bio)) { 2755 defer_bio(cache, bio); 2756 return DM_MAPIO_SUBMITTED; 2757 } 2758 2759 r = map_bio(cache, bio, block, &commit_needed); 2760 if (commit_needed) 2761 schedule_commit(&cache->committer); 2762 2763 return r; 2764 } 2765 2766 static int cache_end_io(struct dm_target *ti, struct bio *bio, blk_status_t *error) 2767 { 2768 struct cache *cache = ti->private; 2769 unsigned long flags; 2770 struct per_bio_data *pb = get_per_bio_data(bio); 2771 2772 if (pb->tick) { 2773 policy_tick(cache->policy, false); 2774 2775 spin_lock_irqsave(&cache->lock, flags); 2776 cache->need_tick_bio = true; 2777 spin_unlock_irqrestore(&cache->lock, flags); 2778 } 2779 2780 bio_drop_shared_lock(cache, bio); 2781 accounted_complete(cache, bio); 2782 2783 return DM_ENDIO_DONE; 2784 } 2785 2786 static int write_dirty_bitset(struct cache *cache) 2787 { 2788 int r; 2789 2790 if (get_cache_mode(cache) >= CM_READ_ONLY) 2791 return -EINVAL; 2792 2793 r = dm_cache_set_dirty_bits(cache->cmd, from_cblock(cache->cache_size), cache->dirty_bitset); 2794 if (r) 2795 metadata_operation_failed(cache, "dm_cache_set_dirty_bits", r); 2796 2797 return r; 2798 } 2799 2800 static int write_discard_bitset(struct cache *cache) 2801 { 2802 unsigned i, r; 2803 2804 if (get_cache_mode(cache) >= CM_READ_ONLY) 2805 return -EINVAL; 2806 2807 r = dm_cache_discard_bitset_resize(cache->cmd, cache->discard_block_size, 2808 cache->discard_nr_blocks); 2809 if (r) { 2810 DMERR("%s: could not resize on-disk discard bitset", cache_device_name(cache)); 2811 metadata_operation_failed(cache, "dm_cache_discard_bitset_resize", r); 2812 return r; 2813 } 2814 2815 for (i = 0; i < from_dblock(cache->discard_nr_blocks); i++) { 2816 r = dm_cache_set_discard(cache->cmd, to_dblock(i), 2817 is_discarded(cache, to_dblock(i))); 2818 if (r) { 2819 metadata_operation_failed(cache, "dm_cache_set_discard", r); 2820 return r; 2821 } 2822 } 2823 2824 return 0; 2825 } 2826 2827 static int write_hints(struct cache *cache) 2828 { 2829 int r; 2830 2831 if (get_cache_mode(cache) >= CM_READ_ONLY) 2832 return -EINVAL; 2833 2834 r = dm_cache_write_hints(cache->cmd, cache->policy); 2835 if (r) { 2836 metadata_operation_failed(cache, "dm_cache_write_hints", r); 2837 return r; 2838 } 2839 2840 return 0; 2841 } 2842 2843 /* 2844 * returns true on success 2845 */ 2846 static bool sync_metadata(struct cache *cache) 2847 { 2848 int r1, r2, r3, r4; 2849 2850 r1 = write_dirty_bitset(cache); 2851 if (r1) 2852 DMERR("%s: could not write dirty bitset", cache_device_name(cache)); 2853 2854 r2 = write_discard_bitset(cache); 2855 if (r2) 2856 DMERR("%s: could not write discard bitset", cache_device_name(cache)); 2857 2858 save_stats(cache); 2859 2860 r3 = write_hints(cache); 2861 if (r3) 2862 DMERR("%s: could not write hints", cache_device_name(cache)); 2863 2864 /* 2865 * If writing the above metadata failed, we still commit, but don't 2866 * set the clean shutdown flag. This will effectively force every 2867 * dirty bit to be set on reload. 2868 */ 2869 r4 = commit(cache, !r1 && !r2 && !r3); 2870 if (r4) 2871 DMERR("%s: could not write cache metadata", cache_device_name(cache)); 2872 2873 return !r1 && !r2 && !r3 && !r4; 2874 } 2875 2876 static void cache_postsuspend(struct dm_target *ti) 2877 { 2878 struct cache *cache = ti->private; 2879 2880 prevent_background_work(cache); 2881 BUG_ON(atomic_read(&cache->nr_io_migrations)); 2882 2883 cancel_delayed_work(&cache->waker); 2884 flush_workqueue(cache->wq); 2885 WARN_ON(cache->tracker.in_flight); 2886 2887 /* 2888 * If it's a flush suspend there won't be any deferred bios, so this 2889 * call is harmless. 2890 */ 2891 requeue_deferred_bios(cache); 2892 2893 if (get_cache_mode(cache) == CM_WRITE) 2894 (void) sync_metadata(cache); 2895 } 2896 2897 static int load_mapping(void *context, dm_oblock_t oblock, dm_cblock_t cblock, 2898 bool dirty, uint32_t hint, bool hint_valid) 2899 { 2900 int r; 2901 struct cache *cache = context; 2902 2903 if (dirty) { 2904 set_bit(from_cblock(cblock), cache->dirty_bitset); 2905 atomic_inc(&cache->nr_dirty); 2906 } else 2907 clear_bit(from_cblock(cblock), cache->dirty_bitset); 2908 2909 r = policy_load_mapping(cache->policy, oblock, cblock, dirty, hint, hint_valid); 2910 if (r) 2911 return r; 2912 2913 return 0; 2914 } 2915 2916 /* 2917 * The discard block size in the on disk metadata is not 2918 * neccessarily the same as we're currently using. So we have to 2919 * be careful to only set the discarded attribute if we know it 2920 * covers a complete block of the new size. 2921 */ 2922 struct discard_load_info { 2923 struct cache *cache; 2924 2925 /* 2926 * These blocks are sized using the on disk dblock size, rather 2927 * than the current one. 2928 */ 2929 dm_block_t block_size; 2930 dm_block_t discard_begin, discard_end; 2931 }; 2932 2933 static void discard_load_info_init(struct cache *cache, 2934 struct discard_load_info *li) 2935 { 2936 li->cache = cache; 2937 li->discard_begin = li->discard_end = 0; 2938 } 2939 2940 static void set_discard_range(struct discard_load_info *li) 2941 { 2942 sector_t b, e; 2943 2944 if (li->discard_begin == li->discard_end) 2945 return; 2946 2947 /* 2948 * Convert to sectors. 2949 */ 2950 b = li->discard_begin * li->block_size; 2951 e = li->discard_end * li->block_size; 2952 2953 /* 2954 * Then convert back to the current dblock size. 2955 */ 2956 b = dm_sector_div_up(b, li->cache->discard_block_size); 2957 sector_div(e, li->cache->discard_block_size); 2958 2959 /* 2960 * The origin may have shrunk, so we need to check we're still in 2961 * bounds. 2962 */ 2963 if (e > from_dblock(li->cache->discard_nr_blocks)) 2964 e = from_dblock(li->cache->discard_nr_blocks); 2965 2966 for (; b < e; b++) 2967 set_discard(li->cache, to_dblock(b)); 2968 } 2969 2970 static int load_discard(void *context, sector_t discard_block_size, 2971 dm_dblock_t dblock, bool discard) 2972 { 2973 struct discard_load_info *li = context; 2974 2975 li->block_size = discard_block_size; 2976 2977 if (discard) { 2978 if (from_dblock(dblock) == li->discard_end) 2979 /* 2980 * We're already in a discard range, just extend it. 2981 */ 2982 li->discard_end = li->discard_end + 1ULL; 2983 2984 else { 2985 /* 2986 * Emit the old range and start a new one. 2987 */ 2988 set_discard_range(li); 2989 li->discard_begin = from_dblock(dblock); 2990 li->discard_end = li->discard_begin + 1ULL; 2991 } 2992 } else { 2993 set_discard_range(li); 2994 li->discard_begin = li->discard_end = 0; 2995 } 2996 2997 return 0; 2998 } 2999 3000 static dm_cblock_t get_cache_dev_size(struct cache *cache) 3001 { 3002 sector_t size = get_dev_size(cache->cache_dev); 3003 (void) sector_div(size, cache->sectors_per_block); 3004 return to_cblock(size); 3005 } 3006 3007 static bool can_resize(struct cache *cache, dm_cblock_t new_size) 3008 { 3009 if (from_cblock(new_size) > from_cblock(cache->cache_size)) 3010 return true; 3011 3012 /* 3013 * We can't drop a dirty block when shrinking the cache. 3014 */ 3015 while (from_cblock(new_size) < from_cblock(cache->cache_size)) { 3016 new_size = to_cblock(from_cblock(new_size) + 1); 3017 if (is_dirty(cache, new_size)) { 3018 DMERR("%s: unable to shrink cache; cache block %llu is dirty", 3019 cache_device_name(cache), 3020 (unsigned long long) from_cblock(new_size)); 3021 return false; 3022 } 3023 } 3024 3025 return true; 3026 } 3027 3028 static int resize_cache_dev(struct cache *cache, dm_cblock_t new_size) 3029 { 3030 int r; 3031 3032 r = dm_cache_resize(cache->cmd, new_size); 3033 if (r) { 3034 DMERR("%s: could not resize cache metadata", cache_device_name(cache)); 3035 metadata_operation_failed(cache, "dm_cache_resize", r); 3036 return r; 3037 } 3038 3039 set_cache_size(cache, new_size); 3040 3041 return 0; 3042 } 3043 3044 static int cache_preresume(struct dm_target *ti) 3045 { 3046 int r = 0; 3047 struct cache *cache = ti->private; 3048 dm_cblock_t csize = get_cache_dev_size(cache); 3049 3050 /* 3051 * Check to see if the cache has resized. 3052 */ 3053 if (!cache->sized) { 3054 r = resize_cache_dev(cache, csize); 3055 if (r) 3056 return r; 3057 3058 cache->sized = true; 3059 3060 } else if (csize != cache->cache_size) { 3061 if (!can_resize(cache, csize)) 3062 return -EINVAL; 3063 3064 r = resize_cache_dev(cache, csize); 3065 if (r) 3066 return r; 3067 } 3068 3069 if (!cache->loaded_mappings) { 3070 r = dm_cache_load_mappings(cache->cmd, cache->policy, 3071 load_mapping, cache); 3072 if (r) { 3073 DMERR("%s: could not load cache mappings", cache_device_name(cache)); 3074 metadata_operation_failed(cache, "dm_cache_load_mappings", r); 3075 return r; 3076 } 3077 3078 cache->loaded_mappings = true; 3079 } 3080 3081 if (!cache->loaded_discards) { 3082 struct discard_load_info li; 3083 3084 /* 3085 * The discard bitset could have been resized, or the 3086 * discard block size changed. To be safe we start by 3087 * setting every dblock to not discarded. 3088 */ 3089 clear_bitset(cache->discard_bitset, from_dblock(cache->discard_nr_blocks)); 3090 3091 discard_load_info_init(cache, &li); 3092 r = dm_cache_load_discards(cache->cmd, load_discard, &li); 3093 if (r) { 3094 DMERR("%s: could not load origin discards", cache_device_name(cache)); 3095 metadata_operation_failed(cache, "dm_cache_load_discards", r); 3096 return r; 3097 } 3098 set_discard_range(&li); 3099 3100 cache->loaded_discards = true; 3101 } 3102 3103 return r; 3104 } 3105 3106 static void cache_resume(struct dm_target *ti) 3107 { 3108 struct cache *cache = ti->private; 3109 3110 cache->need_tick_bio = true; 3111 allow_background_work(cache); 3112 do_waker(&cache->waker.work); 3113 } 3114 3115 /* 3116 * Status format: 3117 * 3118 * <metadata block size> <#used metadata blocks>/<#total metadata blocks> 3119 * <cache block size> <#used cache blocks>/<#total cache blocks> 3120 * <#read hits> <#read misses> <#write hits> <#write misses> 3121 * <#demotions> <#promotions> <#dirty> 3122 * <#features> <features>* 3123 * <#core args> <core args> 3124 * <policy name> <#policy args> <policy args>* <cache metadata mode> <needs_check> 3125 */ 3126 static void cache_status(struct dm_target *ti, status_type_t type, 3127 unsigned status_flags, char *result, unsigned maxlen) 3128 { 3129 int r = 0; 3130 unsigned i; 3131 ssize_t sz = 0; 3132 dm_block_t nr_free_blocks_metadata = 0; 3133 dm_block_t nr_blocks_metadata = 0; 3134 char buf[BDEVNAME_SIZE]; 3135 struct cache *cache = ti->private; 3136 dm_cblock_t residency; 3137 bool needs_check; 3138 3139 switch (type) { 3140 case STATUSTYPE_INFO: 3141 if (get_cache_mode(cache) == CM_FAIL) { 3142 DMEMIT("Fail"); 3143 break; 3144 } 3145 3146 /* Commit to ensure statistics aren't out-of-date */ 3147 if (!(status_flags & DM_STATUS_NOFLUSH_FLAG) && !dm_suspended(ti)) 3148 (void) commit(cache, false); 3149 3150 r = dm_cache_get_free_metadata_block_count(cache->cmd, &nr_free_blocks_metadata); 3151 if (r) { 3152 DMERR("%s: dm_cache_get_free_metadata_block_count returned %d", 3153 cache_device_name(cache), r); 3154 goto err; 3155 } 3156 3157 r = dm_cache_get_metadata_dev_size(cache->cmd, &nr_blocks_metadata); 3158 if (r) { 3159 DMERR("%s: dm_cache_get_metadata_dev_size returned %d", 3160 cache_device_name(cache), r); 3161 goto err; 3162 } 3163 3164 residency = policy_residency(cache->policy); 3165 3166 DMEMIT("%u %llu/%llu %llu %llu/%llu %u %u %u %u %u %u %lu ", 3167 (unsigned)DM_CACHE_METADATA_BLOCK_SIZE, 3168 (unsigned long long)(nr_blocks_metadata - nr_free_blocks_metadata), 3169 (unsigned long long)nr_blocks_metadata, 3170 (unsigned long long)cache->sectors_per_block, 3171 (unsigned long long) from_cblock(residency), 3172 (unsigned long long) from_cblock(cache->cache_size), 3173 (unsigned) atomic_read(&cache->stats.read_hit), 3174 (unsigned) atomic_read(&cache->stats.read_miss), 3175 (unsigned) atomic_read(&cache->stats.write_hit), 3176 (unsigned) atomic_read(&cache->stats.write_miss), 3177 (unsigned) atomic_read(&cache->stats.demotion), 3178 (unsigned) atomic_read(&cache->stats.promotion), 3179 (unsigned long) atomic_read(&cache->nr_dirty)); 3180 3181 if (cache->features.metadata_version == 2) 3182 DMEMIT("2 metadata2 "); 3183 else 3184 DMEMIT("1 "); 3185 3186 if (writethrough_mode(cache)) 3187 DMEMIT("writethrough "); 3188 3189 else if (passthrough_mode(cache)) 3190 DMEMIT("passthrough "); 3191 3192 else if (writeback_mode(cache)) 3193 DMEMIT("writeback "); 3194 3195 else { 3196 DMERR("%s: internal error: unknown io mode: %d", 3197 cache_device_name(cache), (int) cache->features.io_mode); 3198 goto err; 3199 } 3200 3201 DMEMIT("2 migration_threshold %llu ", (unsigned long long) cache->migration_threshold); 3202 3203 DMEMIT("%s ", dm_cache_policy_get_name(cache->policy)); 3204 if (sz < maxlen) { 3205 r = policy_emit_config_values(cache->policy, result, maxlen, &sz); 3206 if (r) 3207 DMERR("%s: policy_emit_config_values returned %d", 3208 cache_device_name(cache), r); 3209 } 3210 3211 if (get_cache_mode(cache) == CM_READ_ONLY) 3212 DMEMIT("ro "); 3213 else 3214 DMEMIT("rw "); 3215 3216 r = dm_cache_metadata_needs_check(cache->cmd, &needs_check); 3217 3218 if (r || needs_check) 3219 DMEMIT("needs_check "); 3220 else 3221 DMEMIT("- "); 3222 3223 break; 3224 3225 case STATUSTYPE_TABLE: 3226 format_dev_t(buf, cache->metadata_dev->bdev->bd_dev); 3227 DMEMIT("%s ", buf); 3228 format_dev_t(buf, cache->cache_dev->bdev->bd_dev); 3229 DMEMIT("%s ", buf); 3230 format_dev_t(buf, cache->origin_dev->bdev->bd_dev); 3231 DMEMIT("%s", buf); 3232 3233 for (i = 0; i < cache->nr_ctr_args - 1; i++) 3234 DMEMIT(" %s", cache->ctr_args[i]); 3235 if (cache->nr_ctr_args) 3236 DMEMIT(" %s", cache->ctr_args[cache->nr_ctr_args - 1]); 3237 } 3238 3239 return; 3240 3241 err: 3242 DMEMIT("Error"); 3243 } 3244 3245 /* 3246 * Defines a range of cblocks, begin to (end - 1) are in the range. end is 3247 * the one-past-the-end value. 3248 */ 3249 struct cblock_range { 3250 dm_cblock_t begin; 3251 dm_cblock_t end; 3252 }; 3253 3254 /* 3255 * A cache block range can take two forms: 3256 * 3257 * i) A single cblock, eg. '3456' 3258 * ii) A begin and end cblock with a dash between, eg. 123-234 3259 */ 3260 static int parse_cblock_range(struct cache *cache, const char *str, 3261 struct cblock_range *result) 3262 { 3263 char dummy; 3264 uint64_t b, e; 3265 int r; 3266 3267 /* 3268 * Try and parse form (ii) first. 3269 */ 3270 r = sscanf(str, "%llu-%llu%c", &b, &e, &dummy); 3271 if (r < 0) 3272 return r; 3273 3274 if (r == 2) { 3275 result->begin = to_cblock(b); 3276 result->end = to_cblock(e); 3277 return 0; 3278 } 3279 3280 /* 3281 * That didn't work, try form (i). 3282 */ 3283 r = sscanf(str, "%llu%c", &b, &dummy); 3284 if (r < 0) 3285 return r; 3286 3287 if (r == 1) { 3288 result->begin = to_cblock(b); 3289 result->end = to_cblock(from_cblock(result->begin) + 1u); 3290 return 0; 3291 } 3292 3293 DMERR("%s: invalid cblock range '%s'", cache_device_name(cache), str); 3294 return -EINVAL; 3295 } 3296 3297 static int validate_cblock_range(struct cache *cache, struct cblock_range *range) 3298 { 3299 uint64_t b = from_cblock(range->begin); 3300 uint64_t e = from_cblock(range->end); 3301 uint64_t n = from_cblock(cache->cache_size); 3302 3303 if (b >= n) { 3304 DMERR("%s: begin cblock out of range: %llu >= %llu", 3305 cache_device_name(cache), b, n); 3306 return -EINVAL; 3307 } 3308 3309 if (e > n) { 3310 DMERR("%s: end cblock out of range: %llu > %llu", 3311 cache_device_name(cache), e, n); 3312 return -EINVAL; 3313 } 3314 3315 if (b >= e) { 3316 DMERR("%s: invalid cblock range: %llu >= %llu", 3317 cache_device_name(cache), b, e); 3318 return -EINVAL; 3319 } 3320 3321 return 0; 3322 } 3323 3324 static inline dm_cblock_t cblock_succ(dm_cblock_t b) 3325 { 3326 return to_cblock(from_cblock(b) + 1); 3327 } 3328 3329 static int request_invalidation(struct cache *cache, struct cblock_range *range) 3330 { 3331 int r = 0; 3332 3333 /* 3334 * We don't need to do any locking here because we know we're in 3335 * passthrough mode. There's is potential for a race between an 3336 * invalidation triggered by an io and an invalidation message. This 3337 * is harmless, we must not worry if the policy call fails. 3338 */ 3339 while (range->begin != range->end) { 3340 r = invalidate_cblock(cache, range->begin); 3341 if (r) 3342 return r; 3343 3344 range->begin = cblock_succ(range->begin); 3345 } 3346 3347 cache->commit_requested = true; 3348 return r; 3349 } 3350 3351 static int process_invalidate_cblocks_message(struct cache *cache, unsigned count, 3352 const char **cblock_ranges) 3353 { 3354 int r = 0; 3355 unsigned i; 3356 struct cblock_range range; 3357 3358 if (!passthrough_mode(cache)) { 3359 DMERR("%s: cache has to be in passthrough mode for invalidation", 3360 cache_device_name(cache)); 3361 return -EPERM; 3362 } 3363 3364 for (i = 0; i < count; i++) { 3365 r = parse_cblock_range(cache, cblock_ranges[i], &range); 3366 if (r) 3367 break; 3368 3369 r = validate_cblock_range(cache, &range); 3370 if (r) 3371 break; 3372 3373 /* 3374 * Pass begin and end origin blocks to the worker and wake it. 3375 */ 3376 r = request_invalidation(cache, &range); 3377 if (r) 3378 break; 3379 } 3380 3381 return r; 3382 } 3383 3384 /* 3385 * Supports 3386 * "<key> <value>" 3387 * and 3388 * "invalidate_cblocks [(<begin>)|(<begin>-<end>)]* 3389 * 3390 * The key migration_threshold is supported by the cache target core. 3391 */ 3392 static int cache_message(struct dm_target *ti, unsigned argc, char **argv, 3393 char *result, unsigned maxlen) 3394 { 3395 struct cache *cache = ti->private; 3396 3397 if (!argc) 3398 return -EINVAL; 3399 3400 if (get_cache_mode(cache) >= CM_READ_ONLY) { 3401 DMERR("%s: unable to service cache target messages in READ_ONLY or FAIL mode", 3402 cache_device_name(cache)); 3403 return -EOPNOTSUPP; 3404 } 3405 3406 if (!strcasecmp(argv[0], "invalidate_cblocks")) 3407 return process_invalidate_cblocks_message(cache, argc - 1, (const char **) argv + 1); 3408 3409 if (argc != 2) 3410 return -EINVAL; 3411 3412 return set_config_value(cache, argv[0], argv[1]); 3413 } 3414 3415 static int cache_iterate_devices(struct dm_target *ti, 3416 iterate_devices_callout_fn fn, void *data) 3417 { 3418 int r = 0; 3419 struct cache *cache = ti->private; 3420 3421 r = fn(ti, cache->cache_dev, 0, get_dev_size(cache->cache_dev), data); 3422 if (!r) 3423 r = fn(ti, cache->origin_dev, 0, ti->len, data); 3424 3425 return r; 3426 } 3427 3428 static void set_discard_limits(struct cache *cache, struct queue_limits *limits) 3429 { 3430 /* 3431 * FIXME: these limits may be incompatible with the cache device 3432 */ 3433 limits->max_discard_sectors = min_t(sector_t, cache->discard_block_size * 1024, 3434 cache->origin_sectors); 3435 limits->discard_granularity = cache->discard_block_size << SECTOR_SHIFT; 3436 } 3437 3438 static void cache_io_hints(struct dm_target *ti, struct queue_limits *limits) 3439 { 3440 struct cache *cache = ti->private; 3441 uint64_t io_opt_sectors = limits->io_opt >> SECTOR_SHIFT; 3442 3443 /* 3444 * If the system-determined stacked limits are compatible with the 3445 * cache's blocksize (io_opt is a factor) do not override them. 3446 */ 3447 if (io_opt_sectors < cache->sectors_per_block || 3448 do_div(io_opt_sectors, cache->sectors_per_block)) { 3449 blk_limits_io_min(limits, cache->sectors_per_block << SECTOR_SHIFT); 3450 blk_limits_io_opt(limits, cache->sectors_per_block << SECTOR_SHIFT); 3451 } 3452 set_discard_limits(cache, limits); 3453 } 3454 3455 /*----------------------------------------------------------------*/ 3456 3457 static struct target_type cache_target = { 3458 .name = "cache", 3459 .version = {2, 0, 0}, 3460 .module = THIS_MODULE, 3461 .ctr = cache_ctr, 3462 .dtr = cache_dtr, 3463 .map = cache_map, 3464 .end_io = cache_end_io, 3465 .postsuspend = cache_postsuspend, 3466 .preresume = cache_preresume, 3467 .resume = cache_resume, 3468 .status = cache_status, 3469 .message = cache_message, 3470 .iterate_devices = cache_iterate_devices, 3471 .io_hints = cache_io_hints, 3472 }; 3473 3474 static int __init dm_cache_init(void) 3475 { 3476 int r; 3477 3478 migration_cache = KMEM_CACHE(dm_cache_migration, 0); 3479 if (!migration_cache) { 3480 dm_unregister_target(&cache_target); 3481 return -ENOMEM; 3482 } 3483 3484 r = dm_register_target(&cache_target); 3485 if (r) { 3486 DMERR("cache target registration failed: %d", r); 3487 return r; 3488 } 3489 3490 return 0; 3491 } 3492 3493 static void __exit dm_cache_exit(void) 3494 { 3495 dm_unregister_target(&cache_target); 3496 kmem_cache_destroy(migration_cache); 3497 } 3498 3499 module_init(dm_cache_init); 3500 module_exit(dm_cache_exit); 3501 3502 MODULE_DESCRIPTION(DM_NAME " cache target"); 3503 MODULE_AUTHOR("Joe Thornber <ejt@redhat.com>"); 3504 MODULE_LICENSE("GPL"); 3505