1 /* 2 * Copyright (C) 2012 Red Hat. All rights reserved. 3 * 4 * This file is released under the GPL. 5 */ 6 7 #include "dm.h" 8 #include "dm-bio-prison-v2.h" 9 #include "dm-bio-record.h" 10 #include "dm-cache-metadata.h" 11 12 #include <linux/dm-io.h> 13 #include <linux/dm-kcopyd.h> 14 #include <linux/jiffies.h> 15 #include <linux/init.h> 16 #include <linux/mempool.h> 17 #include <linux/module.h> 18 #include <linux/rwsem.h> 19 #include <linux/slab.h> 20 #include <linux/vmalloc.h> 21 22 #define DM_MSG_PREFIX "cache" 23 24 DECLARE_DM_KCOPYD_THROTTLE_WITH_MODULE_PARM(cache_copy_throttle, 25 "A percentage of time allocated for copying to and/or from cache"); 26 27 /*----------------------------------------------------------------*/ 28 29 /* 30 * Glossary: 31 * 32 * oblock: index of an origin block 33 * cblock: index of a cache block 34 * promotion: movement of a block from origin to cache 35 * demotion: movement of a block from cache to origin 36 * migration: movement of a block between the origin and cache device, 37 * either direction 38 */ 39 40 /*----------------------------------------------------------------*/ 41 42 struct io_tracker { 43 spinlock_t lock; 44 45 /* 46 * Sectors of in-flight IO. 47 */ 48 sector_t in_flight; 49 50 /* 51 * The time, in jiffies, when this device became idle (if it is 52 * indeed idle). 53 */ 54 unsigned long idle_time; 55 unsigned long last_update_time; 56 }; 57 58 static void iot_init(struct io_tracker *iot) 59 { 60 spin_lock_init(&iot->lock); 61 iot->in_flight = 0ul; 62 iot->idle_time = 0ul; 63 iot->last_update_time = jiffies; 64 } 65 66 static bool __iot_idle_for(struct io_tracker *iot, unsigned long jifs) 67 { 68 if (iot->in_flight) 69 return false; 70 71 return time_after(jiffies, iot->idle_time + jifs); 72 } 73 74 static bool iot_idle_for(struct io_tracker *iot, unsigned long jifs) 75 { 76 bool r; 77 unsigned long flags; 78 79 spin_lock_irqsave(&iot->lock, flags); 80 r = __iot_idle_for(iot, jifs); 81 spin_unlock_irqrestore(&iot->lock, flags); 82 83 return r; 84 } 85 86 static void iot_io_begin(struct io_tracker *iot, sector_t len) 87 { 88 unsigned long flags; 89 90 spin_lock_irqsave(&iot->lock, flags); 91 iot->in_flight += len; 92 spin_unlock_irqrestore(&iot->lock, flags); 93 } 94 95 static void __iot_io_end(struct io_tracker *iot, sector_t len) 96 { 97 if (!len) 98 return; 99 100 iot->in_flight -= len; 101 if (!iot->in_flight) 102 iot->idle_time = jiffies; 103 } 104 105 static void iot_io_end(struct io_tracker *iot, sector_t len) 106 { 107 unsigned long flags; 108 109 spin_lock_irqsave(&iot->lock, flags); 110 __iot_io_end(iot, len); 111 spin_unlock_irqrestore(&iot->lock, flags); 112 } 113 114 /*----------------------------------------------------------------*/ 115 116 /* 117 * Represents a chunk of future work. 'input' allows continuations to pass 118 * values between themselves, typically error values. 119 */ 120 struct continuation { 121 struct work_struct ws; 122 blk_status_t input; 123 }; 124 125 static inline void init_continuation(struct continuation *k, 126 void (*fn)(struct work_struct *)) 127 { 128 INIT_WORK(&k->ws, fn); 129 k->input = 0; 130 } 131 132 static inline void queue_continuation(struct workqueue_struct *wq, 133 struct continuation *k) 134 { 135 queue_work(wq, &k->ws); 136 } 137 138 /*----------------------------------------------------------------*/ 139 140 /* 141 * The batcher collects together pieces of work that need a particular 142 * operation to occur before they can proceed (typically a commit). 143 */ 144 struct batcher { 145 /* 146 * The operation that everyone is waiting for. 147 */ 148 blk_status_t (*commit_op)(void *context); 149 void *commit_context; 150 151 /* 152 * This is how bios should be issued once the commit op is complete 153 * (accounted_request). 154 */ 155 void (*issue_op)(struct bio *bio, void *context); 156 void *issue_context; 157 158 /* 159 * Queued work gets put on here after commit. 160 */ 161 struct workqueue_struct *wq; 162 163 spinlock_t lock; 164 struct list_head work_items; 165 struct bio_list bios; 166 struct work_struct commit_work; 167 168 bool commit_scheduled; 169 }; 170 171 static void __commit(struct work_struct *_ws) 172 { 173 struct batcher *b = container_of(_ws, struct batcher, commit_work); 174 blk_status_t r; 175 unsigned long flags; 176 struct list_head work_items; 177 struct work_struct *ws, *tmp; 178 struct continuation *k; 179 struct bio *bio; 180 struct bio_list bios; 181 182 INIT_LIST_HEAD(&work_items); 183 bio_list_init(&bios); 184 185 /* 186 * We have to grab these before the commit_op to avoid a race 187 * condition. 188 */ 189 spin_lock_irqsave(&b->lock, flags); 190 list_splice_init(&b->work_items, &work_items); 191 bio_list_merge(&bios, &b->bios); 192 bio_list_init(&b->bios); 193 b->commit_scheduled = false; 194 spin_unlock_irqrestore(&b->lock, flags); 195 196 r = b->commit_op(b->commit_context); 197 198 list_for_each_entry_safe(ws, tmp, &work_items, entry) { 199 k = container_of(ws, struct continuation, ws); 200 k->input = r; 201 INIT_LIST_HEAD(&ws->entry); /* to avoid a WARN_ON */ 202 queue_work(b->wq, ws); 203 } 204 205 while ((bio = bio_list_pop(&bios))) { 206 if (r) { 207 bio->bi_status = r; 208 bio_endio(bio); 209 } else 210 b->issue_op(bio, b->issue_context); 211 } 212 } 213 214 static void batcher_init(struct batcher *b, 215 blk_status_t (*commit_op)(void *), 216 void *commit_context, 217 void (*issue_op)(struct bio *bio, void *), 218 void *issue_context, 219 struct workqueue_struct *wq) 220 { 221 b->commit_op = commit_op; 222 b->commit_context = commit_context; 223 b->issue_op = issue_op; 224 b->issue_context = issue_context; 225 b->wq = wq; 226 227 spin_lock_init(&b->lock); 228 INIT_LIST_HEAD(&b->work_items); 229 bio_list_init(&b->bios); 230 INIT_WORK(&b->commit_work, __commit); 231 b->commit_scheduled = false; 232 } 233 234 static void async_commit(struct batcher *b) 235 { 236 queue_work(b->wq, &b->commit_work); 237 } 238 239 static void continue_after_commit(struct batcher *b, struct continuation *k) 240 { 241 unsigned long flags; 242 bool commit_scheduled; 243 244 spin_lock_irqsave(&b->lock, flags); 245 commit_scheduled = b->commit_scheduled; 246 list_add_tail(&k->ws.entry, &b->work_items); 247 spin_unlock_irqrestore(&b->lock, flags); 248 249 if (commit_scheduled) 250 async_commit(b); 251 } 252 253 /* 254 * Bios are errored if commit failed. 255 */ 256 static void issue_after_commit(struct batcher *b, struct bio *bio) 257 { 258 unsigned long flags; 259 bool commit_scheduled; 260 261 spin_lock_irqsave(&b->lock, flags); 262 commit_scheduled = b->commit_scheduled; 263 bio_list_add(&b->bios, bio); 264 spin_unlock_irqrestore(&b->lock, flags); 265 266 if (commit_scheduled) 267 async_commit(b); 268 } 269 270 /* 271 * Call this if some urgent work is waiting for the commit to complete. 272 */ 273 static void schedule_commit(struct batcher *b) 274 { 275 bool immediate; 276 unsigned long flags; 277 278 spin_lock_irqsave(&b->lock, flags); 279 immediate = !list_empty(&b->work_items) || !bio_list_empty(&b->bios); 280 b->commit_scheduled = true; 281 spin_unlock_irqrestore(&b->lock, flags); 282 283 if (immediate) 284 async_commit(b); 285 } 286 287 /* 288 * There are a couple of places where we let a bio run, but want to do some 289 * work before calling its endio function. We do this by temporarily 290 * changing the endio fn. 291 */ 292 struct dm_hook_info { 293 bio_end_io_t *bi_end_io; 294 }; 295 296 static void dm_hook_bio(struct dm_hook_info *h, struct bio *bio, 297 bio_end_io_t *bi_end_io, void *bi_private) 298 { 299 h->bi_end_io = bio->bi_end_io; 300 301 bio->bi_end_io = bi_end_io; 302 bio->bi_private = bi_private; 303 } 304 305 static void dm_unhook_bio(struct dm_hook_info *h, struct bio *bio) 306 { 307 bio->bi_end_io = h->bi_end_io; 308 } 309 310 /*----------------------------------------------------------------*/ 311 312 #define MIGRATION_POOL_SIZE 128 313 #define COMMIT_PERIOD HZ 314 #define MIGRATION_COUNT_WINDOW 10 315 316 /* 317 * The block size of the device holding cache data must be 318 * between 32KB and 1GB. 319 */ 320 #define DATA_DEV_BLOCK_SIZE_MIN_SECTORS (32 * 1024 >> SECTOR_SHIFT) 321 #define DATA_DEV_BLOCK_SIZE_MAX_SECTORS (1024 * 1024 * 1024 >> SECTOR_SHIFT) 322 323 enum cache_metadata_mode { 324 CM_WRITE, /* metadata may be changed */ 325 CM_READ_ONLY, /* metadata may not be changed */ 326 CM_FAIL 327 }; 328 329 enum cache_io_mode { 330 /* 331 * Data is written to cached blocks only. These blocks are marked 332 * dirty. If you lose the cache device you will lose data. 333 * Potential performance increase for both reads and writes. 334 */ 335 CM_IO_WRITEBACK, 336 337 /* 338 * Data is written to both cache and origin. Blocks are never 339 * dirty. Potential performance benfit for reads only. 340 */ 341 CM_IO_WRITETHROUGH, 342 343 /* 344 * A degraded mode useful for various cache coherency situations 345 * (eg, rolling back snapshots). Reads and writes always go to the 346 * origin. If a write goes to a cached oblock, then the cache 347 * block is invalidated. 348 */ 349 CM_IO_PASSTHROUGH 350 }; 351 352 struct cache_features { 353 enum cache_metadata_mode mode; 354 enum cache_io_mode io_mode; 355 unsigned metadata_version; 356 }; 357 358 struct cache_stats { 359 atomic_t read_hit; 360 atomic_t read_miss; 361 atomic_t write_hit; 362 atomic_t write_miss; 363 atomic_t demotion; 364 atomic_t promotion; 365 atomic_t writeback; 366 atomic_t copies_avoided; 367 atomic_t cache_cell_clash; 368 atomic_t commit_count; 369 atomic_t discard_count; 370 }; 371 372 struct cache { 373 struct dm_target *ti; 374 struct dm_target_callbacks callbacks; 375 376 struct dm_cache_metadata *cmd; 377 378 /* 379 * Metadata is written to this device. 380 */ 381 struct dm_dev *metadata_dev; 382 383 /* 384 * The slower of the two data devices. Typically a spindle. 385 */ 386 struct dm_dev *origin_dev; 387 388 /* 389 * The faster of the two data devices. Typically an SSD. 390 */ 391 struct dm_dev *cache_dev; 392 393 /* 394 * Size of the origin device in _complete_ blocks and native sectors. 395 */ 396 dm_oblock_t origin_blocks; 397 sector_t origin_sectors; 398 399 /* 400 * Size of the cache device in blocks. 401 */ 402 dm_cblock_t cache_size; 403 404 /* 405 * Fields for converting from sectors to blocks. 406 */ 407 sector_t sectors_per_block; 408 int sectors_per_block_shift; 409 410 spinlock_t lock; 411 struct list_head deferred_cells; 412 struct bio_list deferred_bios; 413 struct bio_list deferred_writethrough_bios; 414 sector_t migration_threshold; 415 wait_queue_head_t migration_wait; 416 atomic_t nr_allocated_migrations; 417 418 /* 419 * The number of in flight migrations that are performing 420 * background io. eg, promotion, writeback. 421 */ 422 atomic_t nr_io_migrations; 423 424 struct rw_semaphore quiesce_lock; 425 426 /* 427 * cache_size entries, dirty if set 428 */ 429 atomic_t nr_dirty; 430 unsigned long *dirty_bitset; 431 432 /* 433 * origin_blocks entries, discarded if set. 434 */ 435 dm_dblock_t discard_nr_blocks; 436 unsigned long *discard_bitset; 437 uint32_t discard_block_size; /* a power of 2 times sectors per block */ 438 439 /* 440 * Rather than reconstructing the table line for the status we just 441 * save it and regurgitate. 442 */ 443 unsigned nr_ctr_args; 444 const char **ctr_args; 445 446 struct dm_kcopyd_client *copier; 447 struct workqueue_struct *wq; 448 struct work_struct deferred_bio_worker; 449 struct work_struct deferred_writethrough_worker; 450 struct work_struct migration_worker; 451 struct delayed_work waker; 452 struct dm_bio_prison_v2 *prison; 453 454 mempool_t *migration_pool; 455 456 struct dm_cache_policy *policy; 457 unsigned policy_nr_args; 458 459 bool need_tick_bio:1; 460 bool sized:1; 461 bool invalidate:1; 462 bool commit_requested:1; 463 bool loaded_mappings:1; 464 bool loaded_discards:1; 465 466 /* 467 * Cache features such as write-through. 468 */ 469 struct cache_features features; 470 471 struct cache_stats stats; 472 473 /* 474 * Invalidation fields. 475 */ 476 spinlock_t invalidation_lock; 477 struct list_head invalidation_requests; 478 479 struct io_tracker tracker; 480 481 struct work_struct commit_ws; 482 struct batcher committer; 483 484 struct rw_semaphore background_work_lock; 485 }; 486 487 struct per_bio_data { 488 bool tick:1; 489 unsigned req_nr:2; 490 struct dm_bio_prison_cell_v2 *cell; 491 struct dm_hook_info hook_info; 492 sector_t len; 493 494 /* 495 * writethrough fields. These MUST remain at the end of this 496 * structure and the 'cache' member must be the first as it 497 * is used to determine the offset of the writethrough fields. 498 */ 499 struct cache *cache; 500 dm_cblock_t cblock; 501 struct dm_bio_details bio_details; 502 }; 503 504 struct dm_cache_migration { 505 struct continuation k; 506 struct cache *cache; 507 508 struct policy_work *op; 509 struct bio *overwrite_bio; 510 struct dm_bio_prison_cell_v2 *cell; 511 512 dm_cblock_t invalidate_cblock; 513 dm_oblock_t invalidate_oblock; 514 }; 515 516 /*----------------------------------------------------------------*/ 517 518 static bool writethrough_mode(struct cache_features *f) 519 { 520 return f->io_mode == CM_IO_WRITETHROUGH; 521 } 522 523 static bool writeback_mode(struct cache_features *f) 524 { 525 return f->io_mode == CM_IO_WRITEBACK; 526 } 527 528 static inline bool passthrough_mode(struct cache_features *f) 529 { 530 return unlikely(f->io_mode == CM_IO_PASSTHROUGH); 531 } 532 533 /*----------------------------------------------------------------*/ 534 535 static void wake_deferred_bio_worker(struct cache *cache) 536 { 537 queue_work(cache->wq, &cache->deferred_bio_worker); 538 } 539 540 static void wake_deferred_writethrough_worker(struct cache *cache) 541 { 542 queue_work(cache->wq, &cache->deferred_writethrough_worker); 543 } 544 545 static void wake_migration_worker(struct cache *cache) 546 { 547 if (passthrough_mode(&cache->features)) 548 return; 549 550 queue_work(cache->wq, &cache->migration_worker); 551 } 552 553 /*----------------------------------------------------------------*/ 554 555 static struct dm_bio_prison_cell_v2 *alloc_prison_cell(struct cache *cache) 556 { 557 return dm_bio_prison_alloc_cell_v2(cache->prison, GFP_NOWAIT); 558 } 559 560 static void free_prison_cell(struct cache *cache, struct dm_bio_prison_cell_v2 *cell) 561 { 562 dm_bio_prison_free_cell_v2(cache->prison, cell); 563 } 564 565 static struct dm_cache_migration *alloc_migration(struct cache *cache) 566 { 567 struct dm_cache_migration *mg; 568 569 mg = mempool_alloc(cache->migration_pool, GFP_NOWAIT); 570 if (mg) { 571 mg->cache = cache; 572 atomic_inc(&mg->cache->nr_allocated_migrations); 573 } 574 575 return mg; 576 } 577 578 static void free_migration(struct dm_cache_migration *mg) 579 { 580 struct cache *cache = mg->cache; 581 582 if (atomic_dec_and_test(&cache->nr_allocated_migrations)) 583 wake_up(&cache->migration_wait); 584 585 mempool_free(mg, cache->migration_pool); 586 } 587 588 /*----------------------------------------------------------------*/ 589 590 static inline dm_oblock_t oblock_succ(dm_oblock_t b) 591 { 592 return to_oblock(from_oblock(b) + 1ull); 593 } 594 595 static void build_key(dm_oblock_t begin, dm_oblock_t end, struct dm_cell_key_v2 *key) 596 { 597 key->virtual = 0; 598 key->dev = 0; 599 key->block_begin = from_oblock(begin); 600 key->block_end = from_oblock(end); 601 } 602 603 /* 604 * We have two lock levels. Level 0, which is used to prevent WRITEs, and 605 * level 1 which prevents *both* READs and WRITEs. 606 */ 607 #define WRITE_LOCK_LEVEL 0 608 #define READ_WRITE_LOCK_LEVEL 1 609 610 static unsigned lock_level(struct bio *bio) 611 { 612 return bio_data_dir(bio) == WRITE ? 613 WRITE_LOCK_LEVEL : 614 READ_WRITE_LOCK_LEVEL; 615 } 616 617 /*---------------------------------------------------------------- 618 * Per bio data 619 *--------------------------------------------------------------*/ 620 621 /* 622 * If using writeback, leave out struct per_bio_data's writethrough fields. 623 */ 624 #define PB_DATA_SIZE_WB (offsetof(struct per_bio_data, cache)) 625 #define PB_DATA_SIZE_WT (sizeof(struct per_bio_data)) 626 627 static size_t get_per_bio_data_size(struct cache *cache) 628 { 629 return writethrough_mode(&cache->features) ? PB_DATA_SIZE_WT : PB_DATA_SIZE_WB; 630 } 631 632 static struct per_bio_data *get_per_bio_data(struct bio *bio, size_t data_size) 633 { 634 struct per_bio_data *pb = dm_per_bio_data(bio, data_size); 635 BUG_ON(!pb); 636 return pb; 637 } 638 639 static struct per_bio_data *init_per_bio_data(struct bio *bio, size_t data_size) 640 { 641 struct per_bio_data *pb = get_per_bio_data(bio, data_size); 642 643 pb->tick = false; 644 pb->req_nr = dm_bio_get_target_bio_nr(bio); 645 pb->cell = NULL; 646 pb->len = 0; 647 648 return pb; 649 } 650 651 /*----------------------------------------------------------------*/ 652 653 static void defer_bio(struct cache *cache, struct bio *bio) 654 { 655 unsigned long flags; 656 657 spin_lock_irqsave(&cache->lock, flags); 658 bio_list_add(&cache->deferred_bios, bio); 659 spin_unlock_irqrestore(&cache->lock, flags); 660 661 wake_deferred_bio_worker(cache); 662 } 663 664 static void defer_bios(struct cache *cache, struct bio_list *bios) 665 { 666 unsigned long flags; 667 668 spin_lock_irqsave(&cache->lock, flags); 669 bio_list_merge(&cache->deferred_bios, bios); 670 bio_list_init(bios); 671 spin_unlock_irqrestore(&cache->lock, flags); 672 673 wake_deferred_bio_worker(cache); 674 } 675 676 /*----------------------------------------------------------------*/ 677 678 static bool bio_detain_shared(struct cache *cache, dm_oblock_t oblock, struct bio *bio) 679 { 680 bool r; 681 size_t pb_size; 682 struct per_bio_data *pb; 683 struct dm_cell_key_v2 key; 684 dm_oblock_t end = to_oblock(from_oblock(oblock) + 1ULL); 685 struct dm_bio_prison_cell_v2 *cell_prealloc, *cell; 686 687 cell_prealloc = alloc_prison_cell(cache); /* FIXME: allow wait if calling from worker */ 688 if (!cell_prealloc) { 689 defer_bio(cache, bio); 690 return false; 691 } 692 693 build_key(oblock, end, &key); 694 r = dm_cell_get_v2(cache->prison, &key, lock_level(bio), bio, cell_prealloc, &cell); 695 if (!r) { 696 /* 697 * Failed to get the lock. 698 */ 699 free_prison_cell(cache, cell_prealloc); 700 return r; 701 } 702 703 if (cell != cell_prealloc) 704 free_prison_cell(cache, cell_prealloc); 705 706 pb_size = get_per_bio_data_size(cache); 707 pb = get_per_bio_data(bio, pb_size); 708 pb->cell = cell; 709 710 return r; 711 } 712 713 /*----------------------------------------------------------------*/ 714 715 static bool is_dirty(struct cache *cache, dm_cblock_t b) 716 { 717 return test_bit(from_cblock(b), cache->dirty_bitset); 718 } 719 720 static void set_dirty(struct cache *cache, dm_cblock_t cblock) 721 { 722 if (!test_and_set_bit(from_cblock(cblock), cache->dirty_bitset)) { 723 atomic_inc(&cache->nr_dirty); 724 policy_set_dirty(cache->policy, cblock); 725 } 726 } 727 728 /* 729 * These two are called when setting after migrations to force the policy 730 * and dirty bitset to be in sync. 731 */ 732 static void force_set_dirty(struct cache *cache, dm_cblock_t cblock) 733 { 734 if (!test_and_set_bit(from_cblock(cblock), cache->dirty_bitset)) 735 atomic_inc(&cache->nr_dirty); 736 policy_set_dirty(cache->policy, cblock); 737 } 738 739 static void force_clear_dirty(struct cache *cache, dm_cblock_t cblock) 740 { 741 if (test_and_clear_bit(from_cblock(cblock), cache->dirty_bitset)) { 742 if (atomic_dec_return(&cache->nr_dirty) == 0) 743 dm_table_event(cache->ti->table); 744 } 745 746 policy_clear_dirty(cache->policy, cblock); 747 } 748 749 /*----------------------------------------------------------------*/ 750 751 static bool block_size_is_power_of_two(struct cache *cache) 752 { 753 return cache->sectors_per_block_shift >= 0; 754 } 755 756 /* gcc on ARM generates spurious references to __udivdi3 and __umoddi3 */ 757 #if defined(CONFIG_ARM) && __GNUC__ == 4 && __GNUC_MINOR__ <= 6 758 __always_inline 759 #endif 760 static dm_block_t block_div(dm_block_t b, uint32_t n) 761 { 762 do_div(b, n); 763 764 return b; 765 } 766 767 static dm_block_t oblocks_per_dblock(struct cache *cache) 768 { 769 dm_block_t oblocks = cache->discard_block_size; 770 771 if (block_size_is_power_of_two(cache)) 772 oblocks >>= cache->sectors_per_block_shift; 773 else 774 oblocks = block_div(oblocks, cache->sectors_per_block); 775 776 return oblocks; 777 } 778 779 static dm_dblock_t oblock_to_dblock(struct cache *cache, dm_oblock_t oblock) 780 { 781 return to_dblock(block_div(from_oblock(oblock), 782 oblocks_per_dblock(cache))); 783 } 784 785 static void set_discard(struct cache *cache, dm_dblock_t b) 786 { 787 unsigned long flags; 788 789 BUG_ON(from_dblock(b) >= from_dblock(cache->discard_nr_blocks)); 790 atomic_inc(&cache->stats.discard_count); 791 792 spin_lock_irqsave(&cache->lock, flags); 793 set_bit(from_dblock(b), cache->discard_bitset); 794 spin_unlock_irqrestore(&cache->lock, flags); 795 } 796 797 static void clear_discard(struct cache *cache, dm_dblock_t b) 798 { 799 unsigned long flags; 800 801 spin_lock_irqsave(&cache->lock, flags); 802 clear_bit(from_dblock(b), cache->discard_bitset); 803 spin_unlock_irqrestore(&cache->lock, flags); 804 } 805 806 static bool is_discarded(struct cache *cache, dm_dblock_t b) 807 { 808 int r; 809 unsigned long flags; 810 811 spin_lock_irqsave(&cache->lock, flags); 812 r = test_bit(from_dblock(b), cache->discard_bitset); 813 spin_unlock_irqrestore(&cache->lock, flags); 814 815 return r; 816 } 817 818 static bool is_discarded_oblock(struct cache *cache, dm_oblock_t b) 819 { 820 int r; 821 unsigned long flags; 822 823 spin_lock_irqsave(&cache->lock, flags); 824 r = test_bit(from_dblock(oblock_to_dblock(cache, b)), 825 cache->discard_bitset); 826 spin_unlock_irqrestore(&cache->lock, flags); 827 828 return r; 829 } 830 831 /*---------------------------------------------------------------- 832 * Remapping 833 *--------------------------------------------------------------*/ 834 static void remap_to_origin(struct cache *cache, struct bio *bio) 835 { 836 bio->bi_bdev = cache->origin_dev->bdev; 837 } 838 839 static void remap_to_cache(struct cache *cache, struct bio *bio, 840 dm_cblock_t cblock) 841 { 842 sector_t bi_sector = bio->bi_iter.bi_sector; 843 sector_t block = from_cblock(cblock); 844 845 bio->bi_bdev = cache->cache_dev->bdev; 846 if (!block_size_is_power_of_two(cache)) 847 bio->bi_iter.bi_sector = 848 (block * cache->sectors_per_block) + 849 sector_div(bi_sector, cache->sectors_per_block); 850 else 851 bio->bi_iter.bi_sector = 852 (block << cache->sectors_per_block_shift) | 853 (bi_sector & (cache->sectors_per_block - 1)); 854 } 855 856 static void check_if_tick_bio_needed(struct cache *cache, struct bio *bio) 857 { 858 unsigned long flags; 859 size_t pb_data_size = get_per_bio_data_size(cache); 860 struct per_bio_data *pb = get_per_bio_data(bio, pb_data_size); 861 862 spin_lock_irqsave(&cache->lock, flags); 863 if (cache->need_tick_bio && !op_is_flush(bio->bi_opf) && 864 bio_op(bio) != REQ_OP_DISCARD) { 865 pb->tick = true; 866 cache->need_tick_bio = false; 867 } 868 spin_unlock_irqrestore(&cache->lock, flags); 869 } 870 871 static void remap_to_origin_clear_discard(struct cache *cache, struct bio *bio, 872 dm_oblock_t oblock) 873 { 874 // FIXME: this is called way too much. 875 check_if_tick_bio_needed(cache, bio); 876 remap_to_origin(cache, bio); 877 if (bio_data_dir(bio) == WRITE) 878 clear_discard(cache, oblock_to_dblock(cache, oblock)); 879 } 880 881 static void remap_to_cache_dirty(struct cache *cache, struct bio *bio, 882 dm_oblock_t oblock, dm_cblock_t cblock) 883 { 884 check_if_tick_bio_needed(cache, bio); 885 remap_to_cache(cache, bio, cblock); 886 if (bio_data_dir(bio) == WRITE) { 887 set_dirty(cache, cblock); 888 clear_discard(cache, oblock_to_dblock(cache, oblock)); 889 } 890 } 891 892 static dm_oblock_t get_bio_block(struct cache *cache, struct bio *bio) 893 { 894 sector_t block_nr = bio->bi_iter.bi_sector; 895 896 if (!block_size_is_power_of_two(cache)) 897 (void) sector_div(block_nr, cache->sectors_per_block); 898 else 899 block_nr >>= cache->sectors_per_block_shift; 900 901 return to_oblock(block_nr); 902 } 903 904 static bool accountable_bio(struct cache *cache, struct bio *bio) 905 { 906 return bio_op(bio) != REQ_OP_DISCARD; 907 } 908 909 static void accounted_begin(struct cache *cache, struct bio *bio) 910 { 911 size_t pb_data_size = get_per_bio_data_size(cache); 912 struct per_bio_data *pb = get_per_bio_data(bio, pb_data_size); 913 914 if (accountable_bio(cache, bio)) { 915 pb->len = bio_sectors(bio); 916 iot_io_begin(&cache->tracker, pb->len); 917 } 918 } 919 920 static void accounted_complete(struct cache *cache, struct bio *bio) 921 { 922 size_t pb_data_size = get_per_bio_data_size(cache); 923 struct per_bio_data *pb = get_per_bio_data(bio, pb_data_size); 924 925 iot_io_end(&cache->tracker, pb->len); 926 } 927 928 static void accounted_request(struct cache *cache, struct bio *bio) 929 { 930 accounted_begin(cache, bio); 931 generic_make_request(bio); 932 } 933 934 static void issue_op(struct bio *bio, void *context) 935 { 936 struct cache *cache = context; 937 accounted_request(cache, bio); 938 } 939 940 static void defer_writethrough_bio(struct cache *cache, struct bio *bio) 941 { 942 unsigned long flags; 943 944 spin_lock_irqsave(&cache->lock, flags); 945 bio_list_add(&cache->deferred_writethrough_bios, bio); 946 spin_unlock_irqrestore(&cache->lock, flags); 947 948 wake_deferred_writethrough_worker(cache); 949 } 950 951 static void writethrough_endio(struct bio *bio) 952 { 953 struct per_bio_data *pb = get_per_bio_data(bio, PB_DATA_SIZE_WT); 954 955 dm_unhook_bio(&pb->hook_info, bio); 956 957 if (bio->bi_status) { 958 bio_endio(bio); 959 return; 960 } 961 962 dm_bio_restore(&pb->bio_details, bio); 963 remap_to_cache(pb->cache, bio, pb->cblock); 964 965 /* 966 * We can't issue this bio directly, since we're in interrupt 967 * context. So it gets put on a bio list for processing by the 968 * worker thread. 969 */ 970 defer_writethrough_bio(pb->cache, bio); 971 } 972 973 /* 974 * FIXME: send in parallel, huge latency as is. 975 * When running in writethrough mode we need to send writes to clean blocks 976 * to both the cache and origin devices. In future we'd like to clone the 977 * bio and send them in parallel, but for now we're doing them in 978 * series as this is easier. 979 */ 980 static void remap_to_origin_then_cache(struct cache *cache, struct bio *bio, 981 dm_oblock_t oblock, dm_cblock_t cblock) 982 { 983 struct per_bio_data *pb = get_per_bio_data(bio, PB_DATA_SIZE_WT); 984 985 pb->cache = cache; 986 pb->cblock = cblock; 987 dm_hook_bio(&pb->hook_info, bio, writethrough_endio, NULL); 988 dm_bio_record(&pb->bio_details, bio); 989 990 remap_to_origin_clear_discard(pb->cache, bio, oblock); 991 } 992 993 /*---------------------------------------------------------------- 994 * Failure modes 995 *--------------------------------------------------------------*/ 996 static enum cache_metadata_mode get_cache_mode(struct cache *cache) 997 { 998 return cache->features.mode; 999 } 1000 1001 static const char *cache_device_name(struct cache *cache) 1002 { 1003 return dm_device_name(dm_table_get_md(cache->ti->table)); 1004 } 1005 1006 static void notify_mode_switch(struct cache *cache, enum cache_metadata_mode mode) 1007 { 1008 const char *descs[] = { 1009 "write", 1010 "read-only", 1011 "fail" 1012 }; 1013 1014 dm_table_event(cache->ti->table); 1015 DMINFO("%s: switching cache to %s mode", 1016 cache_device_name(cache), descs[(int)mode]); 1017 } 1018 1019 static void set_cache_mode(struct cache *cache, enum cache_metadata_mode new_mode) 1020 { 1021 bool needs_check; 1022 enum cache_metadata_mode old_mode = get_cache_mode(cache); 1023 1024 if (dm_cache_metadata_needs_check(cache->cmd, &needs_check)) { 1025 DMERR("%s: unable to read needs_check flag, setting failure mode.", 1026 cache_device_name(cache)); 1027 new_mode = CM_FAIL; 1028 } 1029 1030 if (new_mode == CM_WRITE && needs_check) { 1031 DMERR("%s: unable to switch cache to write mode until repaired.", 1032 cache_device_name(cache)); 1033 if (old_mode != new_mode) 1034 new_mode = old_mode; 1035 else 1036 new_mode = CM_READ_ONLY; 1037 } 1038 1039 /* Never move out of fail mode */ 1040 if (old_mode == CM_FAIL) 1041 new_mode = CM_FAIL; 1042 1043 switch (new_mode) { 1044 case CM_FAIL: 1045 case CM_READ_ONLY: 1046 dm_cache_metadata_set_read_only(cache->cmd); 1047 break; 1048 1049 case CM_WRITE: 1050 dm_cache_metadata_set_read_write(cache->cmd); 1051 break; 1052 } 1053 1054 cache->features.mode = new_mode; 1055 1056 if (new_mode != old_mode) 1057 notify_mode_switch(cache, new_mode); 1058 } 1059 1060 static void abort_transaction(struct cache *cache) 1061 { 1062 const char *dev_name = cache_device_name(cache); 1063 1064 if (get_cache_mode(cache) >= CM_READ_ONLY) 1065 return; 1066 1067 if (dm_cache_metadata_set_needs_check(cache->cmd)) { 1068 DMERR("%s: failed to set 'needs_check' flag in metadata", dev_name); 1069 set_cache_mode(cache, CM_FAIL); 1070 } 1071 1072 DMERR_LIMIT("%s: aborting current metadata transaction", dev_name); 1073 if (dm_cache_metadata_abort(cache->cmd)) { 1074 DMERR("%s: failed to abort metadata transaction", dev_name); 1075 set_cache_mode(cache, CM_FAIL); 1076 } 1077 } 1078 1079 static void metadata_operation_failed(struct cache *cache, const char *op, int r) 1080 { 1081 DMERR_LIMIT("%s: metadata operation '%s' failed: error = %d", 1082 cache_device_name(cache), op, r); 1083 abort_transaction(cache); 1084 set_cache_mode(cache, CM_READ_ONLY); 1085 } 1086 1087 /*----------------------------------------------------------------*/ 1088 1089 static void load_stats(struct cache *cache) 1090 { 1091 struct dm_cache_statistics stats; 1092 1093 dm_cache_metadata_get_stats(cache->cmd, &stats); 1094 atomic_set(&cache->stats.read_hit, stats.read_hits); 1095 atomic_set(&cache->stats.read_miss, stats.read_misses); 1096 atomic_set(&cache->stats.write_hit, stats.write_hits); 1097 atomic_set(&cache->stats.write_miss, stats.write_misses); 1098 } 1099 1100 static void save_stats(struct cache *cache) 1101 { 1102 struct dm_cache_statistics stats; 1103 1104 if (get_cache_mode(cache) >= CM_READ_ONLY) 1105 return; 1106 1107 stats.read_hits = atomic_read(&cache->stats.read_hit); 1108 stats.read_misses = atomic_read(&cache->stats.read_miss); 1109 stats.write_hits = atomic_read(&cache->stats.write_hit); 1110 stats.write_misses = atomic_read(&cache->stats.write_miss); 1111 1112 dm_cache_metadata_set_stats(cache->cmd, &stats); 1113 } 1114 1115 static void update_stats(struct cache_stats *stats, enum policy_operation op) 1116 { 1117 switch (op) { 1118 case POLICY_PROMOTE: 1119 atomic_inc(&stats->promotion); 1120 break; 1121 1122 case POLICY_DEMOTE: 1123 atomic_inc(&stats->demotion); 1124 break; 1125 1126 case POLICY_WRITEBACK: 1127 atomic_inc(&stats->writeback); 1128 break; 1129 } 1130 } 1131 1132 /*---------------------------------------------------------------- 1133 * Migration processing 1134 * 1135 * Migration covers moving data from the origin device to the cache, or 1136 * vice versa. 1137 *--------------------------------------------------------------*/ 1138 1139 static void inc_io_migrations(struct cache *cache) 1140 { 1141 atomic_inc(&cache->nr_io_migrations); 1142 } 1143 1144 static void dec_io_migrations(struct cache *cache) 1145 { 1146 atomic_dec(&cache->nr_io_migrations); 1147 } 1148 1149 static bool discard_or_flush(struct bio *bio) 1150 { 1151 return bio_op(bio) == REQ_OP_DISCARD || op_is_flush(bio->bi_opf); 1152 } 1153 1154 static void calc_discard_block_range(struct cache *cache, struct bio *bio, 1155 dm_dblock_t *b, dm_dblock_t *e) 1156 { 1157 sector_t sb = bio->bi_iter.bi_sector; 1158 sector_t se = bio_end_sector(bio); 1159 1160 *b = to_dblock(dm_sector_div_up(sb, cache->discard_block_size)); 1161 1162 if (se - sb < cache->discard_block_size) 1163 *e = *b; 1164 else 1165 *e = to_dblock(block_div(se, cache->discard_block_size)); 1166 } 1167 1168 /*----------------------------------------------------------------*/ 1169 1170 static void prevent_background_work(struct cache *cache) 1171 { 1172 lockdep_off(); 1173 down_write(&cache->background_work_lock); 1174 lockdep_on(); 1175 } 1176 1177 static void allow_background_work(struct cache *cache) 1178 { 1179 lockdep_off(); 1180 up_write(&cache->background_work_lock); 1181 lockdep_on(); 1182 } 1183 1184 static bool background_work_begin(struct cache *cache) 1185 { 1186 bool r; 1187 1188 lockdep_off(); 1189 r = down_read_trylock(&cache->background_work_lock); 1190 lockdep_on(); 1191 1192 return r; 1193 } 1194 1195 static void background_work_end(struct cache *cache) 1196 { 1197 lockdep_off(); 1198 up_read(&cache->background_work_lock); 1199 lockdep_on(); 1200 } 1201 1202 /*----------------------------------------------------------------*/ 1203 1204 static void quiesce(struct dm_cache_migration *mg, 1205 void (*continuation)(struct work_struct *)) 1206 { 1207 init_continuation(&mg->k, continuation); 1208 dm_cell_quiesce_v2(mg->cache->prison, mg->cell, &mg->k.ws); 1209 } 1210 1211 static struct dm_cache_migration *ws_to_mg(struct work_struct *ws) 1212 { 1213 struct continuation *k = container_of(ws, struct continuation, ws); 1214 return container_of(k, struct dm_cache_migration, k); 1215 } 1216 1217 static void copy_complete(int read_err, unsigned long write_err, void *context) 1218 { 1219 struct dm_cache_migration *mg = container_of(context, struct dm_cache_migration, k); 1220 1221 if (read_err || write_err) 1222 mg->k.input = BLK_STS_IOERR; 1223 1224 queue_continuation(mg->cache->wq, &mg->k); 1225 } 1226 1227 static int copy(struct dm_cache_migration *mg, bool promote) 1228 { 1229 int r; 1230 struct dm_io_region o_region, c_region; 1231 struct cache *cache = mg->cache; 1232 1233 o_region.bdev = cache->origin_dev->bdev; 1234 o_region.sector = from_oblock(mg->op->oblock) * cache->sectors_per_block; 1235 o_region.count = cache->sectors_per_block; 1236 1237 c_region.bdev = cache->cache_dev->bdev; 1238 c_region.sector = from_cblock(mg->op->cblock) * cache->sectors_per_block; 1239 c_region.count = cache->sectors_per_block; 1240 1241 if (promote) 1242 r = dm_kcopyd_copy(cache->copier, &o_region, 1, &c_region, 0, copy_complete, &mg->k); 1243 else 1244 r = dm_kcopyd_copy(cache->copier, &c_region, 1, &o_region, 0, copy_complete, &mg->k); 1245 1246 return r; 1247 } 1248 1249 static void bio_drop_shared_lock(struct cache *cache, struct bio *bio) 1250 { 1251 size_t pb_data_size = get_per_bio_data_size(cache); 1252 struct per_bio_data *pb = get_per_bio_data(bio, pb_data_size); 1253 1254 if (pb->cell && dm_cell_put_v2(cache->prison, pb->cell)) 1255 free_prison_cell(cache, pb->cell); 1256 pb->cell = NULL; 1257 } 1258 1259 static void overwrite_endio(struct bio *bio) 1260 { 1261 struct dm_cache_migration *mg = bio->bi_private; 1262 struct cache *cache = mg->cache; 1263 size_t pb_data_size = get_per_bio_data_size(cache); 1264 struct per_bio_data *pb = get_per_bio_data(bio, pb_data_size); 1265 1266 dm_unhook_bio(&pb->hook_info, bio); 1267 1268 if (bio->bi_status) 1269 mg->k.input = bio->bi_status; 1270 1271 queue_continuation(mg->cache->wq, &mg->k); 1272 } 1273 1274 static void overwrite(struct dm_cache_migration *mg, 1275 void (*continuation)(struct work_struct *)) 1276 { 1277 struct bio *bio = mg->overwrite_bio; 1278 size_t pb_data_size = get_per_bio_data_size(mg->cache); 1279 struct per_bio_data *pb = get_per_bio_data(bio, pb_data_size); 1280 1281 dm_hook_bio(&pb->hook_info, bio, overwrite_endio, mg); 1282 1283 /* 1284 * The overwrite bio is part of the copy operation, as such it does 1285 * not set/clear discard or dirty flags. 1286 */ 1287 if (mg->op->op == POLICY_PROMOTE) 1288 remap_to_cache(mg->cache, bio, mg->op->cblock); 1289 else 1290 remap_to_origin(mg->cache, bio); 1291 1292 init_continuation(&mg->k, continuation); 1293 accounted_request(mg->cache, bio); 1294 } 1295 1296 /* 1297 * Migration steps: 1298 * 1299 * 1) exclusive lock preventing WRITEs 1300 * 2) quiesce 1301 * 3) copy or issue overwrite bio 1302 * 4) upgrade to exclusive lock preventing READs and WRITEs 1303 * 5) quiesce 1304 * 6) update metadata and commit 1305 * 7) unlock 1306 */ 1307 static void mg_complete(struct dm_cache_migration *mg, bool success) 1308 { 1309 struct bio_list bios; 1310 struct cache *cache = mg->cache; 1311 struct policy_work *op = mg->op; 1312 dm_cblock_t cblock = op->cblock; 1313 1314 if (success) 1315 update_stats(&cache->stats, op->op); 1316 1317 switch (op->op) { 1318 case POLICY_PROMOTE: 1319 clear_discard(cache, oblock_to_dblock(cache, op->oblock)); 1320 policy_complete_background_work(cache->policy, op, success); 1321 1322 if (mg->overwrite_bio) { 1323 if (success) 1324 force_set_dirty(cache, cblock); 1325 else if (mg->k.input) 1326 mg->overwrite_bio->bi_status = mg->k.input; 1327 else 1328 mg->overwrite_bio->bi_status = BLK_STS_IOERR; 1329 bio_endio(mg->overwrite_bio); 1330 } else { 1331 if (success) 1332 force_clear_dirty(cache, cblock); 1333 dec_io_migrations(cache); 1334 } 1335 break; 1336 1337 case POLICY_DEMOTE: 1338 /* 1339 * We clear dirty here to update the nr_dirty counter. 1340 */ 1341 if (success) 1342 force_clear_dirty(cache, cblock); 1343 policy_complete_background_work(cache->policy, op, success); 1344 dec_io_migrations(cache); 1345 break; 1346 1347 case POLICY_WRITEBACK: 1348 if (success) 1349 force_clear_dirty(cache, cblock); 1350 policy_complete_background_work(cache->policy, op, success); 1351 dec_io_migrations(cache); 1352 break; 1353 } 1354 1355 bio_list_init(&bios); 1356 if (mg->cell) { 1357 if (dm_cell_unlock_v2(cache->prison, mg->cell, &bios)) 1358 free_prison_cell(cache, mg->cell); 1359 } 1360 1361 free_migration(mg); 1362 defer_bios(cache, &bios); 1363 wake_migration_worker(cache); 1364 1365 background_work_end(cache); 1366 } 1367 1368 static void mg_success(struct work_struct *ws) 1369 { 1370 struct dm_cache_migration *mg = ws_to_mg(ws); 1371 mg_complete(mg, mg->k.input == 0); 1372 } 1373 1374 static void mg_update_metadata(struct work_struct *ws) 1375 { 1376 int r; 1377 struct dm_cache_migration *mg = ws_to_mg(ws); 1378 struct cache *cache = mg->cache; 1379 struct policy_work *op = mg->op; 1380 1381 switch (op->op) { 1382 case POLICY_PROMOTE: 1383 r = dm_cache_insert_mapping(cache->cmd, op->cblock, op->oblock); 1384 if (r) { 1385 DMERR_LIMIT("%s: migration failed; couldn't insert mapping", 1386 cache_device_name(cache)); 1387 metadata_operation_failed(cache, "dm_cache_insert_mapping", r); 1388 1389 mg_complete(mg, false); 1390 return; 1391 } 1392 mg_complete(mg, true); 1393 break; 1394 1395 case POLICY_DEMOTE: 1396 r = dm_cache_remove_mapping(cache->cmd, op->cblock); 1397 if (r) { 1398 DMERR_LIMIT("%s: migration failed; couldn't update on disk metadata", 1399 cache_device_name(cache)); 1400 metadata_operation_failed(cache, "dm_cache_remove_mapping", r); 1401 1402 mg_complete(mg, false); 1403 return; 1404 } 1405 1406 /* 1407 * It would be nice if we only had to commit when a REQ_FLUSH 1408 * comes through. But there's one scenario that we have to 1409 * look out for: 1410 * 1411 * - vblock x in a cache block 1412 * - domotion occurs 1413 * - cache block gets reallocated and over written 1414 * - crash 1415 * 1416 * When we recover, because there was no commit the cache will 1417 * rollback to having the data for vblock x in the cache block. 1418 * But the cache block has since been overwritten, so it'll end 1419 * up pointing to data that was never in 'x' during the history 1420 * of the device. 1421 * 1422 * To avoid this issue we require a commit as part of the 1423 * demotion operation. 1424 */ 1425 init_continuation(&mg->k, mg_success); 1426 continue_after_commit(&cache->committer, &mg->k); 1427 schedule_commit(&cache->committer); 1428 break; 1429 1430 case POLICY_WRITEBACK: 1431 mg_complete(mg, true); 1432 break; 1433 } 1434 } 1435 1436 static void mg_update_metadata_after_copy(struct work_struct *ws) 1437 { 1438 struct dm_cache_migration *mg = ws_to_mg(ws); 1439 1440 /* 1441 * Did the copy succeed? 1442 */ 1443 if (mg->k.input) 1444 mg_complete(mg, false); 1445 else 1446 mg_update_metadata(ws); 1447 } 1448 1449 static void mg_upgrade_lock(struct work_struct *ws) 1450 { 1451 int r; 1452 struct dm_cache_migration *mg = ws_to_mg(ws); 1453 1454 /* 1455 * Did the copy succeed? 1456 */ 1457 if (mg->k.input) 1458 mg_complete(mg, false); 1459 1460 else { 1461 /* 1462 * Now we want the lock to prevent both reads and writes. 1463 */ 1464 r = dm_cell_lock_promote_v2(mg->cache->prison, mg->cell, 1465 READ_WRITE_LOCK_LEVEL); 1466 if (r < 0) 1467 mg_complete(mg, false); 1468 1469 else if (r) 1470 quiesce(mg, mg_update_metadata); 1471 1472 else 1473 mg_update_metadata(ws); 1474 } 1475 } 1476 1477 static void mg_copy(struct work_struct *ws) 1478 { 1479 int r; 1480 struct dm_cache_migration *mg = ws_to_mg(ws); 1481 1482 if (mg->overwrite_bio) { 1483 /* 1484 * It's safe to do this here, even though it's new data 1485 * because all IO has been locked out of the block. 1486 * 1487 * mg_lock_writes() already took READ_WRITE_LOCK_LEVEL 1488 * so _not_ using mg_upgrade_lock() as continutation. 1489 */ 1490 overwrite(mg, mg_update_metadata_after_copy); 1491 1492 } else { 1493 struct cache *cache = mg->cache; 1494 struct policy_work *op = mg->op; 1495 bool is_policy_promote = (op->op == POLICY_PROMOTE); 1496 1497 if ((!is_policy_promote && !is_dirty(cache, op->cblock)) || 1498 is_discarded_oblock(cache, op->oblock)) { 1499 mg_upgrade_lock(ws); 1500 return; 1501 } 1502 1503 init_continuation(&mg->k, mg_upgrade_lock); 1504 1505 r = copy(mg, is_policy_promote); 1506 if (r) { 1507 DMERR_LIMIT("%s: migration copy failed", cache_device_name(cache)); 1508 mg->k.input = BLK_STS_IOERR; 1509 mg_complete(mg, false); 1510 } 1511 } 1512 } 1513 1514 static int mg_lock_writes(struct dm_cache_migration *mg) 1515 { 1516 int r; 1517 struct dm_cell_key_v2 key; 1518 struct cache *cache = mg->cache; 1519 struct dm_bio_prison_cell_v2 *prealloc; 1520 1521 prealloc = alloc_prison_cell(cache); 1522 if (!prealloc) { 1523 DMERR_LIMIT("%s: alloc_prison_cell failed", cache_device_name(cache)); 1524 mg_complete(mg, false); 1525 return -ENOMEM; 1526 } 1527 1528 /* 1529 * Prevent writes to the block, but allow reads to continue. 1530 * Unless we're using an overwrite bio, in which case we lock 1531 * everything. 1532 */ 1533 build_key(mg->op->oblock, oblock_succ(mg->op->oblock), &key); 1534 r = dm_cell_lock_v2(cache->prison, &key, 1535 mg->overwrite_bio ? READ_WRITE_LOCK_LEVEL : WRITE_LOCK_LEVEL, 1536 prealloc, &mg->cell); 1537 if (r < 0) { 1538 free_prison_cell(cache, prealloc); 1539 mg_complete(mg, false); 1540 return r; 1541 } 1542 1543 if (mg->cell != prealloc) 1544 free_prison_cell(cache, prealloc); 1545 1546 if (r == 0) 1547 mg_copy(&mg->k.ws); 1548 else 1549 quiesce(mg, mg_copy); 1550 1551 return 0; 1552 } 1553 1554 static int mg_start(struct cache *cache, struct policy_work *op, struct bio *bio) 1555 { 1556 struct dm_cache_migration *mg; 1557 1558 if (!background_work_begin(cache)) { 1559 policy_complete_background_work(cache->policy, op, false); 1560 return -EPERM; 1561 } 1562 1563 mg = alloc_migration(cache); 1564 if (!mg) { 1565 policy_complete_background_work(cache->policy, op, false); 1566 background_work_end(cache); 1567 return -ENOMEM; 1568 } 1569 1570 memset(mg, 0, sizeof(*mg)); 1571 1572 mg->cache = cache; 1573 mg->op = op; 1574 mg->overwrite_bio = bio; 1575 1576 if (!bio) 1577 inc_io_migrations(cache); 1578 1579 return mg_lock_writes(mg); 1580 } 1581 1582 /*---------------------------------------------------------------- 1583 * invalidation processing 1584 *--------------------------------------------------------------*/ 1585 1586 static void invalidate_complete(struct dm_cache_migration *mg, bool success) 1587 { 1588 struct bio_list bios; 1589 struct cache *cache = mg->cache; 1590 1591 bio_list_init(&bios); 1592 if (dm_cell_unlock_v2(cache->prison, mg->cell, &bios)) 1593 free_prison_cell(cache, mg->cell); 1594 1595 if (!success && mg->overwrite_bio) 1596 bio_io_error(mg->overwrite_bio); 1597 1598 free_migration(mg); 1599 defer_bios(cache, &bios); 1600 1601 background_work_end(cache); 1602 } 1603 1604 static void invalidate_completed(struct work_struct *ws) 1605 { 1606 struct dm_cache_migration *mg = ws_to_mg(ws); 1607 invalidate_complete(mg, !mg->k.input); 1608 } 1609 1610 static int invalidate_cblock(struct cache *cache, dm_cblock_t cblock) 1611 { 1612 int r = policy_invalidate_mapping(cache->policy, cblock); 1613 if (!r) { 1614 r = dm_cache_remove_mapping(cache->cmd, cblock); 1615 if (r) { 1616 DMERR_LIMIT("%s: invalidation failed; couldn't update on disk metadata", 1617 cache_device_name(cache)); 1618 metadata_operation_failed(cache, "dm_cache_remove_mapping", r); 1619 } 1620 1621 } else if (r == -ENODATA) { 1622 /* 1623 * Harmless, already unmapped. 1624 */ 1625 r = 0; 1626 1627 } else 1628 DMERR("%s: policy_invalidate_mapping failed", cache_device_name(cache)); 1629 1630 return r; 1631 } 1632 1633 static void invalidate_remove(struct work_struct *ws) 1634 { 1635 int r; 1636 struct dm_cache_migration *mg = ws_to_mg(ws); 1637 struct cache *cache = mg->cache; 1638 1639 r = invalidate_cblock(cache, mg->invalidate_cblock); 1640 if (r) { 1641 invalidate_complete(mg, false); 1642 return; 1643 } 1644 1645 init_continuation(&mg->k, invalidate_completed); 1646 continue_after_commit(&cache->committer, &mg->k); 1647 remap_to_origin_clear_discard(cache, mg->overwrite_bio, mg->invalidate_oblock); 1648 mg->overwrite_bio = NULL; 1649 schedule_commit(&cache->committer); 1650 } 1651 1652 static int invalidate_lock(struct dm_cache_migration *mg) 1653 { 1654 int r; 1655 struct dm_cell_key_v2 key; 1656 struct cache *cache = mg->cache; 1657 struct dm_bio_prison_cell_v2 *prealloc; 1658 1659 prealloc = alloc_prison_cell(cache); 1660 if (!prealloc) { 1661 invalidate_complete(mg, false); 1662 return -ENOMEM; 1663 } 1664 1665 build_key(mg->invalidate_oblock, oblock_succ(mg->invalidate_oblock), &key); 1666 r = dm_cell_lock_v2(cache->prison, &key, 1667 READ_WRITE_LOCK_LEVEL, prealloc, &mg->cell); 1668 if (r < 0) { 1669 free_prison_cell(cache, prealloc); 1670 invalidate_complete(mg, false); 1671 return r; 1672 } 1673 1674 if (mg->cell != prealloc) 1675 free_prison_cell(cache, prealloc); 1676 1677 if (r) 1678 quiesce(mg, invalidate_remove); 1679 1680 else { 1681 /* 1682 * We can't call invalidate_remove() directly here because we 1683 * might still be in request context. 1684 */ 1685 init_continuation(&mg->k, invalidate_remove); 1686 queue_work(cache->wq, &mg->k.ws); 1687 } 1688 1689 return 0; 1690 } 1691 1692 static int invalidate_start(struct cache *cache, dm_cblock_t cblock, 1693 dm_oblock_t oblock, struct bio *bio) 1694 { 1695 struct dm_cache_migration *mg; 1696 1697 if (!background_work_begin(cache)) 1698 return -EPERM; 1699 1700 mg = alloc_migration(cache); 1701 if (!mg) { 1702 background_work_end(cache); 1703 return -ENOMEM; 1704 } 1705 1706 memset(mg, 0, sizeof(*mg)); 1707 1708 mg->cache = cache; 1709 mg->overwrite_bio = bio; 1710 mg->invalidate_cblock = cblock; 1711 mg->invalidate_oblock = oblock; 1712 1713 return invalidate_lock(mg); 1714 } 1715 1716 /*---------------------------------------------------------------- 1717 * bio processing 1718 *--------------------------------------------------------------*/ 1719 1720 enum busy { 1721 IDLE, 1722 BUSY 1723 }; 1724 1725 static enum busy spare_migration_bandwidth(struct cache *cache) 1726 { 1727 bool idle = iot_idle_for(&cache->tracker, HZ); 1728 sector_t current_volume = (atomic_read(&cache->nr_io_migrations) + 1) * 1729 cache->sectors_per_block; 1730 1731 if (idle && current_volume <= cache->migration_threshold) 1732 return IDLE; 1733 else 1734 return BUSY; 1735 } 1736 1737 static void inc_hit_counter(struct cache *cache, struct bio *bio) 1738 { 1739 atomic_inc(bio_data_dir(bio) == READ ? 1740 &cache->stats.read_hit : &cache->stats.write_hit); 1741 } 1742 1743 static void inc_miss_counter(struct cache *cache, struct bio *bio) 1744 { 1745 atomic_inc(bio_data_dir(bio) == READ ? 1746 &cache->stats.read_miss : &cache->stats.write_miss); 1747 } 1748 1749 /*----------------------------------------------------------------*/ 1750 1751 static bool bio_writes_complete_block(struct cache *cache, struct bio *bio) 1752 { 1753 return (bio_data_dir(bio) == WRITE) && 1754 (bio->bi_iter.bi_size == (cache->sectors_per_block << SECTOR_SHIFT)); 1755 } 1756 1757 static bool optimisable_bio(struct cache *cache, struct bio *bio, dm_oblock_t block) 1758 { 1759 return writeback_mode(&cache->features) && 1760 (is_discarded_oblock(cache, block) || bio_writes_complete_block(cache, bio)); 1761 } 1762 1763 static int map_bio(struct cache *cache, struct bio *bio, dm_oblock_t block, 1764 bool *commit_needed) 1765 { 1766 int r, data_dir; 1767 bool rb, background_queued; 1768 dm_cblock_t cblock; 1769 size_t pb_data_size = get_per_bio_data_size(cache); 1770 struct per_bio_data *pb = get_per_bio_data(bio, pb_data_size); 1771 1772 *commit_needed = false; 1773 1774 rb = bio_detain_shared(cache, block, bio); 1775 if (!rb) { 1776 /* 1777 * An exclusive lock is held for this block, so we have to 1778 * wait. We set the commit_needed flag so the current 1779 * transaction will be committed asap, allowing this lock 1780 * to be dropped. 1781 */ 1782 *commit_needed = true; 1783 return DM_MAPIO_SUBMITTED; 1784 } 1785 1786 data_dir = bio_data_dir(bio); 1787 1788 if (optimisable_bio(cache, bio, block)) { 1789 struct policy_work *op = NULL; 1790 1791 r = policy_lookup_with_work(cache->policy, block, &cblock, data_dir, true, &op); 1792 if (unlikely(r && r != -ENOENT)) { 1793 DMERR_LIMIT("%s: policy_lookup_with_work() failed with r = %d", 1794 cache_device_name(cache), r); 1795 bio_io_error(bio); 1796 return DM_MAPIO_SUBMITTED; 1797 } 1798 1799 if (r == -ENOENT && op) { 1800 bio_drop_shared_lock(cache, bio); 1801 BUG_ON(op->op != POLICY_PROMOTE); 1802 mg_start(cache, op, bio); 1803 return DM_MAPIO_SUBMITTED; 1804 } 1805 } else { 1806 r = policy_lookup(cache->policy, block, &cblock, data_dir, false, &background_queued); 1807 if (unlikely(r && r != -ENOENT)) { 1808 DMERR_LIMIT("%s: policy_lookup() failed with r = %d", 1809 cache_device_name(cache), r); 1810 bio_io_error(bio); 1811 return DM_MAPIO_SUBMITTED; 1812 } 1813 1814 if (background_queued) 1815 wake_migration_worker(cache); 1816 } 1817 1818 if (r == -ENOENT) { 1819 /* 1820 * Miss. 1821 */ 1822 inc_miss_counter(cache, bio); 1823 if (pb->req_nr == 0) { 1824 accounted_begin(cache, bio); 1825 remap_to_origin_clear_discard(cache, bio, block); 1826 1827 } else { 1828 /* 1829 * This is a duplicate writethrough io that is no 1830 * longer needed because the block has been demoted. 1831 */ 1832 bio_endio(bio); 1833 return DM_MAPIO_SUBMITTED; 1834 } 1835 } else { 1836 /* 1837 * Hit. 1838 */ 1839 inc_hit_counter(cache, bio); 1840 1841 /* 1842 * Passthrough always maps to the origin, invalidating any 1843 * cache blocks that are written to. 1844 */ 1845 if (passthrough_mode(&cache->features)) { 1846 if (bio_data_dir(bio) == WRITE) { 1847 bio_drop_shared_lock(cache, bio); 1848 atomic_inc(&cache->stats.demotion); 1849 invalidate_start(cache, cblock, block, bio); 1850 } else 1851 remap_to_origin_clear_discard(cache, bio, block); 1852 1853 } else { 1854 if (bio_data_dir(bio) == WRITE && writethrough_mode(&cache->features) && 1855 !is_dirty(cache, cblock)) { 1856 remap_to_origin_then_cache(cache, bio, block, cblock); 1857 accounted_begin(cache, bio); 1858 } else 1859 remap_to_cache_dirty(cache, bio, block, cblock); 1860 } 1861 } 1862 1863 /* 1864 * dm core turns FUA requests into a separate payload and FLUSH req. 1865 */ 1866 if (bio->bi_opf & REQ_FUA) { 1867 /* 1868 * issue_after_commit will call accounted_begin a second time. So 1869 * we call accounted_complete() to avoid double accounting. 1870 */ 1871 accounted_complete(cache, bio); 1872 issue_after_commit(&cache->committer, bio); 1873 *commit_needed = true; 1874 return DM_MAPIO_SUBMITTED; 1875 } 1876 1877 return DM_MAPIO_REMAPPED; 1878 } 1879 1880 static bool process_bio(struct cache *cache, struct bio *bio) 1881 { 1882 bool commit_needed; 1883 1884 if (map_bio(cache, bio, get_bio_block(cache, bio), &commit_needed) == DM_MAPIO_REMAPPED) 1885 generic_make_request(bio); 1886 1887 return commit_needed; 1888 } 1889 1890 /* 1891 * A non-zero return indicates read_only or fail_io mode. 1892 */ 1893 static int commit(struct cache *cache, bool clean_shutdown) 1894 { 1895 int r; 1896 1897 if (get_cache_mode(cache) >= CM_READ_ONLY) 1898 return -EINVAL; 1899 1900 atomic_inc(&cache->stats.commit_count); 1901 r = dm_cache_commit(cache->cmd, clean_shutdown); 1902 if (r) 1903 metadata_operation_failed(cache, "dm_cache_commit", r); 1904 1905 return r; 1906 } 1907 1908 /* 1909 * Used by the batcher. 1910 */ 1911 static blk_status_t commit_op(void *context) 1912 { 1913 struct cache *cache = context; 1914 1915 if (dm_cache_changed_this_transaction(cache->cmd)) 1916 return errno_to_blk_status(commit(cache, false)); 1917 1918 return 0; 1919 } 1920 1921 /*----------------------------------------------------------------*/ 1922 1923 static bool process_flush_bio(struct cache *cache, struct bio *bio) 1924 { 1925 size_t pb_data_size = get_per_bio_data_size(cache); 1926 struct per_bio_data *pb = get_per_bio_data(bio, pb_data_size); 1927 1928 if (!pb->req_nr) 1929 remap_to_origin(cache, bio); 1930 else 1931 remap_to_cache(cache, bio, 0); 1932 1933 issue_after_commit(&cache->committer, bio); 1934 return true; 1935 } 1936 1937 static bool process_discard_bio(struct cache *cache, struct bio *bio) 1938 { 1939 dm_dblock_t b, e; 1940 1941 // FIXME: do we need to lock the region? Or can we just assume the 1942 // user wont be so foolish as to issue discard concurrently with 1943 // other IO? 1944 calc_discard_block_range(cache, bio, &b, &e); 1945 while (b != e) { 1946 set_discard(cache, b); 1947 b = to_dblock(from_dblock(b) + 1); 1948 } 1949 1950 bio_endio(bio); 1951 1952 return false; 1953 } 1954 1955 static void process_deferred_bios(struct work_struct *ws) 1956 { 1957 struct cache *cache = container_of(ws, struct cache, deferred_bio_worker); 1958 1959 unsigned long flags; 1960 bool commit_needed = false; 1961 struct bio_list bios; 1962 struct bio *bio; 1963 1964 bio_list_init(&bios); 1965 1966 spin_lock_irqsave(&cache->lock, flags); 1967 bio_list_merge(&bios, &cache->deferred_bios); 1968 bio_list_init(&cache->deferred_bios); 1969 spin_unlock_irqrestore(&cache->lock, flags); 1970 1971 while ((bio = bio_list_pop(&bios))) { 1972 if (bio->bi_opf & REQ_PREFLUSH) 1973 commit_needed = process_flush_bio(cache, bio) || commit_needed; 1974 1975 else if (bio_op(bio) == REQ_OP_DISCARD) 1976 commit_needed = process_discard_bio(cache, bio) || commit_needed; 1977 1978 else 1979 commit_needed = process_bio(cache, bio) || commit_needed; 1980 } 1981 1982 if (commit_needed) 1983 schedule_commit(&cache->committer); 1984 } 1985 1986 static void process_deferred_writethrough_bios(struct work_struct *ws) 1987 { 1988 struct cache *cache = container_of(ws, struct cache, deferred_writethrough_worker); 1989 1990 unsigned long flags; 1991 struct bio_list bios; 1992 struct bio *bio; 1993 1994 bio_list_init(&bios); 1995 1996 spin_lock_irqsave(&cache->lock, flags); 1997 bio_list_merge(&bios, &cache->deferred_writethrough_bios); 1998 bio_list_init(&cache->deferred_writethrough_bios); 1999 spin_unlock_irqrestore(&cache->lock, flags); 2000 2001 /* 2002 * These bios have already been through accounted_begin() 2003 */ 2004 while ((bio = bio_list_pop(&bios))) 2005 generic_make_request(bio); 2006 } 2007 2008 /*---------------------------------------------------------------- 2009 * Main worker loop 2010 *--------------------------------------------------------------*/ 2011 2012 static void requeue_deferred_bios(struct cache *cache) 2013 { 2014 struct bio *bio; 2015 struct bio_list bios; 2016 2017 bio_list_init(&bios); 2018 bio_list_merge(&bios, &cache->deferred_bios); 2019 bio_list_init(&cache->deferred_bios); 2020 2021 while ((bio = bio_list_pop(&bios))) { 2022 bio->bi_status = BLK_STS_DM_REQUEUE; 2023 bio_endio(bio); 2024 } 2025 } 2026 2027 /* 2028 * We want to commit periodically so that not too much 2029 * unwritten metadata builds up. 2030 */ 2031 static void do_waker(struct work_struct *ws) 2032 { 2033 struct cache *cache = container_of(to_delayed_work(ws), struct cache, waker); 2034 2035 policy_tick(cache->policy, true); 2036 wake_migration_worker(cache); 2037 schedule_commit(&cache->committer); 2038 queue_delayed_work(cache->wq, &cache->waker, COMMIT_PERIOD); 2039 } 2040 2041 static void check_migrations(struct work_struct *ws) 2042 { 2043 int r; 2044 struct policy_work *op; 2045 struct cache *cache = container_of(ws, struct cache, migration_worker); 2046 enum busy b; 2047 2048 for (;;) { 2049 b = spare_migration_bandwidth(cache); 2050 2051 r = policy_get_background_work(cache->policy, b == IDLE, &op); 2052 if (r == -ENODATA) 2053 break; 2054 2055 if (r) { 2056 DMERR_LIMIT("%s: policy_background_work failed", 2057 cache_device_name(cache)); 2058 break; 2059 } 2060 2061 r = mg_start(cache, op, NULL); 2062 if (r) 2063 break; 2064 } 2065 } 2066 2067 /*---------------------------------------------------------------- 2068 * Target methods 2069 *--------------------------------------------------------------*/ 2070 2071 /* 2072 * This function gets called on the error paths of the constructor, so we 2073 * have to cope with a partially initialised struct. 2074 */ 2075 static void destroy(struct cache *cache) 2076 { 2077 unsigned i; 2078 2079 mempool_destroy(cache->migration_pool); 2080 2081 if (cache->prison) 2082 dm_bio_prison_destroy_v2(cache->prison); 2083 2084 if (cache->wq) 2085 destroy_workqueue(cache->wq); 2086 2087 if (cache->dirty_bitset) 2088 free_bitset(cache->dirty_bitset); 2089 2090 if (cache->discard_bitset) 2091 free_bitset(cache->discard_bitset); 2092 2093 if (cache->copier) 2094 dm_kcopyd_client_destroy(cache->copier); 2095 2096 if (cache->cmd) 2097 dm_cache_metadata_close(cache->cmd); 2098 2099 if (cache->metadata_dev) 2100 dm_put_device(cache->ti, cache->metadata_dev); 2101 2102 if (cache->origin_dev) 2103 dm_put_device(cache->ti, cache->origin_dev); 2104 2105 if (cache->cache_dev) 2106 dm_put_device(cache->ti, cache->cache_dev); 2107 2108 if (cache->policy) 2109 dm_cache_policy_destroy(cache->policy); 2110 2111 for (i = 0; i < cache->nr_ctr_args ; i++) 2112 kfree(cache->ctr_args[i]); 2113 kfree(cache->ctr_args); 2114 2115 kfree(cache); 2116 } 2117 2118 static void cache_dtr(struct dm_target *ti) 2119 { 2120 struct cache *cache = ti->private; 2121 2122 destroy(cache); 2123 } 2124 2125 static sector_t get_dev_size(struct dm_dev *dev) 2126 { 2127 return i_size_read(dev->bdev->bd_inode) >> SECTOR_SHIFT; 2128 } 2129 2130 /*----------------------------------------------------------------*/ 2131 2132 /* 2133 * Construct a cache device mapping. 2134 * 2135 * cache <metadata dev> <cache dev> <origin dev> <block size> 2136 * <#feature args> [<feature arg>]* 2137 * <policy> <#policy args> [<policy arg>]* 2138 * 2139 * metadata dev : fast device holding the persistent metadata 2140 * cache dev : fast device holding cached data blocks 2141 * origin dev : slow device holding original data blocks 2142 * block size : cache unit size in sectors 2143 * 2144 * #feature args : number of feature arguments passed 2145 * feature args : writethrough. (The default is writeback.) 2146 * 2147 * policy : the replacement policy to use 2148 * #policy args : an even number of policy arguments corresponding 2149 * to key/value pairs passed to the policy 2150 * policy args : key/value pairs passed to the policy 2151 * E.g. 'sequential_threshold 1024' 2152 * See cache-policies.txt for details. 2153 * 2154 * Optional feature arguments are: 2155 * writethrough : write through caching that prohibits cache block 2156 * content from being different from origin block content. 2157 * Without this argument, the default behaviour is to write 2158 * back cache block contents later for performance reasons, 2159 * so they may differ from the corresponding origin blocks. 2160 */ 2161 struct cache_args { 2162 struct dm_target *ti; 2163 2164 struct dm_dev *metadata_dev; 2165 2166 struct dm_dev *cache_dev; 2167 sector_t cache_sectors; 2168 2169 struct dm_dev *origin_dev; 2170 sector_t origin_sectors; 2171 2172 uint32_t block_size; 2173 2174 const char *policy_name; 2175 int policy_argc; 2176 const char **policy_argv; 2177 2178 struct cache_features features; 2179 }; 2180 2181 static void destroy_cache_args(struct cache_args *ca) 2182 { 2183 if (ca->metadata_dev) 2184 dm_put_device(ca->ti, ca->metadata_dev); 2185 2186 if (ca->cache_dev) 2187 dm_put_device(ca->ti, ca->cache_dev); 2188 2189 if (ca->origin_dev) 2190 dm_put_device(ca->ti, ca->origin_dev); 2191 2192 kfree(ca); 2193 } 2194 2195 static bool at_least_one_arg(struct dm_arg_set *as, char **error) 2196 { 2197 if (!as->argc) { 2198 *error = "Insufficient args"; 2199 return false; 2200 } 2201 2202 return true; 2203 } 2204 2205 static int parse_metadata_dev(struct cache_args *ca, struct dm_arg_set *as, 2206 char **error) 2207 { 2208 int r; 2209 sector_t metadata_dev_size; 2210 char b[BDEVNAME_SIZE]; 2211 2212 if (!at_least_one_arg(as, error)) 2213 return -EINVAL; 2214 2215 r = dm_get_device(ca->ti, dm_shift_arg(as), FMODE_READ | FMODE_WRITE, 2216 &ca->metadata_dev); 2217 if (r) { 2218 *error = "Error opening metadata device"; 2219 return r; 2220 } 2221 2222 metadata_dev_size = get_dev_size(ca->metadata_dev); 2223 if (metadata_dev_size > DM_CACHE_METADATA_MAX_SECTORS_WARNING) 2224 DMWARN("Metadata device %s is larger than %u sectors: excess space will not be used.", 2225 bdevname(ca->metadata_dev->bdev, b), THIN_METADATA_MAX_SECTORS); 2226 2227 return 0; 2228 } 2229 2230 static int parse_cache_dev(struct cache_args *ca, struct dm_arg_set *as, 2231 char **error) 2232 { 2233 int r; 2234 2235 if (!at_least_one_arg(as, error)) 2236 return -EINVAL; 2237 2238 r = dm_get_device(ca->ti, dm_shift_arg(as), FMODE_READ | FMODE_WRITE, 2239 &ca->cache_dev); 2240 if (r) { 2241 *error = "Error opening cache device"; 2242 return r; 2243 } 2244 ca->cache_sectors = get_dev_size(ca->cache_dev); 2245 2246 return 0; 2247 } 2248 2249 static int parse_origin_dev(struct cache_args *ca, struct dm_arg_set *as, 2250 char **error) 2251 { 2252 int r; 2253 2254 if (!at_least_one_arg(as, error)) 2255 return -EINVAL; 2256 2257 r = dm_get_device(ca->ti, dm_shift_arg(as), FMODE_READ | FMODE_WRITE, 2258 &ca->origin_dev); 2259 if (r) { 2260 *error = "Error opening origin device"; 2261 return r; 2262 } 2263 2264 ca->origin_sectors = get_dev_size(ca->origin_dev); 2265 if (ca->ti->len > ca->origin_sectors) { 2266 *error = "Device size larger than cached device"; 2267 return -EINVAL; 2268 } 2269 2270 return 0; 2271 } 2272 2273 static int parse_block_size(struct cache_args *ca, struct dm_arg_set *as, 2274 char **error) 2275 { 2276 unsigned long block_size; 2277 2278 if (!at_least_one_arg(as, error)) 2279 return -EINVAL; 2280 2281 if (kstrtoul(dm_shift_arg(as), 10, &block_size) || !block_size || 2282 block_size < DATA_DEV_BLOCK_SIZE_MIN_SECTORS || 2283 block_size > DATA_DEV_BLOCK_SIZE_MAX_SECTORS || 2284 block_size & (DATA_DEV_BLOCK_SIZE_MIN_SECTORS - 1)) { 2285 *error = "Invalid data block size"; 2286 return -EINVAL; 2287 } 2288 2289 if (block_size > ca->cache_sectors) { 2290 *error = "Data block size is larger than the cache device"; 2291 return -EINVAL; 2292 } 2293 2294 ca->block_size = block_size; 2295 2296 return 0; 2297 } 2298 2299 static void init_features(struct cache_features *cf) 2300 { 2301 cf->mode = CM_WRITE; 2302 cf->io_mode = CM_IO_WRITEBACK; 2303 cf->metadata_version = 1; 2304 } 2305 2306 static int parse_features(struct cache_args *ca, struct dm_arg_set *as, 2307 char **error) 2308 { 2309 static struct dm_arg _args[] = { 2310 {0, 2, "Invalid number of cache feature arguments"}, 2311 }; 2312 2313 int r; 2314 unsigned argc; 2315 const char *arg; 2316 struct cache_features *cf = &ca->features; 2317 2318 init_features(cf); 2319 2320 r = dm_read_arg_group(_args, as, &argc, error); 2321 if (r) 2322 return -EINVAL; 2323 2324 while (argc--) { 2325 arg = dm_shift_arg(as); 2326 2327 if (!strcasecmp(arg, "writeback")) 2328 cf->io_mode = CM_IO_WRITEBACK; 2329 2330 else if (!strcasecmp(arg, "writethrough")) 2331 cf->io_mode = CM_IO_WRITETHROUGH; 2332 2333 else if (!strcasecmp(arg, "passthrough")) 2334 cf->io_mode = CM_IO_PASSTHROUGH; 2335 2336 else if (!strcasecmp(arg, "metadata2")) 2337 cf->metadata_version = 2; 2338 2339 else { 2340 *error = "Unrecognised cache feature requested"; 2341 return -EINVAL; 2342 } 2343 } 2344 2345 return 0; 2346 } 2347 2348 static int parse_policy(struct cache_args *ca, struct dm_arg_set *as, 2349 char **error) 2350 { 2351 static struct dm_arg _args[] = { 2352 {0, 1024, "Invalid number of policy arguments"}, 2353 }; 2354 2355 int r; 2356 2357 if (!at_least_one_arg(as, error)) 2358 return -EINVAL; 2359 2360 ca->policy_name = dm_shift_arg(as); 2361 2362 r = dm_read_arg_group(_args, as, &ca->policy_argc, error); 2363 if (r) 2364 return -EINVAL; 2365 2366 ca->policy_argv = (const char **)as->argv; 2367 dm_consume_args(as, ca->policy_argc); 2368 2369 return 0; 2370 } 2371 2372 static int parse_cache_args(struct cache_args *ca, int argc, char **argv, 2373 char **error) 2374 { 2375 int r; 2376 struct dm_arg_set as; 2377 2378 as.argc = argc; 2379 as.argv = argv; 2380 2381 r = parse_metadata_dev(ca, &as, error); 2382 if (r) 2383 return r; 2384 2385 r = parse_cache_dev(ca, &as, error); 2386 if (r) 2387 return r; 2388 2389 r = parse_origin_dev(ca, &as, error); 2390 if (r) 2391 return r; 2392 2393 r = parse_block_size(ca, &as, error); 2394 if (r) 2395 return r; 2396 2397 r = parse_features(ca, &as, error); 2398 if (r) 2399 return r; 2400 2401 r = parse_policy(ca, &as, error); 2402 if (r) 2403 return r; 2404 2405 return 0; 2406 } 2407 2408 /*----------------------------------------------------------------*/ 2409 2410 static struct kmem_cache *migration_cache; 2411 2412 #define NOT_CORE_OPTION 1 2413 2414 static int process_config_option(struct cache *cache, const char *key, const char *value) 2415 { 2416 unsigned long tmp; 2417 2418 if (!strcasecmp(key, "migration_threshold")) { 2419 if (kstrtoul(value, 10, &tmp)) 2420 return -EINVAL; 2421 2422 cache->migration_threshold = tmp; 2423 return 0; 2424 } 2425 2426 return NOT_CORE_OPTION; 2427 } 2428 2429 static int set_config_value(struct cache *cache, const char *key, const char *value) 2430 { 2431 int r = process_config_option(cache, key, value); 2432 2433 if (r == NOT_CORE_OPTION) 2434 r = policy_set_config_value(cache->policy, key, value); 2435 2436 if (r) 2437 DMWARN("bad config value for %s: %s", key, value); 2438 2439 return r; 2440 } 2441 2442 static int set_config_values(struct cache *cache, int argc, const char **argv) 2443 { 2444 int r = 0; 2445 2446 if (argc & 1) { 2447 DMWARN("Odd number of policy arguments given but they should be <key> <value> pairs."); 2448 return -EINVAL; 2449 } 2450 2451 while (argc) { 2452 r = set_config_value(cache, argv[0], argv[1]); 2453 if (r) 2454 break; 2455 2456 argc -= 2; 2457 argv += 2; 2458 } 2459 2460 return r; 2461 } 2462 2463 static int create_cache_policy(struct cache *cache, struct cache_args *ca, 2464 char **error) 2465 { 2466 struct dm_cache_policy *p = dm_cache_policy_create(ca->policy_name, 2467 cache->cache_size, 2468 cache->origin_sectors, 2469 cache->sectors_per_block); 2470 if (IS_ERR(p)) { 2471 *error = "Error creating cache's policy"; 2472 return PTR_ERR(p); 2473 } 2474 cache->policy = p; 2475 BUG_ON(!cache->policy); 2476 2477 return 0; 2478 } 2479 2480 /* 2481 * We want the discard block size to be at least the size of the cache 2482 * block size and have no more than 2^14 discard blocks across the origin. 2483 */ 2484 #define MAX_DISCARD_BLOCKS (1 << 14) 2485 2486 static bool too_many_discard_blocks(sector_t discard_block_size, 2487 sector_t origin_size) 2488 { 2489 (void) sector_div(origin_size, discard_block_size); 2490 2491 return origin_size > MAX_DISCARD_BLOCKS; 2492 } 2493 2494 static sector_t calculate_discard_block_size(sector_t cache_block_size, 2495 sector_t origin_size) 2496 { 2497 sector_t discard_block_size = cache_block_size; 2498 2499 if (origin_size) 2500 while (too_many_discard_blocks(discard_block_size, origin_size)) 2501 discard_block_size *= 2; 2502 2503 return discard_block_size; 2504 } 2505 2506 static void set_cache_size(struct cache *cache, dm_cblock_t size) 2507 { 2508 dm_block_t nr_blocks = from_cblock(size); 2509 2510 if (nr_blocks > (1 << 20) && cache->cache_size != size) 2511 DMWARN_LIMIT("You have created a cache device with a lot of individual cache blocks (%llu)\n" 2512 "All these mappings can consume a lot of kernel memory, and take some time to read/write.\n" 2513 "Please consider increasing the cache block size to reduce the overall cache block count.", 2514 (unsigned long long) nr_blocks); 2515 2516 cache->cache_size = size; 2517 } 2518 2519 static int is_congested(struct dm_dev *dev, int bdi_bits) 2520 { 2521 struct request_queue *q = bdev_get_queue(dev->bdev); 2522 return bdi_congested(q->backing_dev_info, bdi_bits); 2523 } 2524 2525 static int cache_is_congested(struct dm_target_callbacks *cb, int bdi_bits) 2526 { 2527 struct cache *cache = container_of(cb, struct cache, callbacks); 2528 2529 return is_congested(cache->origin_dev, bdi_bits) || 2530 is_congested(cache->cache_dev, bdi_bits); 2531 } 2532 2533 #define DEFAULT_MIGRATION_THRESHOLD 2048 2534 2535 static int cache_create(struct cache_args *ca, struct cache **result) 2536 { 2537 int r = 0; 2538 char **error = &ca->ti->error; 2539 struct cache *cache; 2540 struct dm_target *ti = ca->ti; 2541 dm_block_t origin_blocks; 2542 struct dm_cache_metadata *cmd; 2543 bool may_format = ca->features.mode == CM_WRITE; 2544 2545 cache = kzalloc(sizeof(*cache), GFP_KERNEL); 2546 if (!cache) 2547 return -ENOMEM; 2548 2549 cache->ti = ca->ti; 2550 ti->private = cache; 2551 ti->num_flush_bios = 2; 2552 ti->flush_supported = true; 2553 2554 ti->num_discard_bios = 1; 2555 ti->discards_supported = true; 2556 ti->split_discard_bios = false; 2557 2558 cache->features = ca->features; 2559 ti->per_io_data_size = get_per_bio_data_size(cache); 2560 2561 cache->callbacks.congested_fn = cache_is_congested; 2562 dm_table_add_target_callbacks(ti->table, &cache->callbacks); 2563 2564 cache->metadata_dev = ca->metadata_dev; 2565 cache->origin_dev = ca->origin_dev; 2566 cache->cache_dev = ca->cache_dev; 2567 2568 ca->metadata_dev = ca->origin_dev = ca->cache_dev = NULL; 2569 2570 origin_blocks = cache->origin_sectors = ca->origin_sectors; 2571 origin_blocks = block_div(origin_blocks, ca->block_size); 2572 cache->origin_blocks = to_oblock(origin_blocks); 2573 2574 cache->sectors_per_block = ca->block_size; 2575 if (dm_set_target_max_io_len(ti, cache->sectors_per_block)) { 2576 r = -EINVAL; 2577 goto bad; 2578 } 2579 2580 if (ca->block_size & (ca->block_size - 1)) { 2581 dm_block_t cache_size = ca->cache_sectors; 2582 2583 cache->sectors_per_block_shift = -1; 2584 cache_size = block_div(cache_size, ca->block_size); 2585 set_cache_size(cache, to_cblock(cache_size)); 2586 } else { 2587 cache->sectors_per_block_shift = __ffs(ca->block_size); 2588 set_cache_size(cache, to_cblock(ca->cache_sectors >> cache->sectors_per_block_shift)); 2589 } 2590 2591 r = create_cache_policy(cache, ca, error); 2592 if (r) 2593 goto bad; 2594 2595 cache->policy_nr_args = ca->policy_argc; 2596 cache->migration_threshold = DEFAULT_MIGRATION_THRESHOLD; 2597 2598 r = set_config_values(cache, ca->policy_argc, ca->policy_argv); 2599 if (r) { 2600 *error = "Error setting cache policy's config values"; 2601 goto bad; 2602 } 2603 2604 cmd = dm_cache_metadata_open(cache->metadata_dev->bdev, 2605 ca->block_size, may_format, 2606 dm_cache_policy_get_hint_size(cache->policy), 2607 ca->features.metadata_version); 2608 if (IS_ERR(cmd)) { 2609 *error = "Error creating metadata object"; 2610 r = PTR_ERR(cmd); 2611 goto bad; 2612 } 2613 cache->cmd = cmd; 2614 set_cache_mode(cache, CM_WRITE); 2615 if (get_cache_mode(cache) != CM_WRITE) { 2616 *error = "Unable to get write access to metadata, please check/repair metadata."; 2617 r = -EINVAL; 2618 goto bad; 2619 } 2620 2621 if (passthrough_mode(&cache->features)) { 2622 bool all_clean; 2623 2624 r = dm_cache_metadata_all_clean(cache->cmd, &all_clean); 2625 if (r) { 2626 *error = "dm_cache_metadata_all_clean() failed"; 2627 goto bad; 2628 } 2629 2630 if (!all_clean) { 2631 *error = "Cannot enter passthrough mode unless all blocks are clean"; 2632 r = -EINVAL; 2633 goto bad; 2634 } 2635 2636 policy_allow_migrations(cache->policy, false); 2637 } 2638 2639 spin_lock_init(&cache->lock); 2640 INIT_LIST_HEAD(&cache->deferred_cells); 2641 bio_list_init(&cache->deferred_bios); 2642 bio_list_init(&cache->deferred_writethrough_bios); 2643 atomic_set(&cache->nr_allocated_migrations, 0); 2644 atomic_set(&cache->nr_io_migrations, 0); 2645 init_waitqueue_head(&cache->migration_wait); 2646 2647 r = -ENOMEM; 2648 atomic_set(&cache->nr_dirty, 0); 2649 cache->dirty_bitset = alloc_bitset(from_cblock(cache->cache_size)); 2650 if (!cache->dirty_bitset) { 2651 *error = "could not allocate dirty bitset"; 2652 goto bad; 2653 } 2654 clear_bitset(cache->dirty_bitset, from_cblock(cache->cache_size)); 2655 2656 cache->discard_block_size = 2657 calculate_discard_block_size(cache->sectors_per_block, 2658 cache->origin_sectors); 2659 cache->discard_nr_blocks = to_dblock(dm_sector_div_up(cache->origin_sectors, 2660 cache->discard_block_size)); 2661 cache->discard_bitset = alloc_bitset(from_dblock(cache->discard_nr_blocks)); 2662 if (!cache->discard_bitset) { 2663 *error = "could not allocate discard bitset"; 2664 goto bad; 2665 } 2666 clear_bitset(cache->discard_bitset, from_dblock(cache->discard_nr_blocks)); 2667 2668 cache->copier = dm_kcopyd_client_create(&dm_kcopyd_throttle); 2669 if (IS_ERR(cache->copier)) { 2670 *error = "could not create kcopyd client"; 2671 r = PTR_ERR(cache->copier); 2672 goto bad; 2673 } 2674 2675 cache->wq = alloc_workqueue("dm-" DM_MSG_PREFIX, WQ_MEM_RECLAIM, 0); 2676 if (!cache->wq) { 2677 *error = "could not create workqueue for metadata object"; 2678 goto bad; 2679 } 2680 INIT_WORK(&cache->deferred_bio_worker, process_deferred_bios); 2681 INIT_WORK(&cache->deferred_writethrough_worker, 2682 process_deferred_writethrough_bios); 2683 INIT_WORK(&cache->migration_worker, check_migrations); 2684 INIT_DELAYED_WORK(&cache->waker, do_waker); 2685 2686 cache->prison = dm_bio_prison_create_v2(cache->wq); 2687 if (!cache->prison) { 2688 *error = "could not create bio prison"; 2689 goto bad; 2690 } 2691 2692 cache->migration_pool = mempool_create_slab_pool(MIGRATION_POOL_SIZE, 2693 migration_cache); 2694 if (!cache->migration_pool) { 2695 *error = "Error creating cache's migration mempool"; 2696 goto bad; 2697 } 2698 2699 cache->need_tick_bio = true; 2700 cache->sized = false; 2701 cache->invalidate = false; 2702 cache->commit_requested = false; 2703 cache->loaded_mappings = false; 2704 cache->loaded_discards = false; 2705 2706 load_stats(cache); 2707 2708 atomic_set(&cache->stats.demotion, 0); 2709 atomic_set(&cache->stats.promotion, 0); 2710 atomic_set(&cache->stats.copies_avoided, 0); 2711 atomic_set(&cache->stats.cache_cell_clash, 0); 2712 atomic_set(&cache->stats.commit_count, 0); 2713 atomic_set(&cache->stats.discard_count, 0); 2714 2715 spin_lock_init(&cache->invalidation_lock); 2716 INIT_LIST_HEAD(&cache->invalidation_requests); 2717 2718 batcher_init(&cache->committer, commit_op, cache, 2719 issue_op, cache, cache->wq); 2720 iot_init(&cache->tracker); 2721 2722 init_rwsem(&cache->background_work_lock); 2723 prevent_background_work(cache); 2724 2725 *result = cache; 2726 return 0; 2727 bad: 2728 destroy(cache); 2729 return r; 2730 } 2731 2732 static int copy_ctr_args(struct cache *cache, int argc, const char **argv) 2733 { 2734 unsigned i; 2735 const char **copy; 2736 2737 copy = kcalloc(argc, sizeof(*copy), GFP_KERNEL); 2738 if (!copy) 2739 return -ENOMEM; 2740 for (i = 0; i < argc; i++) { 2741 copy[i] = kstrdup(argv[i], GFP_KERNEL); 2742 if (!copy[i]) { 2743 while (i--) 2744 kfree(copy[i]); 2745 kfree(copy); 2746 return -ENOMEM; 2747 } 2748 } 2749 2750 cache->nr_ctr_args = argc; 2751 cache->ctr_args = copy; 2752 2753 return 0; 2754 } 2755 2756 static int cache_ctr(struct dm_target *ti, unsigned argc, char **argv) 2757 { 2758 int r = -EINVAL; 2759 struct cache_args *ca; 2760 struct cache *cache = NULL; 2761 2762 ca = kzalloc(sizeof(*ca), GFP_KERNEL); 2763 if (!ca) { 2764 ti->error = "Error allocating memory for cache"; 2765 return -ENOMEM; 2766 } 2767 ca->ti = ti; 2768 2769 r = parse_cache_args(ca, argc, argv, &ti->error); 2770 if (r) 2771 goto out; 2772 2773 r = cache_create(ca, &cache); 2774 if (r) 2775 goto out; 2776 2777 r = copy_ctr_args(cache, argc - 3, (const char **)argv + 3); 2778 if (r) { 2779 destroy(cache); 2780 goto out; 2781 } 2782 2783 ti->private = cache; 2784 out: 2785 destroy_cache_args(ca); 2786 return r; 2787 } 2788 2789 /*----------------------------------------------------------------*/ 2790 2791 static int cache_map(struct dm_target *ti, struct bio *bio) 2792 { 2793 struct cache *cache = ti->private; 2794 2795 int r; 2796 bool commit_needed; 2797 dm_oblock_t block = get_bio_block(cache, bio); 2798 size_t pb_data_size = get_per_bio_data_size(cache); 2799 2800 init_per_bio_data(bio, pb_data_size); 2801 if (unlikely(from_oblock(block) >= from_oblock(cache->origin_blocks))) { 2802 /* 2803 * This can only occur if the io goes to a partial block at 2804 * the end of the origin device. We don't cache these. 2805 * Just remap to the origin and carry on. 2806 */ 2807 remap_to_origin(cache, bio); 2808 accounted_begin(cache, bio); 2809 return DM_MAPIO_REMAPPED; 2810 } 2811 2812 if (discard_or_flush(bio)) { 2813 defer_bio(cache, bio); 2814 return DM_MAPIO_SUBMITTED; 2815 } 2816 2817 r = map_bio(cache, bio, block, &commit_needed); 2818 if (commit_needed) 2819 schedule_commit(&cache->committer); 2820 2821 return r; 2822 } 2823 2824 static int cache_end_io(struct dm_target *ti, struct bio *bio, 2825 blk_status_t *error) 2826 { 2827 struct cache *cache = ti->private; 2828 unsigned long flags; 2829 size_t pb_data_size = get_per_bio_data_size(cache); 2830 struct per_bio_data *pb = get_per_bio_data(bio, pb_data_size); 2831 2832 if (pb->tick) { 2833 policy_tick(cache->policy, false); 2834 2835 spin_lock_irqsave(&cache->lock, flags); 2836 cache->need_tick_bio = true; 2837 spin_unlock_irqrestore(&cache->lock, flags); 2838 } 2839 2840 bio_drop_shared_lock(cache, bio); 2841 accounted_complete(cache, bio); 2842 2843 return DM_ENDIO_DONE; 2844 } 2845 2846 static int write_dirty_bitset(struct cache *cache) 2847 { 2848 int r; 2849 2850 if (get_cache_mode(cache) >= CM_READ_ONLY) 2851 return -EINVAL; 2852 2853 r = dm_cache_set_dirty_bits(cache->cmd, from_cblock(cache->cache_size), cache->dirty_bitset); 2854 if (r) 2855 metadata_operation_failed(cache, "dm_cache_set_dirty_bits", r); 2856 2857 return r; 2858 } 2859 2860 static int write_discard_bitset(struct cache *cache) 2861 { 2862 unsigned i, r; 2863 2864 if (get_cache_mode(cache) >= CM_READ_ONLY) 2865 return -EINVAL; 2866 2867 r = dm_cache_discard_bitset_resize(cache->cmd, cache->discard_block_size, 2868 cache->discard_nr_blocks); 2869 if (r) { 2870 DMERR("%s: could not resize on-disk discard bitset", cache_device_name(cache)); 2871 metadata_operation_failed(cache, "dm_cache_discard_bitset_resize", r); 2872 return r; 2873 } 2874 2875 for (i = 0; i < from_dblock(cache->discard_nr_blocks); i++) { 2876 r = dm_cache_set_discard(cache->cmd, to_dblock(i), 2877 is_discarded(cache, to_dblock(i))); 2878 if (r) { 2879 metadata_operation_failed(cache, "dm_cache_set_discard", r); 2880 return r; 2881 } 2882 } 2883 2884 return 0; 2885 } 2886 2887 static int write_hints(struct cache *cache) 2888 { 2889 int r; 2890 2891 if (get_cache_mode(cache) >= CM_READ_ONLY) 2892 return -EINVAL; 2893 2894 r = dm_cache_write_hints(cache->cmd, cache->policy); 2895 if (r) { 2896 metadata_operation_failed(cache, "dm_cache_write_hints", r); 2897 return r; 2898 } 2899 2900 return 0; 2901 } 2902 2903 /* 2904 * returns true on success 2905 */ 2906 static bool sync_metadata(struct cache *cache) 2907 { 2908 int r1, r2, r3, r4; 2909 2910 r1 = write_dirty_bitset(cache); 2911 if (r1) 2912 DMERR("%s: could not write dirty bitset", cache_device_name(cache)); 2913 2914 r2 = write_discard_bitset(cache); 2915 if (r2) 2916 DMERR("%s: could not write discard bitset", cache_device_name(cache)); 2917 2918 save_stats(cache); 2919 2920 r3 = write_hints(cache); 2921 if (r3) 2922 DMERR("%s: could not write hints", cache_device_name(cache)); 2923 2924 /* 2925 * If writing the above metadata failed, we still commit, but don't 2926 * set the clean shutdown flag. This will effectively force every 2927 * dirty bit to be set on reload. 2928 */ 2929 r4 = commit(cache, !r1 && !r2 && !r3); 2930 if (r4) 2931 DMERR("%s: could not write cache metadata", cache_device_name(cache)); 2932 2933 return !r1 && !r2 && !r3 && !r4; 2934 } 2935 2936 static void cache_postsuspend(struct dm_target *ti) 2937 { 2938 struct cache *cache = ti->private; 2939 2940 prevent_background_work(cache); 2941 BUG_ON(atomic_read(&cache->nr_io_migrations)); 2942 2943 cancel_delayed_work(&cache->waker); 2944 flush_workqueue(cache->wq); 2945 WARN_ON(cache->tracker.in_flight); 2946 2947 /* 2948 * If it's a flush suspend there won't be any deferred bios, so this 2949 * call is harmless. 2950 */ 2951 requeue_deferred_bios(cache); 2952 2953 if (get_cache_mode(cache) == CM_WRITE) 2954 (void) sync_metadata(cache); 2955 } 2956 2957 static int load_mapping(void *context, dm_oblock_t oblock, dm_cblock_t cblock, 2958 bool dirty, uint32_t hint, bool hint_valid) 2959 { 2960 int r; 2961 struct cache *cache = context; 2962 2963 if (dirty) { 2964 set_bit(from_cblock(cblock), cache->dirty_bitset); 2965 atomic_inc(&cache->nr_dirty); 2966 } else 2967 clear_bit(from_cblock(cblock), cache->dirty_bitset); 2968 2969 r = policy_load_mapping(cache->policy, oblock, cblock, dirty, hint, hint_valid); 2970 if (r) 2971 return r; 2972 2973 return 0; 2974 } 2975 2976 /* 2977 * The discard block size in the on disk metadata is not 2978 * neccessarily the same as we're currently using. So we have to 2979 * be careful to only set the discarded attribute if we know it 2980 * covers a complete block of the new size. 2981 */ 2982 struct discard_load_info { 2983 struct cache *cache; 2984 2985 /* 2986 * These blocks are sized using the on disk dblock size, rather 2987 * than the current one. 2988 */ 2989 dm_block_t block_size; 2990 dm_block_t discard_begin, discard_end; 2991 }; 2992 2993 static void discard_load_info_init(struct cache *cache, 2994 struct discard_load_info *li) 2995 { 2996 li->cache = cache; 2997 li->discard_begin = li->discard_end = 0; 2998 } 2999 3000 static void set_discard_range(struct discard_load_info *li) 3001 { 3002 sector_t b, e; 3003 3004 if (li->discard_begin == li->discard_end) 3005 return; 3006 3007 /* 3008 * Convert to sectors. 3009 */ 3010 b = li->discard_begin * li->block_size; 3011 e = li->discard_end * li->block_size; 3012 3013 /* 3014 * Then convert back to the current dblock size. 3015 */ 3016 b = dm_sector_div_up(b, li->cache->discard_block_size); 3017 sector_div(e, li->cache->discard_block_size); 3018 3019 /* 3020 * The origin may have shrunk, so we need to check we're still in 3021 * bounds. 3022 */ 3023 if (e > from_dblock(li->cache->discard_nr_blocks)) 3024 e = from_dblock(li->cache->discard_nr_blocks); 3025 3026 for (; b < e; b++) 3027 set_discard(li->cache, to_dblock(b)); 3028 } 3029 3030 static int load_discard(void *context, sector_t discard_block_size, 3031 dm_dblock_t dblock, bool discard) 3032 { 3033 struct discard_load_info *li = context; 3034 3035 li->block_size = discard_block_size; 3036 3037 if (discard) { 3038 if (from_dblock(dblock) == li->discard_end) 3039 /* 3040 * We're already in a discard range, just extend it. 3041 */ 3042 li->discard_end = li->discard_end + 1ULL; 3043 3044 else { 3045 /* 3046 * Emit the old range and start a new one. 3047 */ 3048 set_discard_range(li); 3049 li->discard_begin = from_dblock(dblock); 3050 li->discard_end = li->discard_begin + 1ULL; 3051 } 3052 } else { 3053 set_discard_range(li); 3054 li->discard_begin = li->discard_end = 0; 3055 } 3056 3057 return 0; 3058 } 3059 3060 static dm_cblock_t get_cache_dev_size(struct cache *cache) 3061 { 3062 sector_t size = get_dev_size(cache->cache_dev); 3063 (void) sector_div(size, cache->sectors_per_block); 3064 return to_cblock(size); 3065 } 3066 3067 static bool can_resize(struct cache *cache, dm_cblock_t new_size) 3068 { 3069 if (from_cblock(new_size) > from_cblock(cache->cache_size)) 3070 return true; 3071 3072 /* 3073 * We can't drop a dirty block when shrinking the cache. 3074 */ 3075 while (from_cblock(new_size) < from_cblock(cache->cache_size)) { 3076 new_size = to_cblock(from_cblock(new_size) + 1); 3077 if (is_dirty(cache, new_size)) { 3078 DMERR("%s: unable to shrink cache; cache block %llu is dirty", 3079 cache_device_name(cache), 3080 (unsigned long long) from_cblock(new_size)); 3081 return false; 3082 } 3083 } 3084 3085 return true; 3086 } 3087 3088 static int resize_cache_dev(struct cache *cache, dm_cblock_t new_size) 3089 { 3090 int r; 3091 3092 r = dm_cache_resize(cache->cmd, new_size); 3093 if (r) { 3094 DMERR("%s: could not resize cache metadata", cache_device_name(cache)); 3095 metadata_operation_failed(cache, "dm_cache_resize", r); 3096 return r; 3097 } 3098 3099 set_cache_size(cache, new_size); 3100 3101 return 0; 3102 } 3103 3104 static int cache_preresume(struct dm_target *ti) 3105 { 3106 int r = 0; 3107 struct cache *cache = ti->private; 3108 dm_cblock_t csize = get_cache_dev_size(cache); 3109 3110 /* 3111 * Check to see if the cache has resized. 3112 */ 3113 if (!cache->sized) { 3114 r = resize_cache_dev(cache, csize); 3115 if (r) 3116 return r; 3117 3118 cache->sized = true; 3119 3120 } else if (csize != cache->cache_size) { 3121 if (!can_resize(cache, csize)) 3122 return -EINVAL; 3123 3124 r = resize_cache_dev(cache, csize); 3125 if (r) 3126 return r; 3127 } 3128 3129 if (!cache->loaded_mappings) { 3130 r = dm_cache_load_mappings(cache->cmd, cache->policy, 3131 load_mapping, cache); 3132 if (r) { 3133 DMERR("%s: could not load cache mappings", cache_device_name(cache)); 3134 metadata_operation_failed(cache, "dm_cache_load_mappings", r); 3135 return r; 3136 } 3137 3138 cache->loaded_mappings = true; 3139 } 3140 3141 if (!cache->loaded_discards) { 3142 struct discard_load_info li; 3143 3144 /* 3145 * The discard bitset could have been resized, or the 3146 * discard block size changed. To be safe we start by 3147 * setting every dblock to not discarded. 3148 */ 3149 clear_bitset(cache->discard_bitset, from_dblock(cache->discard_nr_blocks)); 3150 3151 discard_load_info_init(cache, &li); 3152 r = dm_cache_load_discards(cache->cmd, load_discard, &li); 3153 if (r) { 3154 DMERR("%s: could not load origin discards", cache_device_name(cache)); 3155 metadata_operation_failed(cache, "dm_cache_load_discards", r); 3156 return r; 3157 } 3158 set_discard_range(&li); 3159 3160 cache->loaded_discards = true; 3161 } 3162 3163 return r; 3164 } 3165 3166 static void cache_resume(struct dm_target *ti) 3167 { 3168 struct cache *cache = ti->private; 3169 3170 cache->need_tick_bio = true; 3171 allow_background_work(cache); 3172 do_waker(&cache->waker.work); 3173 } 3174 3175 /* 3176 * Status format: 3177 * 3178 * <metadata block size> <#used metadata blocks>/<#total metadata blocks> 3179 * <cache block size> <#used cache blocks>/<#total cache blocks> 3180 * <#read hits> <#read misses> <#write hits> <#write misses> 3181 * <#demotions> <#promotions> <#dirty> 3182 * <#features> <features>* 3183 * <#core args> <core args> 3184 * <policy name> <#policy args> <policy args>* <cache metadata mode> <needs_check> 3185 */ 3186 static void cache_status(struct dm_target *ti, status_type_t type, 3187 unsigned status_flags, char *result, unsigned maxlen) 3188 { 3189 int r = 0; 3190 unsigned i; 3191 ssize_t sz = 0; 3192 dm_block_t nr_free_blocks_metadata = 0; 3193 dm_block_t nr_blocks_metadata = 0; 3194 char buf[BDEVNAME_SIZE]; 3195 struct cache *cache = ti->private; 3196 dm_cblock_t residency; 3197 bool needs_check; 3198 3199 switch (type) { 3200 case STATUSTYPE_INFO: 3201 if (get_cache_mode(cache) == CM_FAIL) { 3202 DMEMIT("Fail"); 3203 break; 3204 } 3205 3206 /* Commit to ensure statistics aren't out-of-date */ 3207 if (!(status_flags & DM_STATUS_NOFLUSH_FLAG) && !dm_suspended(ti)) 3208 (void) commit(cache, false); 3209 3210 r = dm_cache_get_free_metadata_block_count(cache->cmd, &nr_free_blocks_metadata); 3211 if (r) { 3212 DMERR("%s: dm_cache_get_free_metadata_block_count returned %d", 3213 cache_device_name(cache), r); 3214 goto err; 3215 } 3216 3217 r = dm_cache_get_metadata_dev_size(cache->cmd, &nr_blocks_metadata); 3218 if (r) { 3219 DMERR("%s: dm_cache_get_metadata_dev_size returned %d", 3220 cache_device_name(cache), r); 3221 goto err; 3222 } 3223 3224 residency = policy_residency(cache->policy); 3225 3226 DMEMIT("%u %llu/%llu %llu %llu/%llu %u %u %u %u %u %u %lu ", 3227 (unsigned)DM_CACHE_METADATA_BLOCK_SIZE, 3228 (unsigned long long)(nr_blocks_metadata - nr_free_blocks_metadata), 3229 (unsigned long long)nr_blocks_metadata, 3230 (unsigned long long)cache->sectors_per_block, 3231 (unsigned long long) from_cblock(residency), 3232 (unsigned long long) from_cblock(cache->cache_size), 3233 (unsigned) atomic_read(&cache->stats.read_hit), 3234 (unsigned) atomic_read(&cache->stats.read_miss), 3235 (unsigned) atomic_read(&cache->stats.write_hit), 3236 (unsigned) atomic_read(&cache->stats.write_miss), 3237 (unsigned) atomic_read(&cache->stats.demotion), 3238 (unsigned) atomic_read(&cache->stats.promotion), 3239 (unsigned long) atomic_read(&cache->nr_dirty)); 3240 3241 if (cache->features.metadata_version == 2) 3242 DMEMIT("2 metadata2 "); 3243 else 3244 DMEMIT("1 "); 3245 3246 if (writethrough_mode(&cache->features)) 3247 DMEMIT("writethrough "); 3248 3249 else if (passthrough_mode(&cache->features)) 3250 DMEMIT("passthrough "); 3251 3252 else if (writeback_mode(&cache->features)) 3253 DMEMIT("writeback "); 3254 3255 else { 3256 DMERR("%s: internal error: unknown io mode: %d", 3257 cache_device_name(cache), (int) cache->features.io_mode); 3258 goto err; 3259 } 3260 3261 DMEMIT("2 migration_threshold %llu ", (unsigned long long) cache->migration_threshold); 3262 3263 DMEMIT("%s ", dm_cache_policy_get_name(cache->policy)); 3264 if (sz < maxlen) { 3265 r = policy_emit_config_values(cache->policy, result, maxlen, &sz); 3266 if (r) 3267 DMERR("%s: policy_emit_config_values returned %d", 3268 cache_device_name(cache), r); 3269 } 3270 3271 if (get_cache_mode(cache) == CM_READ_ONLY) 3272 DMEMIT("ro "); 3273 else 3274 DMEMIT("rw "); 3275 3276 r = dm_cache_metadata_needs_check(cache->cmd, &needs_check); 3277 3278 if (r || needs_check) 3279 DMEMIT("needs_check "); 3280 else 3281 DMEMIT("- "); 3282 3283 break; 3284 3285 case STATUSTYPE_TABLE: 3286 format_dev_t(buf, cache->metadata_dev->bdev->bd_dev); 3287 DMEMIT("%s ", buf); 3288 format_dev_t(buf, cache->cache_dev->bdev->bd_dev); 3289 DMEMIT("%s ", buf); 3290 format_dev_t(buf, cache->origin_dev->bdev->bd_dev); 3291 DMEMIT("%s", buf); 3292 3293 for (i = 0; i < cache->nr_ctr_args - 1; i++) 3294 DMEMIT(" %s", cache->ctr_args[i]); 3295 if (cache->nr_ctr_args) 3296 DMEMIT(" %s", cache->ctr_args[cache->nr_ctr_args - 1]); 3297 } 3298 3299 return; 3300 3301 err: 3302 DMEMIT("Error"); 3303 } 3304 3305 /* 3306 * Defines a range of cblocks, begin to (end - 1) are in the range. end is 3307 * the one-past-the-end value. 3308 */ 3309 struct cblock_range { 3310 dm_cblock_t begin; 3311 dm_cblock_t end; 3312 }; 3313 3314 /* 3315 * A cache block range can take two forms: 3316 * 3317 * i) A single cblock, eg. '3456' 3318 * ii) A begin and end cblock with a dash between, eg. 123-234 3319 */ 3320 static int parse_cblock_range(struct cache *cache, const char *str, 3321 struct cblock_range *result) 3322 { 3323 char dummy; 3324 uint64_t b, e; 3325 int r; 3326 3327 /* 3328 * Try and parse form (ii) first. 3329 */ 3330 r = sscanf(str, "%llu-%llu%c", &b, &e, &dummy); 3331 if (r < 0) 3332 return r; 3333 3334 if (r == 2) { 3335 result->begin = to_cblock(b); 3336 result->end = to_cblock(e); 3337 return 0; 3338 } 3339 3340 /* 3341 * That didn't work, try form (i). 3342 */ 3343 r = sscanf(str, "%llu%c", &b, &dummy); 3344 if (r < 0) 3345 return r; 3346 3347 if (r == 1) { 3348 result->begin = to_cblock(b); 3349 result->end = to_cblock(from_cblock(result->begin) + 1u); 3350 return 0; 3351 } 3352 3353 DMERR("%s: invalid cblock range '%s'", cache_device_name(cache), str); 3354 return -EINVAL; 3355 } 3356 3357 static int validate_cblock_range(struct cache *cache, struct cblock_range *range) 3358 { 3359 uint64_t b = from_cblock(range->begin); 3360 uint64_t e = from_cblock(range->end); 3361 uint64_t n = from_cblock(cache->cache_size); 3362 3363 if (b >= n) { 3364 DMERR("%s: begin cblock out of range: %llu >= %llu", 3365 cache_device_name(cache), b, n); 3366 return -EINVAL; 3367 } 3368 3369 if (e > n) { 3370 DMERR("%s: end cblock out of range: %llu > %llu", 3371 cache_device_name(cache), e, n); 3372 return -EINVAL; 3373 } 3374 3375 if (b >= e) { 3376 DMERR("%s: invalid cblock range: %llu >= %llu", 3377 cache_device_name(cache), b, e); 3378 return -EINVAL; 3379 } 3380 3381 return 0; 3382 } 3383 3384 static inline dm_cblock_t cblock_succ(dm_cblock_t b) 3385 { 3386 return to_cblock(from_cblock(b) + 1); 3387 } 3388 3389 static int request_invalidation(struct cache *cache, struct cblock_range *range) 3390 { 3391 int r = 0; 3392 3393 /* 3394 * We don't need to do any locking here because we know we're in 3395 * passthrough mode. There's is potential for a race between an 3396 * invalidation triggered by an io and an invalidation message. This 3397 * is harmless, we must not worry if the policy call fails. 3398 */ 3399 while (range->begin != range->end) { 3400 r = invalidate_cblock(cache, range->begin); 3401 if (r) 3402 return r; 3403 3404 range->begin = cblock_succ(range->begin); 3405 } 3406 3407 cache->commit_requested = true; 3408 return r; 3409 } 3410 3411 static int process_invalidate_cblocks_message(struct cache *cache, unsigned count, 3412 const char **cblock_ranges) 3413 { 3414 int r = 0; 3415 unsigned i; 3416 struct cblock_range range; 3417 3418 if (!passthrough_mode(&cache->features)) { 3419 DMERR("%s: cache has to be in passthrough mode for invalidation", 3420 cache_device_name(cache)); 3421 return -EPERM; 3422 } 3423 3424 for (i = 0; i < count; i++) { 3425 r = parse_cblock_range(cache, cblock_ranges[i], &range); 3426 if (r) 3427 break; 3428 3429 r = validate_cblock_range(cache, &range); 3430 if (r) 3431 break; 3432 3433 /* 3434 * Pass begin and end origin blocks to the worker and wake it. 3435 */ 3436 r = request_invalidation(cache, &range); 3437 if (r) 3438 break; 3439 } 3440 3441 return r; 3442 } 3443 3444 /* 3445 * Supports 3446 * "<key> <value>" 3447 * and 3448 * "invalidate_cblocks [(<begin>)|(<begin>-<end>)]* 3449 * 3450 * The key migration_threshold is supported by the cache target core. 3451 */ 3452 static int cache_message(struct dm_target *ti, unsigned argc, char **argv) 3453 { 3454 struct cache *cache = ti->private; 3455 3456 if (!argc) 3457 return -EINVAL; 3458 3459 if (get_cache_mode(cache) >= CM_READ_ONLY) { 3460 DMERR("%s: unable to service cache target messages in READ_ONLY or FAIL mode", 3461 cache_device_name(cache)); 3462 return -EOPNOTSUPP; 3463 } 3464 3465 if (!strcasecmp(argv[0], "invalidate_cblocks")) 3466 return process_invalidate_cblocks_message(cache, argc - 1, (const char **) argv + 1); 3467 3468 if (argc != 2) 3469 return -EINVAL; 3470 3471 return set_config_value(cache, argv[0], argv[1]); 3472 } 3473 3474 static int cache_iterate_devices(struct dm_target *ti, 3475 iterate_devices_callout_fn fn, void *data) 3476 { 3477 int r = 0; 3478 struct cache *cache = ti->private; 3479 3480 r = fn(ti, cache->cache_dev, 0, get_dev_size(cache->cache_dev), data); 3481 if (!r) 3482 r = fn(ti, cache->origin_dev, 0, ti->len, data); 3483 3484 return r; 3485 } 3486 3487 static void set_discard_limits(struct cache *cache, struct queue_limits *limits) 3488 { 3489 /* 3490 * FIXME: these limits may be incompatible with the cache device 3491 */ 3492 limits->max_discard_sectors = min_t(sector_t, cache->discard_block_size * 1024, 3493 cache->origin_sectors); 3494 limits->discard_granularity = cache->discard_block_size << SECTOR_SHIFT; 3495 } 3496 3497 static void cache_io_hints(struct dm_target *ti, struct queue_limits *limits) 3498 { 3499 struct cache *cache = ti->private; 3500 uint64_t io_opt_sectors = limits->io_opt >> SECTOR_SHIFT; 3501 3502 /* 3503 * If the system-determined stacked limits are compatible with the 3504 * cache's blocksize (io_opt is a factor) do not override them. 3505 */ 3506 if (io_opt_sectors < cache->sectors_per_block || 3507 do_div(io_opt_sectors, cache->sectors_per_block)) { 3508 blk_limits_io_min(limits, cache->sectors_per_block << SECTOR_SHIFT); 3509 blk_limits_io_opt(limits, cache->sectors_per_block << SECTOR_SHIFT); 3510 } 3511 set_discard_limits(cache, limits); 3512 } 3513 3514 /*----------------------------------------------------------------*/ 3515 3516 static struct target_type cache_target = { 3517 .name = "cache", 3518 .version = {2, 0, 0}, 3519 .module = THIS_MODULE, 3520 .ctr = cache_ctr, 3521 .dtr = cache_dtr, 3522 .map = cache_map, 3523 .end_io = cache_end_io, 3524 .postsuspend = cache_postsuspend, 3525 .preresume = cache_preresume, 3526 .resume = cache_resume, 3527 .status = cache_status, 3528 .message = cache_message, 3529 .iterate_devices = cache_iterate_devices, 3530 .io_hints = cache_io_hints, 3531 }; 3532 3533 static int __init dm_cache_init(void) 3534 { 3535 int r; 3536 3537 r = dm_register_target(&cache_target); 3538 if (r) { 3539 DMERR("cache target registration failed: %d", r); 3540 return r; 3541 } 3542 3543 migration_cache = KMEM_CACHE(dm_cache_migration, 0); 3544 if (!migration_cache) { 3545 dm_unregister_target(&cache_target); 3546 return -ENOMEM; 3547 } 3548 3549 return 0; 3550 } 3551 3552 static void __exit dm_cache_exit(void) 3553 { 3554 dm_unregister_target(&cache_target); 3555 kmem_cache_destroy(migration_cache); 3556 } 3557 3558 module_init(dm_cache_init); 3559 module_exit(dm_cache_exit); 3560 3561 MODULE_DESCRIPTION(DM_NAME " cache target"); 3562 MODULE_AUTHOR("Joe Thornber <ejt@redhat.com>"); 3563 MODULE_LICENSE("GPL"); 3564