1 /* 2 * Copyright (C) 2012 Red Hat. All rights reserved. 3 * 4 * This file is released under the GPL. 5 */ 6 7 #include "dm.h" 8 #include "dm-bio-prison.h" 9 #include "dm-bio-record.h" 10 #include "dm-cache-metadata.h" 11 12 #include <linux/dm-io.h> 13 #include <linux/dm-kcopyd.h> 14 #include <linux/init.h> 15 #include <linux/mempool.h> 16 #include <linux/module.h> 17 #include <linux/slab.h> 18 #include <linux/vmalloc.h> 19 20 #define DM_MSG_PREFIX "cache" 21 22 DECLARE_DM_KCOPYD_THROTTLE_WITH_MODULE_PARM(cache_copy_throttle, 23 "A percentage of time allocated for copying to and/or from cache"); 24 25 /*----------------------------------------------------------------*/ 26 27 /* 28 * Glossary: 29 * 30 * oblock: index of an origin block 31 * cblock: index of a cache block 32 * promotion: movement of a block from origin to cache 33 * demotion: movement of a block from cache to origin 34 * migration: movement of a block between the origin and cache device, 35 * either direction 36 */ 37 38 /*----------------------------------------------------------------*/ 39 40 static size_t bitset_size_in_bytes(unsigned nr_entries) 41 { 42 return sizeof(unsigned long) * dm_div_up(nr_entries, BITS_PER_LONG); 43 } 44 45 static unsigned long *alloc_bitset(unsigned nr_entries) 46 { 47 size_t s = bitset_size_in_bytes(nr_entries); 48 return vzalloc(s); 49 } 50 51 static void clear_bitset(void *bitset, unsigned nr_entries) 52 { 53 size_t s = bitset_size_in_bytes(nr_entries); 54 memset(bitset, 0, s); 55 } 56 57 static void free_bitset(unsigned long *bits) 58 { 59 vfree(bits); 60 } 61 62 /*----------------------------------------------------------------*/ 63 64 /* 65 * There are a couple of places where we let a bio run, but want to do some 66 * work before calling its endio function. We do this by temporarily 67 * changing the endio fn. 68 */ 69 struct dm_hook_info { 70 bio_end_io_t *bi_end_io; 71 void *bi_private; 72 }; 73 74 static void dm_hook_bio(struct dm_hook_info *h, struct bio *bio, 75 bio_end_io_t *bi_end_io, void *bi_private) 76 { 77 h->bi_end_io = bio->bi_end_io; 78 h->bi_private = bio->bi_private; 79 80 bio->bi_end_io = bi_end_io; 81 bio->bi_private = bi_private; 82 } 83 84 static void dm_unhook_bio(struct dm_hook_info *h, struct bio *bio) 85 { 86 bio->bi_end_io = h->bi_end_io; 87 bio->bi_private = h->bi_private; 88 89 /* 90 * Must bump bi_remaining to allow bio to complete with 91 * restored bi_end_io. 92 */ 93 atomic_inc(&bio->bi_remaining); 94 } 95 96 /*----------------------------------------------------------------*/ 97 98 #define MIGRATION_POOL_SIZE 128 99 #define COMMIT_PERIOD HZ 100 #define MIGRATION_COUNT_WINDOW 10 101 102 /* 103 * The block size of the device holding cache data must be 104 * between 32KB and 1GB. 105 */ 106 #define DATA_DEV_BLOCK_SIZE_MIN_SECTORS (32 * 1024 >> SECTOR_SHIFT) 107 #define DATA_DEV_BLOCK_SIZE_MAX_SECTORS (1024 * 1024 * 1024 >> SECTOR_SHIFT) 108 109 /* 110 * FIXME: the cache is read/write for the time being. 111 */ 112 enum cache_metadata_mode { 113 CM_WRITE, /* metadata may be changed */ 114 CM_READ_ONLY, /* metadata may not be changed */ 115 }; 116 117 enum cache_io_mode { 118 /* 119 * Data is written to cached blocks only. These blocks are marked 120 * dirty. If you lose the cache device you will lose data. 121 * Potential performance increase for both reads and writes. 122 */ 123 CM_IO_WRITEBACK, 124 125 /* 126 * Data is written to both cache and origin. Blocks are never 127 * dirty. Potential performance benfit for reads only. 128 */ 129 CM_IO_WRITETHROUGH, 130 131 /* 132 * A degraded mode useful for various cache coherency situations 133 * (eg, rolling back snapshots). Reads and writes always go to the 134 * origin. If a write goes to a cached oblock, then the cache 135 * block is invalidated. 136 */ 137 CM_IO_PASSTHROUGH 138 }; 139 140 struct cache_features { 141 enum cache_metadata_mode mode; 142 enum cache_io_mode io_mode; 143 }; 144 145 struct cache_stats { 146 atomic_t read_hit; 147 atomic_t read_miss; 148 atomic_t write_hit; 149 atomic_t write_miss; 150 atomic_t demotion; 151 atomic_t promotion; 152 atomic_t copies_avoided; 153 atomic_t cache_cell_clash; 154 atomic_t commit_count; 155 atomic_t discard_count; 156 }; 157 158 /* 159 * Defines a range of cblocks, begin to (end - 1) are in the range. end is 160 * the one-past-the-end value. 161 */ 162 struct cblock_range { 163 dm_cblock_t begin; 164 dm_cblock_t end; 165 }; 166 167 struct invalidation_request { 168 struct list_head list; 169 struct cblock_range *cblocks; 170 171 atomic_t complete; 172 int err; 173 174 wait_queue_head_t result_wait; 175 }; 176 177 struct cache { 178 struct dm_target *ti; 179 struct dm_target_callbacks callbacks; 180 181 struct dm_cache_metadata *cmd; 182 183 /* 184 * Metadata is written to this device. 185 */ 186 struct dm_dev *metadata_dev; 187 188 /* 189 * The slower of the two data devices. Typically a spindle. 190 */ 191 struct dm_dev *origin_dev; 192 193 /* 194 * The faster of the two data devices. Typically an SSD. 195 */ 196 struct dm_dev *cache_dev; 197 198 /* 199 * Size of the origin device in _complete_ blocks and native sectors. 200 */ 201 dm_oblock_t origin_blocks; 202 sector_t origin_sectors; 203 204 /* 205 * Size of the cache device in blocks. 206 */ 207 dm_cblock_t cache_size; 208 209 /* 210 * Fields for converting from sectors to blocks. 211 */ 212 uint32_t sectors_per_block; 213 int sectors_per_block_shift; 214 215 spinlock_t lock; 216 struct bio_list deferred_bios; 217 struct bio_list deferred_flush_bios; 218 struct bio_list deferred_writethrough_bios; 219 struct list_head quiesced_migrations; 220 struct list_head completed_migrations; 221 struct list_head need_commit_migrations; 222 sector_t migration_threshold; 223 wait_queue_head_t migration_wait; 224 atomic_t nr_allocated_migrations; 225 226 /* 227 * The number of in flight migrations that are performing 228 * background io. eg, promotion, writeback. 229 */ 230 atomic_t nr_io_migrations; 231 232 wait_queue_head_t quiescing_wait; 233 atomic_t quiescing; 234 atomic_t quiescing_ack; 235 236 /* 237 * cache_size entries, dirty if set 238 */ 239 atomic_t nr_dirty; 240 unsigned long *dirty_bitset; 241 242 /* 243 * origin_blocks entries, discarded if set. 244 */ 245 dm_dblock_t discard_nr_blocks; 246 unsigned long *discard_bitset; 247 uint32_t discard_block_size; /* a power of 2 times sectors per block */ 248 249 /* 250 * Rather than reconstructing the table line for the status we just 251 * save it and regurgitate. 252 */ 253 unsigned nr_ctr_args; 254 const char **ctr_args; 255 256 struct dm_kcopyd_client *copier; 257 struct workqueue_struct *wq; 258 struct work_struct worker; 259 260 struct delayed_work waker; 261 unsigned long last_commit_jiffies; 262 263 struct dm_bio_prison *prison; 264 struct dm_deferred_set *all_io_ds; 265 266 mempool_t *migration_pool; 267 268 struct dm_cache_policy *policy; 269 unsigned policy_nr_args; 270 271 bool need_tick_bio:1; 272 bool sized:1; 273 bool invalidate:1; 274 bool commit_requested:1; 275 bool loaded_mappings:1; 276 bool loaded_discards:1; 277 278 /* 279 * Cache features such as write-through. 280 */ 281 struct cache_features features; 282 283 struct cache_stats stats; 284 285 /* 286 * Invalidation fields. 287 */ 288 spinlock_t invalidation_lock; 289 struct list_head invalidation_requests; 290 }; 291 292 struct per_bio_data { 293 bool tick:1; 294 unsigned req_nr:2; 295 struct dm_deferred_entry *all_io_entry; 296 struct dm_hook_info hook_info; 297 298 /* 299 * writethrough fields. These MUST remain at the end of this 300 * structure and the 'cache' member must be the first as it 301 * is used to determine the offset of the writethrough fields. 302 */ 303 struct cache *cache; 304 dm_cblock_t cblock; 305 struct dm_bio_details bio_details; 306 }; 307 308 struct dm_cache_migration { 309 struct list_head list; 310 struct cache *cache; 311 312 unsigned long start_jiffies; 313 dm_oblock_t old_oblock; 314 dm_oblock_t new_oblock; 315 dm_cblock_t cblock; 316 317 bool err:1; 318 bool discard:1; 319 bool writeback:1; 320 bool demote:1; 321 bool promote:1; 322 bool requeue_holder:1; 323 bool invalidate:1; 324 325 struct dm_bio_prison_cell *old_ocell; 326 struct dm_bio_prison_cell *new_ocell; 327 }; 328 329 /* 330 * Processing a bio in the worker thread may require these memory 331 * allocations. We prealloc to avoid deadlocks (the same worker thread 332 * frees them back to the mempool). 333 */ 334 struct prealloc { 335 struct dm_cache_migration *mg; 336 struct dm_bio_prison_cell *cell1; 337 struct dm_bio_prison_cell *cell2; 338 }; 339 340 static void wake_worker(struct cache *cache) 341 { 342 queue_work(cache->wq, &cache->worker); 343 } 344 345 /*----------------------------------------------------------------*/ 346 347 static struct dm_bio_prison_cell *alloc_prison_cell(struct cache *cache) 348 { 349 /* FIXME: change to use a local slab. */ 350 return dm_bio_prison_alloc_cell(cache->prison, GFP_NOWAIT); 351 } 352 353 static void free_prison_cell(struct cache *cache, struct dm_bio_prison_cell *cell) 354 { 355 dm_bio_prison_free_cell(cache->prison, cell); 356 } 357 358 static struct dm_cache_migration *alloc_migration(struct cache *cache) 359 { 360 struct dm_cache_migration *mg; 361 362 mg = mempool_alloc(cache->migration_pool, GFP_NOWAIT); 363 if (mg) { 364 mg->cache = cache; 365 atomic_inc(&mg->cache->nr_allocated_migrations); 366 } 367 368 return mg; 369 } 370 371 static void free_migration(struct dm_cache_migration *mg) 372 { 373 if (atomic_dec_and_test(&mg->cache->nr_allocated_migrations)) 374 wake_up(&mg->cache->migration_wait); 375 376 mempool_free(mg, mg->cache->migration_pool); 377 } 378 379 static int prealloc_data_structs(struct cache *cache, struct prealloc *p) 380 { 381 if (!p->mg) { 382 p->mg = alloc_migration(cache); 383 if (!p->mg) 384 return -ENOMEM; 385 } 386 387 if (!p->cell1) { 388 p->cell1 = alloc_prison_cell(cache); 389 if (!p->cell1) 390 return -ENOMEM; 391 } 392 393 if (!p->cell2) { 394 p->cell2 = alloc_prison_cell(cache); 395 if (!p->cell2) 396 return -ENOMEM; 397 } 398 399 return 0; 400 } 401 402 static void prealloc_free_structs(struct cache *cache, struct prealloc *p) 403 { 404 if (p->cell2) 405 free_prison_cell(cache, p->cell2); 406 407 if (p->cell1) 408 free_prison_cell(cache, p->cell1); 409 410 if (p->mg) 411 free_migration(p->mg); 412 } 413 414 static struct dm_cache_migration *prealloc_get_migration(struct prealloc *p) 415 { 416 struct dm_cache_migration *mg = p->mg; 417 418 BUG_ON(!mg); 419 p->mg = NULL; 420 421 return mg; 422 } 423 424 /* 425 * You must have a cell within the prealloc struct to return. If not this 426 * function will BUG() rather than returning NULL. 427 */ 428 static struct dm_bio_prison_cell *prealloc_get_cell(struct prealloc *p) 429 { 430 struct dm_bio_prison_cell *r = NULL; 431 432 if (p->cell1) { 433 r = p->cell1; 434 p->cell1 = NULL; 435 436 } else if (p->cell2) { 437 r = p->cell2; 438 p->cell2 = NULL; 439 } else 440 BUG(); 441 442 return r; 443 } 444 445 /* 446 * You can't have more than two cells in a prealloc struct. BUG() will be 447 * called if you try and overfill. 448 */ 449 static void prealloc_put_cell(struct prealloc *p, struct dm_bio_prison_cell *cell) 450 { 451 if (!p->cell2) 452 p->cell2 = cell; 453 454 else if (!p->cell1) 455 p->cell1 = cell; 456 457 else 458 BUG(); 459 } 460 461 /*----------------------------------------------------------------*/ 462 463 static void build_key(dm_oblock_t begin, dm_oblock_t end, struct dm_cell_key *key) 464 { 465 key->virtual = 0; 466 key->dev = 0; 467 key->block_begin = from_oblock(begin); 468 key->block_end = from_oblock(end); 469 } 470 471 /* 472 * The caller hands in a preallocated cell, and a free function for it. 473 * The cell will be freed if there's an error, or if it wasn't used because 474 * a cell with that key already exists. 475 */ 476 typedef void (*cell_free_fn)(void *context, struct dm_bio_prison_cell *cell); 477 478 static int bio_detain_range(struct cache *cache, dm_oblock_t oblock_begin, dm_oblock_t oblock_end, 479 struct bio *bio, struct dm_bio_prison_cell *cell_prealloc, 480 cell_free_fn free_fn, void *free_context, 481 struct dm_bio_prison_cell **cell_result) 482 { 483 int r; 484 struct dm_cell_key key; 485 486 build_key(oblock_begin, oblock_end, &key); 487 r = dm_bio_detain(cache->prison, &key, bio, cell_prealloc, cell_result); 488 if (r) 489 free_fn(free_context, cell_prealloc); 490 491 return r; 492 } 493 494 static int bio_detain(struct cache *cache, dm_oblock_t oblock, 495 struct bio *bio, struct dm_bio_prison_cell *cell_prealloc, 496 cell_free_fn free_fn, void *free_context, 497 struct dm_bio_prison_cell **cell_result) 498 { 499 dm_oblock_t end = to_oblock(from_oblock(oblock) + 1ULL); 500 return bio_detain_range(cache, oblock, end, bio, 501 cell_prealloc, free_fn, free_context, cell_result); 502 } 503 504 static int get_cell(struct cache *cache, 505 dm_oblock_t oblock, 506 struct prealloc *structs, 507 struct dm_bio_prison_cell **cell_result) 508 { 509 int r; 510 struct dm_cell_key key; 511 struct dm_bio_prison_cell *cell_prealloc; 512 513 cell_prealloc = prealloc_get_cell(structs); 514 515 build_key(oblock, to_oblock(from_oblock(oblock) + 1ULL), &key); 516 r = dm_get_cell(cache->prison, &key, cell_prealloc, cell_result); 517 if (r) 518 prealloc_put_cell(structs, cell_prealloc); 519 520 return r; 521 } 522 523 /*----------------------------------------------------------------*/ 524 525 static bool is_dirty(struct cache *cache, dm_cblock_t b) 526 { 527 return test_bit(from_cblock(b), cache->dirty_bitset); 528 } 529 530 static void set_dirty(struct cache *cache, dm_oblock_t oblock, dm_cblock_t cblock) 531 { 532 if (!test_and_set_bit(from_cblock(cblock), cache->dirty_bitset)) { 533 atomic_inc(&cache->nr_dirty); 534 policy_set_dirty(cache->policy, oblock); 535 } 536 } 537 538 static void clear_dirty(struct cache *cache, dm_oblock_t oblock, dm_cblock_t cblock) 539 { 540 if (test_and_clear_bit(from_cblock(cblock), cache->dirty_bitset)) { 541 policy_clear_dirty(cache->policy, oblock); 542 if (atomic_dec_return(&cache->nr_dirty) == 0) 543 dm_table_event(cache->ti->table); 544 } 545 } 546 547 /*----------------------------------------------------------------*/ 548 549 static bool block_size_is_power_of_two(struct cache *cache) 550 { 551 return cache->sectors_per_block_shift >= 0; 552 } 553 554 /* gcc on ARM generates spurious references to __udivdi3 and __umoddi3 */ 555 #if defined(CONFIG_ARM) && __GNUC__ == 4 && __GNUC_MINOR__ <= 6 556 __always_inline 557 #endif 558 static dm_block_t block_div(dm_block_t b, uint32_t n) 559 { 560 do_div(b, n); 561 562 return b; 563 } 564 565 static dm_block_t oblocks_per_dblock(struct cache *cache) 566 { 567 dm_block_t oblocks = cache->discard_block_size; 568 569 if (block_size_is_power_of_two(cache)) 570 oblocks >>= cache->sectors_per_block_shift; 571 else 572 oblocks = block_div(oblocks, cache->sectors_per_block); 573 574 return oblocks; 575 } 576 577 static dm_dblock_t oblock_to_dblock(struct cache *cache, dm_oblock_t oblock) 578 { 579 return to_dblock(block_div(from_oblock(oblock), 580 oblocks_per_dblock(cache))); 581 } 582 583 static dm_oblock_t dblock_to_oblock(struct cache *cache, dm_dblock_t dblock) 584 { 585 return to_oblock(from_dblock(dblock) * oblocks_per_dblock(cache)); 586 } 587 588 static void set_discard(struct cache *cache, dm_dblock_t b) 589 { 590 unsigned long flags; 591 592 BUG_ON(from_dblock(b) >= from_dblock(cache->discard_nr_blocks)); 593 atomic_inc(&cache->stats.discard_count); 594 595 spin_lock_irqsave(&cache->lock, flags); 596 set_bit(from_dblock(b), cache->discard_bitset); 597 spin_unlock_irqrestore(&cache->lock, flags); 598 } 599 600 static void clear_discard(struct cache *cache, dm_dblock_t b) 601 { 602 unsigned long flags; 603 604 spin_lock_irqsave(&cache->lock, flags); 605 clear_bit(from_dblock(b), cache->discard_bitset); 606 spin_unlock_irqrestore(&cache->lock, flags); 607 } 608 609 static bool is_discarded(struct cache *cache, dm_dblock_t b) 610 { 611 int r; 612 unsigned long flags; 613 614 spin_lock_irqsave(&cache->lock, flags); 615 r = test_bit(from_dblock(b), cache->discard_bitset); 616 spin_unlock_irqrestore(&cache->lock, flags); 617 618 return r; 619 } 620 621 static bool is_discarded_oblock(struct cache *cache, dm_oblock_t b) 622 { 623 int r; 624 unsigned long flags; 625 626 spin_lock_irqsave(&cache->lock, flags); 627 r = test_bit(from_dblock(oblock_to_dblock(cache, b)), 628 cache->discard_bitset); 629 spin_unlock_irqrestore(&cache->lock, flags); 630 631 return r; 632 } 633 634 /*----------------------------------------------------------------*/ 635 636 static void load_stats(struct cache *cache) 637 { 638 struct dm_cache_statistics stats; 639 640 dm_cache_metadata_get_stats(cache->cmd, &stats); 641 atomic_set(&cache->stats.read_hit, stats.read_hits); 642 atomic_set(&cache->stats.read_miss, stats.read_misses); 643 atomic_set(&cache->stats.write_hit, stats.write_hits); 644 atomic_set(&cache->stats.write_miss, stats.write_misses); 645 } 646 647 static void save_stats(struct cache *cache) 648 { 649 struct dm_cache_statistics stats; 650 651 stats.read_hits = atomic_read(&cache->stats.read_hit); 652 stats.read_misses = atomic_read(&cache->stats.read_miss); 653 stats.write_hits = atomic_read(&cache->stats.write_hit); 654 stats.write_misses = atomic_read(&cache->stats.write_miss); 655 656 dm_cache_metadata_set_stats(cache->cmd, &stats); 657 } 658 659 /*---------------------------------------------------------------- 660 * Per bio data 661 *--------------------------------------------------------------*/ 662 663 /* 664 * If using writeback, leave out struct per_bio_data's writethrough fields. 665 */ 666 #define PB_DATA_SIZE_WB (offsetof(struct per_bio_data, cache)) 667 #define PB_DATA_SIZE_WT (sizeof(struct per_bio_data)) 668 669 static bool writethrough_mode(struct cache_features *f) 670 { 671 return f->io_mode == CM_IO_WRITETHROUGH; 672 } 673 674 static bool writeback_mode(struct cache_features *f) 675 { 676 return f->io_mode == CM_IO_WRITEBACK; 677 } 678 679 static bool passthrough_mode(struct cache_features *f) 680 { 681 return f->io_mode == CM_IO_PASSTHROUGH; 682 } 683 684 static size_t get_per_bio_data_size(struct cache *cache) 685 { 686 return writethrough_mode(&cache->features) ? PB_DATA_SIZE_WT : PB_DATA_SIZE_WB; 687 } 688 689 static struct per_bio_data *get_per_bio_data(struct bio *bio, size_t data_size) 690 { 691 struct per_bio_data *pb = dm_per_bio_data(bio, data_size); 692 BUG_ON(!pb); 693 return pb; 694 } 695 696 static struct per_bio_data *init_per_bio_data(struct bio *bio, size_t data_size) 697 { 698 struct per_bio_data *pb = get_per_bio_data(bio, data_size); 699 700 pb->tick = false; 701 pb->req_nr = dm_bio_get_target_bio_nr(bio); 702 pb->all_io_entry = NULL; 703 704 return pb; 705 } 706 707 /*---------------------------------------------------------------- 708 * Remapping 709 *--------------------------------------------------------------*/ 710 static void remap_to_origin(struct cache *cache, struct bio *bio) 711 { 712 bio->bi_bdev = cache->origin_dev->bdev; 713 } 714 715 static void remap_to_cache(struct cache *cache, struct bio *bio, 716 dm_cblock_t cblock) 717 { 718 sector_t bi_sector = bio->bi_iter.bi_sector; 719 sector_t block = from_cblock(cblock); 720 721 bio->bi_bdev = cache->cache_dev->bdev; 722 if (!block_size_is_power_of_two(cache)) 723 bio->bi_iter.bi_sector = 724 (block * cache->sectors_per_block) + 725 sector_div(bi_sector, cache->sectors_per_block); 726 else 727 bio->bi_iter.bi_sector = 728 (block << cache->sectors_per_block_shift) | 729 (bi_sector & (cache->sectors_per_block - 1)); 730 } 731 732 static void check_if_tick_bio_needed(struct cache *cache, struct bio *bio) 733 { 734 unsigned long flags; 735 size_t pb_data_size = get_per_bio_data_size(cache); 736 struct per_bio_data *pb = get_per_bio_data(bio, pb_data_size); 737 738 spin_lock_irqsave(&cache->lock, flags); 739 if (cache->need_tick_bio && 740 !(bio->bi_rw & (REQ_FUA | REQ_FLUSH | REQ_DISCARD))) { 741 pb->tick = true; 742 cache->need_tick_bio = false; 743 } 744 spin_unlock_irqrestore(&cache->lock, flags); 745 } 746 747 static void remap_to_origin_clear_discard(struct cache *cache, struct bio *bio, 748 dm_oblock_t oblock) 749 { 750 check_if_tick_bio_needed(cache, bio); 751 remap_to_origin(cache, bio); 752 if (bio_data_dir(bio) == WRITE) 753 clear_discard(cache, oblock_to_dblock(cache, oblock)); 754 } 755 756 static void remap_to_cache_dirty(struct cache *cache, struct bio *bio, 757 dm_oblock_t oblock, dm_cblock_t cblock) 758 { 759 check_if_tick_bio_needed(cache, bio); 760 remap_to_cache(cache, bio, cblock); 761 if (bio_data_dir(bio) == WRITE) { 762 set_dirty(cache, oblock, cblock); 763 clear_discard(cache, oblock_to_dblock(cache, oblock)); 764 } 765 } 766 767 static dm_oblock_t get_bio_block(struct cache *cache, struct bio *bio) 768 { 769 sector_t block_nr = bio->bi_iter.bi_sector; 770 771 if (!block_size_is_power_of_two(cache)) 772 (void) sector_div(block_nr, cache->sectors_per_block); 773 else 774 block_nr >>= cache->sectors_per_block_shift; 775 776 return to_oblock(block_nr); 777 } 778 779 static int bio_triggers_commit(struct cache *cache, struct bio *bio) 780 { 781 return bio->bi_rw & (REQ_FLUSH | REQ_FUA); 782 } 783 784 /* 785 * You must increment the deferred set whilst the prison cell is held. To 786 * encourage this, we ask for 'cell' to be passed in. 787 */ 788 static void inc_ds(struct cache *cache, struct bio *bio, 789 struct dm_bio_prison_cell *cell) 790 { 791 size_t pb_data_size = get_per_bio_data_size(cache); 792 struct per_bio_data *pb = get_per_bio_data(bio, pb_data_size); 793 794 BUG_ON(!cell); 795 BUG_ON(pb->all_io_entry); 796 797 pb->all_io_entry = dm_deferred_entry_inc(cache->all_io_ds); 798 } 799 800 static void issue(struct cache *cache, struct bio *bio) 801 { 802 unsigned long flags; 803 804 if (!bio_triggers_commit(cache, bio)) { 805 generic_make_request(bio); 806 return; 807 } 808 809 /* 810 * Batch together any bios that trigger commits and then issue a 811 * single commit for them in do_worker(). 812 */ 813 spin_lock_irqsave(&cache->lock, flags); 814 cache->commit_requested = true; 815 bio_list_add(&cache->deferred_flush_bios, bio); 816 spin_unlock_irqrestore(&cache->lock, flags); 817 } 818 819 static void inc_and_issue(struct cache *cache, struct bio *bio, struct dm_bio_prison_cell *cell) 820 { 821 inc_ds(cache, bio, cell); 822 issue(cache, bio); 823 } 824 825 static void defer_writethrough_bio(struct cache *cache, struct bio *bio) 826 { 827 unsigned long flags; 828 829 spin_lock_irqsave(&cache->lock, flags); 830 bio_list_add(&cache->deferred_writethrough_bios, bio); 831 spin_unlock_irqrestore(&cache->lock, flags); 832 833 wake_worker(cache); 834 } 835 836 static void writethrough_endio(struct bio *bio, int err) 837 { 838 struct per_bio_data *pb = get_per_bio_data(bio, PB_DATA_SIZE_WT); 839 840 dm_unhook_bio(&pb->hook_info, bio); 841 842 if (err) { 843 bio_endio(bio, err); 844 return; 845 } 846 847 dm_bio_restore(&pb->bio_details, bio); 848 remap_to_cache(pb->cache, bio, pb->cblock); 849 850 /* 851 * We can't issue this bio directly, since we're in interrupt 852 * context. So it gets put on a bio list for processing by the 853 * worker thread. 854 */ 855 defer_writethrough_bio(pb->cache, bio); 856 } 857 858 /* 859 * When running in writethrough mode we need to send writes to clean blocks 860 * to both the cache and origin devices. In future we'd like to clone the 861 * bio and send them in parallel, but for now we're doing them in 862 * series as this is easier. 863 */ 864 static void remap_to_origin_then_cache(struct cache *cache, struct bio *bio, 865 dm_oblock_t oblock, dm_cblock_t cblock) 866 { 867 struct per_bio_data *pb = get_per_bio_data(bio, PB_DATA_SIZE_WT); 868 869 pb->cache = cache; 870 pb->cblock = cblock; 871 dm_hook_bio(&pb->hook_info, bio, writethrough_endio, NULL); 872 dm_bio_record(&pb->bio_details, bio); 873 874 remap_to_origin_clear_discard(pb->cache, bio, oblock); 875 } 876 877 /*---------------------------------------------------------------- 878 * Migration processing 879 * 880 * Migration covers moving data from the origin device to the cache, or 881 * vice versa. 882 *--------------------------------------------------------------*/ 883 static void inc_io_migrations(struct cache *cache) 884 { 885 atomic_inc(&cache->nr_io_migrations); 886 } 887 888 static void dec_io_migrations(struct cache *cache) 889 { 890 atomic_dec(&cache->nr_io_migrations); 891 } 892 893 static void __cell_defer(struct cache *cache, struct dm_bio_prison_cell *cell, 894 bool holder) 895 { 896 (holder ? dm_cell_release : dm_cell_release_no_holder) 897 (cache->prison, cell, &cache->deferred_bios); 898 free_prison_cell(cache, cell); 899 } 900 901 static void cell_defer(struct cache *cache, struct dm_bio_prison_cell *cell, 902 bool holder) 903 { 904 unsigned long flags; 905 906 spin_lock_irqsave(&cache->lock, flags); 907 __cell_defer(cache, cell, holder); 908 spin_unlock_irqrestore(&cache->lock, flags); 909 910 wake_worker(cache); 911 } 912 913 static void free_io_migration(struct dm_cache_migration *mg) 914 { 915 dec_io_migrations(mg->cache); 916 free_migration(mg); 917 } 918 919 static void migration_failure(struct dm_cache_migration *mg) 920 { 921 struct cache *cache = mg->cache; 922 923 if (mg->writeback) { 924 DMWARN_LIMIT("writeback failed; couldn't copy block"); 925 set_dirty(cache, mg->old_oblock, mg->cblock); 926 cell_defer(cache, mg->old_ocell, false); 927 928 } else if (mg->demote) { 929 DMWARN_LIMIT("demotion failed; couldn't copy block"); 930 policy_force_mapping(cache->policy, mg->new_oblock, mg->old_oblock); 931 932 cell_defer(cache, mg->old_ocell, mg->promote ? false : true); 933 if (mg->promote) 934 cell_defer(cache, mg->new_ocell, true); 935 } else { 936 DMWARN_LIMIT("promotion failed; couldn't copy block"); 937 policy_remove_mapping(cache->policy, mg->new_oblock); 938 cell_defer(cache, mg->new_ocell, true); 939 } 940 941 free_io_migration(mg); 942 } 943 944 static void migration_success_pre_commit(struct dm_cache_migration *mg) 945 { 946 unsigned long flags; 947 struct cache *cache = mg->cache; 948 949 if (mg->writeback) { 950 clear_dirty(cache, mg->old_oblock, mg->cblock); 951 cell_defer(cache, mg->old_ocell, false); 952 free_io_migration(mg); 953 return; 954 955 } else if (mg->demote) { 956 if (dm_cache_remove_mapping(cache->cmd, mg->cblock)) { 957 DMWARN_LIMIT("demotion failed; couldn't update on disk metadata"); 958 policy_force_mapping(cache->policy, mg->new_oblock, 959 mg->old_oblock); 960 if (mg->promote) 961 cell_defer(cache, mg->new_ocell, true); 962 free_io_migration(mg); 963 return; 964 } 965 } else { 966 if (dm_cache_insert_mapping(cache->cmd, mg->cblock, mg->new_oblock)) { 967 DMWARN_LIMIT("promotion failed; couldn't update on disk metadata"); 968 policy_remove_mapping(cache->policy, mg->new_oblock); 969 free_io_migration(mg); 970 return; 971 } 972 } 973 974 spin_lock_irqsave(&cache->lock, flags); 975 list_add_tail(&mg->list, &cache->need_commit_migrations); 976 cache->commit_requested = true; 977 spin_unlock_irqrestore(&cache->lock, flags); 978 } 979 980 static void migration_success_post_commit(struct dm_cache_migration *mg) 981 { 982 unsigned long flags; 983 struct cache *cache = mg->cache; 984 985 if (mg->writeback) { 986 DMWARN("writeback unexpectedly triggered commit"); 987 return; 988 989 } else if (mg->demote) { 990 cell_defer(cache, mg->old_ocell, mg->promote ? false : true); 991 992 if (mg->promote) { 993 mg->demote = false; 994 995 spin_lock_irqsave(&cache->lock, flags); 996 list_add_tail(&mg->list, &cache->quiesced_migrations); 997 spin_unlock_irqrestore(&cache->lock, flags); 998 999 } else { 1000 if (mg->invalidate) 1001 policy_remove_mapping(cache->policy, mg->old_oblock); 1002 free_io_migration(mg); 1003 } 1004 1005 } else { 1006 if (mg->requeue_holder) { 1007 clear_dirty(cache, mg->new_oblock, mg->cblock); 1008 cell_defer(cache, mg->new_ocell, true); 1009 } else { 1010 /* 1011 * The block was promoted via an overwrite, so it's dirty. 1012 */ 1013 set_dirty(cache, mg->new_oblock, mg->cblock); 1014 bio_endio(mg->new_ocell->holder, 0); 1015 cell_defer(cache, mg->new_ocell, false); 1016 } 1017 free_io_migration(mg); 1018 } 1019 } 1020 1021 static void copy_complete(int read_err, unsigned long write_err, void *context) 1022 { 1023 unsigned long flags; 1024 struct dm_cache_migration *mg = (struct dm_cache_migration *) context; 1025 struct cache *cache = mg->cache; 1026 1027 if (read_err || write_err) 1028 mg->err = true; 1029 1030 spin_lock_irqsave(&cache->lock, flags); 1031 list_add_tail(&mg->list, &cache->completed_migrations); 1032 spin_unlock_irqrestore(&cache->lock, flags); 1033 1034 wake_worker(cache); 1035 } 1036 1037 static void issue_copy(struct dm_cache_migration *mg) 1038 { 1039 int r; 1040 struct dm_io_region o_region, c_region; 1041 struct cache *cache = mg->cache; 1042 sector_t cblock = from_cblock(mg->cblock); 1043 1044 o_region.bdev = cache->origin_dev->bdev; 1045 o_region.count = cache->sectors_per_block; 1046 1047 c_region.bdev = cache->cache_dev->bdev; 1048 c_region.sector = cblock * cache->sectors_per_block; 1049 c_region.count = cache->sectors_per_block; 1050 1051 if (mg->writeback || mg->demote) { 1052 /* demote */ 1053 o_region.sector = from_oblock(mg->old_oblock) * cache->sectors_per_block; 1054 r = dm_kcopyd_copy(cache->copier, &c_region, 1, &o_region, 0, copy_complete, mg); 1055 } else { 1056 /* promote */ 1057 o_region.sector = from_oblock(mg->new_oblock) * cache->sectors_per_block; 1058 r = dm_kcopyd_copy(cache->copier, &o_region, 1, &c_region, 0, copy_complete, mg); 1059 } 1060 1061 if (r < 0) { 1062 DMERR_LIMIT("issuing migration failed"); 1063 migration_failure(mg); 1064 } 1065 } 1066 1067 static void overwrite_endio(struct bio *bio, int err) 1068 { 1069 struct dm_cache_migration *mg = bio->bi_private; 1070 struct cache *cache = mg->cache; 1071 size_t pb_data_size = get_per_bio_data_size(cache); 1072 struct per_bio_data *pb = get_per_bio_data(bio, pb_data_size); 1073 unsigned long flags; 1074 1075 dm_unhook_bio(&pb->hook_info, bio); 1076 1077 if (err) 1078 mg->err = true; 1079 1080 mg->requeue_holder = false; 1081 1082 spin_lock_irqsave(&cache->lock, flags); 1083 list_add_tail(&mg->list, &cache->completed_migrations); 1084 spin_unlock_irqrestore(&cache->lock, flags); 1085 1086 wake_worker(cache); 1087 } 1088 1089 static void issue_overwrite(struct dm_cache_migration *mg, struct bio *bio) 1090 { 1091 size_t pb_data_size = get_per_bio_data_size(mg->cache); 1092 struct per_bio_data *pb = get_per_bio_data(bio, pb_data_size); 1093 1094 dm_hook_bio(&pb->hook_info, bio, overwrite_endio, mg); 1095 remap_to_cache_dirty(mg->cache, bio, mg->new_oblock, mg->cblock); 1096 1097 /* 1098 * No need to inc_ds() here, since the cell will be held for the 1099 * duration of the io. 1100 */ 1101 generic_make_request(bio); 1102 } 1103 1104 static bool bio_writes_complete_block(struct cache *cache, struct bio *bio) 1105 { 1106 return (bio_data_dir(bio) == WRITE) && 1107 (bio->bi_iter.bi_size == (cache->sectors_per_block << SECTOR_SHIFT)); 1108 } 1109 1110 static void avoid_copy(struct dm_cache_migration *mg) 1111 { 1112 atomic_inc(&mg->cache->stats.copies_avoided); 1113 migration_success_pre_commit(mg); 1114 } 1115 1116 static void calc_discard_block_range(struct cache *cache, struct bio *bio, 1117 dm_dblock_t *b, dm_dblock_t *e) 1118 { 1119 sector_t sb = bio->bi_iter.bi_sector; 1120 sector_t se = bio_end_sector(bio); 1121 1122 *b = to_dblock(dm_sector_div_up(sb, cache->discard_block_size)); 1123 1124 if (se - sb < cache->discard_block_size) 1125 *e = *b; 1126 else 1127 *e = to_dblock(block_div(se, cache->discard_block_size)); 1128 } 1129 1130 static void issue_discard(struct dm_cache_migration *mg) 1131 { 1132 dm_dblock_t b, e; 1133 struct bio *bio = mg->new_ocell->holder; 1134 1135 calc_discard_block_range(mg->cache, bio, &b, &e); 1136 while (b != e) { 1137 set_discard(mg->cache, b); 1138 b = to_dblock(from_dblock(b) + 1); 1139 } 1140 1141 bio_endio(bio, 0); 1142 cell_defer(mg->cache, mg->new_ocell, false); 1143 free_migration(mg); 1144 } 1145 1146 static void issue_copy_or_discard(struct dm_cache_migration *mg) 1147 { 1148 bool avoid; 1149 struct cache *cache = mg->cache; 1150 1151 if (mg->discard) { 1152 issue_discard(mg); 1153 return; 1154 } 1155 1156 if (mg->writeback || mg->demote) 1157 avoid = !is_dirty(cache, mg->cblock) || 1158 is_discarded_oblock(cache, mg->old_oblock); 1159 else { 1160 struct bio *bio = mg->new_ocell->holder; 1161 1162 avoid = is_discarded_oblock(cache, mg->new_oblock); 1163 1164 if (writeback_mode(&cache->features) && 1165 !avoid && bio_writes_complete_block(cache, bio)) { 1166 issue_overwrite(mg, bio); 1167 return; 1168 } 1169 } 1170 1171 avoid ? avoid_copy(mg) : issue_copy(mg); 1172 } 1173 1174 static void complete_migration(struct dm_cache_migration *mg) 1175 { 1176 if (mg->err) 1177 migration_failure(mg); 1178 else 1179 migration_success_pre_commit(mg); 1180 } 1181 1182 static void process_migrations(struct cache *cache, struct list_head *head, 1183 void (*fn)(struct dm_cache_migration *)) 1184 { 1185 unsigned long flags; 1186 struct list_head list; 1187 struct dm_cache_migration *mg, *tmp; 1188 1189 INIT_LIST_HEAD(&list); 1190 spin_lock_irqsave(&cache->lock, flags); 1191 list_splice_init(head, &list); 1192 spin_unlock_irqrestore(&cache->lock, flags); 1193 1194 list_for_each_entry_safe(mg, tmp, &list, list) 1195 fn(mg); 1196 } 1197 1198 static void __queue_quiesced_migration(struct dm_cache_migration *mg) 1199 { 1200 list_add_tail(&mg->list, &mg->cache->quiesced_migrations); 1201 } 1202 1203 static void queue_quiesced_migration(struct dm_cache_migration *mg) 1204 { 1205 unsigned long flags; 1206 struct cache *cache = mg->cache; 1207 1208 spin_lock_irqsave(&cache->lock, flags); 1209 __queue_quiesced_migration(mg); 1210 spin_unlock_irqrestore(&cache->lock, flags); 1211 1212 wake_worker(cache); 1213 } 1214 1215 static void queue_quiesced_migrations(struct cache *cache, struct list_head *work) 1216 { 1217 unsigned long flags; 1218 struct dm_cache_migration *mg, *tmp; 1219 1220 spin_lock_irqsave(&cache->lock, flags); 1221 list_for_each_entry_safe(mg, tmp, work, list) 1222 __queue_quiesced_migration(mg); 1223 spin_unlock_irqrestore(&cache->lock, flags); 1224 1225 wake_worker(cache); 1226 } 1227 1228 static void check_for_quiesced_migrations(struct cache *cache, 1229 struct per_bio_data *pb) 1230 { 1231 struct list_head work; 1232 1233 if (!pb->all_io_entry) 1234 return; 1235 1236 INIT_LIST_HEAD(&work); 1237 dm_deferred_entry_dec(pb->all_io_entry, &work); 1238 1239 if (!list_empty(&work)) 1240 queue_quiesced_migrations(cache, &work); 1241 } 1242 1243 static void quiesce_migration(struct dm_cache_migration *mg) 1244 { 1245 if (!dm_deferred_set_add_work(mg->cache->all_io_ds, &mg->list)) 1246 queue_quiesced_migration(mg); 1247 } 1248 1249 static void promote(struct cache *cache, struct prealloc *structs, 1250 dm_oblock_t oblock, dm_cblock_t cblock, 1251 struct dm_bio_prison_cell *cell) 1252 { 1253 struct dm_cache_migration *mg = prealloc_get_migration(structs); 1254 1255 mg->err = false; 1256 mg->discard = false; 1257 mg->writeback = false; 1258 mg->demote = false; 1259 mg->promote = true; 1260 mg->requeue_holder = true; 1261 mg->invalidate = false; 1262 mg->cache = cache; 1263 mg->new_oblock = oblock; 1264 mg->cblock = cblock; 1265 mg->old_ocell = NULL; 1266 mg->new_ocell = cell; 1267 mg->start_jiffies = jiffies; 1268 1269 inc_io_migrations(cache); 1270 quiesce_migration(mg); 1271 } 1272 1273 static void writeback(struct cache *cache, struct prealloc *structs, 1274 dm_oblock_t oblock, dm_cblock_t cblock, 1275 struct dm_bio_prison_cell *cell) 1276 { 1277 struct dm_cache_migration *mg = prealloc_get_migration(structs); 1278 1279 mg->err = false; 1280 mg->discard = false; 1281 mg->writeback = true; 1282 mg->demote = false; 1283 mg->promote = false; 1284 mg->requeue_holder = true; 1285 mg->invalidate = false; 1286 mg->cache = cache; 1287 mg->old_oblock = oblock; 1288 mg->cblock = cblock; 1289 mg->old_ocell = cell; 1290 mg->new_ocell = NULL; 1291 mg->start_jiffies = jiffies; 1292 1293 inc_io_migrations(cache); 1294 quiesce_migration(mg); 1295 } 1296 1297 static void demote_then_promote(struct cache *cache, struct prealloc *structs, 1298 dm_oblock_t old_oblock, dm_oblock_t new_oblock, 1299 dm_cblock_t cblock, 1300 struct dm_bio_prison_cell *old_ocell, 1301 struct dm_bio_prison_cell *new_ocell) 1302 { 1303 struct dm_cache_migration *mg = prealloc_get_migration(structs); 1304 1305 mg->err = false; 1306 mg->discard = false; 1307 mg->writeback = false; 1308 mg->demote = true; 1309 mg->promote = true; 1310 mg->requeue_holder = true; 1311 mg->invalidate = false; 1312 mg->cache = cache; 1313 mg->old_oblock = old_oblock; 1314 mg->new_oblock = new_oblock; 1315 mg->cblock = cblock; 1316 mg->old_ocell = old_ocell; 1317 mg->new_ocell = new_ocell; 1318 mg->start_jiffies = jiffies; 1319 1320 inc_io_migrations(cache); 1321 quiesce_migration(mg); 1322 } 1323 1324 /* 1325 * Invalidate a cache entry. No writeback occurs; any changes in the cache 1326 * block are thrown away. 1327 */ 1328 static void invalidate(struct cache *cache, struct prealloc *structs, 1329 dm_oblock_t oblock, dm_cblock_t cblock, 1330 struct dm_bio_prison_cell *cell) 1331 { 1332 struct dm_cache_migration *mg = prealloc_get_migration(structs); 1333 1334 mg->err = false; 1335 mg->discard = false; 1336 mg->writeback = false; 1337 mg->demote = true; 1338 mg->promote = false; 1339 mg->requeue_holder = true; 1340 mg->invalidate = true; 1341 mg->cache = cache; 1342 mg->old_oblock = oblock; 1343 mg->cblock = cblock; 1344 mg->old_ocell = cell; 1345 mg->new_ocell = NULL; 1346 mg->start_jiffies = jiffies; 1347 1348 inc_io_migrations(cache); 1349 quiesce_migration(mg); 1350 } 1351 1352 static void discard(struct cache *cache, struct prealloc *structs, 1353 struct dm_bio_prison_cell *cell) 1354 { 1355 struct dm_cache_migration *mg = prealloc_get_migration(structs); 1356 1357 mg->err = false; 1358 mg->discard = true; 1359 mg->writeback = false; 1360 mg->demote = false; 1361 mg->promote = false; 1362 mg->requeue_holder = false; 1363 mg->invalidate = false; 1364 mg->cache = cache; 1365 mg->old_ocell = NULL; 1366 mg->new_ocell = cell; 1367 mg->start_jiffies = jiffies; 1368 1369 quiesce_migration(mg); 1370 } 1371 1372 /*---------------------------------------------------------------- 1373 * bio processing 1374 *--------------------------------------------------------------*/ 1375 static void defer_bio(struct cache *cache, struct bio *bio) 1376 { 1377 unsigned long flags; 1378 1379 spin_lock_irqsave(&cache->lock, flags); 1380 bio_list_add(&cache->deferred_bios, bio); 1381 spin_unlock_irqrestore(&cache->lock, flags); 1382 1383 wake_worker(cache); 1384 } 1385 1386 static void process_flush_bio(struct cache *cache, struct bio *bio) 1387 { 1388 size_t pb_data_size = get_per_bio_data_size(cache); 1389 struct per_bio_data *pb = get_per_bio_data(bio, pb_data_size); 1390 1391 BUG_ON(bio->bi_iter.bi_size); 1392 if (!pb->req_nr) 1393 remap_to_origin(cache, bio); 1394 else 1395 remap_to_cache(cache, bio, 0); 1396 1397 /* 1398 * REQ_FLUSH is not directed at any particular block so we don't 1399 * need to inc_ds(). REQ_FUA's are split into a write + REQ_FLUSH 1400 * by dm-core. 1401 */ 1402 issue(cache, bio); 1403 } 1404 1405 static void process_discard_bio(struct cache *cache, struct prealloc *structs, 1406 struct bio *bio) 1407 { 1408 int r; 1409 dm_dblock_t b, e; 1410 struct dm_bio_prison_cell *cell_prealloc, *new_ocell; 1411 1412 calc_discard_block_range(cache, bio, &b, &e); 1413 if (b == e) { 1414 bio_endio(bio, 0); 1415 return; 1416 } 1417 1418 cell_prealloc = prealloc_get_cell(structs); 1419 r = bio_detain_range(cache, dblock_to_oblock(cache, b), dblock_to_oblock(cache, e), bio, cell_prealloc, 1420 (cell_free_fn) prealloc_put_cell, 1421 structs, &new_ocell); 1422 if (r > 0) 1423 return; 1424 1425 discard(cache, structs, new_ocell); 1426 } 1427 1428 static bool spare_migration_bandwidth(struct cache *cache) 1429 { 1430 sector_t current_volume = (atomic_read(&cache->nr_io_migrations) + 1) * 1431 cache->sectors_per_block; 1432 return current_volume < cache->migration_threshold; 1433 } 1434 1435 static void inc_hit_counter(struct cache *cache, struct bio *bio) 1436 { 1437 atomic_inc(bio_data_dir(bio) == READ ? 1438 &cache->stats.read_hit : &cache->stats.write_hit); 1439 } 1440 1441 static void inc_miss_counter(struct cache *cache, struct bio *bio) 1442 { 1443 atomic_inc(bio_data_dir(bio) == READ ? 1444 &cache->stats.read_miss : &cache->stats.write_miss); 1445 } 1446 1447 static void process_bio(struct cache *cache, struct prealloc *structs, 1448 struct bio *bio) 1449 { 1450 int r; 1451 bool release_cell = true; 1452 dm_oblock_t block = get_bio_block(cache, bio); 1453 struct dm_bio_prison_cell *cell_prealloc, *old_ocell, *new_ocell; 1454 struct policy_result lookup_result; 1455 bool passthrough = passthrough_mode(&cache->features); 1456 bool discarded_block, can_migrate; 1457 1458 /* 1459 * Check to see if that block is currently migrating. 1460 */ 1461 cell_prealloc = prealloc_get_cell(structs); 1462 r = bio_detain(cache, block, bio, cell_prealloc, 1463 (cell_free_fn) prealloc_put_cell, 1464 structs, &new_ocell); 1465 if (r > 0) 1466 return; 1467 1468 discarded_block = is_discarded_oblock(cache, block); 1469 can_migrate = !passthrough && (discarded_block || spare_migration_bandwidth(cache)); 1470 1471 r = policy_map(cache->policy, block, true, can_migrate, discarded_block, 1472 bio, &lookup_result); 1473 1474 if (r == -EWOULDBLOCK) 1475 /* migration has been denied */ 1476 lookup_result.op = POLICY_MISS; 1477 1478 switch (lookup_result.op) { 1479 case POLICY_HIT: 1480 if (passthrough) { 1481 inc_miss_counter(cache, bio); 1482 1483 /* 1484 * Passthrough always maps to the origin, 1485 * invalidating any cache blocks that are written 1486 * to. 1487 */ 1488 1489 if (bio_data_dir(bio) == WRITE) { 1490 atomic_inc(&cache->stats.demotion); 1491 invalidate(cache, structs, block, lookup_result.cblock, new_ocell); 1492 release_cell = false; 1493 1494 } else { 1495 /* FIXME: factor out issue_origin() */ 1496 remap_to_origin_clear_discard(cache, bio, block); 1497 inc_and_issue(cache, bio, new_ocell); 1498 } 1499 } else { 1500 inc_hit_counter(cache, bio); 1501 1502 if (bio_data_dir(bio) == WRITE && 1503 writethrough_mode(&cache->features) && 1504 !is_dirty(cache, lookup_result.cblock)) { 1505 remap_to_origin_then_cache(cache, bio, block, lookup_result.cblock); 1506 inc_and_issue(cache, bio, new_ocell); 1507 1508 } else { 1509 remap_to_cache_dirty(cache, bio, block, lookup_result.cblock); 1510 inc_and_issue(cache, bio, new_ocell); 1511 } 1512 } 1513 1514 break; 1515 1516 case POLICY_MISS: 1517 inc_miss_counter(cache, bio); 1518 remap_to_origin_clear_discard(cache, bio, block); 1519 inc_and_issue(cache, bio, new_ocell); 1520 break; 1521 1522 case POLICY_NEW: 1523 atomic_inc(&cache->stats.promotion); 1524 promote(cache, structs, block, lookup_result.cblock, new_ocell); 1525 release_cell = false; 1526 break; 1527 1528 case POLICY_REPLACE: 1529 cell_prealloc = prealloc_get_cell(structs); 1530 r = bio_detain(cache, lookup_result.old_oblock, bio, cell_prealloc, 1531 (cell_free_fn) prealloc_put_cell, 1532 structs, &old_ocell); 1533 if (r > 0) { 1534 /* 1535 * We have to be careful to avoid lock inversion of 1536 * the cells. So we back off, and wait for the 1537 * old_ocell to become free. 1538 */ 1539 policy_force_mapping(cache->policy, block, 1540 lookup_result.old_oblock); 1541 atomic_inc(&cache->stats.cache_cell_clash); 1542 break; 1543 } 1544 atomic_inc(&cache->stats.demotion); 1545 atomic_inc(&cache->stats.promotion); 1546 1547 demote_then_promote(cache, structs, lookup_result.old_oblock, 1548 block, lookup_result.cblock, 1549 old_ocell, new_ocell); 1550 release_cell = false; 1551 break; 1552 1553 default: 1554 DMERR_LIMIT("%s: erroring bio, unknown policy op: %u", __func__, 1555 (unsigned) lookup_result.op); 1556 bio_io_error(bio); 1557 } 1558 1559 if (release_cell) 1560 cell_defer(cache, new_ocell, false); 1561 } 1562 1563 static int need_commit_due_to_time(struct cache *cache) 1564 { 1565 return jiffies < cache->last_commit_jiffies || 1566 jiffies > cache->last_commit_jiffies + COMMIT_PERIOD; 1567 } 1568 1569 static int commit_if_needed(struct cache *cache) 1570 { 1571 int r = 0; 1572 1573 if ((cache->commit_requested || need_commit_due_to_time(cache)) && 1574 dm_cache_changed_this_transaction(cache->cmd)) { 1575 atomic_inc(&cache->stats.commit_count); 1576 cache->commit_requested = false; 1577 r = dm_cache_commit(cache->cmd, false); 1578 cache->last_commit_jiffies = jiffies; 1579 } 1580 1581 return r; 1582 } 1583 1584 static void process_deferred_bios(struct cache *cache) 1585 { 1586 unsigned long flags; 1587 struct bio_list bios; 1588 struct bio *bio; 1589 struct prealloc structs; 1590 1591 memset(&structs, 0, sizeof(structs)); 1592 bio_list_init(&bios); 1593 1594 spin_lock_irqsave(&cache->lock, flags); 1595 bio_list_merge(&bios, &cache->deferred_bios); 1596 bio_list_init(&cache->deferred_bios); 1597 spin_unlock_irqrestore(&cache->lock, flags); 1598 1599 while (!bio_list_empty(&bios)) { 1600 /* 1601 * If we've got no free migration structs, and processing 1602 * this bio might require one, we pause until there are some 1603 * prepared mappings to process. 1604 */ 1605 if (prealloc_data_structs(cache, &structs)) { 1606 spin_lock_irqsave(&cache->lock, flags); 1607 bio_list_merge(&cache->deferred_bios, &bios); 1608 spin_unlock_irqrestore(&cache->lock, flags); 1609 break; 1610 } 1611 1612 bio = bio_list_pop(&bios); 1613 1614 if (bio->bi_rw & REQ_FLUSH) 1615 process_flush_bio(cache, bio); 1616 else if (bio->bi_rw & REQ_DISCARD) 1617 process_discard_bio(cache, &structs, bio); 1618 else 1619 process_bio(cache, &structs, bio); 1620 } 1621 1622 prealloc_free_structs(cache, &structs); 1623 } 1624 1625 static void process_deferred_flush_bios(struct cache *cache, bool submit_bios) 1626 { 1627 unsigned long flags; 1628 struct bio_list bios; 1629 struct bio *bio; 1630 1631 bio_list_init(&bios); 1632 1633 spin_lock_irqsave(&cache->lock, flags); 1634 bio_list_merge(&bios, &cache->deferred_flush_bios); 1635 bio_list_init(&cache->deferred_flush_bios); 1636 spin_unlock_irqrestore(&cache->lock, flags); 1637 1638 /* 1639 * These bios have already been through inc_ds() 1640 */ 1641 while ((bio = bio_list_pop(&bios))) 1642 submit_bios ? generic_make_request(bio) : bio_io_error(bio); 1643 } 1644 1645 static void process_deferred_writethrough_bios(struct cache *cache) 1646 { 1647 unsigned long flags; 1648 struct bio_list bios; 1649 struct bio *bio; 1650 1651 bio_list_init(&bios); 1652 1653 spin_lock_irqsave(&cache->lock, flags); 1654 bio_list_merge(&bios, &cache->deferred_writethrough_bios); 1655 bio_list_init(&cache->deferred_writethrough_bios); 1656 spin_unlock_irqrestore(&cache->lock, flags); 1657 1658 /* 1659 * These bios have already been through inc_ds() 1660 */ 1661 while ((bio = bio_list_pop(&bios))) 1662 generic_make_request(bio); 1663 } 1664 1665 static void writeback_some_dirty_blocks(struct cache *cache) 1666 { 1667 int r = 0; 1668 dm_oblock_t oblock; 1669 dm_cblock_t cblock; 1670 struct prealloc structs; 1671 struct dm_bio_prison_cell *old_ocell; 1672 1673 memset(&structs, 0, sizeof(structs)); 1674 1675 while (spare_migration_bandwidth(cache)) { 1676 if (prealloc_data_structs(cache, &structs)) 1677 break; 1678 1679 r = policy_writeback_work(cache->policy, &oblock, &cblock); 1680 if (r) 1681 break; 1682 1683 r = get_cell(cache, oblock, &structs, &old_ocell); 1684 if (r) { 1685 policy_set_dirty(cache->policy, oblock); 1686 break; 1687 } 1688 1689 writeback(cache, &structs, oblock, cblock, old_ocell); 1690 } 1691 1692 prealloc_free_structs(cache, &structs); 1693 } 1694 1695 /*---------------------------------------------------------------- 1696 * Invalidations. 1697 * Dropping something from the cache *without* writing back. 1698 *--------------------------------------------------------------*/ 1699 1700 static void process_invalidation_request(struct cache *cache, struct invalidation_request *req) 1701 { 1702 int r = 0; 1703 uint64_t begin = from_cblock(req->cblocks->begin); 1704 uint64_t end = from_cblock(req->cblocks->end); 1705 1706 while (begin != end) { 1707 r = policy_remove_cblock(cache->policy, to_cblock(begin)); 1708 if (!r) { 1709 r = dm_cache_remove_mapping(cache->cmd, to_cblock(begin)); 1710 if (r) 1711 break; 1712 1713 } else if (r == -ENODATA) { 1714 /* harmless, already unmapped */ 1715 r = 0; 1716 1717 } else { 1718 DMERR("policy_remove_cblock failed"); 1719 break; 1720 } 1721 1722 begin++; 1723 } 1724 1725 cache->commit_requested = true; 1726 1727 req->err = r; 1728 atomic_set(&req->complete, 1); 1729 1730 wake_up(&req->result_wait); 1731 } 1732 1733 static void process_invalidation_requests(struct cache *cache) 1734 { 1735 struct list_head list; 1736 struct invalidation_request *req, *tmp; 1737 1738 INIT_LIST_HEAD(&list); 1739 spin_lock(&cache->invalidation_lock); 1740 list_splice_init(&cache->invalidation_requests, &list); 1741 spin_unlock(&cache->invalidation_lock); 1742 1743 list_for_each_entry_safe (req, tmp, &list, list) 1744 process_invalidation_request(cache, req); 1745 } 1746 1747 /*---------------------------------------------------------------- 1748 * Main worker loop 1749 *--------------------------------------------------------------*/ 1750 static bool is_quiescing(struct cache *cache) 1751 { 1752 return atomic_read(&cache->quiescing); 1753 } 1754 1755 static void ack_quiescing(struct cache *cache) 1756 { 1757 if (is_quiescing(cache)) { 1758 atomic_inc(&cache->quiescing_ack); 1759 wake_up(&cache->quiescing_wait); 1760 } 1761 } 1762 1763 static void wait_for_quiescing_ack(struct cache *cache) 1764 { 1765 wait_event(cache->quiescing_wait, atomic_read(&cache->quiescing_ack)); 1766 } 1767 1768 static void start_quiescing(struct cache *cache) 1769 { 1770 atomic_inc(&cache->quiescing); 1771 wait_for_quiescing_ack(cache); 1772 } 1773 1774 static void stop_quiescing(struct cache *cache) 1775 { 1776 atomic_set(&cache->quiescing, 0); 1777 atomic_set(&cache->quiescing_ack, 0); 1778 } 1779 1780 static void wait_for_migrations(struct cache *cache) 1781 { 1782 wait_event(cache->migration_wait, !atomic_read(&cache->nr_allocated_migrations)); 1783 } 1784 1785 static void stop_worker(struct cache *cache) 1786 { 1787 cancel_delayed_work(&cache->waker); 1788 flush_workqueue(cache->wq); 1789 } 1790 1791 static void requeue_deferred_io(struct cache *cache) 1792 { 1793 struct bio *bio; 1794 struct bio_list bios; 1795 1796 bio_list_init(&bios); 1797 bio_list_merge(&bios, &cache->deferred_bios); 1798 bio_list_init(&cache->deferred_bios); 1799 1800 while ((bio = bio_list_pop(&bios))) 1801 bio_endio(bio, DM_ENDIO_REQUEUE); 1802 } 1803 1804 static int more_work(struct cache *cache) 1805 { 1806 if (is_quiescing(cache)) 1807 return !list_empty(&cache->quiesced_migrations) || 1808 !list_empty(&cache->completed_migrations) || 1809 !list_empty(&cache->need_commit_migrations); 1810 else 1811 return !bio_list_empty(&cache->deferred_bios) || 1812 !bio_list_empty(&cache->deferred_flush_bios) || 1813 !bio_list_empty(&cache->deferred_writethrough_bios) || 1814 !list_empty(&cache->quiesced_migrations) || 1815 !list_empty(&cache->completed_migrations) || 1816 !list_empty(&cache->need_commit_migrations) || 1817 cache->invalidate; 1818 } 1819 1820 static void do_worker(struct work_struct *ws) 1821 { 1822 struct cache *cache = container_of(ws, struct cache, worker); 1823 1824 do { 1825 if (!is_quiescing(cache)) { 1826 writeback_some_dirty_blocks(cache); 1827 process_deferred_writethrough_bios(cache); 1828 process_deferred_bios(cache); 1829 process_invalidation_requests(cache); 1830 } 1831 1832 process_migrations(cache, &cache->quiesced_migrations, issue_copy_or_discard); 1833 process_migrations(cache, &cache->completed_migrations, complete_migration); 1834 1835 if (commit_if_needed(cache)) { 1836 process_deferred_flush_bios(cache, false); 1837 process_migrations(cache, &cache->need_commit_migrations, migration_failure); 1838 1839 /* 1840 * FIXME: rollback metadata or just go into a 1841 * failure mode and error everything 1842 */ 1843 } else { 1844 process_deferred_flush_bios(cache, true); 1845 process_migrations(cache, &cache->need_commit_migrations, 1846 migration_success_post_commit); 1847 } 1848 1849 ack_quiescing(cache); 1850 1851 } while (more_work(cache)); 1852 } 1853 1854 /* 1855 * We want to commit periodically so that not too much 1856 * unwritten metadata builds up. 1857 */ 1858 static void do_waker(struct work_struct *ws) 1859 { 1860 struct cache *cache = container_of(to_delayed_work(ws), struct cache, waker); 1861 policy_tick(cache->policy); 1862 wake_worker(cache); 1863 queue_delayed_work(cache->wq, &cache->waker, COMMIT_PERIOD); 1864 } 1865 1866 /*----------------------------------------------------------------*/ 1867 1868 static int is_congested(struct dm_dev *dev, int bdi_bits) 1869 { 1870 struct request_queue *q = bdev_get_queue(dev->bdev); 1871 return bdi_congested(&q->backing_dev_info, bdi_bits); 1872 } 1873 1874 static int cache_is_congested(struct dm_target_callbacks *cb, int bdi_bits) 1875 { 1876 struct cache *cache = container_of(cb, struct cache, callbacks); 1877 1878 return is_congested(cache->origin_dev, bdi_bits) || 1879 is_congested(cache->cache_dev, bdi_bits); 1880 } 1881 1882 /*---------------------------------------------------------------- 1883 * Target methods 1884 *--------------------------------------------------------------*/ 1885 1886 /* 1887 * This function gets called on the error paths of the constructor, so we 1888 * have to cope with a partially initialised struct. 1889 */ 1890 static void destroy(struct cache *cache) 1891 { 1892 unsigned i; 1893 1894 if (cache->migration_pool) 1895 mempool_destroy(cache->migration_pool); 1896 1897 if (cache->all_io_ds) 1898 dm_deferred_set_destroy(cache->all_io_ds); 1899 1900 if (cache->prison) 1901 dm_bio_prison_destroy(cache->prison); 1902 1903 if (cache->wq) 1904 destroy_workqueue(cache->wq); 1905 1906 if (cache->dirty_bitset) 1907 free_bitset(cache->dirty_bitset); 1908 1909 if (cache->discard_bitset) 1910 free_bitset(cache->discard_bitset); 1911 1912 if (cache->copier) 1913 dm_kcopyd_client_destroy(cache->copier); 1914 1915 if (cache->cmd) 1916 dm_cache_metadata_close(cache->cmd); 1917 1918 if (cache->metadata_dev) 1919 dm_put_device(cache->ti, cache->metadata_dev); 1920 1921 if (cache->origin_dev) 1922 dm_put_device(cache->ti, cache->origin_dev); 1923 1924 if (cache->cache_dev) 1925 dm_put_device(cache->ti, cache->cache_dev); 1926 1927 if (cache->policy) 1928 dm_cache_policy_destroy(cache->policy); 1929 1930 for (i = 0; i < cache->nr_ctr_args ; i++) 1931 kfree(cache->ctr_args[i]); 1932 kfree(cache->ctr_args); 1933 1934 kfree(cache); 1935 } 1936 1937 static void cache_dtr(struct dm_target *ti) 1938 { 1939 struct cache *cache = ti->private; 1940 1941 destroy(cache); 1942 } 1943 1944 static sector_t get_dev_size(struct dm_dev *dev) 1945 { 1946 return i_size_read(dev->bdev->bd_inode) >> SECTOR_SHIFT; 1947 } 1948 1949 /*----------------------------------------------------------------*/ 1950 1951 /* 1952 * Construct a cache device mapping. 1953 * 1954 * cache <metadata dev> <cache dev> <origin dev> <block size> 1955 * <#feature args> [<feature arg>]* 1956 * <policy> <#policy args> [<policy arg>]* 1957 * 1958 * metadata dev : fast device holding the persistent metadata 1959 * cache dev : fast device holding cached data blocks 1960 * origin dev : slow device holding original data blocks 1961 * block size : cache unit size in sectors 1962 * 1963 * #feature args : number of feature arguments passed 1964 * feature args : writethrough. (The default is writeback.) 1965 * 1966 * policy : the replacement policy to use 1967 * #policy args : an even number of policy arguments corresponding 1968 * to key/value pairs passed to the policy 1969 * policy args : key/value pairs passed to the policy 1970 * E.g. 'sequential_threshold 1024' 1971 * See cache-policies.txt for details. 1972 * 1973 * Optional feature arguments are: 1974 * writethrough : write through caching that prohibits cache block 1975 * content from being different from origin block content. 1976 * Without this argument, the default behaviour is to write 1977 * back cache block contents later for performance reasons, 1978 * so they may differ from the corresponding origin blocks. 1979 */ 1980 struct cache_args { 1981 struct dm_target *ti; 1982 1983 struct dm_dev *metadata_dev; 1984 1985 struct dm_dev *cache_dev; 1986 sector_t cache_sectors; 1987 1988 struct dm_dev *origin_dev; 1989 sector_t origin_sectors; 1990 1991 uint32_t block_size; 1992 1993 const char *policy_name; 1994 int policy_argc; 1995 const char **policy_argv; 1996 1997 struct cache_features features; 1998 }; 1999 2000 static void destroy_cache_args(struct cache_args *ca) 2001 { 2002 if (ca->metadata_dev) 2003 dm_put_device(ca->ti, ca->metadata_dev); 2004 2005 if (ca->cache_dev) 2006 dm_put_device(ca->ti, ca->cache_dev); 2007 2008 if (ca->origin_dev) 2009 dm_put_device(ca->ti, ca->origin_dev); 2010 2011 kfree(ca); 2012 } 2013 2014 static bool at_least_one_arg(struct dm_arg_set *as, char **error) 2015 { 2016 if (!as->argc) { 2017 *error = "Insufficient args"; 2018 return false; 2019 } 2020 2021 return true; 2022 } 2023 2024 static int parse_metadata_dev(struct cache_args *ca, struct dm_arg_set *as, 2025 char **error) 2026 { 2027 int r; 2028 sector_t metadata_dev_size; 2029 char b[BDEVNAME_SIZE]; 2030 2031 if (!at_least_one_arg(as, error)) 2032 return -EINVAL; 2033 2034 r = dm_get_device(ca->ti, dm_shift_arg(as), FMODE_READ | FMODE_WRITE, 2035 &ca->metadata_dev); 2036 if (r) { 2037 *error = "Error opening metadata device"; 2038 return r; 2039 } 2040 2041 metadata_dev_size = get_dev_size(ca->metadata_dev); 2042 if (metadata_dev_size > DM_CACHE_METADATA_MAX_SECTORS_WARNING) 2043 DMWARN("Metadata device %s is larger than %u sectors: excess space will not be used.", 2044 bdevname(ca->metadata_dev->bdev, b), THIN_METADATA_MAX_SECTORS); 2045 2046 return 0; 2047 } 2048 2049 static int parse_cache_dev(struct cache_args *ca, struct dm_arg_set *as, 2050 char **error) 2051 { 2052 int r; 2053 2054 if (!at_least_one_arg(as, error)) 2055 return -EINVAL; 2056 2057 r = dm_get_device(ca->ti, dm_shift_arg(as), FMODE_READ | FMODE_WRITE, 2058 &ca->cache_dev); 2059 if (r) { 2060 *error = "Error opening cache device"; 2061 return r; 2062 } 2063 ca->cache_sectors = get_dev_size(ca->cache_dev); 2064 2065 return 0; 2066 } 2067 2068 static int parse_origin_dev(struct cache_args *ca, struct dm_arg_set *as, 2069 char **error) 2070 { 2071 int r; 2072 2073 if (!at_least_one_arg(as, error)) 2074 return -EINVAL; 2075 2076 r = dm_get_device(ca->ti, dm_shift_arg(as), FMODE_READ | FMODE_WRITE, 2077 &ca->origin_dev); 2078 if (r) { 2079 *error = "Error opening origin device"; 2080 return r; 2081 } 2082 2083 ca->origin_sectors = get_dev_size(ca->origin_dev); 2084 if (ca->ti->len > ca->origin_sectors) { 2085 *error = "Device size larger than cached device"; 2086 return -EINVAL; 2087 } 2088 2089 return 0; 2090 } 2091 2092 static int parse_block_size(struct cache_args *ca, struct dm_arg_set *as, 2093 char **error) 2094 { 2095 unsigned long block_size; 2096 2097 if (!at_least_one_arg(as, error)) 2098 return -EINVAL; 2099 2100 if (kstrtoul(dm_shift_arg(as), 10, &block_size) || !block_size || 2101 block_size < DATA_DEV_BLOCK_SIZE_MIN_SECTORS || 2102 block_size > DATA_DEV_BLOCK_SIZE_MAX_SECTORS || 2103 block_size & (DATA_DEV_BLOCK_SIZE_MIN_SECTORS - 1)) { 2104 *error = "Invalid data block size"; 2105 return -EINVAL; 2106 } 2107 2108 if (block_size > ca->cache_sectors) { 2109 *error = "Data block size is larger than the cache device"; 2110 return -EINVAL; 2111 } 2112 2113 ca->block_size = block_size; 2114 2115 return 0; 2116 } 2117 2118 static void init_features(struct cache_features *cf) 2119 { 2120 cf->mode = CM_WRITE; 2121 cf->io_mode = CM_IO_WRITEBACK; 2122 } 2123 2124 static int parse_features(struct cache_args *ca, struct dm_arg_set *as, 2125 char **error) 2126 { 2127 static struct dm_arg _args[] = { 2128 {0, 1, "Invalid number of cache feature arguments"}, 2129 }; 2130 2131 int r; 2132 unsigned argc; 2133 const char *arg; 2134 struct cache_features *cf = &ca->features; 2135 2136 init_features(cf); 2137 2138 r = dm_read_arg_group(_args, as, &argc, error); 2139 if (r) 2140 return -EINVAL; 2141 2142 while (argc--) { 2143 arg = dm_shift_arg(as); 2144 2145 if (!strcasecmp(arg, "writeback")) 2146 cf->io_mode = CM_IO_WRITEBACK; 2147 2148 else if (!strcasecmp(arg, "writethrough")) 2149 cf->io_mode = CM_IO_WRITETHROUGH; 2150 2151 else if (!strcasecmp(arg, "passthrough")) 2152 cf->io_mode = CM_IO_PASSTHROUGH; 2153 2154 else { 2155 *error = "Unrecognised cache feature requested"; 2156 return -EINVAL; 2157 } 2158 } 2159 2160 return 0; 2161 } 2162 2163 static int parse_policy(struct cache_args *ca, struct dm_arg_set *as, 2164 char **error) 2165 { 2166 static struct dm_arg _args[] = { 2167 {0, 1024, "Invalid number of policy arguments"}, 2168 }; 2169 2170 int r; 2171 2172 if (!at_least_one_arg(as, error)) 2173 return -EINVAL; 2174 2175 ca->policy_name = dm_shift_arg(as); 2176 2177 r = dm_read_arg_group(_args, as, &ca->policy_argc, error); 2178 if (r) 2179 return -EINVAL; 2180 2181 ca->policy_argv = (const char **)as->argv; 2182 dm_consume_args(as, ca->policy_argc); 2183 2184 return 0; 2185 } 2186 2187 static int parse_cache_args(struct cache_args *ca, int argc, char **argv, 2188 char **error) 2189 { 2190 int r; 2191 struct dm_arg_set as; 2192 2193 as.argc = argc; 2194 as.argv = argv; 2195 2196 r = parse_metadata_dev(ca, &as, error); 2197 if (r) 2198 return r; 2199 2200 r = parse_cache_dev(ca, &as, error); 2201 if (r) 2202 return r; 2203 2204 r = parse_origin_dev(ca, &as, error); 2205 if (r) 2206 return r; 2207 2208 r = parse_block_size(ca, &as, error); 2209 if (r) 2210 return r; 2211 2212 r = parse_features(ca, &as, error); 2213 if (r) 2214 return r; 2215 2216 r = parse_policy(ca, &as, error); 2217 if (r) 2218 return r; 2219 2220 return 0; 2221 } 2222 2223 /*----------------------------------------------------------------*/ 2224 2225 static struct kmem_cache *migration_cache; 2226 2227 #define NOT_CORE_OPTION 1 2228 2229 static int process_config_option(struct cache *cache, const char *key, const char *value) 2230 { 2231 unsigned long tmp; 2232 2233 if (!strcasecmp(key, "migration_threshold")) { 2234 if (kstrtoul(value, 10, &tmp)) 2235 return -EINVAL; 2236 2237 cache->migration_threshold = tmp; 2238 return 0; 2239 } 2240 2241 return NOT_CORE_OPTION; 2242 } 2243 2244 static int set_config_value(struct cache *cache, const char *key, const char *value) 2245 { 2246 int r = process_config_option(cache, key, value); 2247 2248 if (r == NOT_CORE_OPTION) 2249 r = policy_set_config_value(cache->policy, key, value); 2250 2251 if (r) 2252 DMWARN("bad config value for %s: %s", key, value); 2253 2254 return r; 2255 } 2256 2257 static int set_config_values(struct cache *cache, int argc, const char **argv) 2258 { 2259 int r = 0; 2260 2261 if (argc & 1) { 2262 DMWARN("Odd number of policy arguments given but they should be <key> <value> pairs."); 2263 return -EINVAL; 2264 } 2265 2266 while (argc) { 2267 r = set_config_value(cache, argv[0], argv[1]); 2268 if (r) 2269 break; 2270 2271 argc -= 2; 2272 argv += 2; 2273 } 2274 2275 return r; 2276 } 2277 2278 static int create_cache_policy(struct cache *cache, struct cache_args *ca, 2279 char **error) 2280 { 2281 struct dm_cache_policy *p = dm_cache_policy_create(ca->policy_name, 2282 cache->cache_size, 2283 cache->origin_sectors, 2284 cache->sectors_per_block); 2285 if (IS_ERR(p)) { 2286 *error = "Error creating cache's policy"; 2287 return PTR_ERR(p); 2288 } 2289 cache->policy = p; 2290 2291 return 0; 2292 } 2293 2294 /* 2295 * We want the discard block size to be at least the size of the cache 2296 * block size and have no more than 2^14 discard blocks across the origin. 2297 */ 2298 #define MAX_DISCARD_BLOCKS (1 << 14) 2299 2300 static bool too_many_discard_blocks(sector_t discard_block_size, 2301 sector_t origin_size) 2302 { 2303 (void) sector_div(origin_size, discard_block_size); 2304 2305 return origin_size > MAX_DISCARD_BLOCKS; 2306 } 2307 2308 static sector_t calculate_discard_block_size(sector_t cache_block_size, 2309 sector_t origin_size) 2310 { 2311 sector_t discard_block_size = cache_block_size; 2312 2313 if (origin_size) 2314 while (too_many_discard_blocks(discard_block_size, origin_size)) 2315 discard_block_size *= 2; 2316 2317 return discard_block_size; 2318 } 2319 2320 static void set_cache_size(struct cache *cache, dm_cblock_t size) 2321 { 2322 dm_block_t nr_blocks = from_cblock(size); 2323 2324 if (nr_blocks > (1 << 20) && cache->cache_size != size) 2325 DMWARN_LIMIT("You have created a cache device with a lot of individual cache blocks (%llu)\n" 2326 "All these mappings can consume a lot of kernel memory, and take some time to read/write.\n" 2327 "Please consider increasing the cache block size to reduce the overall cache block count.", 2328 (unsigned long long) nr_blocks); 2329 2330 cache->cache_size = size; 2331 } 2332 2333 #define DEFAULT_MIGRATION_THRESHOLD 2048 2334 2335 static int cache_create(struct cache_args *ca, struct cache **result) 2336 { 2337 int r = 0; 2338 char **error = &ca->ti->error; 2339 struct cache *cache; 2340 struct dm_target *ti = ca->ti; 2341 dm_block_t origin_blocks; 2342 struct dm_cache_metadata *cmd; 2343 bool may_format = ca->features.mode == CM_WRITE; 2344 2345 cache = kzalloc(sizeof(*cache), GFP_KERNEL); 2346 if (!cache) 2347 return -ENOMEM; 2348 2349 cache->ti = ca->ti; 2350 ti->private = cache; 2351 ti->num_flush_bios = 2; 2352 ti->flush_supported = true; 2353 2354 ti->num_discard_bios = 1; 2355 ti->discards_supported = true; 2356 ti->discard_zeroes_data_unsupported = true; 2357 ti->split_discard_bios = false; 2358 2359 cache->features = ca->features; 2360 ti->per_bio_data_size = get_per_bio_data_size(cache); 2361 2362 cache->callbacks.congested_fn = cache_is_congested; 2363 dm_table_add_target_callbacks(ti->table, &cache->callbacks); 2364 2365 cache->metadata_dev = ca->metadata_dev; 2366 cache->origin_dev = ca->origin_dev; 2367 cache->cache_dev = ca->cache_dev; 2368 2369 ca->metadata_dev = ca->origin_dev = ca->cache_dev = NULL; 2370 2371 /* FIXME: factor out this whole section */ 2372 origin_blocks = cache->origin_sectors = ca->origin_sectors; 2373 origin_blocks = block_div(origin_blocks, ca->block_size); 2374 cache->origin_blocks = to_oblock(origin_blocks); 2375 2376 cache->sectors_per_block = ca->block_size; 2377 if (dm_set_target_max_io_len(ti, cache->sectors_per_block)) { 2378 r = -EINVAL; 2379 goto bad; 2380 } 2381 2382 if (ca->block_size & (ca->block_size - 1)) { 2383 dm_block_t cache_size = ca->cache_sectors; 2384 2385 cache->sectors_per_block_shift = -1; 2386 cache_size = block_div(cache_size, ca->block_size); 2387 set_cache_size(cache, to_cblock(cache_size)); 2388 } else { 2389 cache->sectors_per_block_shift = __ffs(ca->block_size); 2390 set_cache_size(cache, to_cblock(ca->cache_sectors >> cache->sectors_per_block_shift)); 2391 } 2392 2393 r = create_cache_policy(cache, ca, error); 2394 if (r) 2395 goto bad; 2396 2397 cache->policy_nr_args = ca->policy_argc; 2398 cache->migration_threshold = DEFAULT_MIGRATION_THRESHOLD; 2399 2400 r = set_config_values(cache, ca->policy_argc, ca->policy_argv); 2401 if (r) { 2402 *error = "Error setting cache policy's config values"; 2403 goto bad; 2404 } 2405 2406 cmd = dm_cache_metadata_open(cache->metadata_dev->bdev, 2407 ca->block_size, may_format, 2408 dm_cache_policy_get_hint_size(cache->policy)); 2409 if (IS_ERR(cmd)) { 2410 *error = "Error creating metadata object"; 2411 r = PTR_ERR(cmd); 2412 goto bad; 2413 } 2414 cache->cmd = cmd; 2415 2416 if (passthrough_mode(&cache->features)) { 2417 bool all_clean; 2418 2419 r = dm_cache_metadata_all_clean(cache->cmd, &all_clean); 2420 if (r) { 2421 *error = "dm_cache_metadata_all_clean() failed"; 2422 goto bad; 2423 } 2424 2425 if (!all_clean) { 2426 *error = "Cannot enter passthrough mode unless all blocks are clean"; 2427 r = -EINVAL; 2428 goto bad; 2429 } 2430 } 2431 2432 spin_lock_init(&cache->lock); 2433 bio_list_init(&cache->deferred_bios); 2434 bio_list_init(&cache->deferred_flush_bios); 2435 bio_list_init(&cache->deferred_writethrough_bios); 2436 INIT_LIST_HEAD(&cache->quiesced_migrations); 2437 INIT_LIST_HEAD(&cache->completed_migrations); 2438 INIT_LIST_HEAD(&cache->need_commit_migrations); 2439 atomic_set(&cache->nr_allocated_migrations, 0); 2440 atomic_set(&cache->nr_io_migrations, 0); 2441 init_waitqueue_head(&cache->migration_wait); 2442 2443 init_waitqueue_head(&cache->quiescing_wait); 2444 atomic_set(&cache->quiescing, 0); 2445 atomic_set(&cache->quiescing_ack, 0); 2446 2447 r = -ENOMEM; 2448 atomic_set(&cache->nr_dirty, 0); 2449 cache->dirty_bitset = alloc_bitset(from_cblock(cache->cache_size)); 2450 if (!cache->dirty_bitset) { 2451 *error = "could not allocate dirty bitset"; 2452 goto bad; 2453 } 2454 clear_bitset(cache->dirty_bitset, from_cblock(cache->cache_size)); 2455 2456 cache->discard_block_size = 2457 calculate_discard_block_size(cache->sectors_per_block, 2458 cache->origin_sectors); 2459 cache->discard_nr_blocks = to_dblock(dm_sector_div_up(cache->origin_sectors, 2460 cache->discard_block_size)); 2461 cache->discard_bitset = alloc_bitset(from_dblock(cache->discard_nr_blocks)); 2462 if (!cache->discard_bitset) { 2463 *error = "could not allocate discard bitset"; 2464 goto bad; 2465 } 2466 clear_bitset(cache->discard_bitset, from_dblock(cache->discard_nr_blocks)); 2467 2468 cache->copier = dm_kcopyd_client_create(&dm_kcopyd_throttle); 2469 if (IS_ERR(cache->copier)) { 2470 *error = "could not create kcopyd client"; 2471 r = PTR_ERR(cache->copier); 2472 goto bad; 2473 } 2474 2475 cache->wq = alloc_ordered_workqueue("dm-" DM_MSG_PREFIX, WQ_MEM_RECLAIM); 2476 if (!cache->wq) { 2477 *error = "could not create workqueue for metadata object"; 2478 goto bad; 2479 } 2480 INIT_WORK(&cache->worker, do_worker); 2481 INIT_DELAYED_WORK(&cache->waker, do_waker); 2482 cache->last_commit_jiffies = jiffies; 2483 2484 cache->prison = dm_bio_prison_create(); 2485 if (!cache->prison) { 2486 *error = "could not create bio prison"; 2487 goto bad; 2488 } 2489 2490 cache->all_io_ds = dm_deferred_set_create(); 2491 if (!cache->all_io_ds) { 2492 *error = "could not create all_io deferred set"; 2493 goto bad; 2494 } 2495 2496 cache->migration_pool = mempool_create_slab_pool(MIGRATION_POOL_SIZE, 2497 migration_cache); 2498 if (!cache->migration_pool) { 2499 *error = "Error creating cache's migration mempool"; 2500 goto bad; 2501 } 2502 2503 cache->need_tick_bio = true; 2504 cache->sized = false; 2505 cache->invalidate = false; 2506 cache->commit_requested = false; 2507 cache->loaded_mappings = false; 2508 cache->loaded_discards = false; 2509 2510 load_stats(cache); 2511 2512 atomic_set(&cache->stats.demotion, 0); 2513 atomic_set(&cache->stats.promotion, 0); 2514 atomic_set(&cache->stats.copies_avoided, 0); 2515 atomic_set(&cache->stats.cache_cell_clash, 0); 2516 atomic_set(&cache->stats.commit_count, 0); 2517 atomic_set(&cache->stats.discard_count, 0); 2518 2519 spin_lock_init(&cache->invalidation_lock); 2520 INIT_LIST_HEAD(&cache->invalidation_requests); 2521 2522 *result = cache; 2523 return 0; 2524 2525 bad: 2526 destroy(cache); 2527 return r; 2528 } 2529 2530 static int copy_ctr_args(struct cache *cache, int argc, const char **argv) 2531 { 2532 unsigned i; 2533 const char **copy; 2534 2535 copy = kcalloc(argc, sizeof(*copy), GFP_KERNEL); 2536 if (!copy) 2537 return -ENOMEM; 2538 for (i = 0; i < argc; i++) { 2539 copy[i] = kstrdup(argv[i], GFP_KERNEL); 2540 if (!copy[i]) { 2541 while (i--) 2542 kfree(copy[i]); 2543 kfree(copy); 2544 return -ENOMEM; 2545 } 2546 } 2547 2548 cache->nr_ctr_args = argc; 2549 cache->ctr_args = copy; 2550 2551 return 0; 2552 } 2553 2554 static int cache_ctr(struct dm_target *ti, unsigned argc, char **argv) 2555 { 2556 int r = -EINVAL; 2557 struct cache_args *ca; 2558 struct cache *cache = NULL; 2559 2560 ca = kzalloc(sizeof(*ca), GFP_KERNEL); 2561 if (!ca) { 2562 ti->error = "Error allocating memory for cache"; 2563 return -ENOMEM; 2564 } 2565 ca->ti = ti; 2566 2567 r = parse_cache_args(ca, argc, argv, &ti->error); 2568 if (r) 2569 goto out; 2570 2571 r = cache_create(ca, &cache); 2572 if (r) 2573 goto out; 2574 2575 r = copy_ctr_args(cache, argc - 3, (const char **)argv + 3); 2576 if (r) { 2577 destroy(cache); 2578 goto out; 2579 } 2580 2581 ti->private = cache; 2582 2583 out: 2584 destroy_cache_args(ca); 2585 return r; 2586 } 2587 2588 static int __cache_map(struct cache *cache, struct bio *bio, struct dm_bio_prison_cell **cell) 2589 { 2590 int r; 2591 dm_oblock_t block = get_bio_block(cache, bio); 2592 size_t pb_data_size = get_per_bio_data_size(cache); 2593 bool can_migrate = false; 2594 bool discarded_block; 2595 struct policy_result lookup_result; 2596 struct per_bio_data *pb = init_per_bio_data(bio, pb_data_size); 2597 2598 if (unlikely(from_oblock(block) >= from_oblock(cache->origin_blocks))) { 2599 /* 2600 * This can only occur if the io goes to a partial block at 2601 * the end of the origin device. We don't cache these. 2602 * Just remap to the origin and carry on. 2603 */ 2604 remap_to_origin(cache, bio); 2605 return DM_MAPIO_REMAPPED; 2606 } 2607 2608 if (bio->bi_rw & (REQ_FLUSH | REQ_FUA | REQ_DISCARD)) { 2609 defer_bio(cache, bio); 2610 return DM_MAPIO_SUBMITTED; 2611 } 2612 2613 /* 2614 * Check to see if that block is currently migrating. 2615 */ 2616 *cell = alloc_prison_cell(cache); 2617 if (!*cell) { 2618 defer_bio(cache, bio); 2619 return DM_MAPIO_SUBMITTED; 2620 } 2621 2622 r = bio_detain(cache, block, bio, *cell, 2623 (cell_free_fn) free_prison_cell, 2624 cache, cell); 2625 if (r) { 2626 if (r < 0) 2627 defer_bio(cache, bio); 2628 2629 return DM_MAPIO_SUBMITTED; 2630 } 2631 2632 discarded_block = is_discarded_oblock(cache, block); 2633 2634 r = policy_map(cache->policy, block, false, can_migrate, discarded_block, 2635 bio, &lookup_result); 2636 if (r == -EWOULDBLOCK) { 2637 cell_defer(cache, *cell, true); 2638 return DM_MAPIO_SUBMITTED; 2639 2640 } else if (r) { 2641 DMERR_LIMIT("Unexpected return from cache replacement policy: %d", r); 2642 cell_defer(cache, *cell, false); 2643 bio_io_error(bio); 2644 return DM_MAPIO_SUBMITTED; 2645 } 2646 2647 r = DM_MAPIO_REMAPPED; 2648 switch (lookup_result.op) { 2649 case POLICY_HIT: 2650 if (passthrough_mode(&cache->features)) { 2651 if (bio_data_dir(bio) == WRITE) { 2652 /* 2653 * We need to invalidate this block, so 2654 * defer for the worker thread. 2655 */ 2656 cell_defer(cache, *cell, true); 2657 r = DM_MAPIO_SUBMITTED; 2658 2659 } else { 2660 inc_miss_counter(cache, bio); 2661 remap_to_origin_clear_discard(cache, bio, block); 2662 } 2663 2664 } else { 2665 inc_hit_counter(cache, bio); 2666 if (bio_data_dir(bio) == WRITE && writethrough_mode(&cache->features) && 2667 !is_dirty(cache, lookup_result.cblock)) 2668 remap_to_origin_then_cache(cache, bio, block, lookup_result.cblock); 2669 else 2670 remap_to_cache_dirty(cache, bio, block, lookup_result.cblock); 2671 } 2672 break; 2673 2674 case POLICY_MISS: 2675 inc_miss_counter(cache, bio); 2676 if (pb->req_nr != 0) { 2677 /* 2678 * This is a duplicate writethrough io that is no 2679 * longer needed because the block has been demoted. 2680 */ 2681 bio_endio(bio, 0); 2682 cell_defer(cache, *cell, false); 2683 r = DM_MAPIO_SUBMITTED; 2684 2685 } else 2686 remap_to_origin_clear_discard(cache, bio, block); 2687 2688 break; 2689 2690 default: 2691 DMERR_LIMIT("%s: erroring bio: unknown policy op: %u", __func__, 2692 (unsigned) lookup_result.op); 2693 cell_defer(cache, *cell, false); 2694 bio_io_error(bio); 2695 r = DM_MAPIO_SUBMITTED; 2696 } 2697 2698 return r; 2699 } 2700 2701 static int cache_map(struct dm_target *ti, struct bio *bio) 2702 { 2703 int r; 2704 struct dm_bio_prison_cell *cell = NULL; 2705 struct cache *cache = ti->private; 2706 2707 r = __cache_map(cache, bio, &cell); 2708 if (r == DM_MAPIO_REMAPPED && cell) { 2709 inc_ds(cache, bio, cell); 2710 cell_defer(cache, cell, false); 2711 } 2712 2713 return r; 2714 } 2715 2716 static int cache_end_io(struct dm_target *ti, struct bio *bio, int error) 2717 { 2718 struct cache *cache = ti->private; 2719 unsigned long flags; 2720 size_t pb_data_size = get_per_bio_data_size(cache); 2721 struct per_bio_data *pb = get_per_bio_data(bio, pb_data_size); 2722 2723 if (pb->tick) { 2724 policy_tick(cache->policy); 2725 2726 spin_lock_irqsave(&cache->lock, flags); 2727 cache->need_tick_bio = true; 2728 spin_unlock_irqrestore(&cache->lock, flags); 2729 } 2730 2731 check_for_quiesced_migrations(cache, pb); 2732 2733 return 0; 2734 } 2735 2736 static int write_dirty_bitset(struct cache *cache) 2737 { 2738 unsigned i, r; 2739 2740 for (i = 0; i < from_cblock(cache->cache_size); i++) { 2741 r = dm_cache_set_dirty(cache->cmd, to_cblock(i), 2742 is_dirty(cache, to_cblock(i))); 2743 if (r) 2744 return r; 2745 } 2746 2747 return 0; 2748 } 2749 2750 static int write_discard_bitset(struct cache *cache) 2751 { 2752 unsigned i, r; 2753 2754 r = dm_cache_discard_bitset_resize(cache->cmd, cache->discard_block_size, 2755 cache->discard_nr_blocks); 2756 if (r) { 2757 DMERR("could not resize on-disk discard bitset"); 2758 return r; 2759 } 2760 2761 for (i = 0; i < from_dblock(cache->discard_nr_blocks); i++) { 2762 r = dm_cache_set_discard(cache->cmd, to_dblock(i), 2763 is_discarded(cache, to_dblock(i))); 2764 if (r) 2765 return r; 2766 } 2767 2768 return 0; 2769 } 2770 2771 /* 2772 * returns true on success 2773 */ 2774 static bool sync_metadata(struct cache *cache) 2775 { 2776 int r1, r2, r3, r4; 2777 2778 r1 = write_dirty_bitset(cache); 2779 if (r1) 2780 DMERR("could not write dirty bitset"); 2781 2782 r2 = write_discard_bitset(cache); 2783 if (r2) 2784 DMERR("could not write discard bitset"); 2785 2786 save_stats(cache); 2787 2788 r3 = dm_cache_write_hints(cache->cmd, cache->policy); 2789 if (r3) 2790 DMERR("could not write hints"); 2791 2792 /* 2793 * If writing the above metadata failed, we still commit, but don't 2794 * set the clean shutdown flag. This will effectively force every 2795 * dirty bit to be set on reload. 2796 */ 2797 r4 = dm_cache_commit(cache->cmd, !r1 && !r2 && !r3); 2798 if (r4) 2799 DMERR("could not write cache metadata. Data loss may occur."); 2800 2801 return !r1 && !r2 && !r3 && !r4; 2802 } 2803 2804 static void cache_postsuspend(struct dm_target *ti) 2805 { 2806 struct cache *cache = ti->private; 2807 2808 start_quiescing(cache); 2809 wait_for_migrations(cache); 2810 stop_worker(cache); 2811 requeue_deferred_io(cache); 2812 stop_quiescing(cache); 2813 2814 (void) sync_metadata(cache); 2815 } 2816 2817 static int load_mapping(void *context, dm_oblock_t oblock, dm_cblock_t cblock, 2818 bool dirty, uint32_t hint, bool hint_valid) 2819 { 2820 int r; 2821 struct cache *cache = context; 2822 2823 r = policy_load_mapping(cache->policy, oblock, cblock, hint, hint_valid); 2824 if (r) 2825 return r; 2826 2827 if (dirty) 2828 set_dirty(cache, oblock, cblock); 2829 else 2830 clear_dirty(cache, oblock, cblock); 2831 2832 return 0; 2833 } 2834 2835 /* 2836 * The discard block size in the on disk metadata is not 2837 * neccessarily the same as we're currently using. So we have to 2838 * be careful to only set the discarded attribute if we know it 2839 * covers a complete block of the new size. 2840 */ 2841 struct discard_load_info { 2842 struct cache *cache; 2843 2844 /* 2845 * These blocks are sized using the on disk dblock size, rather 2846 * than the current one. 2847 */ 2848 dm_block_t block_size; 2849 dm_block_t discard_begin, discard_end; 2850 }; 2851 2852 static void discard_load_info_init(struct cache *cache, 2853 struct discard_load_info *li) 2854 { 2855 li->cache = cache; 2856 li->discard_begin = li->discard_end = 0; 2857 } 2858 2859 static void set_discard_range(struct discard_load_info *li) 2860 { 2861 sector_t b, e; 2862 2863 if (li->discard_begin == li->discard_end) 2864 return; 2865 2866 /* 2867 * Convert to sectors. 2868 */ 2869 b = li->discard_begin * li->block_size; 2870 e = li->discard_end * li->block_size; 2871 2872 /* 2873 * Then convert back to the current dblock size. 2874 */ 2875 b = dm_sector_div_up(b, li->cache->discard_block_size); 2876 sector_div(e, li->cache->discard_block_size); 2877 2878 /* 2879 * The origin may have shrunk, so we need to check we're still in 2880 * bounds. 2881 */ 2882 if (e > from_dblock(li->cache->discard_nr_blocks)) 2883 e = from_dblock(li->cache->discard_nr_blocks); 2884 2885 for (; b < e; b++) 2886 set_discard(li->cache, to_dblock(b)); 2887 } 2888 2889 static int load_discard(void *context, sector_t discard_block_size, 2890 dm_dblock_t dblock, bool discard) 2891 { 2892 struct discard_load_info *li = context; 2893 2894 li->block_size = discard_block_size; 2895 2896 if (discard) { 2897 if (from_dblock(dblock) == li->discard_end) 2898 /* 2899 * We're already in a discard range, just extend it. 2900 */ 2901 li->discard_end = li->discard_end + 1ULL; 2902 2903 else { 2904 /* 2905 * Emit the old range and start a new one. 2906 */ 2907 set_discard_range(li); 2908 li->discard_begin = from_dblock(dblock); 2909 li->discard_end = li->discard_begin + 1ULL; 2910 } 2911 } else { 2912 set_discard_range(li); 2913 li->discard_begin = li->discard_end = 0; 2914 } 2915 2916 return 0; 2917 } 2918 2919 static dm_cblock_t get_cache_dev_size(struct cache *cache) 2920 { 2921 sector_t size = get_dev_size(cache->cache_dev); 2922 (void) sector_div(size, cache->sectors_per_block); 2923 return to_cblock(size); 2924 } 2925 2926 static bool can_resize(struct cache *cache, dm_cblock_t new_size) 2927 { 2928 if (from_cblock(new_size) > from_cblock(cache->cache_size)) 2929 return true; 2930 2931 /* 2932 * We can't drop a dirty block when shrinking the cache. 2933 */ 2934 while (from_cblock(new_size) < from_cblock(cache->cache_size)) { 2935 new_size = to_cblock(from_cblock(new_size) + 1); 2936 if (is_dirty(cache, new_size)) { 2937 DMERR("unable to shrink cache; cache block %llu is dirty", 2938 (unsigned long long) from_cblock(new_size)); 2939 return false; 2940 } 2941 } 2942 2943 return true; 2944 } 2945 2946 static int resize_cache_dev(struct cache *cache, dm_cblock_t new_size) 2947 { 2948 int r; 2949 2950 r = dm_cache_resize(cache->cmd, new_size); 2951 if (r) { 2952 DMERR("could not resize cache metadata"); 2953 return r; 2954 } 2955 2956 set_cache_size(cache, new_size); 2957 2958 return 0; 2959 } 2960 2961 static int cache_preresume(struct dm_target *ti) 2962 { 2963 int r = 0; 2964 struct cache *cache = ti->private; 2965 dm_cblock_t csize = get_cache_dev_size(cache); 2966 2967 /* 2968 * Check to see if the cache has resized. 2969 */ 2970 if (!cache->sized) { 2971 r = resize_cache_dev(cache, csize); 2972 if (r) 2973 return r; 2974 2975 cache->sized = true; 2976 2977 } else if (csize != cache->cache_size) { 2978 if (!can_resize(cache, csize)) 2979 return -EINVAL; 2980 2981 r = resize_cache_dev(cache, csize); 2982 if (r) 2983 return r; 2984 } 2985 2986 if (!cache->loaded_mappings) { 2987 r = dm_cache_load_mappings(cache->cmd, cache->policy, 2988 load_mapping, cache); 2989 if (r) { 2990 DMERR("could not load cache mappings"); 2991 return r; 2992 } 2993 2994 cache->loaded_mappings = true; 2995 } 2996 2997 if (!cache->loaded_discards) { 2998 struct discard_load_info li; 2999 3000 /* 3001 * The discard bitset could have been resized, or the 3002 * discard block size changed. To be safe we start by 3003 * setting every dblock to not discarded. 3004 */ 3005 clear_bitset(cache->discard_bitset, from_dblock(cache->discard_nr_blocks)); 3006 3007 discard_load_info_init(cache, &li); 3008 r = dm_cache_load_discards(cache->cmd, load_discard, &li); 3009 if (r) { 3010 DMERR("could not load origin discards"); 3011 return r; 3012 } 3013 set_discard_range(&li); 3014 3015 cache->loaded_discards = true; 3016 } 3017 3018 return r; 3019 } 3020 3021 static void cache_resume(struct dm_target *ti) 3022 { 3023 struct cache *cache = ti->private; 3024 3025 cache->need_tick_bio = true; 3026 do_waker(&cache->waker.work); 3027 } 3028 3029 /* 3030 * Status format: 3031 * 3032 * <metadata block size> <#used metadata blocks>/<#total metadata blocks> 3033 * <cache block size> <#used cache blocks>/<#total cache blocks> 3034 * <#read hits> <#read misses> <#write hits> <#write misses> 3035 * <#demotions> <#promotions> <#dirty> 3036 * <#features> <features>* 3037 * <#core args> <core args> 3038 * <policy name> <#policy args> <policy args>* 3039 */ 3040 static void cache_status(struct dm_target *ti, status_type_t type, 3041 unsigned status_flags, char *result, unsigned maxlen) 3042 { 3043 int r = 0; 3044 unsigned i; 3045 ssize_t sz = 0; 3046 dm_block_t nr_free_blocks_metadata = 0; 3047 dm_block_t nr_blocks_metadata = 0; 3048 char buf[BDEVNAME_SIZE]; 3049 struct cache *cache = ti->private; 3050 dm_cblock_t residency; 3051 3052 switch (type) { 3053 case STATUSTYPE_INFO: 3054 /* Commit to ensure statistics aren't out-of-date */ 3055 if (!(status_flags & DM_STATUS_NOFLUSH_FLAG) && !dm_suspended(ti)) { 3056 r = dm_cache_commit(cache->cmd, false); 3057 if (r) 3058 DMERR("could not commit metadata for accurate status"); 3059 } 3060 3061 r = dm_cache_get_free_metadata_block_count(cache->cmd, 3062 &nr_free_blocks_metadata); 3063 if (r) { 3064 DMERR("could not get metadata free block count"); 3065 goto err; 3066 } 3067 3068 r = dm_cache_get_metadata_dev_size(cache->cmd, &nr_blocks_metadata); 3069 if (r) { 3070 DMERR("could not get metadata device size"); 3071 goto err; 3072 } 3073 3074 residency = policy_residency(cache->policy); 3075 3076 DMEMIT("%u %llu/%llu %u %llu/%llu %u %u %u %u %u %u %lu ", 3077 (unsigned)DM_CACHE_METADATA_BLOCK_SIZE, 3078 (unsigned long long)(nr_blocks_metadata - nr_free_blocks_metadata), 3079 (unsigned long long)nr_blocks_metadata, 3080 cache->sectors_per_block, 3081 (unsigned long long) from_cblock(residency), 3082 (unsigned long long) from_cblock(cache->cache_size), 3083 (unsigned) atomic_read(&cache->stats.read_hit), 3084 (unsigned) atomic_read(&cache->stats.read_miss), 3085 (unsigned) atomic_read(&cache->stats.write_hit), 3086 (unsigned) atomic_read(&cache->stats.write_miss), 3087 (unsigned) atomic_read(&cache->stats.demotion), 3088 (unsigned) atomic_read(&cache->stats.promotion), 3089 (unsigned long) atomic_read(&cache->nr_dirty)); 3090 3091 if (writethrough_mode(&cache->features)) 3092 DMEMIT("1 writethrough "); 3093 3094 else if (passthrough_mode(&cache->features)) 3095 DMEMIT("1 passthrough "); 3096 3097 else if (writeback_mode(&cache->features)) 3098 DMEMIT("1 writeback "); 3099 3100 else { 3101 DMERR("internal error: unknown io mode: %d", (int) cache->features.io_mode); 3102 goto err; 3103 } 3104 3105 DMEMIT("2 migration_threshold %llu ", (unsigned long long) cache->migration_threshold); 3106 3107 DMEMIT("%s ", dm_cache_policy_get_name(cache->policy)); 3108 if (sz < maxlen) { 3109 r = policy_emit_config_values(cache->policy, result + sz, maxlen - sz); 3110 if (r) 3111 DMERR("policy_emit_config_values returned %d", r); 3112 } 3113 3114 break; 3115 3116 case STATUSTYPE_TABLE: 3117 format_dev_t(buf, cache->metadata_dev->bdev->bd_dev); 3118 DMEMIT("%s ", buf); 3119 format_dev_t(buf, cache->cache_dev->bdev->bd_dev); 3120 DMEMIT("%s ", buf); 3121 format_dev_t(buf, cache->origin_dev->bdev->bd_dev); 3122 DMEMIT("%s", buf); 3123 3124 for (i = 0; i < cache->nr_ctr_args - 1; i++) 3125 DMEMIT(" %s", cache->ctr_args[i]); 3126 if (cache->nr_ctr_args) 3127 DMEMIT(" %s", cache->ctr_args[cache->nr_ctr_args - 1]); 3128 } 3129 3130 return; 3131 3132 err: 3133 DMEMIT("Error"); 3134 } 3135 3136 /* 3137 * A cache block range can take two forms: 3138 * 3139 * i) A single cblock, eg. '3456' 3140 * ii) A begin and end cblock with dots between, eg. 123-234 3141 */ 3142 static int parse_cblock_range(struct cache *cache, const char *str, 3143 struct cblock_range *result) 3144 { 3145 char dummy; 3146 uint64_t b, e; 3147 int r; 3148 3149 /* 3150 * Try and parse form (ii) first. 3151 */ 3152 r = sscanf(str, "%llu-%llu%c", &b, &e, &dummy); 3153 if (r < 0) 3154 return r; 3155 3156 if (r == 2) { 3157 result->begin = to_cblock(b); 3158 result->end = to_cblock(e); 3159 return 0; 3160 } 3161 3162 /* 3163 * That didn't work, try form (i). 3164 */ 3165 r = sscanf(str, "%llu%c", &b, &dummy); 3166 if (r < 0) 3167 return r; 3168 3169 if (r == 1) { 3170 result->begin = to_cblock(b); 3171 result->end = to_cblock(from_cblock(result->begin) + 1u); 3172 return 0; 3173 } 3174 3175 DMERR("invalid cblock range '%s'", str); 3176 return -EINVAL; 3177 } 3178 3179 static int validate_cblock_range(struct cache *cache, struct cblock_range *range) 3180 { 3181 uint64_t b = from_cblock(range->begin); 3182 uint64_t e = from_cblock(range->end); 3183 uint64_t n = from_cblock(cache->cache_size); 3184 3185 if (b >= n) { 3186 DMERR("begin cblock out of range: %llu >= %llu", b, n); 3187 return -EINVAL; 3188 } 3189 3190 if (e > n) { 3191 DMERR("end cblock out of range: %llu > %llu", e, n); 3192 return -EINVAL; 3193 } 3194 3195 if (b >= e) { 3196 DMERR("invalid cblock range: %llu >= %llu", b, e); 3197 return -EINVAL; 3198 } 3199 3200 return 0; 3201 } 3202 3203 static int request_invalidation(struct cache *cache, struct cblock_range *range) 3204 { 3205 struct invalidation_request req; 3206 3207 INIT_LIST_HEAD(&req.list); 3208 req.cblocks = range; 3209 atomic_set(&req.complete, 0); 3210 req.err = 0; 3211 init_waitqueue_head(&req.result_wait); 3212 3213 spin_lock(&cache->invalidation_lock); 3214 list_add(&req.list, &cache->invalidation_requests); 3215 spin_unlock(&cache->invalidation_lock); 3216 wake_worker(cache); 3217 3218 wait_event(req.result_wait, atomic_read(&req.complete)); 3219 return req.err; 3220 } 3221 3222 static int process_invalidate_cblocks_message(struct cache *cache, unsigned count, 3223 const char **cblock_ranges) 3224 { 3225 int r = 0; 3226 unsigned i; 3227 struct cblock_range range; 3228 3229 if (!passthrough_mode(&cache->features)) { 3230 DMERR("cache has to be in passthrough mode for invalidation"); 3231 return -EPERM; 3232 } 3233 3234 for (i = 0; i < count; i++) { 3235 r = parse_cblock_range(cache, cblock_ranges[i], &range); 3236 if (r) 3237 break; 3238 3239 r = validate_cblock_range(cache, &range); 3240 if (r) 3241 break; 3242 3243 /* 3244 * Pass begin and end origin blocks to the worker and wake it. 3245 */ 3246 r = request_invalidation(cache, &range); 3247 if (r) 3248 break; 3249 } 3250 3251 return r; 3252 } 3253 3254 /* 3255 * Supports 3256 * "<key> <value>" 3257 * and 3258 * "invalidate_cblocks [(<begin>)|(<begin>-<end>)]* 3259 * 3260 * The key migration_threshold is supported by the cache target core. 3261 */ 3262 static int cache_message(struct dm_target *ti, unsigned argc, char **argv) 3263 { 3264 struct cache *cache = ti->private; 3265 3266 if (!argc) 3267 return -EINVAL; 3268 3269 if (!strcasecmp(argv[0], "invalidate_cblocks")) 3270 return process_invalidate_cblocks_message(cache, argc - 1, (const char **) argv + 1); 3271 3272 if (argc != 2) 3273 return -EINVAL; 3274 3275 return set_config_value(cache, argv[0], argv[1]); 3276 } 3277 3278 static int cache_iterate_devices(struct dm_target *ti, 3279 iterate_devices_callout_fn fn, void *data) 3280 { 3281 int r = 0; 3282 struct cache *cache = ti->private; 3283 3284 r = fn(ti, cache->cache_dev, 0, get_dev_size(cache->cache_dev), data); 3285 if (!r) 3286 r = fn(ti, cache->origin_dev, 0, ti->len, data); 3287 3288 return r; 3289 } 3290 3291 /* 3292 * We assume I/O is going to the origin (which is the volume 3293 * more likely to have restrictions e.g. by being striped). 3294 * (Looking up the exact location of the data would be expensive 3295 * and could always be out of date by the time the bio is submitted.) 3296 */ 3297 static int cache_bvec_merge(struct dm_target *ti, 3298 struct bvec_merge_data *bvm, 3299 struct bio_vec *biovec, int max_size) 3300 { 3301 struct cache *cache = ti->private; 3302 struct request_queue *q = bdev_get_queue(cache->origin_dev->bdev); 3303 3304 if (!q->merge_bvec_fn) 3305 return max_size; 3306 3307 bvm->bi_bdev = cache->origin_dev->bdev; 3308 return min(max_size, q->merge_bvec_fn(q, bvm, biovec)); 3309 } 3310 3311 static void set_discard_limits(struct cache *cache, struct queue_limits *limits) 3312 { 3313 /* 3314 * FIXME: these limits may be incompatible with the cache device 3315 */ 3316 limits->max_discard_sectors = min_t(sector_t, cache->discard_block_size * 1024, 3317 cache->origin_sectors); 3318 limits->discard_granularity = cache->discard_block_size << SECTOR_SHIFT; 3319 } 3320 3321 static void cache_io_hints(struct dm_target *ti, struct queue_limits *limits) 3322 { 3323 struct cache *cache = ti->private; 3324 uint64_t io_opt_sectors = limits->io_opt >> SECTOR_SHIFT; 3325 3326 /* 3327 * If the system-determined stacked limits are compatible with the 3328 * cache's blocksize (io_opt is a factor) do not override them. 3329 */ 3330 if (io_opt_sectors < cache->sectors_per_block || 3331 do_div(io_opt_sectors, cache->sectors_per_block)) { 3332 blk_limits_io_min(limits, cache->sectors_per_block << SECTOR_SHIFT); 3333 blk_limits_io_opt(limits, cache->sectors_per_block << SECTOR_SHIFT); 3334 } 3335 set_discard_limits(cache, limits); 3336 } 3337 3338 /*----------------------------------------------------------------*/ 3339 3340 static struct target_type cache_target = { 3341 .name = "cache", 3342 .version = {1, 6, 0}, 3343 .module = THIS_MODULE, 3344 .ctr = cache_ctr, 3345 .dtr = cache_dtr, 3346 .map = cache_map, 3347 .end_io = cache_end_io, 3348 .postsuspend = cache_postsuspend, 3349 .preresume = cache_preresume, 3350 .resume = cache_resume, 3351 .status = cache_status, 3352 .message = cache_message, 3353 .iterate_devices = cache_iterate_devices, 3354 .merge = cache_bvec_merge, 3355 .io_hints = cache_io_hints, 3356 }; 3357 3358 static int __init dm_cache_init(void) 3359 { 3360 int r; 3361 3362 r = dm_register_target(&cache_target); 3363 if (r) { 3364 DMERR("cache target registration failed: %d", r); 3365 return r; 3366 } 3367 3368 migration_cache = KMEM_CACHE(dm_cache_migration, 0); 3369 if (!migration_cache) { 3370 dm_unregister_target(&cache_target); 3371 return -ENOMEM; 3372 } 3373 3374 return 0; 3375 } 3376 3377 static void __exit dm_cache_exit(void) 3378 { 3379 dm_unregister_target(&cache_target); 3380 kmem_cache_destroy(migration_cache); 3381 } 3382 3383 module_init(dm_cache_init); 3384 module_exit(dm_cache_exit); 3385 3386 MODULE_DESCRIPTION(DM_NAME " cache target"); 3387 MODULE_AUTHOR("Joe Thornber <ejt@redhat.com>"); 3388 MODULE_LICENSE("GPL"); 3389