1 /* 2 * Copyright (C) 2012 Red Hat. All rights reserved. 3 * 4 * This file is released under the GPL. 5 */ 6 7 #include "dm.h" 8 #include "dm-bio-prison.h" 9 #include "dm-bio-record.h" 10 #include "dm-cache-metadata.h" 11 12 #include <linux/dm-io.h> 13 #include <linux/dm-kcopyd.h> 14 #include <linux/init.h> 15 #include <linux/mempool.h> 16 #include <linux/module.h> 17 #include <linux/slab.h> 18 #include <linux/vmalloc.h> 19 20 #define DM_MSG_PREFIX "cache" 21 22 DECLARE_DM_KCOPYD_THROTTLE_WITH_MODULE_PARM(cache_copy_throttle, 23 "A percentage of time allocated for copying to and/or from cache"); 24 25 /*----------------------------------------------------------------*/ 26 27 /* 28 * Glossary: 29 * 30 * oblock: index of an origin block 31 * cblock: index of a cache block 32 * promotion: movement of a block from origin to cache 33 * demotion: movement of a block from cache to origin 34 * migration: movement of a block between the origin and cache device, 35 * either direction 36 */ 37 38 /*----------------------------------------------------------------*/ 39 40 static size_t bitset_size_in_bytes(unsigned nr_entries) 41 { 42 return sizeof(unsigned long) * dm_div_up(nr_entries, BITS_PER_LONG); 43 } 44 45 static unsigned long *alloc_bitset(unsigned nr_entries) 46 { 47 size_t s = bitset_size_in_bytes(nr_entries); 48 return vzalloc(s); 49 } 50 51 static void clear_bitset(void *bitset, unsigned nr_entries) 52 { 53 size_t s = bitset_size_in_bytes(nr_entries); 54 memset(bitset, 0, s); 55 } 56 57 static void free_bitset(unsigned long *bits) 58 { 59 vfree(bits); 60 } 61 62 /*----------------------------------------------------------------*/ 63 64 /* 65 * There are a couple of places where we let a bio run, but want to do some 66 * work before calling its endio function. We do this by temporarily 67 * changing the endio fn. 68 */ 69 struct dm_hook_info { 70 bio_end_io_t *bi_end_io; 71 void *bi_private; 72 }; 73 74 static void dm_hook_bio(struct dm_hook_info *h, struct bio *bio, 75 bio_end_io_t *bi_end_io, void *bi_private) 76 { 77 h->bi_end_io = bio->bi_end_io; 78 h->bi_private = bio->bi_private; 79 80 bio->bi_end_io = bi_end_io; 81 bio->bi_private = bi_private; 82 } 83 84 static void dm_unhook_bio(struct dm_hook_info *h, struct bio *bio) 85 { 86 bio->bi_end_io = h->bi_end_io; 87 bio->bi_private = h->bi_private; 88 89 /* 90 * Must bump bi_remaining to allow bio to complete with 91 * restored bi_end_io. 92 */ 93 atomic_inc(&bio->bi_remaining); 94 } 95 96 /*----------------------------------------------------------------*/ 97 98 #define MIGRATION_POOL_SIZE 128 99 #define COMMIT_PERIOD HZ 100 #define MIGRATION_COUNT_WINDOW 10 101 102 /* 103 * The block size of the device holding cache data must be 104 * between 32KB and 1GB. 105 */ 106 #define DATA_DEV_BLOCK_SIZE_MIN_SECTORS (32 * 1024 >> SECTOR_SHIFT) 107 #define DATA_DEV_BLOCK_SIZE_MAX_SECTORS (1024 * 1024 * 1024 >> SECTOR_SHIFT) 108 109 /* 110 * FIXME: the cache is read/write for the time being. 111 */ 112 enum cache_metadata_mode { 113 CM_WRITE, /* metadata may be changed */ 114 CM_READ_ONLY, /* metadata may not be changed */ 115 }; 116 117 enum cache_io_mode { 118 /* 119 * Data is written to cached blocks only. These blocks are marked 120 * dirty. If you lose the cache device you will lose data. 121 * Potential performance increase for both reads and writes. 122 */ 123 CM_IO_WRITEBACK, 124 125 /* 126 * Data is written to both cache and origin. Blocks are never 127 * dirty. Potential performance benfit for reads only. 128 */ 129 CM_IO_WRITETHROUGH, 130 131 /* 132 * A degraded mode useful for various cache coherency situations 133 * (eg, rolling back snapshots). Reads and writes always go to the 134 * origin. If a write goes to a cached oblock, then the cache 135 * block is invalidated. 136 */ 137 CM_IO_PASSTHROUGH 138 }; 139 140 struct cache_features { 141 enum cache_metadata_mode mode; 142 enum cache_io_mode io_mode; 143 }; 144 145 struct cache_stats { 146 atomic_t read_hit; 147 atomic_t read_miss; 148 atomic_t write_hit; 149 atomic_t write_miss; 150 atomic_t demotion; 151 atomic_t promotion; 152 atomic_t copies_avoided; 153 atomic_t cache_cell_clash; 154 atomic_t commit_count; 155 atomic_t discard_count; 156 }; 157 158 /* 159 * Defines a range of cblocks, begin to (end - 1) are in the range. end is 160 * the one-past-the-end value. 161 */ 162 struct cblock_range { 163 dm_cblock_t begin; 164 dm_cblock_t end; 165 }; 166 167 struct invalidation_request { 168 struct list_head list; 169 struct cblock_range *cblocks; 170 171 atomic_t complete; 172 int err; 173 174 wait_queue_head_t result_wait; 175 }; 176 177 struct cache { 178 struct dm_target *ti; 179 struct dm_target_callbacks callbacks; 180 181 struct dm_cache_metadata *cmd; 182 183 /* 184 * Metadata is written to this device. 185 */ 186 struct dm_dev *metadata_dev; 187 188 /* 189 * The slower of the two data devices. Typically a spindle. 190 */ 191 struct dm_dev *origin_dev; 192 193 /* 194 * The faster of the two data devices. Typically an SSD. 195 */ 196 struct dm_dev *cache_dev; 197 198 /* 199 * Size of the origin device in _complete_ blocks and native sectors. 200 */ 201 dm_oblock_t origin_blocks; 202 sector_t origin_sectors; 203 204 /* 205 * Size of the cache device in blocks. 206 */ 207 dm_cblock_t cache_size; 208 209 /* 210 * Fields for converting from sectors to blocks. 211 */ 212 uint32_t sectors_per_block; 213 int sectors_per_block_shift; 214 215 spinlock_t lock; 216 struct bio_list deferred_bios; 217 struct bio_list deferred_flush_bios; 218 struct bio_list deferred_writethrough_bios; 219 struct list_head quiesced_migrations; 220 struct list_head completed_migrations; 221 struct list_head need_commit_migrations; 222 sector_t migration_threshold; 223 wait_queue_head_t migration_wait; 224 atomic_t nr_migrations; 225 226 wait_queue_head_t quiescing_wait; 227 atomic_t quiescing; 228 atomic_t quiescing_ack; 229 230 /* 231 * cache_size entries, dirty if set 232 */ 233 atomic_t nr_dirty; 234 unsigned long *dirty_bitset; 235 236 /* 237 * origin_blocks entries, discarded if set. 238 */ 239 dm_dblock_t discard_nr_blocks; 240 unsigned long *discard_bitset; 241 uint32_t discard_block_size; /* a power of 2 times sectors per block */ 242 243 /* 244 * Rather than reconstructing the table line for the status we just 245 * save it and regurgitate. 246 */ 247 unsigned nr_ctr_args; 248 const char **ctr_args; 249 250 struct dm_kcopyd_client *copier; 251 struct workqueue_struct *wq; 252 struct work_struct worker; 253 254 struct delayed_work waker; 255 unsigned long last_commit_jiffies; 256 257 struct dm_bio_prison *prison; 258 struct dm_deferred_set *all_io_ds; 259 260 mempool_t *migration_pool; 261 struct dm_cache_migration *next_migration; 262 263 struct dm_cache_policy *policy; 264 unsigned policy_nr_args; 265 266 bool need_tick_bio:1; 267 bool sized:1; 268 bool invalidate:1; 269 bool commit_requested:1; 270 bool loaded_mappings:1; 271 bool loaded_discards:1; 272 273 /* 274 * Cache features such as write-through. 275 */ 276 struct cache_features features; 277 278 struct cache_stats stats; 279 280 /* 281 * Invalidation fields. 282 */ 283 spinlock_t invalidation_lock; 284 struct list_head invalidation_requests; 285 }; 286 287 struct per_bio_data { 288 bool tick:1; 289 unsigned req_nr:2; 290 struct dm_deferred_entry *all_io_entry; 291 struct dm_hook_info hook_info; 292 293 /* 294 * writethrough fields. These MUST remain at the end of this 295 * structure and the 'cache' member must be the first as it 296 * is used to determine the offset of the writethrough fields. 297 */ 298 struct cache *cache; 299 dm_cblock_t cblock; 300 struct dm_bio_details bio_details; 301 }; 302 303 struct dm_cache_migration { 304 struct list_head list; 305 struct cache *cache; 306 307 unsigned long start_jiffies; 308 dm_oblock_t old_oblock; 309 dm_oblock_t new_oblock; 310 dm_cblock_t cblock; 311 312 bool err:1; 313 bool discard:1; 314 bool writeback:1; 315 bool demote:1; 316 bool promote:1; 317 bool requeue_holder:1; 318 bool invalidate:1; 319 320 struct dm_bio_prison_cell *old_ocell; 321 struct dm_bio_prison_cell *new_ocell; 322 }; 323 324 /* 325 * Processing a bio in the worker thread may require these memory 326 * allocations. We prealloc to avoid deadlocks (the same worker thread 327 * frees them back to the mempool). 328 */ 329 struct prealloc { 330 struct dm_cache_migration *mg; 331 struct dm_bio_prison_cell *cell1; 332 struct dm_bio_prison_cell *cell2; 333 }; 334 335 static void wake_worker(struct cache *cache) 336 { 337 queue_work(cache->wq, &cache->worker); 338 } 339 340 /*----------------------------------------------------------------*/ 341 342 static struct dm_bio_prison_cell *alloc_prison_cell(struct cache *cache) 343 { 344 /* FIXME: change to use a local slab. */ 345 return dm_bio_prison_alloc_cell(cache->prison, GFP_NOWAIT); 346 } 347 348 static void free_prison_cell(struct cache *cache, struct dm_bio_prison_cell *cell) 349 { 350 dm_bio_prison_free_cell(cache->prison, cell); 351 } 352 353 static int prealloc_data_structs(struct cache *cache, struct prealloc *p) 354 { 355 if (!p->mg) { 356 p->mg = mempool_alloc(cache->migration_pool, GFP_NOWAIT); 357 if (!p->mg) 358 return -ENOMEM; 359 } 360 361 if (!p->cell1) { 362 p->cell1 = alloc_prison_cell(cache); 363 if (!p->cell1) 364 return -ENOMEM; 365 } 366 367 if (!p->cell2) { 368 p->cell2 = alloc_prison_cell(cache); 369 if (!p->cell2) 370 return -ENOMEM; 371 } 372 373 return 0; 374 } 375 376 static void prealloc_free_structs(struct cache *cache, struct prealloc *p) 377 { 378 if (p->cell2) 379 free_prison_cell(cache, p->cell2); 380 381 if (p->cell1) 382 free_prison_cell(cache, p->cell1); 383 384 if (p->mg) 385 mempool_free(p->mg, cache->migration_pool); 386 } 387 388 static struct dm_cache_migration *prealloc_get_migration(struct prealloc *p) 389 { 390 struct dm_cache_migration *mg = p->mg; 391 392 BUG_ON(!mg); 393 p->mg = NULL; 394 395 return mg; 396 } 397 398 /* 399 * You must have a cell within the prealloc struct to return. If not this 400 * function will BUG() rather than returning NULL. 401 */ 402 static struct dm_bio_prison_cell *prealloc_get_cell(struct prealloc *p) 403 { 404 struct dm_bio_prison_cell *r = NULL; 405 406 if (p->cell1) { 407 r = p->cell1; 408 p->cell1 = NULL; 409 410 } else if (p->cell2) { 411 r = p->cell2; 412 p->cell2 = NULL; 413 } else 414 BUG(); 415 416 return r; 417 } 418 419 /* 420 * You can't have more than two cells in a prealloc struct. BUG() will be 421 * called if you try and overfill. 422 */ 423 static void prealloc_put_cell(struct prealloc *p, struct dm_bio_prison_cell *cell) 424 { 425 if (!p->cell2) 426 p->cell2 = cell; 427 428 else if (!p->cell1) 429 p->cell1 = cell; 430 431 else 432 BUG(); 433 } 434 435 /*----------------------------------------------------------------*/ 436 437 static void build_key(dm_oblock_t begin, dm_oblock_t end, struct dm_cell_key *key) 438 { 439 key->virtual = 0; 440 key->dev = 0; 441 key->block_begin = from_oblock(begin); 442 key->block_end = from_oblock(end); 443 } 444 445 /* 446 * The caller hands in a preallocated cell, and a free function for it. 447 * The cell will be freed if there's an error, or if it wasn't used because 448 * a cell with that key already exists. 449 */ 450 typedef void (*cell_free_fn)(void *context, struct dm_bio_prison_cell *cell); 451 452 static int bio_detain_range(struct cache *cache, dm_oblock_t oblock_begin, dm_oblock_t oblock_end, 453 struct bio *bio, struct dm_bio_prison_cell *cell_prealloc, 454 cell_free_fn free_fn, void *free_context, 455 struct dm_bio_prison_cell **cell_result) 456 { 457 int r; 458 struct dm_cell_key key; 459 460 build_key(oblock_begin, oblock_end, &key); 461 r = dm_bio_detain(cache->prison, &key, bio, cell_prealloc, cell_result); 462 if (r) 463 free_fn(free_context, cell_prealloc); 464 465 return r; 466 } 467 468 static int bio_detain(struct cache *cache, dm_oblock_t oblock, 469 struct bio *bio, struct dm_bio_prison_cell *cell_prealloc, 470 cell_free_fn free_fn, void *free_context, 471 struct dm_bio_prison_cell **cell_result) 472 { 473 dm_oblock_t end = to_oblock(from_oblock(oblock) + 1ULL); 474 return bio_detain_range(cache, oblock, end, bio, 475 cell_prealloc, free_fn, free_context, cell_result); 476 } 477 478 static int get_cell(struct cache *cache, 479 dm_oblock_t oblock, 480 struct prealloc *structs, 481 struct dm_bio_prison_cell **cell_result) 482 { 483 int r; 484 struct dm_cell_key key; 485 struct dm_bio_prison_cell *cell_prealloc; 486 487 cell_prealloc = prealloc_get_cell(structs); 488 489 build_key(oblock, to_oblock(from_oblock(oblock) + 1ULL), &key); 490 r = dm_get_cell(cache->prison, &key, cell_prealloc, cell_result); 491 if (r) 492 prealloc_put_cell(structs, cell_prealloc); 493 494 return r; 495 } 496 497 /*----------------------------------------------------------------*/ 498 499 static bool is_dirty(struct cache *cache, dm_cblock_t b) 500 { 501 return test_bit(from_cblock(b), cache->dirty_bitset); 502 } 503 504 static void set_dirty(struct cache *cache, dm_oblock_t oblock, dm_cblock_t cblock) 505 { 506 if (!test_and_set_bit(from_cblock(cblock), cache->dirty_bitset)) { 507 atomic_inc(&cache->nr_dirty); 508 policy_set_dirty(cache->policy, oblock); 509 } 510 } 511 512 static void clear_dirty(struct cache *cache, dm_oblock_t oblock, dm_cblock_t cblock) 513 { 514 if (test_and_clear_bit(from_cblock(cblock), cache->dirty_bitset)) { 515 policy_clear_dirty(cache->policy, oblock); 516 if (atomic_dec_return(&cache->nr_dirty) == 0) 517 dm_table_event(cache->ti->table); 518 } 519 } 520 521 /*----------------------------------------------------------------*/ 522 523 static bool block_size_is_power_of_two(struct cache *cache) 524 { 525 return cache->sectors_per_block_shift >= 0; 526 } 527 528 /* gcc on ARM generates spurious references to __udivdi3 and __umoddi3 */ 529 #if defined(CONFIG_ARM) && __GNUC__ == 4 && __GNUC_MINOR__ <= 6 530 __always_inline 531 #endif 532 static dm_block_t block_div(dm_block_t b, uint32_t n) 533 { 534 do_div(b, n); 535 536 return b; 537 } 538 539 static dm_block_t oblocks_per_dblock(struct cache *cache) 540 { 541 dm_block_t oblocks = cache->discard_block_size; 542 543 if (block_size_is_power_of_two(cache)) 544 oblocks >>= cache->sectors_per_block_shift; 545 else 546 oblocks = block_div(oblocks, cache->sectors_per_block); 547 548 return oblocks; 549 } 550 551 static dm_dblock_t oblock_to_dblock(struct cache *cache, dm_oblock_t oblock) 552 { 553 return to_dblock(block_div(from_oblock(oblock), 554 oblocks_per_dblock(cache))); 555 } 556 557 static dm_oblock_t dblock_to_oblock(struct cache *cache, dm_dblock_t dblock) 558 { 559 return to_oblock(from_dblock(dblock) * oblocks_per_dblock(cache)); 560 } 561 562 static void set_discard(struct cache *cache, dm_dblock_t b) 563 { 564 unsigned long flags; 565 566 BUG_ON(from_dblock(b) >= from_dblock(cache->discard_nr_blocks)); 567 atomic_inc(&cache->stats.discard_count); 568 569 spin_lock_irqsave(&cache->lock, flags); 570 set_bit(from_dblock(b), cache->discard_bitset); 571 spin_unlock_irqrestore(&cache->lock, flags); 572 } 573 574 static void clear_discard(struct cache *cache, dm_dblock_t b) 575 { 576 unsigned long flags; 577 578 spin_lock_irqsave(&cache->lock, flags); 579 clear_bit(from_dblock(b), cache->discard_bitset); 580 spin_unlock_irqrestore(&cache->lock, flags); 581 } 582 583 static bool is_discarded(struct cache *cache, dm_dblock_t b) 584 { 585 int r; 586 unsigned long flags; 587 588 spin_lock_irqsave(&cache->lock, flags); 589 r = test_bit(from_dblock(b), cache->discard_bitset); 590 spin_unlock_irqrestore(&cache->lock, flags); 591 592 return r; 593 } 594 595 static bool is_discarded_oblock(struct cache *cache, dm_oblock_t b) 596 { 597 int r; 598 unsigned long flags; 599 600 spin_lock_irqsave(&cache->lock, flags); 601 r = test_bit(from_dblock(oblock_to_dblock(cache, b)), 602 cache->discard_bitset); 603 spin_unlock_irqrestore(&cache->lock, flags); 604 605 return r; 606 } 607 608 /*----------------------------------------------------------------*/ 609 610 static void load_stats(struct cache *cache) 611 { 612 struct dm_cache_statistics stats; 613 614 dm_cache_metadata_get_stats(cache->cmd, &stats); 615 atomic_set(&cache->stats.read_hit, stats.read_hits); 616 atomic_set(&cache->stats.read_miss, stats.read_misses); 617 atomic_set(&cache->stats.write_hit, stats.write_hits); 618 atomic_set(&cache->stats.write_miss, stats.write_misses); 619 } 620 621 static void save_stats(struct cache *cache) 622 { 623 struct dm_cache_statistics stats; 624 625 stats.read_hits = atomic_read(&cache->stats.read_hit); 626 stats.read_misses = atomic_read(&cache->stats.read_miss); 627 stats.write_hits = atomic_read(&cache->stats.write_hit); 628 stats.write_misses = atomic_read(&cache->stats.write_miss); 629 630 dm_cache_metadata_set_stats(cache->cmd, &stats); 631 } 632 633 /*---------------------------------------------------------------- 634 * Per bio data 635 *--------------------------------------------------------------*/ 636 637 /* 638 * If using writeback, leave out struct per_bio_data's writethrough fields. 639 */ 640 #define PB_DATA_SIZE_WB (offsetof(struct per_bio_data, cache)) 641 #define PB_DATA_SIZE_WT (sizeof(struct per_bio_data)) 642 643 static bool writethrough_mode(struct cache_features *f) 644 { 645 return f->io_mode == CM_IO_WRITETHROUGH; 646 } 647 648 static bool writeback_mode(struct cache_features *f) 649 { 650 return f->io_mode == CM_IO_WRITEBACK; 651 } 652 653 static bool passthrough_mode(struct cache_features *f) 654 { 655 return f->io_mode == CM_IO_PASSTHROUGH; 656 } 657 658 static size_t get_per_bio_data_size(struct cache *cache) 659 { 660 return writethrough_mode(&cache->features) ? PB_DATA_SIZE_WT : PB_DATA_SIZE_WB; 661 } 662 663 static struct per_bio_data *get_per_bio_data(struct bio *bio, size_t data_size) 664 { 665 struct per_bio_data *pb = dm_per_bio_data(bio, data_size); 666 BUG_ON(!pb); 667 return pb; 668 } 669 670 static struct per_bio_data *init_per_bio_data(struct bio *bio, size_t data_size) 671 { 672 struct per_bio_data *pb = get_per_bio_data(bio, data_size); 673 674 pb->tick = false; 675 pb->req_nr = dm_bio_get_target_bio_nr(bio); 676 pb->all_io_entry = NULL; 677 678 return pb; 679 } 680 681 /*---------------------------------------------------------------- 682 * Remapping 683 *--------------------------------------------------------------*/ 684 static void remap_to_origin(struct cache *cache, struct bio *bio) 685 { 686 bio->bi_bdev = cache->origin_dev->bdev; 687 } 688 689 static void remap_to_cache(struct cache *cache, struct bio *bio, 690 dm_cblock_t cblock) 691 { 692 sector_t bi_sector = bio->bi_iter.bi_sector; 693 sector_t block = from_cblock(cblock); 694 695 bio->bi_bdev = cache->cache_dev->bdev; 696 if (!block_size_is_power_of_two(cache)) 697 bio->bi_iter.bi_sector = 698 (block * cache->sectors_per_block) + 699 sector_div(bi_sector, cache->sectors_per_block); 700 else 701 bio->bi_iter.bi_sector = 702 (block << cache->sectors_per_block_shift) | 703 (bi_sector & (cache->sectors_per_block - 1)); 704 } 705 706 static void check_if_tick_bio_needed(struct cache *cache, struct bio *bio) 707 { 708 unsigned long flags; 709 size_t pb_data_size = get_per_bio_data_size(cache); 710 struct per_bio_data *pb = get_per_bio_data(bio, pb_data_size); 711 712 spin_lock_irqsave(&cache->lock, flags); 713 if (cache->need_tick_bio && 714 !(bio->bi_rw & (REQ_FUA | REQ_FLUSH | REQ_DISCARD))) { 715 pb->tick = true; 716 cache->need_tick_bio = false; 717 } 718 spin_unlock_irqrestore(&cache->lock, flags); 719 } 720 721 static void remap_to_origin_clear_discard(struct cache *cache, struct bio *bio, 722 dm_oblock_t oblock) 723 { 724 check_if_tick_bio_needed(cache, bio); 725 remap_to_origin(cache, bio); 726 if (bio_data_dir(bio) == WRITE) 727 clear_discard(cache, oblock_to_dblock(cache, oblock)); 728 } 729 730 static void remap_to_cache_dirty(struct cache *cache, struct bio *bio, 731 dm_oblock_t oblock, dm_cblock_t cblock) 732 { 733 check_if_tick_bio_needed(cache, bio); 734 remap_to_cache(cache, bio, cblock); 735 if (bio_data_dir(bio) == WRITE) { 736 set_dirty(cache, oblock, cblock); 737 clear_discard(cache, oblock_to_dblock(cache, oblock)); 738 } 739 } 740 741 static dm_oblock_t get_bio_block(struct cache *cache, struct bio *bio) 742 { 743 sector_t block_nr = bio->bi_iter.bi_sector; 744 745 if (!block_size_is_power_of_two(cache)) 746 (void) sector_div(block_nr, cache->sectors_per_block); 747 else 748 block_nr >>= cache->sectors_per_block_shift; 749 750 return to_oblock(block_nr); 751 } 752 753 static int bio_triggers_commit(struct cache *cache, struct bio *bio) 754 { 755 return bio->bi_rw & (REQ_FLUSH | REQ_FUA); 756 } 757 758 /* 759 * You must increment the deferred set whilst the prison cell is held. To 760 * encourage this, we ask for 'cell' to be passed in. 761 */ 762 static void inc_ds(struct cache *cache, struct bio *bio, 763 struct dm_bio_prison_cell *cell) 764 { 765 size_t pb_data_size = get_per_bio_data_size(cache); 766 struct per_bio_data *pb = get_per_bio_data(bio, pb_data_size); 767 768 BUG_ON(!cell); 769 BUG_ON(pb->all_io_entry); 770 771 pb->all_io_entry = dm_deferred_entry_inc(cache->all_io_ds); 772 } 773 774 static void issue(struct cache *cache, struct bio *bio) 775 { 776 unsigned long flags; 777 778 if (!bio_triggers_commit(cache, bio)) { 779 generic_make_request(bio); 780 return; 781 } 782 783 /* 784 * Batch together any bios that trigger commits and then issue a 785 * single commit for them in do_worker(). 786 */ 787 spin_lock_irqsave(&cache->lock, flags); 788 cache->commit_requested = true; 789 bio_list_add(&cache->deferred_flush_bios, bio); 790 spin_unlock_irqrestore(&cache->lock, flags); 791 } 792 793 static void inc_and_issue(struct cache *cache, struct bio *bio, struct dm_bio_prison_cell *cell) 794 { 795 inc_ds(cache, bio, cell); 796 issue(cache, bio); 797 } 798 799 static void defer_writethrough_bio(struct cache *cache, struct bio *bio) 800 { 801 unsigned long flags; 802 803 spin_lock_irqsave(&cache->lock, flags); 804 bio_list_add(&cache->deferred_writethrough_bios, bio); 805 spin_unlock_irqrestore(&cache->lock, flags); 806 807 wake_worker(cache); 808 } 809 810 static void writethrough_endio(struct bio *bio, int err) 811 { 812 struct per_bio_data *pb = get_per_bio_data(bio, PB_DATA_SIZE_WT); 813 814 dm_unhook_bio(&pb->hook_info, bio); 815 816 if (err) { 817 bio_endio(bio, err); 818 return; 819 } 820 821 dm_bio_restore(&pb->bio_details, bio); 822 remap_to_cache(pb->cache, bio, pb->cblock); 823 824 /* 825 * We can't issue this bio directly, since we're in interrupt 826 * context. So it gets put on a bio list for processing by the 827 * worker thread. 828 */ 829 defer_writethrough_bio(pb->cache, bio); 830 } 831 832 /* 833 * When running in writethrough mode we need to send writes to clean blocks 834 * to both the cache and origin devices. In future we'd like to clone the 835 * bio and send them in parallel, but for now we're doing them in 836 * series as this is easier. 837 */ 838 static void remap_to_origin_then_cache(struct cache *cache, struct bio *bio, 839 dm_oblock_t oblock, dm_cblock_t cblock) 840 { 841 struct per_bio_data *pb = get_per_bio_data(bio, PB_DATA_SIZE_WT); 842 843 pb->cache = cache; 844 pb->cblock = cblock; 845 dm_hook_bio(&pb->hook_info, bio, writethrough_endio, NULL); 846 dm_bio_record(&pb->bio_details, bio); 847 848 remap_to_origin_clear_discard(pb->cache, bio, oblock); 849 } 850 851 /*---------------------------------------------------------------- 852 * Migration processing 853 * 854 * Migration covers moving data from the origin device to the cache, or 855 * vice versa. 856 *--------------------------------------------------------------*/ 857 static void free_migration(struct dm_cache_migration *mg) 858 { 859 mempool_free(mg, mg->cache->migration_pool); 860 } 861 862 static void inc_nr_migrations(struct cache *cache) 863 { 864 atomic_inc(&cache->nr_migrations); 865 } 866 867 static void dec_nr_migrations(struct cache *cache) 868 { 869 atomic_dec(&cache->nr_migrations); 870 871 /* 872 * Wake the worker in case we're suspending the target. 873 */ 874 wake_up(&cache->migration_wait); 875 } 876 877 static void __cell_defer(struct cache *cache, struct dm_bio_prison_cell *cell, 878 bool holder) 879 { 880 (holder ? dm_cell_release : dm_cell_release_no_holder) 881 (cache->prison, cell, &cache->deferred_bios); 882 free_prison_cell(cache, cell); 883 } 884 885 static void cell_defer(struct cache *cache, struct dm_bio_prison_cell *cell, 886 bool holder) 887 { 888 unsigned long flags; 889 890 spin_lock_irqsave(&cache->lock, flags); 891 __cell_defer(cache, cell, holder); 892 spin_unlock_irqrestore(&cache->lock, flags); 893 894 wake_worker(cache); 895 } 896 897 static void cleanup_migration(struct dm_cache_migration *mg) 898 { 899 struct cache *cache = mg->cache; 900 free_migration(mg); 901 dec_nr_migrations(cache); 902 } 903 904 static void migration_failure(struct dm_cache_migration *mg) 905 { 906 struct cache *cache = mg->cache; 907 908 if (mg->writeback) { 909 DMWARN_LIMIT("writeback failed; couldn't copy block"); 910 set_dirty(cache, mg->old_oblock, mg->cblock); 911 cell_defer(cache, mg->old_ocell, false); 912 913 } else if (mg->demote) { 914 DMWARN_LIMIT("demotion failed; couldn't copy block"); 915 policy_force_mapping(cache->policy, mg->new_oblock, mg->old_oblock); 916 917 cell_defer(cache, mg->old_ocell, mg->promote ? false : true); 918 if (mg->promote) 919 cell_defer(cache, mg->new_ocell, true); 920 } else { 921 DMWARN_LIMIT("promotion failed; couldn't copy block"); 922 policy_remove_mapping(cache->policy, mg->new_oblock); 923 cell_defer(cache, mg->new_ocell, true); 924 } 925 926 cleanup_migration(mg); 927 } 928 929 static void migration_success_pre_commit(struct dm_cache_migration *mg) 930 { 931 unsigned long flags; 932 struct cache *cache = mg->cache; 933 934 if (mg->writeback) { 935 clear_dirty(cache, mg->old_oblock, mg->cblock); 936 cell_defer(cache, mg->old_ocell, false); 937 cleanup_migration(mg); 938 return; 939 940 } else if (mg->demote) { 941 if (dm_cache_remove_mapping(cache->cmd, mg->cblock)) { 942 DMWARN_LIMIT("demotion failed; couldn't update on disk metadata"); 943 policy_force_mapping(cache->policy, mg->new_oblock, 944 mg->old_oblock); 945 if (mg->promote) 946 cell_defer(cache, mg->new_ocell, true); 947 cleanup_migration(mg); 948 return; 949 } 950 } else { 951 if (dm_cache_insert_mapping(cache->cmd, mg->cblock, mg->new_oblock)) { 952 DMWARN_LIMIT("promotion failed; couldn't update on disk metadata"); 953 policy_remove_mapping(cache->policy, mg->new_oblock); 954 cleanup_migration(mg); 955 return; 956 } 957 } 958 959 spin_lock_irqsave(&cache->lock, flags); 960 list_add_tail(&mg->list, &cache->need_commit_migrations); 961 cache->commit_requested = true; 962 spin_unlock_irqrestore(&cache->lock, flags); 963 } 964 965 static void migration_success_post_commit(struct dm_cache_migration *mg) 966 { 967 unsigned long flags; 968 struct cache *cache = mg->cache; 969 970 if (mg->writeback) { 971 DMWARN("writeback unexpectedly triggered commit"); 972 return; 973 974 } else if (mg->demote) { 975 cell_defer(cache, mg->old_ocell, mg->promote ? false : true); 976 977 if (mg->promote) { 978 mg->demote = false; 979 980 spin_lock_irqsave(&cache->lock, flags); 981 list_add_tail(&mg->list, &cache->quiesced_migrations); 982 spin_unlock_irqrestore(&cache->lock, flags); 983 984 } else { 985 if (mg->invalidate) 986 policy_remove_mapping(cache->policy, mg->old_oblock); 987 cleanup_migration(mg); 988 } 989 990 } else { 991 if (mg->requeue_holder) { 992 clear_dirty(cache, mg->new_oblock, mg->cblock); 993 cell_defer(cache, mg->new_ocell, true); 994 } else { 995 /* 996 * The block was promoted via an overwrite, so it's dirty. 997 */ 998 set_dirty(cache, mg->new_oblock, mg->cblock); 999 bio_endio(mg->new_ocell->holder, 0); 1000 cell_defer(cache, mg->new_ocell, false); 1001 } 1002 cleanup_migration(mg); 1003 } 1004 } 1005 1006 static void copy_complete(int read_err, unsigned long write_err, void *context) 1007 { 1008 unsigned long flags; 1009 struct dm_cache_migration *mg = (struct dm_cache_migration *) context; 1010 struct cache *cache = mg->cache; 1011 1012 if (read_err || write_err) 1013 mg->err = true; 1014 1015 spin_lock_irqsave(&cache->lock, flags); 1016 list_add_tail(&mg->list, &cache->completed_migrations); 1017 spin_unlock_irqrestore(&cache->lock, flags); 1018 1019 wake_worker(cache); 1020 } 1021 1022 static void issue_copy(struct dm_cache_migration *mg) 1023 { 1024 int r; 1025 struct dm_io_region o_region, c_region; 1026 struct cache *cache = mg->cache; 1027 sector_t cblock = from_cblock(mg->cblock); 1028 1029 o_region.bdev = cache->origin_dev->bdev; 1030 o_region.count = cache->sectors_per_block; 1031 1032 c_region.bdev = cache->cache_dev->bdev; 1033 c_region.sector = cblock * cache->sectors_per_block; 1034 c_region.count = cache->sectors_per_block; 1035 1036 if (mg->writeback || mg->demote) { 1037 /* demote */ 1038 o_region.sector = from_oblock(mg->old_oblock) * cache->sectors_per_block; 1039 r = dm_kcopyd_copy(cache->copier, &c_region, 1, &o_region, 0, copy_complete, mg); 1040 } else { 1041 /* promote */ 1042 o_region.sector = from_oblock(mg->new_oblock) * cache->sectors_per_block; 1043 r = dm_kcopyd_copy(cache->copier, &o_region, 1, &c_region, 0, copy_complete, mg); 1044 } 1045 1046 if (r < 0) { 1047 DMERR_LIMIT("issuing migration failed"); 1048 migration_failure(mg); 1049 } 1050 } 1051 1052 static void overwrite_endio(struct bio *bio, int err) 1053 { 1054 struct dm_cache_migration *mg = bio->bi_private; 1055 struct cache *cache = mg->cache; 1056 size_t pb_data_size = get_per_bio_data_size(cache); 1057 struct per_bio_data *pb = get_per_bio_data(bio, pb_data_size); 1058 unsigned long flags; 1059 1060 dm_unhook_bio(&pb->hook_info, bio); 1061 1062 if (err) 1063 mg->err = true; 1064 1065 mg->requeue_holder = false; 1066 1067 spin_lock_irqsave(&cache->lock, flags); 1068 list_add_tail(&mg->list, &cache->completed_migrations); 1069 spin_unlock_irqrestore(&cache->lock, flags); 1070 1071 wake_worker(cache); 1072 } 1073 1074 static void issue_overwrite(struct dm_cache_migration *mg, struct bio *bio) 1075 { 1076 size_t pb_data_size = get_per_bio_data_size(mg->cache); 1077 struct per_bio_data *pb = get_per_bio_data(bio, pb_data_size); 1078 1079 dm_hook_bio(&pb->hook_info, bio, overwrite_endio, mg); 1080 remap_to_cache_dirty(mg->cache, bio, mg->new_oblock, mg->cblock); 1081 1082 /* 1083 * No need to inc_ds() here, since the cell will be held for the 1084 * duration of the io. 1085 */ 1086 generic_make_request(bio); 1087 } 1088 1089 static bool bio_writes_complete_block(struct cache *cache, struct bio *bio) 1090 { 1091 return (bio_data_dir(bio) == WRITE) && 1092 (bio->bi_iter.bi_size == (cache->sectors_per_block << SECTOR_SHIFT)); 1093 } 1094 1095 static void avoid_copy(struct dm_cache_migration *mg) 1096 { 1097 atomic_inc(&mg->cache->stats.copies_avoided); 1098 migration_success_pre_commit(mg); 1099 } 1100 1101 static void calc_discard_block_range(struct cache *cache, struct bio *bio, 1102 dm_dblock_t *b, dm_dblock_t *e) 1103 { 1104 sector_t sb = bio->bi_iter.bi_sector; 1105 sector_t se = bio_end_sector(bio); 1106 1107 *b = to_dblock(dm_sector_div_up(sb, cache->discard_block_size)); 1108 1109 if (se - sb < cache->discard_block_size) 1110 *e = *b; 1111 else 1112 *e = to_dblock(block_div(se, cache->discard_block_size)); 1113 } 1114 1115 static void issue_discard(struct dm_cache_migration *mg) 1116 { 1117 dm_dblock_t b, e; 1118 struct bio *bio = mg->new_ocell->holder; 1119 1120 calc_discard_block_range(mg->cache, bio, &b, &e); 1121 while (b != e) { 1122 set_discard(mg->cache, b); 1123 b = to_dblock(from_dblock(b) + 1); 1124 } 1125 1126 bio_endio(bio, 0); 1127 cell_defer(mg->cache, mg->new_ocell, false); 1128 free_migration(mg); 1129 } 1130 1131 static void issue_copy_or_discard(struct dm_cache_migration *mg) 1132 { 1133 bool avoid; 1134 struct cache *cache = mg->cache; 1135 1136 if (mg->discard) { 1137 issue_discard(mg); 1138 return; 1139 } 1140 1141 if (mg->writeback || mg->demote) 1142 avoid = !is_dirty(cache, mg->cblock) || 1143 is_discarded_oblock(cache, mg->old_oblock); 1144 else { 1145 struct bio *bio = mg->new_ocell->holder; 1146 1147 avoid = is_discarded_oblock(cache, mg->new_oblock); 1148 1149 if (writeback_mode(&cache->features) && 1150 !avoid && bio_writes_complete_block(cache, bio)) { 1151 issue_overwrite(mg, bio); 1152 return; 1153 } 1154 } 1155 1156 avoid ? avoid_copy(mg) : issue_copy(mg); 1157 } 1158 1159 static void complete_migration(struct dm_cache_migration *mg) 1160 { 1161 if (mg->err) 1162 migration_failure(mg); 1163 else 1164 migration_success_pre_commit(mg); 1165 } 1166 1167 static void process_migrations(struct cache *cache, struct list_head *head, 1168 void (*fn)(struct dm_cache_migration *)) 1169 { 1170 unsigned long flags; 1171 struct list_head list; 1172 struct dm_cache_migration *mg, *tmp; 1173 1174 INIT_LIST_HEAD(&list); 1175 spin_lock_irqsave(&cache->lock, flags); 1176 list_splice_init(head, &list); 1177 spin_unlock_irqrestore(&cache->lock, flags); 1178 1179 list_for_each_entry_safe(mg, tmp, &list, list) 1180 fn(mg); 1181 } 1182 1183 static void __queue_quiesced_migration(struct dm_cache_migration *mg) 1184 { 1185 list_add_tail(&mg->list, &mg->cache->quiesced_migrations); 1186 } 1187 1188 static void queue_quiesced_migration(struct dm_cache_migration *mg) 1189 { 1190 unsigned long flags; 1191 struct cache *cache = mg->cache; 1192 1193 spin_lock_irqsave(&cache->lock, flags); 1194 __queue_quiesced_migration(mg); 1195 spin_unlock_irqrestore(&cache->lock, flags); 1196 1197 wake_worker(cache); 1198 } 1199 1200 static void queue_quiesced_migrations(struct cache *cache, struct list_head *work) 1201 { 1202 unsigned long flags; 1203 struct dm_cache_migration *mg, *tmp; 1204 1205 spin_lock_irqsave(&cache->lock, flags); 1206 list_for_each_entry_safe(mg, tmp, work, list) 1207 __queue_quiesced_migration(mg); 1208 spin_unlock_irqrestore(&cache->lock, flags); 1209 1210 wake_worker(cache); 1211 } 1212 1213 static void check_for_quiesced_migrations(struct cache *cache, 1214 struct per_bio_data *pb) 1215 { 1216 struct list_head work; 1217 1218 if (!pb->all_io_entry) 1219 return; 1220 1221 INIT_LIST_HEAD(&work); 1222 dm_deferred_entry_dec(pb->all_io_entry, &work); 1223 1224 if (!list_empty(&work)) 1225 queue_quiesced_migrations(cache, &work); 1226 } 1227 1228 static void quiesce_migration(struct dm_cache_migration *mg) 1229 { 1230 if (!dm_deferred_set_add_work(mg->cache->all_io_ds, &mg->list)) 1231 queue_quiesced_migration(mg); 1232 } 1233 1234 static void promote(struct cache *cache, struct prealloc *structs, 1235 dm_oblock_t oblock, dm_cblock_t cblock, 1236 struct dm_bio_prison_cell *cell) 1237 { 1238 struct dm_cache_migration *mg = prealloc_get_migration(structs); 1239 1240 mg->err = false; 1241 mg->discard = false; 1242 mg->writeback = false; 1243 mg->demote = false; 1244 mg->promote = true; 1245 mg->requeue_holder = true; 1246 mg->invalidate = false; 1247 mg->cache = cache; 1248 mg->new_oblock = oblock; 1249 mg->cblock = cblock; 1250 mg->old_ocell = NULL; 1251 mg->new_ocell = cell; 1252 mg->start_jiffies = jiffies; 1253 1254 inc_nr_migrations(cache); 1255 quiesce_migration(mg); 1256 } 1257 1258 static void writeback(struct cache *cache, struct prealloc *structs, 1259 dm_oblock_t oblock, dm_cblock_t cblock, 1260 struct dm_bio_prison_cell *cell) 1261 { 1262 struct dm_cache_migration *mg = prealloc_get_migration(structs); 1263 1264 mg->err = false; 1265 mg->discard = false; 1266 mg->writeback = true; 1267 mg->demote = false; 1268 mg->promote = false; 1269 mg->requeue_holder = true; 1270 mg->invalidate = false; 1271 mg->cache = cache; 1272 mg->old_oblock = oblock; 1273 mg->cblock = cblock; 1274 mg->old_ocell = cell; 1275 mg->new_ocell = NULL; 1276 mg->start_jiffies = jiffies; 1277 1278 inc_nr_migrations(cache); 1279 quiesce_migration(mg); 1280 } 1281 1282 static void demote_then_promote(struct cache *cache, struct prealloc *structs, 1283 dm_oblock_t old_oblock, dm_oblock_t new_oblock, 1284 dm_cblock_t cblock, 1285 struct dm_bio_prison_cell *old_ocell, 1286 struct dm_bio_prison_cell *new_ocell) 1287 { 1288 struct dm_cache_migration *mg = prealloc_get_migration(structs); 1289 1290 mg->err = false; 1291 mg->discard = false; 1292 mg->writeback = false; 1293 mg->demote = true; 1294 mg->promote = true; 1295 mg->requeue_holder = true; 1296 mg->invalidate = false; 1297 mg->cache = cache; 1298 mg->old_oblock = old_oblock; 1299 mg->new_oblock = new_oblock; 1300 mg->cblock = cblock; 1301 mg->old_ocell = old_ocell; 1302 mg->new_ocell = new_ocell; 1303 mg->start_jiffies = jiffies; 1304 1305 inc_nr_migrations(cache); 1306 quiesce_migration(mg); 1307 } 1308 1309 /* 1310 * Invalidate a cache entry. No writeback occurs; any changes in the cache 1311 * block are thrown away. 1312 */ 1313 static void invalidate(struct cache *cache, struct prealloc *structs, 1314 dm_oblock_t oblock, dm_cblock_t cblock, 1315 struct dm_bio_prison_cell *cell) 1316 { 1317 struct dm_cache_migration *mg = prealloc_get_migration(structs); 1318 1319 mg->err = false; 1320 mg->discard = false; 1321 mg->writeback = false; 1322 mg->demote = true; 1323 mg->promote = false; 1324 mg->requeue_holder = true; 1325 mg->invalidate = true; 1326 mg->cache = cache; 1327 mg->old_oblock = oblock; 1328 mg->cblock = cblock; 1329 mg->old_ocell = cell; 1330 mg->new_ocell = NULL; 1331 mg->start_jiffies = jiffies; 1332 1333 inc_nr_migrations(cache); 1334 quiesce_migration(mg); 1335 } 1336 1337 static void discard(struct cache *cache, struct prealloc *structs, 1338 struct dm_bio_prison_cell *cell) 1339 { 1340 struct dm_cache_migration *mg = prealloc_get_migration(structs); 1341 1342 mg->err = false; 1343 mg->discard = true; 1344 mg->writeback = false; 1345 mg->demote = false; 1346 mg->promote = false; 1347 mg->requeue_holder = false; 1348 mg->invalidate = false; 1349 mg->cache = cache; 1350 mg->old_ocell = NULL; 1351 mg->new_ocell = cell; 1352 mg->start_jiffies = jiffies; 1353 1354 quiesce_migration(mg); 1355 } 1356 1357 /*---------------------------------------------------------------- 1358 * bio processing 1359 *--------------------------------------------------------------*/ 1360 static void defer_bio(struct cache *cache, struct bio *bio) 1361 { 1362 unsigned long flags; 1363 1364 spin_lock_irqsave(&cache->lock, flags); 1365 bio_list_add(&cache->deferred_bios, bio); 1366 spin_unlock_irqrestore(&cache->lock, flags); 1367 1368 wake_worker(cache); 1369 } 1370 1371 static void process_flush_bio(struct cache *cache, struct bio *bio) 1372 { 1373 size_t pb_data_size = get_per_bio_data_size(cache); 1374 struct per_bio_data *pb = get_per_bio_data(bio, pb_data_size); 1375 1376 BUG_ON(bio->bi_iter.bi_size); 1377 if (!pb->req_nr) 1378 remap_to_origin(cache, bio); 1379 else 1380 remap_to_cache(cache, bio, 0); 1381 1382 /* 1383 * REQ_FLUSH is not directed at any particular block so we don't 1384 * need to inc_ds(). REQ_FUA's are split into a write + REQ_FLUSH 1385 * by dm-core. 1386 */ 1387 issue(cache, bio); 1388 } 1389 1390 static void process_discard_bio(struct cache *cache, struct prealloc *structs, 1391 struct bio *bio) 1392 { 1393 int r; 1394 dm_dblock_t b, e; 1395 struct dm_bio_prison_cell *cell_prealloc, *new_ocell; 1396 1397 calc_discard_block_range(cache, bio, &b, &e); 1398 if (b == e) { 1399 bio_endio(bio, 0); 1400 return; 1401 } 1402 1403 cell_prealloc = prealloc_get_cell(structs); 1404 r = bio_detain_range(cache, dblock_to_oblock(cache, b), dblock_to_oblock(cache, e), bio, cell_prealloc, 1405 (cell_free_fn) prealloc_put_cell, 1406 structs, &new_ocell); 1407 if (r > 0) 1408 return; 1409 1410 discard(cache, structs, new_ocell); 1411 } 1412 1413 static bool spare_migration_bandwidth(struct cache *cache) 1414 { 1415 sector_t current_volume = (atomic_read(&cache->nr_migrations) + 1) * 1416 cache->sectors_per_block; 1417 return current_volume < cache->migration_threshold; 1418 } 1419 1420 static void inc_hit_counter(struct cache *cache, struct bio *bio) 1421 { 1422 atomic_inc(bio_data_dir(bio) == READ ? 1423 &cache->stats.read_hit : &cache->stats.write_hit); 1424 } 1425 1426 static void inc_miss_counter(struct cache *cache, struct bio *bio) 1427 { 1428 atomic_inc(bio_data_dir(bio) == READ ? 1429 &cache->stats.read_miss : &cache->stats.write_miss); 1430 } 1431 1432 static void process_bio(struct cache *cache, struct prealloc *structs, 1433 struct bio *bio) 1434 { 1435 int r; 1436 bool release_cell = true; 1437 dm_oblock_t block = get_bio_block(cache, bio); 1438 struct dm_bio_prison_cell *cell_prealloc, *old_ocell, *new_ocell; 1439 struct policy_result lookup_result; 1440 bool passthrough = passthrough_mode(&cache->features); 1441 bool discarded_block, can_migrate; 1442 1443 /* 1444 * Check to see if that block is currently migrating. 1445 */ 1446 cell_prealloc = prealloc_get_cell(structs); 1447 r = bio_detain(cache, block, bio, cell_prealloc, 1448 (cell_free_fn) prealloc_put_cell, 1449 structs, &new_ocell); 1450 if (r > 0) 1451 return; 1452 1453 discarded_block = is_discarded_oblock(cache, block); 1454 can_migrate = !passthrough && (discarded_block || spare_migration_bandwidth(cache)); 1455 1456 r = policy_map(cache->policy, block, true, can_migrate, discarded_block, 1457 bio, &lookup_result); 1458 1459 if (r == -EWOULDBLOCK) 1460 /* migration has been denied */ 1461 lookup_result.op = POLICY_MISS; 1462 1463 switch (lookup_result.op) { 1464 case POLICY_HIT: 1465 if (passthrough) { 1466 inc_miss_counter(cache, bio); 1467 1468 /* 1469 * Passthrough always maps to the origin, 1470 * invalidating any cache blocks that are written 1471 * to. 1472 */ 1473 1474 if (bio_data_dir(bio) == WRITE) { 1475 atomic_inc(&cache->stats.demotion); 1476 invalidate(cache, structs, block, lookup_result.cblock, new_ocell); 1477 release_cell = false; 1478 1479 } else { 1480 /* FIXME: factor out issue_origin() */ 1481 remap_to_origin_clear_discard(cache, bio, block); 1482 inc_and_issue(cache, bio, new_ocell); 1483 } 1484 } else { 1485 inc_hit_counter(cache, bio); 1486 1487 if (bio_data_dir(bio) == WRITE && 1488 writethrough_mode(&cache->features) && 1489 !is_dirty(cache, lookup_result.cblock)) { 1490 remap_to_origin_then_cache(cache, bio, block, lookup_result.cblock); 1491 inc_and_issue(cache, bio, new_ocell); 1492 1493 } else { 1494 remap_to_cache_dirty(cache, bio, block, lookup_result.cblock); 1495 inc_and_issue(cache, bio, new_ocell); 1496 } 1497 } 1498 1499 break; 1500 1501 case POLICY_MISS: 1502 inc_miss_counter(cache, bio); 1503 remap_to_origin_clear_discard(cache, bio, block); 1504 inc_and_issue(cache, bio, new_ocell); 1505 break; 1506 1507 case POLICY_NEW: 1508 atomic_inc(&cache->stats.promotion); 1509 promote(cache, structs, block, lookup_result.cblock, new_ocell); 1510 release_cell = false; 1511 break; 1512 1513 case POLICY_REPLACE: 1514 cell_prealloc = prealloc_get_cell(structs); 1515 r = bio_detain(cache, lookup_result.old_oblock, bio, cell_prealloc, 1516 (cell_free_fn) prealloc_put_cell, 1517 structs, &old_ocell); 1518 if (r > 0) { 1519 /* 1520 * We have to be careful to avoid lock inversion of 1521 * the cells. So we back off, and wait for the 1522 * old_ocell to become free. 1523 */ 1524 policy_force_mapping(cache->policy, block, 1525 lookup_result.old_oblock); 1526 atomic_inc(&cache->stats.cache_cell_clash); 1527 break; 1528 } 1529 atomic_inc(&cache->stats.demotion); 1530 atomic_inc(&cache->stats.promotion); 1531 1532 demote_then_promote(cache, structs, lookup_result.old_oblock, 1533 block, lookup_result.cblock, 1534 old_ocell, new_ocell); 1535 release_cell = false; 1536 break; 1537 1538 default: 1539 DMERR_LIMIT("%s: erroring bio, unknown policy op: %u", __func__, 1540 (unsigned) lookup_result.op); 1541 bio_io_error(bio); 1542 } 1543 1544 if (release_cell) 1545 cell_defer(cache, new_ocell, false); 1546 } 1547 1548 static int need_commit_due_to_time(struct cache *cache) 1549 { 1550 return jiffies < cache->last_commit_jiffies || 1551 jiffies > cache->last_commit_jiffies + COMMIT_PERIOD; 1552 } 1553 1554 static int commit_if_needed(struct cache *cache) 1555 { 1556 int r = 0; 1557 1558 if ((cache->commit_requested || need_commit_due_to_time(cache)) && 1559 dm_cache_changed_this_transaction(cache->cmd)) { 1560 atomic_inc(&cache->stats.commit_count); 1561 cache->commit_requested = false; 1562 r = dm_cache_commit(cache->cmd, false); 1563 cache->last_commit_jiffies = jiffies; 1564 } 1565 1566 return r; 1567 } 1568 1569 static void process_deferred_bios(struct cache *cache) 1570 { 1571 unsigned long flags; 1572 struct bio_list bios; 1573 struct bio *bio; 1574 struct prealloc structs; 1575 1576 memset(&structs, 0, sizeof(structs)); 1577 bio_list_init(&bios); 1578 1579 spin_lock_irqsave(&cache->lock, flags); 1580 bio_list_merge(&bios, &cache->deferred_bios); 1581 bio_list_init(&cache->deferred_bios); 1582 spin_unlock_irqrestore(&cache->lock, flags); 1583 1584 while (!bio_list_empty(&bios)) { 1585 /* 1586 * If we've got no free migration structs, and processing 1587 * this bio might require one, we pause until there are some 1588 * prepared mappings to process. 1589 */ 1590 if (prealloc_data_structs(cache, &structs)) { 1591 spin_lock_irqsave(&cache->lock, flags); 1592 bio_list_merge(&cache->deferred_bios, &bios); 1593 spin_unlock_irqrestore(&cache->lock, flags); 1594 break; 1595 } 1596 1597 bio = bio_list_pop(&bios); 1598 1599 if (bio->bi_rw & REQ_FLUSH) 1600 process_flush_bio(cache, bio); 1601 else if (bio->bi_rw & REQ_DISCARD) 1602 process_discard_bio(cache, &structs, bio); 1603 else 1604 process_bio(cache, &structs, bio); 1605 } 1606 1607 prealloc_free_structs(cache, &structs); 1608 } 1609 1610 static void process_deferred_flush_bios(struct cache *cache, bool submit_bios) 1611 { 1612 unsigned long flags; 1613 struct bio_list bios; 1614 struct bio *bio; 1615 1616 bio_list_init(&bios); 1617 1618 spin_lock_irqsave(&cache->lock, flags); 1619 bio_list_merge(&bios, &cache->deferred_flush_bios); 1620 bio_list_init(&cache->deferred_flush_bios); 1621 spin_unlock_irqrestore(&cache->lock, flags); 1622 1623 /* 1624 * These bios have already been through inc_ds() 1625 */ 1626 while ((bio = bio_list_pop(&bios))) 1627 submit_bios ? generic_make_request(bio) : bio_io_error(bio); 1628 } 1629 1630 static void process_deferred_writethrough_bios(struct cache *cache) 1631 { 1632 unsigned long flags; 1633 struct bio_list bios; 1634 struct bio *bio; 1635 1636 bio_list_init(&bios); 1637 1638 spin_lock_irqsave(&cache->lock, flags); 1639 bio_list_merge(&bios, &cache->deferred_writethrough_bios); 1640 bio_list_init(&cache->deferred_writethrough_bios); 1641 spin_unlock_irqrestore(&cache->lock, flags); 1642 1643 /* 1644 * These bios have already been through inc_ds() 1645 */ 1646 while ((bio = bio_list_pop(&bios))) 1647 generic_make_request(bio); 1648 } 1649 1650 static void writeback_some_dirty_blocks(struct cache *cache) 1651 { 1652 int r = 0; 1653 dm_oblock_t oblock; 1654 dm_cblock_t cblock; 1655 struct prealloc structs; 1656 struct dm_bio_prison_cell *old_ocell; 1657 1658 memset(&structs, 0, sizeof(structs)); 1659 1660 while (spare_migration_bandwidth(cache)) { 1661 if (prealloc_data_structs(cache, &structs)) 1662 break; 1663 1664 r = policy_writeback_work(cache->policy, &oblock, &cblock); 1665 if (r) 1666 break; 1667 1668 r = get_cell(cache, oblock, &structs, &old_ocell); 1669 if (r) { 1670 policy_set_dirty(cache->policy, oblock); 1671 break; 1672 } 1673 1674 writeback(cache, &structs, oblock, cblock, old_ocell); 1675 } 1676 1677 prealloc_free_structs(cache, &structs); 1678 } 1679 1680 /*---------------------------------------------------------------- 1681 * Invalidations. 1682 * Dropping something from the cache *without* writing back. 1683 *--------------------------------------------------------------*/ 1684 1685 static void process_invalidation_request(struct cache *cache, struct invalidation_request *req) 1686 { 1687 int r = 0; 1688 uint64_t begin = from_cblock(req->cblocks->begin); 1689 uint64_t end = from_cblock(req->cblocks->end); 1690 1691 while (begin != end) { 1692 r = policy_remove_cblock(cache->policy, to_cblock(begin)); 1693 if (!r) { 1694 r = dm_cache_remove_mapping(cache->cmd, to_cblock(begin)); 1695 if (r) 1696 break; 1697 1698 } else if (r == -ENODATA) { 1699 /* harmless, already unmapped */ 1700 r = 0; 1701 1702 } else { 1703 DMERR("policy_remove_cblock failed"); 1704 break; 1705 } 1706 1707 begin++; 1708 } 1709 1710 cache->commit_requested = true; 1711 1712 req->err = r; 1713 atomic_set(&req->complete, 1); 1714 1715 wake_up(&req->result_wait); 1716 } 1717 1718 static void process_invalidation_requests(struct cache *cache) 1719 { 1720 struct list_head list; 1721 struct invalidation_request *req, *tmp; 1722 1723 INIT_LIST_HEAD(&list); 1724 spin_lock(&cache->invalidation_lock); 1725 list_splice_init(&cache->invalidation_requests, &list); 1726 spin_unlock(&cache->invalidation_lock); 1727 1728 list_for_each_entry_safe (req, tmp, &list, list) 1729 process_invalidation_request(cache, req); 1730 } 1731 1732 /*---------------------------------------------------------------- 1733 * Main worker loop 1734 *--------------------------------------------------------------*/ 1735 static bool is_quiescing(struct cache *cache) 1736 { 1737 return atomic_read(&cache->quiescing); 1738 } 1739 1740 static void ack_quiescing(struct cache *cache) 1741 { 1742 if (is_quiescing(cache)) { 1743 atomic_inc(&cache->quiescing_ack); 1744 wake_up(&cache->quiescing_wait); 1745 } 1746 } 1747 1748 static void wait_for_quiescing_ack(struct cache *cache) 1749 { 1750 wait_event(cache->quiescing_wait, atomic_read(&cache->quiescing_ack)); 1751 } 1752 1753 static void start_quiescing(struct cache *cache) 1754 { 1755 atomic_inc(&cache->quiescing); 1756 wait_for_quiescing_ack(cache); 1757 } 1758 1759 static void stop_quiescing(struct cache *cache) 1760 { 1761 atomic_set(&cache->quiescing, 0); 1762 atomic_set(&cache->quiescing_ack, 0); 1763 } 1764 1765 static void wait_for_migrations(struct cache *cache) 1766 { 1767 wait_event(cache->migration_wait, !atomic_read(&cache->nr_migrations)); 1768 } 1769 1770 static void stop_worker(struct cache *cache) 1771 { 1772 cancel_delayed_work(&cache->waker); 1773 flush_workqueue(cache->wq); 1774 } 1775 1776 static void requeue_deferred_io(struct cache *cache) 1777 { 1778 struct bio *bio; 1779 struct bio_list bios; 1780 1781 bio_list_init(&bios); 1782 bio_list_merge(&bios, &cache->deferred_bios); 1783 bio_list_init(&cache->deferred_bios); 1784 1785 while ((bio = bio_list_pop(&bios))) 1786 bio_endio(bio, DM_ENDIO_REQUEUE); 1787 } 1788 1789 static int more_work(struct cache *cache) 1790 { 1791 if (is_quiescing(cache)) 1792 return !list_empty(&cache->quiesced_migrations) || 1793 !list_empty(&cache->completed_migrations) || 1794 !list_empty(&cache->need_commit_migrations); 1795 else 1796 return !bio_list_empty(&cache->deferred_bios) || 1797 !bio_list_empty(&cache->deferred_flush_bios) || 1798 !bio_list_empty(&cache->deferred_writethrough_bios) || 1799 !list_empty(&cache->quiesced_migrations) || 1800 !list_empty(&cache->completed_migrations) || 1801 !list_empty(&cache->need_commit_migrations) || 1802 cache->invalidate; 1803 } 1804 1805 static void do_worker(struct work_struct *ws) 1806 { 1807 struct cache *cache = container_of(ws, struct cache, worker); 1808 1809 do { 1810 if (!is_quiescing(cache)) { 1811 writeback_some_dirty_blocks(cache); 1812 process_deferred_writethrough_bios(cache); 1813 process_deferred_bios(cache); 1814 process_invalidation_requests(cache); 1815 } 1816 1817 process_migrations(cache, &cache->quiesced_migrations, issue_copy_or_discard); 1818 process_migrations(cache, &cache->completed_migrations, complete_migration); 1819 1820 if (commit_if_needed(cache)) { 1821 process_deferred_flush_bios(cache, false); 1822 process_migrations(cache, &cache->need_commit_migrations, migration_failure); 1823 1824 /* 1825 * FIXME: rollback metadata or just go into a 1826 * failure mode and error everything 1827 */ 1828 } else { 1829 process_deferred_flush_bios(cache, true); 1830 process_migrations(cache, &cache->need_commit_migrations, 1831 migration_success_post_commit); 1832 } 1833 1834 ack_quiescing(cache); 1835 1836 } while (more_work(cache)); 1837 } 1838 1839 /* 1840 * We want to commit periodically so that not too much 1841 * unwritten metadata builds up. 1842 */ 1843 static void do_waker(struct work_struct *ws) 1844 { 1845 struct cache *cache = container_of(to_delayed_work(ws), struct cache, waker); 1846 policy_tick(cache->policy); 1847 wake_worker(cache); 1848 queue_delayed_work(cache->wq, &cache->waker, COMMIT_PERIOD); 1849 } 1850 1851 /*----------------------------------------------------------------*/ 1852 1853 static int is_congested(struct dm_dev *dev, int bdi_bits) 1854 { 1855 struct request_queue *q = bdev_get_queue(dev->bdev); 1856 return bdi_congested(&q->backing_dev_info, bdi_bits); 1857 } 1858 1859 static int cache_is_congested(struct dm_target_callbacks *cb, int bdi_bits) 1860 { 1861 struct cache *cache = container_of(cb, struct cache, callbacks); 1862 1863 return is_congested(cache->origin_dev, bdi_bits) || 1864 is_congested(cache->cache_dev, bdi_bits); 1865 } 1866 1867 /*---------------------------------------------------------------- 1868 * Target methods 1869 *--------------------------------------------------------------*/ 1870 1871 /* 1872 * This function gets called on the error paths of the constructor, so we 1873 * have to cope with a partially initialised struct. 1874 */ 1875 static void destroy(struct cache *cache) 1876 { 1877 unsigned i; 1878 1879 if (cache->next_migration) 1880 mempool_free(cache->next_migration, cache->migration_pool); 1881 1882 if (cache->migration_pool) 1883 mempool_destroy(cache->migration_pool); 1884 1885 if (cache->all_io_ds) 1886 dm_deferred_set_destroy(cache->all_io_ds); 1887 1888 if (cache->prison) 1889 dm_bio_prison_destroy(cache->prison); 1890 1891 if (cache->wq) 1892 destroy_workqueue(cache->wq); 1893 1894 if (cache->dirty_bitset) 1895 free_bitset(cache->dirty_bitset); 1896 1897 if (cache->discard_bitset) 1898 free_bitset(cache->discard_bitset); 1899 1900 if (cache->copier) 1901 dm_kcopyd_client_destroy(cache->copier); 1902 1903 if (cache->cmd) 1904 dm_cache_metadata_close(cache->cmd); 1905 1906 if (cache->metadata_dev) 1907 dm_put_device(cache->ti, cache->metadata_dev); 1908 1909 if (cache->origin_dev) 1910 dm_put_device(cache->ti, cache->origin_dev); 1911 1912 if (cache->cache_dev) 1913 dm_put_device(cache->ti, cache->cache_dev); 1914 1915 if (cache->policy) 1916 dm_cache_policy_destroy(cache->policy); 1917 1918 for (i = 0; i < cache->nr_ctr_args ; i++) 1919 kfree(cache->ctr_args[i]); 1920 kfree(cache->ctr_args); 1921 1922 kfree(cache); 1923 } 1924 1925 static void cache_dtr(struct dm_target *ti) 1926 { 1927 struct cache *cache = ti->private; 1928 1929 destroy(cache); 1930 } 1931 1932 static sector_t get_dev_size(struct dm_dev *dev) 1933 { 1934 return i_size_read(dev->bdev->bd_inode) >> SECTOR_SHIFT; 1935 } 1936 1937 /*----------------------------------------------------------------*/ 1938 1939 /* 1940 * Construct a cache device mapping. 1941 * 1942 * cache <metadata dev> <cache dev> <origin dev> <block size> 1943 * <#feature args> [<feature arg>]* 1944 * <policy> <#policy args> [<policy arg>]* 1945 * 1946 * metadata dev : fast device holding the persistent metadata 1947 * cache dev : fast device holding cached data blocks 1948 * origin dev : slow device holding original data blocks 1949 * block size : cache unit size in sectors 1950 * 1951 * #feature args : number of feature arguments passed 1952 * feature args : writethrough. (The default is writeback.) 1953 * 1954 * policy : the replacement policy to use 1955 * #policy args : an even number of policy arguments corresponding 1956 * to key/value pairs passed to the policy 1957 * policy args : key/value pairs passed to the policy 1958 * E.g. 'sequential_threshold 1024' 1959 * See cache-policies.txt for details. 1960 * 1961 * Optional feature arguments are: 1962 * writethrough : write through caching that prohibits cache block 1963 * content from being different from origin block content. 1964 * Without this argument, the default behaviour is to write 1965 * back cache block contents later for performance reasons, 1966 * so they may differ from the corresponding origin blocks. 1967 */ 1968 struct cache_args { 1969 struct dm_target *ti; 1970 1971 struct dm_dev *metadata_dev; 1972 1973 struct dm_dev *cache_dev; 1974 sector_t cache_sectors; 1975 1976 struct dm_dev *origin_dev; 1977 sector_t origin_sectors; 1978 1979 uint32_t block_size; 1980 1981 const char *policy_name; 1982 int policy_argc; 1983 const char **policy_argv; 1984 1985 struct cache_features features; 1986 }; 1987 1988 static void destroy_cache_args(struct cache_args *ca) 1989 { 1990 if (ca->metadata_dev) 1991 dm_put_device(ca->ti, ca->metadata_dev); 1992 1993 if (ca->cache_dev) 1994 dm_put_device(ca->ti, ca->cache_dev); 1995 1996 if (ca->origin_dev) 1997 dm_put_device(ca->ti, ca->origin_dev); 1998 1999 kfree(ca); 2000 } 2001 2002 static bool at_least_one_arg(struct dm_arg_set *as, char **error) 2003 { 2004 if (!as->argc) { 2005 *error = "Insufficient args"; 2006 return false; 2007 } 2008 2009 return true; 2010 } 2011 2012 static int parse_metadata_dev(struct cache_args *ca, struct dm_arg_set *as, 2013 char **error) 2014 { 2015 int r; 2016 sector_t metadata_dev_size; 2017 char b[BDEVNAME_SIZE]; 2018 2019 if (!at_least_one_arg(as, error)) 2020 return -EINVAL; 2021 2022 r = dm_get_device(ca->ti, dm_shift_arg(as), FMODE_READ | FMODE_WRITE, 2023 &ca->metadata_dev); 2024 if (r) { 2025 *error = "Error opening metadata device"; 2026 return r; 2027 } 2028 2029 metadata_dev_size = get_dev_size(ca->metadata_dev); 2030 if (metadata_dev_size > DM_CACHE_METADATA_MAX_SECTORS_WARNING) 2031 DMWARN("Metadata device %s is larger than %u sectors: excess space will not be used.", 2032 bdevname(ca->metadata_dev->bdev, b), THIN_METADATA_MAX_SECTORS); 2033 2034 return 0; 2035 } 2036 2037 static int parse_cache_dev(struct cache_args *ca, struct dm_arg_set *as, 2038 char **error) 2039 { 2040 int r; 2041 2042 if (!at_least_one_arg(as, error)) 2043 return -EINVAL; 2044 2045 r = dm_get_device(ca->ti, dm_shift_arg(as), FMODE_READ | FMODE_WRITE, 2046 &ca->cache_dev); 2047 if (r) { 2048 *error = "Error opening cache device"; 2049 return r; 2050 } 2051 ca->cache_sectors = get_dev_size(ca->cache_dev); 2052 2053 return 0; 2054 } 2055 2056 static int parse_origin_dev(struct cache_args *ca, struct dm_arg_set *as, 2057 char **error) 2058 { 2059 int r; 2060 2061 if (!at_least_one_arg(as, error)) 2062 return -EINVAL; 2063 2064 r = dm_get_device(ca->ti, dm_shift_arg(as), FMODE_READ | FMODE_WRITE, 2065 &ca->origin_dev); 2066 if (r) { 2067 *error = "Error opening origin device"; 2068 return r; 2069 } 2070 2071 ca->origin_sectors = get_dev_size(ca->origin_dev); 2072 if (ca->ti->len > ca->origin_sectors) { 2073 *error = "Device size larger than cached device"; 2074 return -EINVAL; 2075 } 2076 2077 return 0; 2078 } 2079 2080 static int parse_block_size(struct cache_args *ca, struct dm_arg_set *as, 2081 char **error) 2082 { 2083 unsigned long block_size; 2084 2085 if (!at_least_one_arg(as, error)) 2086 return -EINVAL; 2087 2088 if (kstrtoul(dm_shift_arg(as), 10, &block_size) || !block_size || 2089 block_size < DATA_DEV_BLOCK_SIZE_MIN_SECTORS || 2090 block_size > DATA_DEV_BLOCK_SIZE_MAX_SECTORS || 2091 block_size & (DATA_DEV_BLOCK_SIZE_MIN_SECTORS - 1)) { 2092 *error = "Invalid data block size"; 2093 return -EINVAL; 2094 } 2095 2096 if (block_size > ca->cache_sectors) { 2097 *error = "Data block size is larger than the cache device"; 2098 return -EINVAL; 2099 } 2100 2101 ca->block_size = block_size; 2102 2103 return 0; 2104 } 2105 2106 static void init_features(struct cache_features *cf) 2107 { 2108 cf->mode = CM_WRITE; 2109 cf->io_mode = CM_IO_WRITEBACK; 2110 } 2111 2112 static int parse_features(struct cache_args *ca, struct dm_arg_set *as, 2113 char **error) 2114 { 2115 static struct dm_arg _args[] = { 2116 {0, 1, "Invalid number of cache feature arguments"}, 2117 }; 2118 2119 int r; 2120 unsigned argc; 2121 const char *arg; 2122 struct cache_features *cf = &ca->features; 2123 2124 init_features(cf); 2125 2126 r = dm_read_arg_group(_args, as, &argc, error); 2127 if (r) 2128 return -EINVAL; 2129 2130 while (argc--) { 2131 arg = dm_shift_arg(as); 2132 2133 if (!strcasecmp(arg, "writeback")) 2134 cf->io_mode = CM_IO_WRITEBACK; 2135 2136 else if (!strcasecmp(arg, "writethrough")) 2137 cf->io_mode = CM_IO_WRITETHROUGH; 2138 2139 else if (!strcasecmp(arg, "passthrough")) 2140 cf->io_mode = CM_IO_PASSTHROUGH; 2141 2142 else { 2143 *error = "Unrecognised cache feature requested"; 2144 return -EINVAL; 2145 } 2146 } 2147 2148 return 0; 2149 } 2150 2151 static int parse_policy(struct cache_args *ca, struct dm_arg_set *as, 2152 char **error) 2153 { 2154 static struct dm_arg _args[] = { 2155 {0, 1024, "Invalid number of policy arguments"}, 2156 }; 2157 2158 int r; 2159 2160 if (!at_least_one_arg(as, error)) 2161 return -EINVAL; 2162 2163 ca->policy_name = dm_shift_arg(as); 2164 2165 r = dm_read_arg_group(_args, as, &ca->policy_argc, error); 2166 if (r) 2167 return -EINVAL; 2168 2169 ca->policy_argv = (const char **)as->argv; 2170 dm_consume_args(as, ca->policy_argc); 2171 2172 return 0; 2173 } 2174 2175 static int parse_cache_args(struct cache_args *ca, int argc, char **argv, 2176 char **error) 2177 { 2178 int r; 2179 struct dm_arg_set as; 2180 2181 as.argc = argc; 2182 as.argv = argv; 2183 2184 r = parse_metadata_dev(ca, &as, error); 2185 if (r) 2186 return r; 2187 2188 r = parse_cache_dev(ca, &as, error); 2189 if (r) 2190 return r; 2191 2192 r = parse_origin_dev(ca, &as, error); 2193 if (r) 2194 return r; 2195 2196 r = parse_block_size(ca, &as, error); 2197 if (r) 2198 return r; 2199 2200 r = parse_features(ca, &as, error); 2201 if (r) 2202 return r; 2203 2204 r = parse_policy(ca, &as, error); 2205 if (r) 2206 return r; 2207 2208 return 0; 2209 } 2210 2211 /*----------------------------------------------------------------*/ 2212 2213 static struct kmem_cache *migration_cache; 2214 2215 #define NOT_CORE_OPTION 1 2216 2217 static int process_config_option(struct cache *cache, const char *key, const char *value) 2218 { 2219 unsigned long tmp; 2220 2221 if (!strcasecmp(key, "migration_threshold")) { 2222 if (kstrtoul(value, 10, &tmp)) 2223 return -EINVAL; 2224 2225 cache->migration_threshold = tmp; 2226 return 0; 2227 } 2228 2229 return NOT_CORE_OPTION; 2230 } 2231 2232 static int set_config_value(struct cache *cache, const char *key, const char *value) 2233 { 2234 int r = process_config_option(cache, key, value); 2235 2236 if (r == NOT_CORE_OPTION) 2237 r = policy_set_config_value(cache->policy, key, value); 2238 2239 if (r) 2240 DMWARN("bad config value for %s: %s", key, value); 2241 2242 return r; 2243 } 2244 2245 static int set_config_values(struct cache *cache, int argc, const char **argv) 2246 { 2247 int r = 0; 2248 2249 if (argc & 1) { 2250 DMWARN("Odd number of policy arguments given but they should be <key> <value> pairs."); 2251 return -EINVAL; 2252 } 2253 2254 while (argc) { 2255 r = set_config_value(cache, argv[0], argv[1]); 2256 if (r) 2257 break; 2258 2259 argc -= 2; 2260 argv += 2; 2261 } 2262 2263 return r; 2264 } 2265 2266 static int create_cache_policy(struct cache *cache, struct cache_args *ca, 2267 char **error) 2268 { 2269 struct dm_cache_policy *p = dm_cache_policy_create(ca->policy_name, 2270 cache->cache_size, 2271 cache->origin_sectors, 2272 cache->sectors_per_block); 2273 if (IS_ERR(p)) { 2274 *error = "Error creating cache's policy"; 2275 return PTR_ERR(p); 2276 } 2277 cache->policy = p; 2278 2279 return 0; 2280 } 2281 2282 /* 2283 * We want the discard block size to be at least the size of the cache 2284 * block size and have no more than 2^14 discard blocks across the origin. 2285 */ 2286 #define MAX_DISCARD_BLOCKS (1 << 14) 2287 2288 static bool too_many_discard_blocks(sector_t discard_block_size, 2289 sector_t origin_size) 2290 { 2291 (void) sector_div(origin_size, discard_block_size); 2292 2293 return origin_size > MAX_DISCARD_BLOCKS; 2294 } 2295 2296 static sector_t calculate_discard_block_size(sector_t cache_block_size, 2297 sector_t origin_size) 2298 { 2299 sector_t discard_block_size = cache_block_size; 2300 2301 if (origin_size) 2302 while (too_many_discard_blocks(discard_block_size, origin_size)) 2303 discard_block_size *= 2; 2304 2305 return discard_block_size; 2306 } 2307 2308 static void set_cache_size(struct cache *cache, dm_cblock_t size) 2309 { 2310 dm_block_t nr_blocks = from_cblock(size); 2311 2312 if (nr_blocks > (1 << 20) && cache->cache_size != size) 2313 DMWARN_LIMIT("You have created a cache device with a lot of individual cache blocks (%llu)\n" 2314 "All these mappings can consume a lot of kernel memory, and take some time to read/write.\n" 2315 "Please consider increasing the cache block size to reduce the overall cache block count.", 2316 (unsigned long long) nr_blocks); 2317 2318 cache->cache_size = size; 2319 } 2320 2321 #define DEFAULT_MIGRATION_THRESHOLD 2048 2322 2323 static int cache_create(struct cache_args *ca, struct cache **result) 2324 { 2325 int r = 0; 2326 char **error = &ca->ti->error; 2327 struct cache *cache; 2328 struct dm_target *ti = ca->ti; 2329 dm_block_t origin_blocks; 2330 struct dm_cache_metadata *cmd; 2331 bool may_format = ca->features.mode == CM_WRITE; 2332 2333 cache = kzalloc(sizeof(*cache), GFP_KERNEL); 2334 if (!cache) 2335 return -ENOMEM; 2336 2337 cache->ti = ca->ti; 2338 ti->private = cache; 2339 ti->num_flush_bios = 2; 2340 ti->flush_supported = true; 2341 2342 ti->num_discard_bios = 1; 2343 ti->discards_supported = true; 2344 ti->discard_zeroes_data_unsupported = true; 2345 ti->split_discard_bios = false; 2346 2347 cache->features = ca->features; 2348 ti->per_bio_data_size = get_per_bio_data_size(cache); 2349 2350 cache->callbacks.congested_fn = cache_is_congested; 2351 dm_table_add_target_callbacks(ti->table, &cache->callbacks); 2352 2353 cache->metadata_dev = ca->metadata_dev; 2354 cache->origin_dev = ca->origin_dev; 2355 cache->cache_dev = ca->cache_dev; 2356 2357 ca->metadata_dev = ca->origin_dev = ca->cache_dev = NULL; 2358 2359 /* FIXME: factor out this whole section */ 2360 origin_blocks = cache->origin_sectors = ca->origin_sectors; 2361 origin_blocks = block_div(origin_blocks, ca->block_size); 2362 cache->origin_blocks = to_oblock(origin_blocks); 2363 2364 cache->sectors_per_block = ca->block_size; 2365 if (dm_set_target_max_io_len(ti, cache->sectors_per_block)) { 2366 r = -EINVAL; 2367 goto bad; 2368 } 2369 2370 if (ca->block_size & (ca->block_size - 1)) { 2371 dm_block_t cache_size = ca->cache_sectors; 2372 2373 cache->sectors_per_block_shift = -1; 2374 cache_size = block_div(cache_size, ca->block_size); 2375 set_cache_size(cache, to_cblock(cache_size)); 2376 } else { 2377 cache->sectors_per_block_shift = __ffs(ca->block_size); 2378 set_cache_size(cache, to_cblock(ca->cache_sectors >> cache->sectors_per_block_shift)); 2379 } 2380 2381 r = create_cache_policy(cache, ca, error); 2382 if (r) 2383 goto bad; 2384 2385 cache->policy_nr_args = ca->policy_argc; 2386 cache->migration_threshold = DEFAULT_MIGRATION_THRESHOLD; 2387 2388 r = set_config_values(cache, ca->policy_argc, ca->policy_argv); 2389 if (r) { 2390 *error = "Error setting cache policy's config values"; 2391 goto bad; 2392 } 2393 2394 cmd = dm_cache_metadata_open(cache->metadata_dev->bdev, 2395 ca->block_size, may_format, 2396 dm_cache_policy_get_hint_size(cache->policy)); 2397 if (IS_ERR(cmd)) { 2398 *error = "Error creating metadata object"; 2399 r = PTR_ERR(cmd); 2400 goto bad; 2401 } 2402 cache->cmd = cmd; 2403 2404 if (passthrough_mode(&cache->features)) { 2405 bool all_clean; 2406 2407 r = dm_cache_metadata_all_clean(cache->cmd, &all_clean); 2408 if (r) { 2409 *error = "dm_cache_metadata_all_clean() failed"; 2410 goto bad; 2411 } 2412 2413 if (!all_clean) { 2414 *error = "Cannot enter passthrough mode unless all blocks are clean"; 2415 r = -EINVAL; 2416 goto bad; 2417 } 2418 } 2419 2420 spin_lock_init(&cache->lock); 2421 bio_list_init(&cache->deferred_bios); 2422 bio_list_init(&cache->deferred_flush_bios); 2423 bio_list_init(&cache->deferred_writethrough_bios); 2424 INIT_LIST_HEAD(&cache->quiesced_migrations); 2425 INIT_LIST_HEAD(&cache->completed_migrations); 2426 INIT_LIST_HEAD(&cache->need_commit_migrations); 2427 atomic_set(&cache->nr_migrations, 0); 2428 init_waitqueue_head(&cache->migration_wait); 2429 2430 init_waitqueue_head(&cache->quiescing_wait); 2431 atomic_set(&cache->quiescing, 0); 2432 atomic_set(&cache->quiescing_ack, 0); 2433 2434 r = -ENOMEM; 2435 atomic_set(&cache->nr_dirty, 0); 2436 cache->dirty_bitset = alloc_bitset(from_cblock(cache->cache_size)); 2437 if (!cache->dirty_bitset) { 2438 *error = "could not allocate dirty bitset"; 2439 goto bad; 2440 } 2441 clear_bitset(cache->dirty_bitset, from_cblock(cache->cache_size)); 2442 2443 cache->discard_block_size = 2444 calculate_discard_block_size(cache->sectors_per_block, 2445 cache->origin_sectors); 2446 cache->discard_nr_blocks = to_dblock(dm_sector_div_up(cache->origin_sectors, 2447 cache->discard_block_size)); 2448 cache->discard_bitset = alloc_bitset(from_dblock(cache->discard_nr_blocks)); 2449 if (!cache->discard_bitset) { 2450 *error = "could not allocate discard bitset"; 2451 goto bad; 2452 } 2453 clear_bitset(cache->discard_bitset, from_dblock(cache->discard_nr_blocks)); 2454 2455 cache->copier = dm_kcopyd_client_create(&dm_kcopyd_throttle); 2456 if (IS_ERR(cache->copier)) { 2457 *error = "could not create kcopyd client"; 2458 r = PTR_ERR(cache->copier); 2459 goto bad; 2460 } 2461 2462 cache->wq = alloc_ordered_workqueue("dm-" DM_MSG_PREFIX, WQ_MEM_RECLAIM); 2463 if (!cache->wq) { 2464 *error = "could not create workqueue for metadata object"; 2465 goto bad; 2466 } 2467 INIT_WORK(&cache->worker, do_worker); 2468 INIT_DELAYED_WORK(&cache->waker, do_waker); 2469 cache->last_commit_jiffies = jiffies; 2470 2471 cache->prison = dm_bio_prison_create(); 2472 if (!cache->prison) { 2473 *error = "could not create bio prison"; 2474 goto bad; 2475 } 2476 2477 cache->all_io_ds = dm_deferred_set_create(); 2478 if (!cache->all_io_ds) { 2479 *error = "could not create all_io deferred set"; 2480 goto bad; 2481 } 2482 2483 cache->migration_pool = mempool_create_slab_pool(MIGRATION_POOL_SIZE, 2484 migration_cache); 2485 if (!cache->migration_pool) { 2486 *error = "Error creating cache's migration mempool"; 2487 goto bad; 2488 } 2489 2490 cache->next_migration = NULL; 2491 2492 cache->need_tick_bio = true; 2493 cache->sized = false; 2494 cache->invalidate = false; 2495 cache->commit_requested = false; 2496 cache->loaded_mappings = false; 2497 cache->loaded_discards = false; 2498 2499 load_stats(cache); 2500 2501 atomic_set(&cache->stats.demotion, 0); 2502 atomic_set(&cache->stats.promotion, 0); 2503 atomic_set(&cache->stats.copies_avoided, 0); 2504 atomic_set(&cache->stats.cache_cell_clash, 0); 2505 atomic_set(&cache->stats.commit_count, 0); 2506 atomic_set(&cache->stats.discard_count, 0); 2507 2508 spin_lock_init(&cache->invalidation_lock); 2509 INIT_LIST_HEAD(&cache->invalidation_requests); 2510 2511 *result = cache; 2512 return 0; 2513 2514 bad: 2515 destroy(cache); 2516 return r; 2517 } 2518 2519 static int copy_ctr_args(struct cache *cache, int argc, const char **argv) 2520 { 2521 unsigned i; 2522 const char **copy; 2523 2524 copy = kcalloc(argc, sizeof(*copy), GFP_KERNEL); 2525 if (!copy) 2526 return -ENOMEM; 2527 for (i = 0; i < argc; i++) { 2528 copy[i] = kstrdup(argv[i], GFP_KERNEL); 2529 if (!copy[i]) { 2530 while (i--) 2531 kfree(copy[i]); 2532 kfree(copy); 2533 return -ENOMEM; 2534 } 2535 } 2536 2537 cache->nr_ctr_args = argc; 2538 cache->ctr_args = copy; 2539 2540 return 0; 2541 } 2542 2543 static int cache_ctr(struct dm_target *ti, unsigned argc, char **argv) 2544 { 2545 int r = -EINVAL; 2546 struct cache_args *ca; 2547 struct cache *cache = NULL; 2548 2549 ca = kzalloc(sizeof(*ca), GFP_KERNEL); 2550 if (!ca) { 2551 ti->error = "Error allocating memory for cache"; 2552 return -ENOMEM; 2553 } 2554 ca->ti = ti; 2555 2556 r = parse_cache_args(ca, argc, argv, &ti->error); 2557 if (r) 2558 goto out; 2559 2560 r = cache_create(ca, &cache); 2561 if (r) 2562 goto out; 2563 2564 r = copy_ctr_args(cache, argc - 3, (const char **)argv + 3); 2565 if (r) { 2566 destroy(cache); 2567 goto out; 2568 } 2569 2570 ti->private = cache; 2571 2572 out: 2573 destroy_cache_args(ca); 2574 return r; 2575 } 2576 2577 static int __cache_map(struct cache *cache, struct bio *bio, struct dm_bio_prison_cell **cell) 2578 { 2579 int r; 2580 dm_oblock_t block = get_bio_block(cache, bio); 2581 size_t pb_data_size = get_per_bio_data_size(cache); 2582 bool can_migrate = false; 2583 bool discarded_block; 2584 struct policy_result lookup_result; 2585 struct per_bio_data *pb = init_per_bio_data(bio, pb_data_size); 2586 2587 if (unlikely(from_oblock(block) >= from_oblock(cache->origin_blocks))) { 2588 /* 2589 * This can only occur if the io goes to a partial block at 2590 * the end of the origin device. We don't cache these. 2591 * Just remap to the origin and carry on. 2592 */ 2593 remap_to_origin(cache, bio); 2594 return DM_MAPIO_REMAPPED; 2595 } 2596 2597 if (bio->bi_rw & (REQ_FLUSH | REQ_FUA | REQ_DISCARD)) { 2598 defer_bio(cache, bio); 2599 return DM_MAPIO_SUBMITTED; 2600 } 2601 2602 /* 2603 * Check to see if that block is currently migrating. 2604 */ 2605 *cell = alloc_prison_cell(cache); 2606 if (!*cell) { 2607 defer_bio(cache, bio); 2608 return DM_MAPIO_SUBMITTED; 2609 } 2610 2611 r = bio_detain(cache, block, bio, *cell, 2612 (cell_free_fn) free_prison_cell, 2613 cache, cell); 2614 if (r) { 2615 if (r < 0) 2616 defer_bio(cache, bio); 2617 2618 return DM_MAPIO_SUBMITTED; 2619 } 2620 2621 discarded_block = is_discarded_oblock(cache, block); 2622 2623 r = policy_map(cache->policy, block, false, can_migrate, discarded_block, 2624 bio, &lookup_result); 2625 if (r == -EWOULDBLOCK) { 2626 cell_defer(cache, *cell, true); 2627 return DM_MAPIO_SUBMITTED; 2628 2629 } else if (r) { 2630 DMERR_LIMIT("Unexpected return from cache replacement policy: %d", r); 2631 cell_defer(cache, *cell, false); 2632 bio_io_error(bio); 2633 return DM_MAPIO_SUBMITTED; 2634 } 2635 2636 r = DM_MAPIO_REMAPPED; 2637 switch (lookup_result.op) { 2638 case POLICY_HIT: 2639 if (passthrough_mode(&cache->features)) { 2640 if (bio_data_dir(bio) == WRITE) { 2641 /* 2642 * We need to invalidate this block, so 2643 * defer for the worker thread. 2644 */ 2645 cell_defer(cache, *cell, true); 2646 r = DM_MAPIO_SUBMITTED; 2647 2648 } else { 2649 inc_miss_counter(cache, bio); 2650 remap_to_origin_clear_discard(cache, bio, block); 2651 } 2652 2653 } else { 2654 inc_hit_counter(cache, bio); 2655 if (bio_data_dir(bio) == WRITE && writethrough_mode(&cache->features) && 2656 !is_dirty(cache, lookup_result.cblock)) 2657 remap_to_origin_then_cache(cache, bio, block, lookup_result.cblock); 2658 else 2659 remap_to_cache_dirty(cache, bio, block, lookup_result.cblock); 2660 } 2661 break; 2662 2663 case POLICY_MISS: 2664 inc_miss_counter(cache, bio); 2665 if (pb->req_nr != 0) { 2666 /* 2667 * This is a duplicate writethrough io that is no 2668 * longer needed because the block has been demoted. 2669 */ 2670 bio_endio(bio, 0); 2671 cell_defer(cache, *cell, false); 2672 r = DM_MAPIO_SUBMITTED; 2673 2674 } else 2675 remap_to_origin_clear_discard(cache, bio, block); 2676 2677 break; 2678 2679 default: 2680 DMERR_LIMIT("%s: erroring bio: unknown policy op: %u", __func__, 2681 (unsigned) lookup_result.op); 2682 cell_defer(cache, *cell, false); 2683 bio_io_error(bio); 2684 r = DM_MAPIO_SUBMITTED; 2685 } 2686 2687 return r; 2688 } 2689 2690 static int cache_map(struct dm_target *ti, struct bio *bio) 2691 { 2692 int r; 2693 struct dm_bio_prison_cell *cell = NULL; 2694 struct cache *cache = ti->private; 2695 2696 r = __cache_map(cache, bio, &cell); 2697 if (r == DM_MAPIO_REMAPPED && cell) { 2698 inc_ds(cache, bio, cell); 2699 cell_defer(cache, cell, false); 2700 } 2701 2702 return r; 2703 } 2704 2705 static int cache_end_io(struct dm_target *ti, struct bio *bio, int error) 2706 { 2707 struct cache *cache = ti->private; 2708 unsigned long flags; 2709 size_t pb_data_size = get_per_bio_data_size(cache); 2710 struct per_bio_data *pb = get_per_bio_data(bio, pb_data_size); 2711 2712 if (pb->tick) { 2713 policy_tick(cache->policy); 2714 2715 spin_lock_irqsave(&cache->lock, flags); 2716 cache->need_tick_bio = true; 2717 spin_unlock_irqrestore(&cache->lock, flags); 2718 } 2719 2720 check_for_quiesced_migrations(cache, pb); 2721 2722 return 0; 2723 } 2724 2725 static int write_dirty_bitset(struct cache *cache) 2726 { 2727 unsigned i, r; 2728 2729 for (i = 0; i < from_cblock(cache->cache_size); i++) { 2730 r = dm_cache_set_dirty(cache->cmd, to_cblock(i), 2731 is_dirty(cache, to_cblock(i))); 2732 if (r) 2733 return r; 2734 } 2735 2736 return 0; 2737 } 2738 2739 static int write_discard_bitset(struct cache *cache) 2740 { 2741 unsigned i, r; 2742 2743 r = dm_cache_discard_bitset_resize(cache->cmd, cache->discard_block_size, 2744 cache->discard_nr_blocks); 2745 if (r) { 2746 DMERR("could not resize on-disk discard bitset"); 2747 return r; 2748 } 2749 2750 for (i = 0; i < from_dblock(cache->discard_nr_blocks); i++) { 2751 r = dm_cache_set_discard(cache->cmd, to_dblock(i), 2752 is_discarded(cache, to_dblock(i))); 2753 if (r) 2754 return r; 2755 } 2756 2757 return 0; 2758 } 2759 2760 /* 2761 * returns true on success 2762 */ 2763 static bool sync_metadata(struct cache *cache) 2764 { 2765 int r1, r2, r3, r4; 2766 2767 r1 = write_dirty_bitset(cache); 2768 if (r1) 2769 DMERR("could not write dirty bitset"); 2770 2771 r2 = write_discard_bitset(cache); 2772 if (r2) 2773 DMERR("could not write discard bitset"); 2774 2775 save_stats(cache); 2776 2777 r3 = dm_cache_write_hints(cache->cmd, cache->policy); 2778 if (r3) 2779 DMERR("could not write hints"); 2780 2781 /* 2782 * If writing the above metadata failed, we still commit, but don't 2783 * set the clean shutdown flag. This will effectively force every 2784 * dirty bit to be set on reload. 2785 */ 2786 r4 = dm_cache_commit(cache->cmd, !r1 && !r2 && !r3); 2787 if (r4) 2788 DMERR("could not write cache metadata. Data loss may occur."); 2789 2790 return !r1 && !r2 && !r3 && !r4; 2791 } 2792 2793 static void cache_postsuspend(struct dm_target *ti) 2794 { 2795 struct cache *cache = ti->private; 2796 2797 start_quiescing(cache); 2798 wait_for_migrations(cache); 2799 stop_worker(cache); 2800 requeue_deferred_io(cache); 2801 stop_quiescing(cache); 2802 2803 (void) sync_metadata(cache); 2804 } 2805 2806 static int load_mapping(void *context, dm_oblock_t oblock, dm_cblock_t cblock, 2807 bool dirty, uint32_t hint, bool hint_valid) 2808 { 2809 int r; 2810 struct cache *cache = context; 2811 2812 r = policy_load_mapping(cache->policy, oblock, cblock, hint, hint_valid); 2813 if (r) 2814 return r; 2815 2816 if (dirty) 2817 set_dirty(cache, oblock, cblock); 2818 else 2819 clear_dirty(cache, oblock, cblock); 2820 2821 return 0; 2822 } 2823 2824 /* 2825 * The discard block size in the on disk metadata is not 2826 * neccessarily the same as we're currently using. So we have to 2827 * be careful to only set the discarded attribute if we know it 2828 * covers a complete block of the new size. 2829 */ 2830 struct discard_load_info { 2831 struct cache *cache; 2832 2833 /* 2834 * These blocks are sized using the on disk dblock size, rather 2835 * than the current one. 2836 */ 2837 dm_block_t block_size; 2838 dm_block_t discard_begin, discard_end; 2839 }; 2840 2841 static void discard_load_info_init(struct cache *cache, 2842 struct discard_load_info *li) 2843 { 2844 li->cache = cache; 2845 li->discard_begin = li->discard_end = 0; 2846 } 2847 2848 static void set_discard_range(struct discard_load_info *li) 2849 { 2850 sector_t b, e; 2851 2852 if (li->discard_begin == li->discard_end) 2853 return; 2854 2855 /* 2856 * Convert to sectors. 2857 */ 2858 b = li->discard_begin * li->block_size; 2859 e = li->discard_end * li->block_size; 2860 2861 /* 2862 * Then convert back to the current dblock size. 2863 */ 2864 b = dm_sector_div_up(b, li->cache->discard_block_size); 2865 sector_div(e, li->cache->discard_block_size); 2866 2867 /* 2868 * The origin may have shrunk, so we need to check we're still in 2869 * bounds. 2870 */ 2871 if (e > from_dblock(li->cache->discard_nr_blocks)) 2872 e = from_dblock(li->cache->discard_nr_blocks); 2873 2874 for (; b < e; b++) 2875 set_discard(li->cache, to_dblock(b)); 2876 } 2877 2878 static int load_discard(void *context, sector_t discard_block_size, 2879 dm_dblock_t dblock, bool discard) 2880 { 2881 struct discard_load_info *li = context; 2882 2883 li->block_size = discard_block_size; 2884 2885 if (discard) { 2886 if (from_dblock(dblock) == li->discard_end) 2887 /* 2888 * We're already in a discard range, just extend it. 2889 */ 2890 li->discard_end = li->discard_end + 1ULL; 2891 2892 else { 2893 /* 2894 * Emit the old range and start a new one. 2895 */ 2896 set_discard_range(li); 2897 li->discard_begin = from_dblock(dblock); 2898 li->discard_end = li->discard_begin + 1ULL; 2899 } 2900 } else { 2901 set_discard_range(li); 2902 li->discard_begin = li->discard_end = 0; 2903 } 2904 2905 return 0; 2906 } 2907 2908 static dm_cblock_t get_cache_dev_size(struct cache *cache) 2909 { 2910 sector_t size = get_dev_size(cache->cache_dev); 2911 (void) sector_div(size, cache->sectors_per_block); 2912 return to_cblock(size); 2913 } 2914 2915 static bool can_resize(struct cache *cache, dm_cblock_t new_size) 2916 { 2917 if (from_cblock(new_size) > from_cblock(cache->cache_size)) 2918 return true; 2919 2920 /* 2921 * We can't drop a dirty block when shrinking the cache. 2922 */ 2923 while (from_cblock(new_size) < from_cblock(cache->cache_size)) { 2924 new_size = to_cblock(from_cblock(new_size) + 1); 2925 if (is_dirty(cache, new_size)) { 2926 DMERR("unable to shrink cache; cache block %llu is dirty", 2927 (unsigned long long) from_cblock(new_size)); 2928 return false; 2929 } 2930 } 2931 2932 return true; 2933 } 2934 2935 static int resize_cache_dev(struct cache *cache, dm_cblock_t new_size) 2936 { 2937 int r; 2938 2939 r = dm_cache_resize(cache->cmd, new_size); 2940 if (r) { 2941 DMERR("could not resize cache metadata"); 2942 return r; 2943 } 2944 2945 set_cache_size(cache, new_size); 2946 2947 return 0; 2948 } 2949 2950 static int cache_preresume(struct dm_target *ti) 2951 { 2952 int r = 0; 2953 struct cache *cache = ti->private; 2954 dm_cblock_t csize = get_cache_dev_size(cache); 2955 2956 /* 2957 * Check to see if the cache has resized. 2958 */ 2959 if (!cache->sized) { 2960 r = resize_cache_dev(cache, csize); 2961 if (r) 2962 return r; 2963 2964 cache->sized = true; 2965 2966 } else if (csize != cache->cache_size) { 2967 if (!can_resize(cache, csize)) 2968 return -EINVAL; 2969 2970 r = resize_cache_dev(cache, csize); 2971 if (r) 2972 return r; 2973 } 2974 2975 if (!cache->loaded_mappings) { 2976 r = dm_cache_load_mappings(cache->cmd, cache->policy, 2977 load_mapping, cache); 2978 if (r) { 2979 DMERR("could not load cache mappings"); 2980 return r; 2981 } 2982 2983 cache->loaded_mappings = true; 2984 } 2985 2986 if (!cache->loaded_discards) { 2987 struct discard_load_info li; 2988 2989 /* 2990 * The discard bitset could have been resized, or the 2991 * discard block size changed. To be safe we start by 2992 * setting every dblock to not discarded. 2993 */ 2994 clear_bitset(cache->discard_bitset, from_dblock(cache->discard_nr_blocks)); 2995 2996 discard_load_info_init(cache, &li); 2997 r = dm_cache_load_discards(cache->cmd, load_discard, &li); 2998 if (r) { 2999 DMERR("could not load origin discards"); 3000 return r; 3001 } 3002 set_discard_range(&li); 3003 3004 cache->loaded_discards = true; 3005 } 3006 3007 return r; 3008 } 3009 3010 static void cache_resume(struct dm_target *ti) 3011 { 3012 struct cache *cache = ti->private; 3013 3014 cache->need_tick_bio = true; 3015 do_waker(&cache->waker.work); 3016 } 3017 3018 /* 3019 * Status format: 3020 * 3021 * <metadata block size> <#used metadata blocks>/<#total metadata blocks> 3022 * <cache block size> <#used cache blocks>/<#total cache blocks> 3023 * <#read hits> <#read misses> <#write hits> <#write misses> 3024 * <#demotions> <#promotions> <#dirty> 3025 * <#features> <features>* 3026 * <#core args> <core args> 3027 * <policy name> <#policy args> <policy args>* 3028 */ 3029 static void cache_status(struct dm_target *ti, status_type_t type, 3030 unsigned status_flags, char *result, unsigned maxlen) 3031 { 3032 int r = 0; 3033 unsigned i; 3034 ssize_t sz = 0; 3035 dm_block_t nr_free_blocks_metadata = 0; 3036 dm_block_t nr_blocks_metadata = 0; 3037 char buf[BDEVNAME_SIZE]; 3038 struct cache *cache = ti->private; 3039 dm_cblock_t residency; 3040 3041 switch (type) { 3042 case STATUSTYPE_INFO: 3043 /* Commit to ensure statistics aren't out-of-date */ 3044 if (!(status_flags & DM_STATUS_NOFLUSH_FLAG) && !dm_suspended(ti)) { 3045 r = dm_cache_commit(cache->cmd, false); 3046 if (r) 3047 DMERR("could not commit metadata for accurate status"); 3048 } 3049 3050 r = dm_cache_get_free_metadata_block_count(cache->cmd, 3051 &nr_free_blocks_metadata); 3052 if (r) { 3053 DMERR("could not get metadata free block count"); 3054 goto err; 3055 } 3056 3057 r = dm_cache_get_metadata_dev_size(cache->cmd, &nr_blocks_metadata); 3058 if (r) { 3059 DMERR("could not get metadata device size"); 3060 goto err; 3061 } 3062 3063 residency = policy_residency(cache->policy); 3064 3065 DMEMIT("%u %llu/%llu %u %llu/%llu %u %u %u %u %u %u %lu ", 3066 (unsigned)DM_CACHE_METADATA_BLOCK_SIZE, 3067 (unsigned long long)(nr_blocks_metadata - nr_free_blocks_metadata), 3068 (unsigned long long)nr_blocks_metadata, 3069 cache->sectors_per_block, 3070 (unsigned long long) from_cblock(residency), 3071 (unsigned long long) from_cblock(cache->cache_size), 3072 (unsigned) atomic_read(&cache->stats.read_hit), 3073 (unsigned) atomic_read(&cache->stats.read_miss), 3074 (unsigned) atomic_read(&cache->stats.write_hit), 3075 (unsigned) atomic_read(&cache->stats.write_miss), 3076 (unsigned) atomic_read(&cache->stats.demotion), 3077 (unsigned) atomic_read(&cache->stats.promotion), 3078 (unsigned long) atomic_read(&cache->nr_dirty)); 3079 3080 if (writethrough_mode(&cache->features)) 3081 DMEMIT("1 writethrough "); 3082 3083 else if (passthrough_mode(&cache->features)) 3084 DMEMIT("1 passthrough "); 3085 3086 else if (writeback_mode(&cache->features)) 3087 DMEMIT("1 writeback "); 3088 3089 else { 3090 DMERR("internal error: unknown io mode: %d", (int) cache->features.io_mode); 3091 goto err; 3092 } 3093 3094 DMEMIT("2 migration_threshold %llu ", (unsigned long long) cache->migration_threshold); 3095 3096 DMEMIT("%s ", dm_cache_policy_get_name(cache->policy)); 3097 if (sz < maxlen) { 3098 r = policy_emit_config_values(cache->policy, result + sz, maxlen - sz); 3099 if (r) 3100 DMERR("policy_emit_config_values returned %d", r); 3101 } 3102 3103 break; 3104 3105 case STATUSTYPE_TABLE: 3106 format_dev_t(buf, cache->metadata_dev->bdev->bd_dev); 3107 DMEMIT("%s ", buf); 3108 format_dev_t(buf, cache->cache_dev->bdev->bd_dev); 3109 DMEMIT("%s ", buf); 3110 format_dev_t(buf, cache->origin_dev->bdev->bd_dev); 3111 DMEMIT("%s", buf); 3112 3113 for (i = 0; i < cache->nr_ctr_args - 1; i++) 3114 DMEMIT(" %s", cache->ctr_args[i]); 3115 if (cache->nr_ctr_args) 3116 DMEMIT(" %s", cache->ctr_args[cache->nr_ctr_args - 1]); 3117 } 3118 3119 return; 3120 3121 err: 3122 DMEMIT("Error"); 3123 } 3124 3125 /* 3126 * A cache block range can take two forms: 3127 * 3128 * i) A single cblock, eg. '3456' 3129 * ii) A begin and end cblock with dots between, eg. 123-234 3130 */ 3131 static int parse_cblock_range(struct cache *cache, const char *str, 3132 struct cblock_range *result) 3133 { 3134 char dummy; 3135 uint64_t b, e; 3136 int r; 3137 3138 /* 3139 * Try and parse form (ii) first. 3140 */ 3141 r = sscanf(str, "%llu-%llu%c", &b, &e, &dummy); 3142 if (r < 0) 3143 return r; 3144 3145 if (r == 2) { 3146 result->begin = to_cblock(b); 3147 result->end = to_cblock(e); 3148 return 0; 3149 } 3150 3151 /* 3152 * That didn't work, try form (i). 3153 */ 3154 r = sscanf(str, "%llu%c", &b, &dummy); 3155 if (r < 0) 3156 return r; 3157 3158 if (r == 1) { 3159 result->begin = to_cblock(b); 3160 result->end = to_cblock(from_cblock(result->begin) + 1u); 3161 return 0; 3162 } 3163 3164 DMERR("invalid cblock range '%s'", str); 3165 return -EINVAL; 3166 } 3167 3168 static int validate_cblock_range(struct cache *cache, struct cblock_range *range) 3169 { 3170 uint64_t b = from_cblock(range->begin); 3171 uint64_t e = from_cblock(range->end); 3172 uint64_t n = from_cblock(cache->cache_size); 3173 3174 if (b >= n) { 3175 DMERR("begin cblock out of range: %llu >= %llu", b, n); 3176 return -EINVAL; 3177 } 3178 3179 if (e > n) { 3180 DMERR("end cblock out of range: %llu > %llu", e, n); 3181 return -EINVAL; 3182 } 3183 3184 if (b >= e) { 3185 DMERR("invalid cblock range: %llu >= %llu", b, e); 3186 return -EINVAL; 3187 } 3188 3189 return 0; 3190 } 3191 3192 static int request_invalidation(struct cache *cache, struct cblock_range *range) 3193 { 3194 struct invalidation_request req; 3195 3196 INIT_LIST_HEAD(&req.list); 3197 req.cblocks = range; 3198 atomic_set(&req.complete, 0); 3199 req.err = 0; 3200 init_waitqueue_head(&req.result_wait); 3201 3202 spin_lock(&cache->invalidation_lock); 3203 list_add(&req.list, &cache->invalidation_requests); 3204 spin_unlock(&cache->invalidation_lock); 3205 wake_worker(cache); 3206 3207 wait_event(req.result_wait, atomic_read(&req.complete)); 3208 return req.err; 3209 } 3210 3211 static int process_invalidate_cblocks_message(struct cache *cache, unsigned count, 3212 const char **cblock_ranges) 3213 { 3214 int r = 0; 3215 unsigned i; 3216 struct cblock_range range; 3217 3218 if (!passthrough_mode(&cache->features)) { 3219 DMERR("cache has to be in passthrough mode for invalidation"); 3220 return -EPERM; 3221 } 3222 3223 for (i = 0; i < count; i++) { 3224 r = parse_cblock_range(cache, cblock_ranges[i], &range); 3225 if (r) 3226 break; 3227 3228 r = validate_cblock_range(cache, &range); 3229 if (r) 3230 break; 3231 3232 /* 3233 * Pass begin and end origin blocks to the worker and wake it. 3234 */ 3235 r = request_invalidation(cache, &range); 3236 if (r) 3237 break; 3238 } 3239 3240 return r; 3241 } 3242 3243 /* 3244 * Supports 3245 * "<key> <value>" 3246 * and 3247 * "invalidate_cblocks [(<begin>)|(<begin>-<end>)]* 3248 * 3249 * The key migration_threshold is supported by the cache target core. 3250 */ 3251 static int cache_message(struct dm_target *ti, unsigned argc, char **argv) 3252 { 3253 struct cache *cache = ti->private; 3254 3255 if (!argc) 3256 return -EINVAL; 3257 3258 if (!strcasecmp(argv[0], "invalidate_cblocks")) 3259 return process_invalidate_cblocks_message(cache, argc - 1, (const char **) argv + 1); 3260 3261 if (argc != 2) 3262 return -EINVAL; 3263 3264 return set_config_value(cache, argv[0], argv[1]); 3265 } 3266 3267 static int cache_iterate_devices(struct dm_target *ti, 3268 iterate_devices_callout_fn fn, void *data) 3269 { 3270 int r = 0; 3271 struct cache *cache = ti->private; 3272 3273 r = fn(ti, cache->cache_dev, 0, get_dev_size(cache->cache_dev), data); 3274 if (!r) 3275 r = fn(ti, cache->origin_dev, 0, ti->len, data); 3276 3277 return r; 3278 } 3279 3280 /* 3281 * We assume I/O is going to the origin (which is the volume 3282 * more likely to have restrictions e.g. by being striped). 3283 * (Looking up the exact location of the data would be expensive 3284 * and could always be out of date by the time the bio is submitted.) 3285 */ 3286 static int cache_bvec_merge(struct dm_target *ti, 3287 struct bvec_merge_data *bvm, 3288 struct bio_vec *biovec, int max_size) 3289 { 3290 struct cache *cache = ti->private; 3291 struct request_queue *q = bdev_get_queue(cache->origin_dev->bdev); 3292 3293 if (!q->merge_bvec_fn) 3294 return max_size; 3295 3296 bvm->bi_bdev = cache->origin_dev->bdev; 3297 return min(max_size, q->merge_bvec_fn(q, bvm, biovec)); 3298 } 3299 3300 static void set_discard_limits(struct cache *cache, struct queue_limits *limits) 3301 { 3302 /* 3303 * FIXME: these limits may be incompatible with the cache device 3304 */ 3305 limits->max_discard_sectors = min_t(sector_t, cache->discard_block_size * 1024, 3306 cache->origin_sectors); 3307 limits->discard_granularity = cache->discard_block_size << SECTOR_SHIFT; 3308 } 3309 3310 static void cache_io_hints(struct dm_target *ti, struct queue_limits *limits) 3311 { 3312 struct cache *cache = ti->private; 3313 uint64_t io_opt_sectors = limits->io_opt >> SECTOR_SHIFT; 3314 3315 /* 3316 * If the system-determined stacked limits are compatible with the 3317 * cache's blocksize (io_opt is a factor) do not override them. 3318 */ 3319 if (io_opt_sectors < cache->sectors_per_block || 3320 do_div(io_opt_sectors, cache->sectors_per_block)) { 3321 blk_limits_io_min(limits, cache->sectors_per_block << SECTOR_SHIFT); 3322 blk_limits_io_opt(limits, cache->sectors_per_block << SECTOR_SHIFT); 3323 } 3324 set_discard_limits(cache, limits); 3325 } 3326 3327 /*----------------------------------------------------------------*/ 3328 3329 static struct target_type cache_target = { 3330 .name = "cache", 3331 .version = {1, 6, 0}, 3332 .module = THIS_MODULE, 3333 .ctr = cache_ctr, 3334 .dtr = cache_dtr, 3335 .map = cache_map, 3336 .end_io = cache_end_io, 3337 .postsuspend = cache_postsuspend, 3338 .preresume = cache_preresume, 3339 .resume = cache_resume, 3340 .status = cache_status, 3341 .message = cache_message, 3342 .iterate_devices = cache_iterate_devices, 3343 .merge = cache_bvec_merge, 3344 .io_hints = cache_io_hints, 3345 }; 3346 3347 static int __init dm_cache_init(void) 3348 { 3349 int r; 3350 3351 r = dm_register_target(&cache_target); 3352 if (r) { 3353 DMERR("cache target registration failed: %d", r); 3354 return r; 3355 } 3356 3357 migration_cache = KMEM_CACHE(dm_cache_migration, 0); 3358 if (!migration_cache) { 3359 dm_unregister_target(&cache_target); 3360 return -ENOMEM; 3361 } 3362 3363 return 0; 3364 } 3365 3366 static void __exit dm_cache_exit(void) 3367 { 3368 dm_unregister_target(&cache_target); 3369 kmem_cache_destroy(migration_cache); 3370 } 3371 3372 module_init(dm_cache_init); 3373 module_exit(dm_cache_exit); 3374 3375 MODULE_DESCRIPTION(DM_NAME " cache target"); 3376 MODULE_AUTHOR("Joe Thornber <ejt@redhat.com>"); 3377 MODULE_LICENSE("GPL"); 3378