1 /* 2 * Copyright (C) 2012 Red Hat. All rights reserved. 3 * 4 * This file is released under the GPL. 5 */ 6 7 #include "dm.h" 8 #include "dm-bio-prison.h" 9 #include "dm-bio-record.h" 10 #include "dm-cache-metadata.h" 11 12 #include <linux/dm-io.h> 13 #include <linux/dm-kcopyd.h> 14 #include <linux/jiffies.h> 15 #include <linux/init.h> 16 #include <linux/mempool.h> 17 #include <linux/module.h> 18 #include <linux/slab.h> 19 #include <linux/vmalloc.h> 20 21 #define DM_MSG_PREFIX "cache" 22 23 DECLARE_DM_KCOPYD_THROTTLE_WITH_MODULE_PARM(cache_copy_throttle, 24 "A percentage of time allocated for copying to and/or from cache"); 25 26 /*----------------------------------------------------------------*/ 27 28 /* 29 * Glossary: 30 * 31 * oblock: index of an origin block 32 * cblock: index of a cache block 33 * promotion: movement of a block from origin to cache 34 * demotion: movement of a block from cache to origin 35 * migration: movement of a block between the origin and cache device, 36 * either direction 37 */ 38 39 /*----------------------------------------------------------------*/ 40 41 static size_t bitset_size_in_bytes(unsigned nr_entries) 42 { 43 return sizeof(unsigned long) * dm_div_up(nr_entries, BITS_PER_LONG); 44 } 45 46 static unsigned long *alloc_bitset(unsigned nr_entries) 47 { 48 size_t s = bitset_size_in_bytes(nr_entries); 49 return vzalloc(s); 50 } 51 52 static void clear_bitset(void *bitset, unsigned nr_entries) 53 { 54 size_t s = bitset_size_in_bytes(nr_entries); 55 memset(bitset, 0, s); 56 } 57 58 static void free_bitset(unsigned long *bits) 59 { 60 vfree(bits); 61 } 62 63 /*----------------------------------------------------------------*/ 64 65 /* 66 * There are a couple of places where we let a bio run, but want to do some 67 * work before calling its endio function. We do this by temporarily 68 * changing the endio fn. 69 */ 70 struct dm_hook_info { 71 bio_end_io_t *bi_end_io; 72 void *bi_private; 73 }; 74 75 static void dm_hook_bio(struct dm_hook_info *h, struct bio *bio, 76 bio_end_io_t *bi_end_io, void *bi_private) 77 { 78 h->bi_end_io = bio->bi_end_io; 79 h->bi_private = bio->bi_private; 80 81 bio->bi_end_io = bi_end_io; 82 bio->bi_private = bi_private; 83 } 84 85 static void dm_unhook_bio(struct dm_hook_info *h, struct bio *bio) 86 { 87 bio->bi_end_io = h->bi_end_io; 88 bio->bi_private = h->bi_private; 89 90 /* 91 * Must bump bi_remaining to allow bio to complete with 92 * restored bi_end_io. 93 */ 94 atomic_inc(&bio->bi_remaining); 95 } 96 97 /*----------------------------------------------------------------*/ 98 99 #define MIGRATION_POOL_SIZE 128 100 #define COMMIT_PERIOD HZ 101 #define MIGRATION_COUNT_WINDOW 10 102 103 /* 104 * The block size of the device holding cache data must be 105 * between 32KB and 1GB. 106 */ 107 #define DATA_DEV_BLOCK_SIZE_MIN_SECTORS (32 * 1024 >> SECTOR_SHIFT) 108 #define DATA_DEV_BLOCK_SIZE_MAX_SECTORS (1024 * 1024 * 1024 >> SECTOR_SHIFT) 109 110 /* 111 * FIXME: the cache is read/write for the time being. 112 */ 113 enum cache_metadata_mode { 114 CM_WRITE, /* metadata may be changed */ 115 CM_READ_ONLY, /* metadata may not be changed */ 116 }; 117 118 enum cache_io_mode { 119 /* 120 * Data is written to cached blocks only. These blocks are marked 121 * dirty. If you lose the cache device you will lose data. 122 * Potential performance increase for both reads and writes. 123 */ 124 CM_IO_WRITEBACK, 125 126 /* 127 * Data is written to both cache and origin. Blocks are never 128 * dirty. Potential performance benfit for reads only. 129 */ 130 CM_IO_WRITETHROUGH, 131 132 /* 133 * A degraded mode useful for various cache coherency situations 134 * (eg, rolling back snapshots). Reads and writes always go to the 135 * origin. If a write goes to a cached oblock, then the cache 136 * block is invalidated. 137 */ 138 CM_IO_PASSTHROUGH 139 }; 140 141 struct cache_features { 142 enum cache_metadata_mode mode; 143 enum cache_io_mode io_mode; 144 }; 145 146 struct cache_stats { 147 atomic_t read_hit; 148 atomic_t read_miss; 149 atomic_t write_hit; 150 atomic_t write_miss; 151 atomic_t demotion; 152 atomic_t promotion; 153 atomic_t copies_avoided; 154 atomic_t cache_cell_clash; 155 atomic_t commit_count; 156 atomic_t discard_count; 157 }; 158 159 /* 160 * Defines a range of cblocks, begin to (end - 1) are in the range. end is 161 * the one-past-the-end value. 162 */ 163 struct cblock_range { 164 dm_cblock_t begin; 165 dm_cblock_t end; 166 }; 167 168 struct invalidation_request { 169 struct list_head list; 170 struct cblock_range *cblocks; 171 172 atomic_t complete; 173 int err; 174 175 wait_queue_head_t result_wait; 176 }; 177 178 struct cache { 179 struct dm_target *ti; 180 struct dm_target_callbacks callbacks; 181 182 struct dm_cache_metadata *cmd; 183 184 /* 185 * Metadata is written to this device. 186 */ 187 struct dm_dev *metadata_dev; 188 189 /* 190 * The slower of the two data devices. Typically a spindle. 191 */ 192 struct dm_dev *origin_dev; 193 194 /* 195 * The faster of the two data devices. Typically an SSD. 196 */ 197 struct dm_dev *cache_dev; 198 199 /* 200 * Size of the origin device in _complete_ blocks and native sectors. 201 */ 202 dm_oblock_t origin_blocks; 203 sector_t origin_sectors; 204 205 /* 206 * Size of the cache device in blocks. 207 */ 208 dm_cblock_t cache_size; 209 210 /* 211 * Fields for converting from sectors to blocks. 212 */ 213 uint32_t sectors_per_block; 214 int sectors_per_block_shift; 215 216 spinlock_t lock; 217 struct bio_list deferred_bios; 218 struct bio_list deferred_flush_bios; 219 struct bio_list deferred_writethrough_bios; 220 struct list_head quiesced_migrations; 221 struct list_head completed_migrations; 222 struct list_head need_commit_migrations; 223 sector_t migration_threshold; 224 wait_queue_head_t migration_wait; 225 atomic_t nr_allocated_migrations; 226 227 /* 228 * The number of in flight migrations that are performing 229 * background io. eg, promotion, writeback. 230 */ 231 atomic_t nr_io_migrations; 232 233 wait_queue_head_t quiescing_wait; 234 atomic_t quiescing; 235 atomic_t quiescing_ack; 236 237 /* 238 * cache_size entries, dirty if set 239 */ 240 atomic_t nr_dirty; 241 unsigned long *dirty_bitset; 242 243 /* 244 * origin_blocks entries, discarded if set. 245 */ 246 dm_dblock_t discard_nr_blocks; 247 unsigned long *discard_bitset; 248 uint32_t discard_block_size; /* a power of 2 times sectors per block */ 249 250 /* 251 * Rather than reconstructing the table line for the status we just 252 * save it and regurgitate. 253 */ 254 unsigned nr_ctr_args; 255 const char **ctr_args; 256 257 struct dm_kcopyd_client *copier; 258 struct workqueue_struct *wq; 259 struct work_struct worker; 260 261 struct delayed_work waker; 262 unsigned long last_commit_jiffies; 263 264 struct dm_bio_prison *prison; 265 struct dm_deferred_set *all_io_ds; 266 267 mempool_t *migration_pool; 268 269 struct dm_cache_policy *policy; 270 unsigned policy_nr_args; 271 272 bool need_tick_bio:1; 273 bool sized:1; 274 bool invalidate:1; 275 bool commit_requested:1; 276 bool loaded_mappings:1; 277 bool loaded_discards:1; 278 279 /* 280 * Cache features such as write-through. 281 */ 282 struct cache_features features; 283 284 struct cache_stats stats; 285 286 /* 287 * Invalidation fields. 288 */ 289 spinlock_t invalidation_lock; 290 struct list_head invalidation_requests; 291 }; 292 293 struct per_bio_data { 294 bool tick:1; 295 unsigned req_nr:2; 296 struct dm_deferred_entry *all_io_entry; 297 struct dm_hook_info hook_info; 298 299 /* 300 * writethrough fields. These MUST remain at the end of this 301 * structure and the 'cache' member must be the first as it 302 * is used to determine the offset of the writethrough fields. 303 */ 304 struct cache *cache; 305 dm_cblock_t cblock; 306 struct dm_bio_details bio_details; 307 }; 308 309 struct dm_cache_migration { 310 struct list_head list; 311 struct cache *cache; 312 313 unsigned long start_jiffies; 314 dm_oblock_t old_oblock; 315 dm_oblock_t new_oblock; 316 dm_cblock_t cblock; 317 318 bool err:1; 319 bool discard:1; 320 bool writeback:1; 321 bool demote:1; 322 bool promote:1; 323 bool requeue_holder:1; 324 bool invalidate:1; 325 326 struct dm_bio_prison_cell *old_ocell; 327 struct dm_bio_prison_cell *new_ocell; 328 }; 329 330 /* 331 * Processing a bio in the worker thread may require these memory 332 * allocations. We prealloc to avoid deadlocks (the same worker thread 333 * frees them back to the mempool). 334 */ 335 struct prealloc { 336 struct dm_cache_migration *mg; 337 struct dm_bio_prison_cell *cell1; 338 struct dm_bio_prison_cell *cell2; 339 }; 340 341 static void wake_worker(struct cache *cache) 342 { 343 queue_work(cache->wq, &cache->worker); 344 } 345 346 /*----------------------------------------------------------------*/ 347 348 static struct dm_bio_prison_cell *alloc_prison_cell(struct cache *cache) 349 { 350 /* FIXME: change to use a local slab. */ 351 return dm_bio_prison_alloc_cell(cache->prison, GFP_NOWAIT); 352 } 353 354 static void free_prison_cell(struct cache *cache, struct dm_bio_prison_cell *cell) 355 { 356 dm_bio_prison_free_cell(cache->prison, cell); 357 } 358 359 static struct dm_cache_migration *alloc_migration(struct cache *cache) 360 { 361 struct dm_cache_migration *mg; 362 363 mg = mempool_alloc(cache->migration_pool, GFP_NOWAIT); 364 if (mg) { 365 mg->cache = cache; 366 atomic_inc(&mg->cache->nr_allocated_migrations); 367 } 368 369 return mg; 370 } 371 372 static void free_migration(struct dm_cache_migration *mg) 373 { 374 if (atomic_dec_and_test(&mg->cache->nr_allocated_migrations)) 375 wake_up(&mg->cache->migration_wait); 376 377 mempool_free(mg, mg->cache->migration_pool); 378 } 379 380 static int prealloc_data_structs(struct cache *cache, struct prealloc *p) 381 { 382 if (!p->mg) { 383 p->mg = alloc_migration(cache); 384 if (!p->mg) 385 return -ENOMEM; 386 } 387 388 if (!p->cell1) { 389 p->cell1 = alloc_prison_cell(cache); 390 if (!p->cell1) 391 return -ENOMEM; 392 } 393 394 if (!p->cell2) { 395 p->cell2 = alloc_prison_cell(cache); 396 if (!p->cell2) 397 return -ENOMEM; 398 } 399 400 return 0; 401 } 402 403 static void prealloc_free_structs(struct cache *cache, struct prealloc *p) 404 { 405 if (p->cell2) 406 free_prison_cell(cache, p->cell2); 407 408 if (p->cell1) 409 free_prison_cell(cache, p->cell1); 410 411 if (p->mg) 412 free_migration(p->mg); 413 } 414 415 static struct dm_cache_migration *prealloc_get_migration(struct prealloc *p) 416 { 417 struct dm_cache_migration *mg = p->mg; 418 419 BUG_ON(!mg); 420 p->mg = NULL; 421 422 return mg; 423 } 424 425 /* 426 * You must have a cell within the prealloc struct to return. If not this 427 * function will BUG() rather than returning NULL. 428 */ 429 static struct dm_bio_prison_cell *prealloc_get_cell(struct prealloc *p) 430 { 431 struct dm_bio_prison_cell *r = NULL; 432 433 if (p->cell1) { 434 r = p->cell1; 435 p->cell1 = NULL; 436 437 } else if (p->cell2) { 438 r = p->cell2; 439 p->cell2 = NULL; 440 } else 441 BUG(); 442 443 return r; 444 } 445 446 /* 447 * You can't have more than two cells in a prealloc struct. BUG() will be 448 * called if you try and overfill. 449 */ 450 static void prealloc_put_cell(struct prealloc *p, struct dm_bio_prison_cell *cell) 451 { 452 if (!p->cell2) 453 p->cell2 = cell; 454 455 else if (!p->cell1) 456 p->cell1 = cell; 457 458 else 459 BUG(); 460 } 461 462 /*----------------------------------------------------------------*/ 463 464 static void build_key(dm_oblock_t begin, dm_oblock_t end, struct dm_cell_key *key) 465 { 466 key->virtual = 0; 467 key->dev = 0; 468 key->block_begin = from_oblock(begin); 469 key->block_end = from_oblock(end); 470 } 471 472 /* 473 * The caller hands in a preallocated cell, and a free function for it. 474 * The cell will be freed if there's an error, or if it wasn't used because 475 * a cell with that key already exists. 476 */ 477 typedef void (*cell_free_fn)(void *context, struct dm_bio_prison_cell *cell); 478 479 static int bio_detain_range(struct cache *cache, dm_oblock_t oblock_begin, dm_oblock_t oblock_end, 480 struct bio *bio, struct dm_bio_prison_cell *cell_prealloc, 481 cell_free_fn free_fn, void *free_context, 482 struct dm_bio_prison_cell **cell_result) 483 { 484 int r; 485 struct dm_cell_key key; 486 487 build_key(oblock_begin, oblock_end, &key); 488 r = dm_bio_detain(cache->prison, &key, bio, cell_prealloc, cell_result); 489 if (r) 490 free_fn(free_context, cell_prealloc); 491 492 return r; 493 } 494 495 static int bio_detain(struct cache *cache, dm_oblock_t oblock, 496 struct bio *bio, struct dm_bio_prison_cell *cell_prealloc, 497 cell_free_fn free_fn, void *free_context, 498 struct dm_bio_prison_cell **cell_result) 499 { 500 dm_oblock_t end = to_oblock(from_oblock(oblock) + 1ULL); 501 return bio_detain_range(cache, oblock, end, bio, 502 cell_prealloc, free_fn, free_context, cell_result); 503 } 504 505 static int get_cell(struct cache *cache, 506 dm_oblock_t oblock, 507 struct prealloc *structs, 508 struct dm_bio_prison_cell **cell_result) 509 { 510 int r; 511 struct dm_cell_key key; 512 struct dm_bio_prison_cell *cell_prealloc; 513 514 cell_prealloc = prealloc_get_cell(structs); 515 516 build_key(oblock, to_oblock(from_oblock(oblock) + 1ULL), &key); 517 r = dm_get_cell(cache->prison, &key, cell_prealloc, cell_result); 518 if (r) 519 prealloc_put_cell(structs, cell_prealloc); 520 521 return r; 522 } 523 524 /*----------------------------------------------------------------*/ 525 526 static bool is_dirty(struct cache *cache, dm_cblock_t b) 527 { 528 return test_bit(from_cblock(b), cache->dirty_bitset); 529 } 530 531 static void set_dirty(struct cache *cache, dm_oblock_t oblock, dm_cblock_t cblock) 532 { 533 if (!test_and_set_bit(from_cblock(cblock), cache->dirty_bitset)) { 534 atomic_inc(&cache->nr_dirty); 535 policy_set_dirty(cache->policy, oblock); 536 } 537 } 538 539 static void clear_dirty(struct cache *cache, dm_oblock_t oblock, dm_cblock_t cblock) 540 { 541 if (test_and_clear_bit(from_cblock(cblock), cache->dirty_bitset)) { 542 policy_clear_dirty(cache->policy, oblock); 543 if (atomic_dec_return(&cache->nr_dirty) == 0) 544 dm_table_event(cache->ti->table); 545 } 546 } 547 548 /*----------------------------------------------------------------*/ 549 550 static bool block_size_is_power_of_two(struct cache *cache) 551 { 552 return cache->sectors_per_block_shift >= 0; 553 } 554 555 /* gcc on ARM generates spurious references to __udivdi3 and __umoddi3 */ 556 #if defined(CONFIG_ARM) && __GNUC__ == 4 && __GNUC_MINOR__ <= 6 557 __always_inline 558 #endif 559 static dm_block_t block_div(dm_block_t b, uint32_t n) 560 { 561 do_div(b, n); 562 563 return b; 564 } 565 566 static dm_block_t oblocks_per_dblock(struct cache *cache) 567 { 568 dm_block_t oblocks = cache->discard_block_size; 569 570 if (block_size_is_power_of_two(cache)) 571 oblocks >>= cache->sectors_per_block_shift; 572 else 573 oblocks = block_div(oblocks, cache->sectors_per_block); 574 575 return oblocks; 576 } 577 578 static dm_dblock_t oblock_to_dblock(struct cache *cache, dm_oblock_t oblock) 579 { 580 return to_dblock(block_div(from_oblock(oblock), 581 oblocks_per_dblock(cache))); 582 } 583 584 static dm_oblock_t dblock_to_oblock(struct cache *cache, dm_dblock_t dblock) 585 { 586 return to_oblock(from_dblock(dblock) * oblocks_per_dblock(cache)); 587 } 588 589 static void set_discard(struct cache *cache, dm_dblock_t b) 590 { 591 unsigned long flags; 592 593 BUG_ON(from_dblock(b) >= from_dblock(cache->discard_nr_blocks)); 594 atomic_inc(&cache->stats.discard_count); 595 596 spin_lock_irqsave(&cache->lock, flags); 597 set_bit(from_dblock(b), cache->discard_bitset); 598 spin_unlock_irqrestore(&cache->lock, flags); 599 } 600 601 static void clear_discard(struct cache *cache, dm_dblock_t b) 602 { 603 unsigned long flags; 604 605 spin_lock_irqsave(&cache->lock, flags); 606 clear_bit(from_dblock(b), cache->discard_bitset); 607 spin_unlock_irqrestore(&cache->lock, flags); 608 } 609 610 static bool is_discarded(struct cache *cache, dm_dblock_t b) 611 { 612 int r; 613 unsigned long flags; 614 615 spin_lock_irqsave(&cache->lock, flags); 616 r = test_bit(from_dblock(b), cache->discard_bitset); 617 spin_unlock_irqrestore(&cache->lock, flags); 618 619 return r; 620 } 621 622 static bool is_discarded_oblock(struct cache *cache, dm_oblock_t b) 623 { 624 int r; 625 unsigned long flags; 626 627 spin_lock_irqsave(&cache->lock, flags); 628 r = test_bit(from_dblock(oblock_to_dblock(cache, b)), 629 cache->discard_bitset); 630 spin_unlock_irqrestore(&cache->lock, flags); 631 632 return r; 633 } 634 635 /*----------------------------------------------------------------*/ 636 637 static void load_stats(struct cache *cache) 638 { 639 struct dm_cache_statistics stats; 640 641 dm_cache_metadata_get_stats(cache->cmd, &stats); 642 atomic_set(&cache->stats.read_hit, stats.read_hits); 643 atomic_set(&cache->stats.read_miss, stats.read_misses); 644 atomic_set(&cache->stats.write_hit, stats.write_hits); 645 atomic_set(&cache->stats.write_miss, stats.write_misses); 646 } 647 648 static void save_stats(struct cache *cache) 649 { 650 struct dm_cache_statistics stats; 651 652 stats.read_hits = atomic_read(&cache->stats.read_hit); 653 stats.read_misses = atomic_read(&cache->stats.read_miss); 654 stats.write_hits = atomic_read(&cache->stats.write_hit); 655 stats.write_misses = atomic_read(&cache->stats.write_miss); 656 657 dm_cache_metadata_set_stats(cache->cmd, &stats); 658 } 659 660 /*---------------------------------------------------------------- 661 * Per bio data 662 *--------------------------------------------------------------*/ 663 664 /* 665 * If using writeback, leave out struct per_bio_data's writethrough fields. 666 */ 667 #define PB_DATA_SIZE_WB (offsetof(struct per_bio_data, cache)) 668 #define PB_DATA_SIZE_WT (sizeof(struct per_bio_data)) 669 670 static bool writethrough_mode(struct cache_features *f) 671 { 672 return f->io_mode == CM_IO_WRITETHROUGH; 673 } 674 675 static bool writeback_mode(struct cache_features *f) 676 { 677 return f->io_mode == CM_IO_WRITEBACK; 678 } 679 680 static bool passthrough_mode(struct cache_features *f) 681 { 682 return f->io_mode == CM_IO_PASSTHROUGH; 683 } 684 685 static size_t get_per_bio_data_size(struct cache *cache) 686 { 687 return writethrough_mode(&cache->features) ? PB_DATA_SIZE_WT : PB_DATA_SIZE_WB; 688 } 689 690 static struct per_bio_data *get_per_bio_data(struct bio *bio, size_t data_size) 691 { 692 struct per_bio_data *pb = dm_per_bio_data(bio, data_size); 693 BUG_ON(!pb); 694 return pb; 695 } 696 697 static struct per_bio_data *init_per_bio_data(struct bio *bio, size_t data_size) 698 { 699 struct per_bio_data *pb = get_per_bio_data(bio, data_size); 700 701 pb->tick = false; 702 pb->req_nr = dm_bio_get_target_bio_nr(bio); 703 pb->all_io_entry = NULL; 704 705 return pb; 706 } 707 708 /*---------------------------------------------------------------- 709 * Remapping 710 *--------------------------------------------------------------*/ 711 static void remap_to_origin(struct cache *cache, struct bio *bio) 712 { 713 bio->bi_bdev = cache->origin_dev->bdev; 714 } 715 716 static void remap_to_cache(struct cache *cache, struct bio *bio, 717 dm_cblock_t cblock) 718 { 719 sector_t bi_sector = bio->bi_iter.bi_sector; 720 sector_t block = from_cblock(cblock); 721 722 bio->bi_bdev = cache->cache_dev->bdev; 723 if (!block_size_is_power_of_two(cache)) 724 bio->bi_iter.bi_sector = 725 (block * cache->sectors_per_block) + 726 sector_div(bi_sector, cache->sectors_per_block); 727 else 728 bio->bi_iter.bi_sector = 729 (block << cache->sectors_per_block_shift) | 730 (bi_sector & (cache->sectors_per_block - 1)); 731 } 732 733 static void check_if_tick_bio_needed(struct cache *cache, struct bio *bio) 734 { 735 unsigned long flags; 736 size_t pb_data_size = get_per_bio_data_size(cache); 737 struct per_bio_data *pb = get_per_bio_data(bio, pb_data_size); 738 739 spin_lock_irqsave(&cache->lock, flags); 740 if (cache->need_tick_bio && 741 !(bio->bi_rw & (REQ_FUA | REQ_FLUSH | REQ_DISCARD))) { 742 pb->tick = true; 743 cache->need_tick_bio = false; 744 } 745 spin_unlock_irqrestore(&cache->lock, flags); 746 } 747 748 static void remap_to_origin_clear_discard(struct cache *cache, struct bio *bio, 749 dm_oblock_t oblock) 750 { 751 check_if_tick_bio_needed(cache, bio); 752 remap_to_origin(cache, bio); 753 if (bio_data_dir(bio) == WRITE) 754 clear_discard(cache, oblock_to_dblock(cache, oblock)); 755 } 756 757 static void remap_to_cache_dirty(struct cache *cache, struct bio *bio, 758 dm_oblock_t oblock, dm_cblock_t cblock) 759 { 760 check_if_tick_bio_needed(cache, bio); 761 remap_to_cache(cache, bio, cblock); 762 if (bio_data_dir(bio) == WRITE) { 763 set_dirty(cache, oblock, cblock); 764 clear_discard(cache, oblock_to_dblock(cache, oblock)); 765 } 766 } 767 768 static dm_oblock_t get_bio_block(struct cache *cache, struct bio *bio) 769 { 770 sector_t block_nr = bio->bi_iter.bi_sector; 771 772 if (!block_size_is_power_of_two(cache)) 773 (void) sector_div(block_nr, cache->sectors_per_block); 774 else 775 block_nr >>= cache->sectors_per_block_shift; 776 777 return to_oblock(block_nr); 778 } 779 780 static int bio_triggers_commit(struct cache *cache, struct bio *bio) 781 { 782 return bio->bi_rw & (REQ_FLUSH | REQ_FUA); 783 } 784 785 /* 786 * You must increment the deferred set whilst the prison cell is held. To 787 * encourage this, we ask for 'cell' to be passed in. 788 */ 789 static void inc_ds(struct cache *cache, struct bio *bio, 790 struct dm_bio_prison_cell *cell) 791 { 792 size_t pb_data_size = get_per_bio_data_size(cache); 793 struct per_bio_data *pb = get_per_bio_data(bio, pb_data_size); 794 795 BUG_ON(!cell); 796 BUG_ON(pb->all_io_entry); 797 798 pb->all_io_entry = dm_deferred_entry_inc(cache->all_io_ds); 799 } 800 801 static void issue(struct cache *cache, struct bio *bio) 802 { 803 unsigned long flags; 804 805 if (!bio_triggers_commit(cache, bio)) { 806 generic_make_request(bio); 807 return; 808 } 809 810 /* 811 * Batch together any bios that trigger commits and then issue a 812 * single commit for them in do_worker(). 813 */ 814 spin_lock_irqsave(&cache->lock, flags); 815 cache->commit_requested = true; 816 bio_list_add(&cache->deferred_flush_bios, bio); 817 spin_unlock_irqrestore(&cache->lock, flags); 818 } 819 820 static void inc_and_issue(struct cache *cache, struct bio *bio, struct dm_bio_prison_cell *cell) 821 { 822 inc_ds(cache, bio, cell); 823 issue(cache, bio); 824 } 825 826 static void defer_writethrough_bio(struct cache *cache, struct bio *bio) 827 { 828 unsigned long flags; 829 830 spin_lock_irqsave(&cache->lock, flags); 831 bio_list_add(&cache->deferred_writethrough_bios, bio); 832 spin_unlock_irqrestore(&cache->lock, flags); 833 834 wake_worker(cache); 835 } 836 837 static void writethrough_endio(struct bio *bio, int err) 838 { 839 struct per_bio_data *pb = get_per_bio_data(bio, PB_DATA_SIZE_WT); 840 841 dm_unhook_bio(&pb->hook_info, bio); 842 843 if (err) { 844 bio_endio(bio, err); 845 return; 846 } 847 848 dm_bio_restore(&pb->bio_details, bio); 849 remap_to_cache(pb->cache, bio, pb->cblock); 850 851 /* 852 * We can't issue this bio directly, since we're in interrupt 853 * context. So it gets put on a bio list for processing by the 854 * worker thread. 855 */ 856 defer_writethrough_bio(pb->cache, bio); 857 } 858 859 /* 860 * When running in writethrough mode we need to send writes to clean blocks 861 * to both the cache and origin devices. In future we'd like to clone the 862 * bio and send them in parallel, but for now we're doing them in 863 * series as this is easier. 864 */ 865 static void remap_to_origin_then_cache(struct cache *cache, struct bio *bio, 866 dm_oblock_t oblock, dm_cblock_t cblock) 867 { 868 struct per_bio_data *pb = get_per_bio_data(bio, PB_DATA_SIZE_WT); 869 870 pb->cache = cache; 871 pb->cblock = cblock; 872 dm_hook_bio(&pb->hook_info, bio, writethrough_endio, NULL); 873 dm_bio_record(&pb->bio_details, bio); 874 875 remap_to_origin_clear_discard(pb->cache, bio, oblock); 876 } 877 878 /*---------------------------------------------------------------- 879 * Migration processing 880 * 881 * Migration covers moving data from the origin device to the cache, or 882 * vice versa. 883 *--------------------------------------------------------------*/ 884 static void inc_io_migrations(struct cache *cache) 885 { 886 atomic_inc(&cache->nr_io_migrations); 887 } 888 889 static void dec_io_migrations(struct cache *cache) 890 { 891 atomic_dec(&cache->nr_io_migrations); 892 } 893 894 static void __cell_defer(struct cache *cache, struct dm_bio_prison_cell *cell, 895 bool holder) 896 { 897 (holder ? dm_cell_release : dm_cell_release_no_holder) 898 (cache->prison, cell, &cache->deferred_bios); 899 free_prison_cell(cache, cell); 900 } 901 902 static void cell_defer(struct cache *cache, struct dm_bio_prison_cell *cell, 903 bool holder) 904 { 905 unsigned long flags; 906 907 spin_lock_irqsave(&cache->lock, flags); 908 __cell_defer(cache, cell, holder); 909 spin_unlock_irqrestore(&cache->lock, flags); 910 911 wake_worker(cache); 912 } 913 914 static void free_io_migration(struct dm_cache_migration *mg) 915 { 916 dec_io_migrations(mg->cache); 917 free_migration(mg); 918 } 919 920 static void migration_failure(struct dm_cache_migration *mg) 921 { 922 struct cache *cache = mg->cache; 923 924 if (mg->writeback) { 925 DMWARN_LIMIT("writeback failed; couldn't copy block"); 926 set_dirty(cache, mg->old_oblock, mg->cblock); 927 cell_defer(cache, mg->old_ocell, false); 928 929 } else if (mg->demote) { 930 DMWARN_LIMIT("demotion failed; couldn't copy block"); 931 policy_force_mapping(cache->policy, mg->new_oblock, mg->old_oblock); 932 933 cell_defer(cache, mg->old_ocell, mg->promote ? false : true); 934 if (mg->promote) 935 cell_defer(cache, mg->new_ocell, true); 936 } else { 937 DMWARN_LIMIT("promotion failed; couldn't copy block"); 938 policy_remove_mapping(cache->policy, mg->new_oblock); 939 cell_defer(cache, mg->new_ocell, true); 940 } 941 942 free_io_migration(mg); 943 } 944 945 static void migration_success_pre_commit(struct dm_cache_migration *mg) 946 { 947 unsigned long flags; 948 struct cache *cache = mg->cache; 949 950 if (mg->writeback) { 951 clear_dirty(cache, mg->old_oblock, mg->cblock); 952 cell_defer(cache, mg->old_ocell, false); 953 free_io_migration(mg); 954 return; 955 956 } else if (mg->demote) { 957 if (dm_cache_remove_mapping(cache->cmd, mg->cblock)) { 958 DMWARN_LIMIT("demotion failed; couldn't update on disk metadata"); 959 policy_force_mapping(cache->policy, mg->new_oblock, 960 mg->old_oblock); 961 if (mg->promote) 962 cell_defer(cache, mg->new_ocell, true); 963 free_io_migration(mg); 964 return; 965 } 966 } else { 967 if (dm_cache_insert_mapping(cache->cmd, mg->cblock, mg->new_oblock)) { 968 DMWARN_LIMIT("promotion failed; couldn't update on disk metadata"); 969 policy_remove_mapping(cache->policy, mg->new_oblock); 970 free_io_migration(mg); 971 return; 972 } 973 } 974 975 spin_lock_irqsave(&cache->lock, flags); 976 list_add_tail(&mg->list, &cache->need_commit_migrations); 977 cache->commit_requested = true; 978 spin_unlock_irqrestore(&cache->lock, flags); 979 } 980 981 static void migration_success_post_commit(struct dm_cache_migration *mg) 982 { 983 unsigned long flags; 984 struct cache *cache = mg->cache; 985 986 if (mg->writeback) { 987 DMWARN("writeback unexpectedly triggered commit"); 988 return; 989 990 } else if (mg->demote) { 991 cell_defer(cache, mg->old_ocell, mg->promote ? false : true); 992 993 if (mg->promote) { 994 mg->demote = false; 995 996 spin_lock_irqsave(&cache->lock, flags); 997 list_add_tail(&mg->list, &cache->quiesced_migrations); 998 spin_unlock_irqrestore(&cache->lock, flags); 999 1000 } else { 1001 if (mg->invalidate) 1002 policy_remove_mapping(cache->policy, mg->old_oblock); 1003 free_io_migration(mg); 1004 } 1005 1006 } else { 1007 if (mg->requeue_holder) { 1008 clear_dirty(cache, mg->new_oblock, mg->cblock); 1009 cell_defer(cache, mg->new_ocell, true); 1010 } else { 1011 /* 1012 * The block was promoted via an overwrite, so it's dirty. 1013 */ 1014 set_dirty(cache, mg->new_oblock, mg->cblock); 1015 bio_endio(mg->new_ocell->holder, 0); 1016 cell_defer(cache, mg->new_ocell, false); 1017 } 1018 free_io_migration(mg); 1019 } 1020 } 1021 1022 static void copy_complete(int read_err, unsigned long write_err, void *context) 1023 { 1024 unsigned long flags; 1025 struct dm_cache_migration *mg = (struct dm_cache_migration *) context; 1026 struct cache *cache = mg->cache; 1027 1028 if (read_err || write_err) 1029 mg->err = true; 1030 1031 spin_lock_irqsave(&cache->lock, flags); 1032 list_add_tail(&mg->list, &cache->completed_migrations); 1033 spin_unlock_irqrestore(&cache->lock, flags); 1034 1035 wake_worker(cache); 1036 } 1037 1038 static void issue_copy(struct dm_cache_migration *mg) 1039 { 1040 int r; 1041 struct dm_io_region o_region, c_region; 1042 struct cache *cache = mg->cache; 1043 sector_t cblock = from_cblock(mg->cblock); 1044 1045 o_region.bdev = cache->origin_dev->bdev; 1046 o_region.count = cache->sectors_per_block; 1047 1048 c_region.bdev = cache->cache_dev->bdev; 1049 c_region.sector = cblock * cache->sectors_per_block; 1050 c_region.count = cache->sectors_per_block; 1051 1052 if (mg->writeback || mg->demote) { 1053 /* demote */ 1054 o_region.sector = from_oblock(mg->old_oblock) * cache->sectors_per_block; 1055 r = dm_kcopyd_copy(cache->copier, &c_region, 1, &o_region, 0, copy_complete, mg); 1056 } else { 1057 /* promote */ 1058 o_region.sector = from_oblock(mg->new_oblock) * cache->sectors_per_block; 1059 r = dm_kcopyd_copy(cache->copier, &o_region, 1, &c_region, 0, copy_complete, mg); 1060 } 1061 1062 if (r < 0) { 1063 DMERR_LIMIT("issuing migration failed"); 1064 migration_failure(mg); 1065 } 1066 } 1067 1068 static void overwrite_endio(struct bio *bio, int err) 1069 { 1070 struct dm_cache_migration *mg = bio->bi_private; 1071 struct cache *cache = mg->cache; 1072 size_t pb_data_size = get_per_bio_data_size(cache); 1073 struct per_bio_data *pb = get_per_bio_data(bio, pb_data_size); 1074 unsigned long flags; 1075 1076 dm_unhook_bio(&pb->hook_info, bio); 1077 1078 if (err) 1079 mg->err = true; 1080 1081 mg->requeue_holder = false; 1082 1083 spin_lock_irqsave(&cache->lock, flags); 1084 list_add_tail(&mg->list, &cache->completed_migrations); 1085 spin_unlock_irqrestore(&cache->lock, flags); 1086 1087 wake_worker(cache); 1088 } 1089 1090 static void issue_overwrite(struct dm_cache_migration *mg, struct bio *bio) 1091 { 1092 size_t pb_data_size = get_per_bio_data_size(mg->cache); 1093 struct per_bio_data *pb = get_per_bio_data(bio, pb_data_size); 1094 1095 dm_hook_bio(&pb->hook_info, bio, overwrite_endio, mg); 1096 remap_to_cache_dirty(mg->cache, bio, mg->new_oblock, mg->cblock); 1097 1098 /* 1099 * No need to inc_ds() here, since the cell will be held for the 1100 * duration of the io. 1101 */ 1102 generic_make_request(bio); 1103 } 1104 1105 static bool bio_writes_complete_block(struct cache *cache, struct bio *bio) 1106 { 1107 return (bio_data_dir(bio) == WRITE) && 1108 (bio->bi_iter.bi_size == (cache->sectors_per_block << SECTOR_SHIFT)); 1109 } 1110 1111 static void avoid_copy(struct dm_cache_migration *mg) 1112 { 1113 atomic_inc(&mg->cache->stats.copies_avoided); 1114 migration_success_pre_commit(mg); 1115 } 1116 1117 static void calc_discard_block_range(struct cache *cache, struct bio *bio, 1118 dm_dblock_t *b, dm_dblock_t *e) 1119 { 1120 sector_t sb = bio->bi_iter.bi_sector; 1121 sector_t se = bio_end_sector(bio); 1122 1123 *b = to_dblock(dm_sector_div_up(sb, cache->discard_block_size)); 1124 1125 if (se - sb < cache->discard_block_size) 1126 *e = *b; 1127 else 1128 *e = to_dblock(block_div(se, cache->discard_block_size)); 1129 } 1130 1131 static void issue_discard(struct dm_cache_migration *mg) 1132 { 1133 dm_dblock_t b, e; 1134 struct bio *bio = mg->new_ocell->holder; 1135 1136 calc_discard_block_range(mg->cache, bio, &b, &e); 1137 while (b != e) { 1138 set_discard(mg->cache, b); 1139 b = to_dblock(from_dblock(b) + 1); 1140 } 1141 1142 bio_endio(bio, 0); 1143 cell_defer(mg->cache, mg->new_ocell, false); 1144 free_migration(mg); 1145 } 1146 1147 static void issue_copy_or_discard(struct dm_cache_migration *mg) 1148 { 1149 bool avoid; 1150 struct cache *cache = mg->cache; 1151 1152 if (mg->discard) { 1153 issue_discard(mg); 1154 return; 1155 } 1156 1157 if (mg->writeback || mg->demote) 1158 avoid = !is_dirty(cache, mg->cblock) || 1159 is_discarded_oblock(cache, mg->old_oblock); 1160 else { 1161 struct bio *bio = mg->new_ocell->holder; 1162 1163 avoid = is_discarded_oblock(cache, mg->new_oblock); 1164 1165 if (writeback_mode(&cache->features) && 1166 !avoid && bio_writes_complete_block(cache, bio)) { 1167 issue_overwrite(mg, bio); 1168 return; 1169 } 1170 } 1171 1172 avoid ? avoid_copy(mg) : issue_copy(mg); 1173 } 1174 1175 static void complete_migration(struct dm_cache_migration *mg) 1176 { 1177 if (mg->err) 1178 migration_failure(mg); 1179 else 1180 migration_success_pre_commit(mg); 1181 } 1182 1183 static void process_migrations(struct cache *cache, struct list_head *head, 1184 void (*fn)(struct dm_cache_migration *)) 1185 { 1186 unsigned long flags; 1187 struct list_head list; 1188 struct dm_cache_migration *mg, *tmp; 1189 1190 INIT_LIST_HEAD(&list); 1191 spin_lock_irqsave(&cache->lock, flags); 1192 list_splice_init(head, &list); 1193 spin_unlock_irqrestore(&cache->lock, flags); 1194 1195 list_for_each_entry_safe(mg, tmp, &list, list) 1196 fn(mg); 1197 } 1198 1199 static void __queue_quiesced_migration(struct dm_cache_migration *mg) 1200 { 1201 list_add_tail(&mg->list, &mg->cache->quiesced_migrations); 1202 } 1203 1204 static void queue_quiesced_migration(struct dm_cache_migration *mg) 1205 { 1206 unsigned long flags; 1207 struct cache *cache = mg->cache; 1208 1209 spin_lock_irqsave(&cache->lock, flags); 1210 __queue_quiesced_migration(mg); 1211 spin_unlock_irqrestore(&cache->lock, flags); 1212 1213 wake_worker(cache); 1214 } 1215 1216 static void queue_quiesced_migrations(struct cache *cache, struct list_head *work) 1217 { 1218 unsigned long flags; 1219 struct dm_cache_migration *mg, *tmp; 1220 1221 spin_lock_irqsave(&cache->lock, flags); 1222 list_for_each_entry_safe(mg, tmp, work, list) 1223 __queue_quiesced_migration(mg); 1224 spin_unlock_irqrestore(&cache->lock, flags); 1225 1226 wake_worker(cache); 1227 } 1228 1229 static void check_for_quiesced_migrations(struct cache *cache, 1230 struct per_bio_data *pb) 1231 { 1232 struct list_head work; 1233 1234 if (!pb->all_io_entry) 1235 return; 1236 1237 INIT_LIST_HEAD(&work); 1238 dm_deferred_entry_dec(pb->all_io_entry, &work); 1239 1240 if (!list_empty(&work)) 1241 queue_quiesced_migrations(cache, &work); 1242 } 1243 1244 static void quiesce_migration(struct dm_cache_migration *mg) 1245 { 1246 if (!dm_deferred_set_add_work(mg->cache->all_io_ds, &mg->list)) 1247 queue_quiesced_migration(mg); 1248 } 1249 1250 static void promote(struct cache *cache, struct prealloc *structs, 1251 dm_oblock_t oblock, dm_cblock_t cblock, 1252 struct dm_bio_prison_cell *cell) 1253 { 1254 struct dm_cache_migration *mg = prealloc_get_migration(structs); 1255 1256 mg->err = false; 1257 mg->discard = false; 1258 mg->writeback = false; 1259 mg->demote = false; 1260 mg->promote = true; 1261 mg->requeue_holder = true; 1262 mg->invalidate = false; 1263 mg->cache = cache; 1264 mg->new_oblock = oblock; 1265 mg->cblock = cblock; 1266 mg->old_ocell = NULL; 1267 mg->new_ocell = cell; 1268 mg->start_jiffies = jiffies; 1269 1270 inc_io_migrations(cache); 1271 quiesce_migration(mg); 1272 } 1273 1274 static void writeback(struct cache *cache, struct prealloc *structs, 1275 dm_oblock_t oblock, dm_cblock_t cblock, 1276 struct dm_bio_prison_cell *cell) 1277 { 1278 struct dm_cache_migration *mg = prealloc_get_migration(structs); 1279 1280 mg->err = false; 1281 mg->discard = false; 1282 mg->writeback = true; 1283 mg->demote = false; 1284 mg->promote = false; 1285 mg->requeue_holder = true; 1286 mg->invalidate = false; 1287 mg->cache = cache; 1288 mg->old_oblock = oblock; 1289 mg->cblock = cblock; 1290 mg->old_ocell = cell; 1291 mg->new_ocell = NULL; 1292 mg->start_jiffies = jiffies; 1293 1294 inc_io_migrations(cache); 1295 quiesce_migration(mg); 1296 } 1297 1298 static void demote_then_promote(struct cache *cache, struct prealloc *structs, 1299 dm_oblock_t old_oblock, dm_oblock_t new_oblock, 1300 dm_cblock_t cblock, 1301 struct dm_bio_prison_cell *old_ocell, 1302 struct dm_bio_prison_cell *new_ocell) 1303 { 1304 struct dm_cache_migration *mg = prealloc_get_migration(structs); 1305 1306 mg->err = false; 1307 mg->discard = false; 1308 mg->writeback = false; 1309 mg->demote = true; 1310 mg->promote = true; 1311 mg->requeue_holder = true; 1312 mg->invalidate = false; 1313 mg->cache = cache; 1314 mg->old_oblock = old_oblock; 1315 mg->new_oblock = new_oblock; 1316 mg->cblock = cblock; 1317 mg->old_ocell = old_ocell; 1318 mg->new_ocell = new_ocell; 1319 mg->start_jiffies = jiffies; 1320 1321 inc_io_migrations(cache); 1322 quiesce_migration(mg); 1323 } 1324 1325 /* 1326 * Invalidate a cache entry. No writeback occurs; any changes in the cache 1327 * block are thrown away. 1328 */ 1329 static void invalidate(struct cache *cache, struct prealloc *structs, 1330 dm_oblock_t oblock, dm_cblock_t cblock, 1331 struct dm_bio_prison_cell *cell) 1332 { 1333 struct dm_cache_migration *mg = prealloc_get_migration(structs); 1334 1335 mg->err = false; 1336 mg->discard = false; 1337 mg->writeback = false; 1338 mg->demote = true; 1339 mg->promote = false; 1340 mg->requeue_holder = true; 1341 mg->invalidate = true; 1342 mg->cache = cache; 1343 mg->old_oblock = oblock; 1344 mg->cblock = cblock; 1345 mg->old_ocell = cell; 1346 mg->new_ocell = NULL; 1347 mg->start_jiffies = jiffies; 1348 1349 inc_io_migrations(cache); 1350 quiesce_migration(mg); 1351 } 1352 1353 static void discard(struct cache *cache, struct prealloc *structs, 1354 struct dm_bio_prison_cell *cell) 1355 { 1356 struct dm_cache_migration *mg = prealloc_get_migration(structs); 1357 1358 mg->err = false; 1359 mg->discard = true; 1360 mg->writeback = false; 1361 mg->demote = false; 1362 mg->promote = false; 1363 mg->requeue_holder = false; 1364 mg->invalidate = false; 1365 mg->cache = cache; 1366 mg->old_ocell = NULL; 1367 mg->new_ocell = cell; 1368 mg->start_jiffies = jiffies; 1369 1370 quiesce_migration(mg); 1371 } 1372 1373 /*---------------------------------------------------------------- 1374 * bio processing 1375 *--------------------------------------------------------------*/ 1376 static void defer_bio(struct cache *cache, struct bio *bio) 1377 { 1378 unsigned long flags; 1379 1380 spin_lock_irqsave(&cache->lock, flags); 1381 bio_list_add(&cache->deferred_bios, bio); 1382 spin_unlock_irqrestore(&cache->lock, flags); 1383 1384 wake_worker(cache); 1385 } 1386 1387 static void process_flush_bio(struct cache *cache, struct bio *bio) 1388 { 1389 size_t pb_data_size = get_per_bio_data_size(cache); 1390 struct per_bio_data *pb = get_per_bio_data(bio, pb_data_size); 1391 1392 BUG_ON(bio->bi_iter.bi_size); 1393 if (!pb->req_nr) 1394 remap_to_origin(cache, bio); 1395 else 1396 remap_to_cache(cache, bio, 0); 1397 1398 /* 1399 * REQ_FLUSH is not directed at any particular block so we don't 1400 * need to inc_ds(). REQ_FUA's are split into a write + REQ_FLUSH 1401 * by dm-core. 1402 */ 1403 issue(cache, bio); 1404 } 1405 1406 static void process_discard_bio(struct cache *cache, struct prealloc *structs, 1407 struct bio *bio) 1408 { 1409 int r; 1410 dm_dblock_t b, e; 1411 struct dm_bio_prison_cell *cell_prealloc, *new_ocell; 1412 1413 calc_discard_block_range(cache, bio, &b, &e); 1414 if (b == e) { 1415 bio_endio(bio, 0); 1416 return; 1417 } 1418 1419 cell_prealloc = prealloc_get_cell(structs); 1420 r = bio_detain_range(cache, dblock_to_oblock(cache, b), dblock_to_oblock(cache, e), bio, cell_prealloc, 1421 (cell_free_fn) prealloc_put_cell, 1422 structs, &new_ocell); 1423 if (r > 0) 1424 return; 1425 1426 discard(cache, structs, new_ocell); 1427 } 1428 1429 static bool spare_migration_bandwidth(struct cache *cache) 1430 { 1431 sector_t current_volume = (atomic_read(&cache->nr_io_migrations) + 1) * 1432 cache->sectors_per_block; 1433 return current_volume < cache->migration_threshold; 1434 } 1435 1436 static void inc_hit_counter(struct cache *cache, struct bio *bio) 1437 { 1438 atomic_inc(bio_data_dir(bio) == READ ? 1439 &cache->stats.read_hit : &cache->stats.write_hit); 1440 } 1441 1442 static void inc_miss_counter(struct cache *cache, struct bio *bio) 1443 { 1444 atomic_inc(bio_data_dir(bio) == READ ? 1445 &cache->stats.read_miss : &cache->stats.write_miss); 1446 } 1447 1448 static void process_bio(struct cache *cache, struct prealloc *structs, 1449 struct bio *bio) 1450 { 1451 int r; 1452 bool release_cell = true; 1453 dm_oblock_t block = get_bio_block(cache, bio); 1454 struct dm_bio_prison_cell *cell_prealloc, *old_ocell, *new_ocell; 1455 struct policy_result lookup_result; 1456 bool passthrough = passthrough_mode(&cache->features); 1457 bool discarded_block, can_migrate; 1458 1459 /* 1460 * Check to see if that block is currently migrating. 1461 */ 1462 cell_prealloc = prealloc_get_cell(structs); 1463 r = bio_detain(cache, block, bio, cell_prealloc, 1464 (cell_free_fn) prealloc_put_cell, 1465 structs, &new_ocell); 1466 if (r > 0) 1467 return; 1468 1469 discarded_block = is_discarded_oblock(cache, block); 1470 can_migrate = !passthrough && (discarded_block || spare_migration_bandwidth(cache)); 1471 1472 r = policy_map(cache->policy, block, true, can_migrate, discarded_block, 1473 bio, &lookup_result); 1474 1475 if (r == -EWOULDBLOCK) 1476 /* migration has been denied */ 1477 lookup_result.op = POLICY_MISS; 1478 1479 switch (lookup_result.op) { 1480 case POLICY_HIT: 1481 if (passthrough) { 1482 inc_miss_counter(cache, bio); 1483 1484 /* 1485 * Passthrough always maps to the origin, 1486 * invalidating any cache blocks that are written 1487 * to. 1488 */ 1489 1490 if (bio_data_dir(bio) == WRITE) { 1491 atomic_inc(&cache->stats.demotion); 1492 invalidate(cache, structs, block, lookup_result.cblock, new_ocell); 1493 release_cell = false; 1494 1495 } else { 1496 /* FIXME: factor out issue_origin() */ 1497 remap_to_origin_clear_discard(cache, bio, block); 1498 inc_and_issue(cache, bio, new_ocell); 1499 } 1500 } else { 1501 inc_hit_counter(cache, bio); 1502 1503 if (bio_data_dir(bio) == WRITE && 1504 writethrough_mode(&cache->features) && 1505 !is_dirty(cache, lookup_result.cblock)) { 1506 remap_to_origin_then_cache(cache, bio, block, lookup_result.cblock); 1507 inc_and_issue(cache, bio, new_ocell); 1508 1509 } else { 1510 remap_to_cache_dirty(cache, bio, block, lookup_result.cblock); 1511 inc_and_issue(cache, bio, new_ocell); 1512 } 1513 } 1514 1515 break; 1516 1517 case POLICY_MISS: 1518 inc_miss_counter(cache, bio); 1519 remap_to_origin_clear_discard(cache, bio, block); 1520 inc_and_issue(cache, bio, new_ocell); 1521 break; 1522 1523 case POLICY_NEW: 1524 atomic_inc(&cache->stats.promotion); 1525 promote(cache, structs, block, lookup_result.cblock, new_ocell); 1526 release_cell = false; 1527 break; 1528 1529 case POLICY_REPLACE: 1530 cell_prealloc = prealloc_get_cell(structs); 1531 r = bio_detain(cache, lookup_result.old_oblock, bio, cell_prealloc, 1532 (cell_free_fn) prealloc_put_cell, 1533 structs, &old_ocell); 1534 if (r > 0) { 1535 /* 1536 * We have to be careful to avoid lock inversion of 1537 * the cells. So we back off, and wait for the 1538 * old_ocell to become free. 1539 */ 1540 policy_force_mapping(cache->policy, block, 1541 lookup_result.old_oblock); 1542 atomic_inc(&cache->stats.cache_cell_clash); 1543 break; 1544 } 1545 atomic_inc(&cache->stats.demotion); 1546 atomic_inc(&cache->stats.promotion); 1547 1548 demote_then_promote(cache, structs, lookup_result.old_oblock, 1549 block, lookup_result.cblock, 1550 old_ocell, new_ocell); 1551 release_cell = false; 1552 break; 1553 1554 default: 1555 DMERR_LIMIT("%s: erroring bio, unknown policy op: %u", __func__, 1556 (unsigned) lookup_result.op); 1557 bio_io_error(bio); 1558 } 1559 1560 if (release_cell) 1561 cell_defer(cache, new_ocell, false); 1562 } 1563 1564 static int need_commit_due_to_time(struct cache *cache) 1565 { 1566 return !time_in_range(jiffies, cache->last_commit_jiffies, 1567 cache->last_commit_jiffies + COMMIT_PERIOD); 1568 } 1569 1570 static int commit_if_needed(struct cache *cache) 1571 { 1572 int r = 0; 1573 1574 if ((cache->commit_requested || need_commit_due_to_time(cache)) && 1575 dm_cache_changed_this_transaction(cache->cmd)) { 1576 atomic_inc(&cache->stats.commit_count); 1577 cache->commit_requested = false; 1578 r = dm_cache_commit(cache->cmd, false); 1579 cache->last_commit_jiffies = jiffies; 1580 } 1581 1582 return r; 1583 } 1584 1585 static void process_deferred_bios(struct cache *cache) 1586 { 1587 unsigned long flags; 1588 struct bio_list bios; 1589 struct bio *bio; 1590 struct prealloc structs; 1591 1592 memset(&structs, 0, sizeof(structs)); 1593 bio_list_init(&bios); 1594 1595 spin_lock_irqsave(&cache->lock, flags); 1596 bio_list_merge(&bios, &cache->deferred_bios); 1597 bio_list_init(&cache->deferred_bios); 1598 spin_unlock_irqrestore(&cache->lock, flags); 1599 1600 while (!bio_list_empty(&bios)) { 1601 /* 1602 * If we've got no free migration structs, and processing 1603 * this bio might require one, we pause until there are some 1604 * prepared mappings to process. 1605 */ 1606 if (prealloc_data_structs(cache, &structs)) { 1607 spin_lock_irqsave(&cache->lock, flags); 1608 bio_list_merge(&cache->deferred_bios, &bios); 1609 spin_unlock_irqrestore(&cache->lock, flags); 1610 break; 1611 } 1612 1613 bio = bio_list_pop(&bios); 1614 1615 if (bio->bi_rw & REQ_FLUSH) 1616 process_flush_bio(cache, bio); 1617 else if (bio->bi_rw & REQ_DISCARD) 1618 process_discard_bio(cache, &structs, bio); 1619 else 1620 process_bio(cache, &structs, bio); 1621 } 1622 1623 prealloc_free_structs(cache, &structs); 1624 } 1625 1626 static void process_deferred_flush_bios(struct cache *cache, bool submit_bios) 1627 { 1628 unsigned long flags; 1629 struct bio_list bios; 1630 struct bio *bio; 1631 1632 bio_list_init(&bios); 1633 1634 spin_lock_irqsave(&cache->lock, flags); 1635 bio_list_merge(&bios, &cache->deferred_flush_bios); 1636 bio_list_init(&cache->deferred_flush_bios); 1637 spin_unlock_irqrestore(&cache->lock, flags); 1638 1639 /* 1640 * These bios have already been through inc_ds() 1641 */ 1642 while ((bio = bio_list_pop(&bios))) 1643 submit_bios ? generic_make_request(bio) : bio_io_error(bio); 1644 } 1645 1646 static void process_deferred_writethrough_bios(struct cache *cache) 1647 { 1648 unsigned long flags; 1649 struct bio_list bios; 1650 struct bio *bio; 1651 1652 bio_list_init(&bios); 1653 1654 spin_lock_irqsave(&cache->lock, flags); 1655 bio_list_merge(&bios, &cache->deferred_writethrough_bios); 1656 bio_list_init(&cache->deferred_writethrough_bios); 1657 spin_unlock_irqrestore(&cache->lock, flags); 1658 1659 /* 1660 * These bios have already been through inc_ds() 1661 */ 1662 while ((bio = bio_list_pop(&bios))) 1663 generic_make_request(bio); 1664 } 1665 1666 static void writeback_some_dirty_blocks(struct cache *cache) 1667 { 1668 int r = 0; 1669 dm_oblock_t oblock; 1670 dm_cblock_t cblock; 1671 struct prealloc structs; 1672 struct dm_bio_prison_cell *old_ocell; 1673 1674 memset(&structs, 0, sizeof(structs)); 1675 1676 while (spare_migration_bandwidth(cache)) { 1677 if (prealloc_data_structs(cache, &structs)) 1678 break; 1679 1680 r = policy_writeback_work(cache->policy, &oblock, &cblock); 1681 if (r) 1682 break; 1683 1684 r = get_cell(cache, oblock, &structs, &old_ocell); 1685 if (r) { 1686 policy_set_dirty(cache->policy, oblock); 1687 break; 1688 } 1689 1690 writeback(cache, &structs, oblock, cblock, old_ocell); 1691 } 1692 1693 prealloc_free_structs(cache, &structs); 1694 } 1695 1696 /*---------------------------------------------------------------- 1697 * Invalidations. 1698 * Dropping something from the cache *without* writing back. 1699 *--------------------------------------------------------------*/ 1700 1701 static void process_invalidation_request(struct cache *cache, struct invalidation_request *req) 1702 { 1703 int r = 0; 1704 uint64_t begin = from_cblock(req->cblocks->begin); 1705 uint64_t end = from_cblock(req->cblocks->end); 1706 1707 while (begin != end) { 1708 r = policy_remove_cblock(cache->policy, to_cblock(begin)); 1709 if (!r) { 1710 r = dm_cache_remove_mapping(cache->cmd, to_cblock(begin)); 1711 if (r) 1712 break; 1713 1714 } else if (r == -ENODATA) { 1715 /* harmless, already unmapped */ 1716 r = 0; 1717 1718 } else { 1719 DMERR("policy_remove_cblock failed"); 1720 break; 1721 } 1722 1723 begin++; 1724 } 1725 1726 cache->commit_requested = true; 1727 1728 req->err = r; 1729 atomic_set(&req->complete, 1); 1730 1731 wake_up(&req->result_wait); 1732 } 1733 1734 static void process_invalidation_requests(struct cache *cache) 1735 { 1736 struct list_head list; 1737 struct invalidation_request *req, *tmp; 1738 1739 INIT_LIST_HEAD(&list); 1740 spin_lock(&cache->invalidation_lock); 1741 list_splice_init(&cache->invalidation_requests, &list); 1742 spin_unlock(&cache->invalidation_lock); 1743 1744 list_for_each_entry_safe (req, tmp, &list, list) 1745 process_invalidation_request(cache, req); 1746 } 1747 1748 /*---------------------------------------------------------------- 1749 * Main worker loop 1750 *--------------------------------------------------------------*/ 1751 static bool is_quiescing(struct cache *cache) 1752 { 1753 return atomic_read(&cache->quiescing); 1754 } 1755 1756 static void ack_quiescing(struct cache *cache) 1757 { 1758 if (is_quiescing(cache)) { 1759 atomic_inc(&cache->quiescing_ack); 1760 wake_up(&cache->quiescing_wait); 1761 } 1762 } 1763 1764 static void wait_for_quiescing_ack(struct cache *cache) 1765 { 1766 wait_event(cache->quiescing_wait, atomic_read(&cache->quiescing_ack)); 1767 } 1768 1769 static void start_quiescing(struct cache *cache) 1770 { 1771 atomic_inc(&cache->quiescing); 1772 wait_for_quiescing_ack(cache); 1773 } 1774 1775 static void stop_quiescing(struct cache *cache) 1776 { 1777 atomic_set(&cache->quiescing, 0); 1778 atomic_set(&cache->quiescing_ack, 0); 1779 } 1780 1781 static void wait_for_migrations(struct cache *cache) 1782 { 1783 wait_event(cache->migration_wait, !atomic_read(&cache->nr_allocated_migrations)); 1784 } 1785 1786 static void stop_worker(struct cache *cache) 1787 { 1788 cancel_delayed_work(&cache->waker); 1789 flush_workqueue(cache->wq); 1790 } 1791 1792 static void requeue_deferred_io(struct cache *cache) 1793 { 1794 struct bio *bio; 1795 struct bio_list bios; 1796 1797 bio_list_init(&bios); 1798 bio_list_merge(&bios, &cache->deferred_bios); 1799 bio_list_init(&cache->deferred_bios); 1800 1801 while ((bio = bio_list_pop(&bios))) 1802 bio_endio(bio, DM_ENDIO_REQUEUE); 1803 } 1804 1805 static int more_work(struct cache *cache) 1806 { 1807 if (is_quiescing(cache)) 1808 return !list_empty(&cache->quiesced_migrations) || 1809 !list_empty(&cache->completed_migrations) || 1810 !list_empty(&cache->need_commit_migrations); 1811 else 1812 return !bio_list_empty(&cache->deferred_bios) || 1813 !bio_list_empty(&cache->deferred_flush_bios) || 1814 !bio_list_empty(&cache->deferred_writethrough_bios) || 1815 !list_empty(&cache->quiesced_migrations) || 1816 !list_empty(&cache->completed_migrations) || 1817 !list_empty(&cache->need_commit_migrations) || 1818 cache->invalidate; 1819 } 1820 1821 static void do_worker(struct work_struct *ws) 1822 { 1823 struct cache *cache = container_of(ws, struct cache, worker); 1824 1825 do { 1826 if (!is_quiescing(cache)) { 1827 writeback_some_dirty_blocks(cache); 1828 process_deferred_writethrough_bios(cache); 1829 process_deferred_bios(cache); 1830 process_invalidation_requests(cache); 1831 } 1832 1833 process_migrations(cache, &cache->quiesced_migrations, issue_copy_or_discard); 1834 process_migrations(cache, &cache->completed_migrations, complete_migration); 1835 1836 if (commit_if_needed(cache)) { 1837 process_deferred_flush_bios(cache, false); 1838 process_migrations(cache, &cache->need_commit_migrations, migration_failure); 1839 1840 /* 1841 * FIXME: rollback metadata or just go into a 1842 * failure mode and error everything 1843 */ 1844 } else { 1845 process_deferred_flush_bios(cache, true); 1846 process_migrations(cache, &cache->need_commit_migrations, 1847 migration_success_post_commit); 1848 } 1849 1850 ack_quiescing(cache); 1851 1852 } while (more_work(cache)); 1853 } 1854 1855 /* 1856 * We want to commit periodically so that not too much 1857 * unwritten metadata builds up. 1858 */ 1859 static void do_waker(struct work_struct *ws) 1860 { 1861 struct cache *cache = container_of(to_delayed_work(ws), struct cache, waker); 1862 policy_tick(cache->policy); 1863 wake_worker(cache); 1864 queue_delayed_work(cache->wq, &cache->waker, COMMIT_PERIOD); 1865 } 1866 1867 /*----------------------------------------------------------------*/ 1868 1869 static int is_congested(struct dm_dev *dev, int bdi_bits) 1870 { 1871 struct request_queue *q = bdev_get_queue(dev->bdev); 1872 return bdi_congested(&q->backing_dev_info, bdi_bits); 1873 } 1874 1875 static int cache_is_congested(struct dm_target_callbacks *cb, int bdi_bits) 1876 { 1877 struct cache *cache = container_of(cb, struct cache, callbacks); 1878 1879 return is_congested(cache->origin_dev, bdi_bits) || 1880 is_congested(cache->cache_dev, bdi_bits); 1881 } 1882 1883 /*---------------------------------------------------------------- 1884 * Target methods 1885 *--------------------------------------------------------------*/ 1886 1887 /* 1888 * This function gets called on the error paths of the constructor, so we 1889 * have to cope with a partially initialised struct. 1890 */ 1891 static void destroy(struct cache *cache) 1892 { 1893 unsigned i; 1894 1895 if (cache->migration_pool) 1896 mempool_destroy(cache->migration_pool); 1897 1898 if (cache->all_io_ds) 1899 dm_deferred_set_destroy(cache->all_io_ds); 1900 1901 if (cache->prison) 1902 dm_bio_prison_destroy(cache->prison); 1903 1904 if (cache->wq) 1905 destroy_workqueue(cache->wq); 1906 1907 if (cache->dirty_bitset) 1908 free_bitset(cache->dirty_bitset); 1909 1910 if (cache->discard_bitset) 1911 free_bitset(cache->discard_bitset); 1912 1913 if (cache->copier) 1914 dm_kcopyd_client_destroy(cache->copier); 1915 1916 if (cache->cmd) 1917 dm_cache_metadata_close(cache->cmd); 1918 1919 if (cache->metadata_dev) 1920 dm_put_device(cache->ti, cache->metadata_dev); 1921 1922 if (cache->origin_dev) 1923 dm_put_device(cache->ti, cache->origin_dev); 1924 1925 if (cache->cache_dev) 1926 dm_put_device(cache->ti, cache->cache_dev); 1927 1928 if (cache->policy) 1929 dm_cache_policy_destroy(cache->policy); 1930 1931 for (i = 0; i < cache->nr_ctr_args ; i++) 1932 kfree(cache->ctr_args[i]); 1933 kfree(cache->ctr_args); 1934 1935 kfree(cache); 1936 } 1937 1938 static void cache_dtr(struct dm_target *ti) 1939 { 1940 struct cache *cache = ti->private; 1941 1942 destroy(cache); 1943 } 1944 1945 static sector_t get_dev_size(struct dm_dev *dev) 1946 { 1947 return i_size_read(dev->bdev->bd_inode) >> SECTOR_SHIFT; 1948 } 1949 1950 /*----------------------------------------------------------------*/ 1951 1952 /* 1953 * Construct a cache device mapping. 1954 * 1955 * cache <metadata dev> <cache dev> <origin dev> <block size> 1956 * <#feature args> [<feature arg>]* 1957 * <policy> <#policy args> [<policy arg>]* 1958 * 1959 * metadata dev : fast device holding the persistent metadata 1960 * cache dev : fast device holding cached data blocks 1961 * origin dev : slow device holding original data blocks 1962 * block size : cache unit size in sectors 1963 * 1964 * #feature args : number of feature arguments passed 1965 * feature args : writethrough. (The default is writeback.) 1966 * 1967 * policy : the replacement policy to use 1968 * #policy args : an even number of policy arguments corresponding 1969 * to key/value pairs passed to the policy 1970 * policy args : key/value pairs passed to the policy 1971 * E.g. 'sequential_threshold 1024' 1972 * See cache-policies.txt for details. 1973 * 1974 * Optional feature arguments are: 1975 * writethrough : write through caching that prohibits cache block 1976 * content from being different from origin block content. 1977 * Without this argument, the default behaviour is to write 1978 * back cache block contents later for performance reasons, 1979 * so they may differ from the corresponding origin blocks. 1980 */ 1981 struct cache_args { 1982 struct dm_target *ti; 1983 1984 struct dm_dev *metadata_dev; 1985 1986 struct dm_dev *cache_dev; 1987 sector_t cache_sectors; 1988 1989 struct dm_dev *origin_dev; 1990 sector_t origin_sectors; 1991 1992 uint32_t block_size; 1993 1994 const char *policy_name; 1995 int policy_argc; 1996 const char **policy_argv; 1997 1998 struct cache_features features; 1999 }; 2000 2001 static void destroy_cache_args(struct cache_args *ca) 2002 { 2003 if (ca->metadata_dev) 2004 dm_put_device(ca->ti, ca->metadata_dev); 2005 2006 if (ca->cache_dev) 2007 dm_put_device(ca->ti, ca->cache_dev); 2008 2009 if (ca->origin_dev) 2010 dm_put_device(ca->ti, ca->origin_dev); 2011 2012 kfree(ca); 2013 } 2014 2015 static bool at_least_one_arg(struct dm_arg_set *as, char **error) 2016 { 2017 if (!as->argc) { 2018 *error = "Insufficient args"; 2019 return false; 2020 } 2021 2022 return true; 2023 } 2024 2025 static int parse_metadata_dev(struct cache_args *ca, struct dm_arg_set *as, 2026 char **error) 2027 { 2028 int r; 2029 sector_t metadata_dev_size; 2030 char b[BDEVNAME_SIZE]; 2031 2032 if (!at_least_one_arg(as, error)) 2033 return -EINVAL; 2034 2035 r = dm_get_device(ca->ti, dm_shift_arg(as), FMODE_READ | FMODE_WRITE, 2036 &ca->metadata_dev); 2037 if (r) { 2038 *error = "Error opening metadata device"; 2039 return r; 2040 } 2041 2042 metadata_dev_size = get_dev_size(ca->metadata_dev); 2043 if (metadata_dev_size > DM_CACHE_METADATA_MAX_SECTORS_WARNING) 2044 DMWARN("Metadata device %s is larger than %u sectors: excess space will not be used.", 2045 bdevname(ca->metadata_dev->bdev, b), THIN_METADATA_MAX_SECTORS); 2046 2047 return 0; 2048 } 2049 2050 static int parse_cache_dev(struct cache_args *ca, struct dm_arg_set *as, 2051 char **error) 2052 { 2053 int r; 2054 2055 if (!at_least_one_arg(as, error)) 2056 return -EINVAL; 2057 2058 r = dm_get_device(ca->ti, dm_shift_arg(as), FMODE_READ | FMODE_WRITE, 2059 &ca->cache_dev); 2060 if (r) { 2061 *error = "Error opening cache device"; 2062 return r; 2063 } 2064 ca->cache_sectors = get_dev_size(ca->cache_dev); 2065 2066 return 0; 2067 } 2068 2069 static int parse_origin_dev(struct cache_args *ca, struct dm_arg_set *as, 2070 char **error) 2071 { 2072 int r; 2073 2074 if (!at_least_one_arg(as, error)) 2075 return -EINVAL; 2076 2077 r = dm_get_device(ca->ti, dm_shift_arg(as), FMODE_READ | FMODE_WRITE, 2078 &ca->origin_dev); 2079 if (r) { 2080 *error = "Error opening origin device"; 2081 return r; 2082 } 2083 2084 ca->origin_sectors = get_dev_size(ca->origin_dev); 2085 if (ca->ti->len > ca->origin_sectors) { 2086 *error = "Device size larger than cached device"; 2087 return -EINVAL; 2088 } 2089 2090 return 0; 2091 } 2092 2093 static int parse_block_size(struct cache_args *ca, struct dm_arg_set *as, 2094 char **error) 2095 { 2096 unsigned long block_size; 2097 2098 if (!at_least_one_arg(as, error)) 2099 return -EINVAL; 2100 2101 if (kstrtoul(dm_shift_arg(as), 10, &block_size) || !block_size || 2102 block_size < DATA_DEV_BLOCK_SIZE_MIN_SECTORS || 2103 block_size > DATA_DEV_BLOCK_SIZE_MAX_SECTORS || 2104 block_size & (DATA_DEV_BLOCK_SIZE_MIN_SECTORS - 1)) { 2105 *error = "Invalid data block size"; 2106 return -EINVAL; 2107 } 2108 2109 if (block_size > ca->cache_sectors) { 2110 *error = "Data block size is larger than the cache device"; 2111 return -EINVAL; 2112 } 2113 2114 ca->block_size = block_size; 2115 2116 return 0; 2117 } 2118 2119 static void init_features(struct cache_features *cf) 2120 { 2121 cf->mode = CM_WRITE; 2122 cf->io_mode = CM_IO_WRITEBACK; 2123 } 2124 2125 static int parse_features(struct cache_args *ca, struct dm_arg_set *as, 2126 char **error) 2127 { 2128 static struct dm_arg _args[] = { 2129 {0, 1, "Invalid number of cache feature arguments"}, 2130 }; 2131 2132 int r; 2133 unsigned argc; 2134 const char *arg; 2135 struct cache_features *cf = &ca->features; 2136 2137 init_features(cf); 2138 2139 r = dm_read_arg_group(_args, as, &argc, error); 2140 if (r) 2141 return -EINVAL; 2142 2143 while (argc--) { 2144 arg = dm_shift_arg(as); 2145 2146 if (!strcasecmp(arg, "writeback")) 2147 cf->io_mode = CM_IO_WRITEBACK; 2148 2149 else if (!strcasecmp(arg, "writethrough")) 2150 cf->io_mode = CM_IO_WRITETHROUGH; 2151 2152 else if (!strcasecmp(arg, "passthrough")) 2153 cf->io_mode = CM_IO_PASSTHROUGH; 2154 2155 else { 2156 *error = "Unrecognised cache feature requested"; 2157 return -EINVAL; 2158 } 2159 } 2160 2161 return 0; 2162 } 2163 2164 static int parse_policy(struct cache_args *ca, struct dm_arg_set *as, 2165 char **error) 2166 { 2167 static struct dm_arg _args[] = { 2168 {0, 1024, "Invalid number of policy arguments"}, 2169 }; 2170 2171 int r; 2172 2173 if (!at_least_one_arg(as, error)) 2174 return -EINVAL; 2175 2176 ca->policy_name = dm_shift_arg(as); 2177 2178 r = dm_read_arg_group(_args, as, &ca->policy_argc, error); 2179 if (r) 2180 return -EINVAL; 2181 2182 ca->policy_argv = (const char **)as->argv; 2183 dm_consume_args(as, ca->policy_argc); 2184 2185 return 0; 2186 } 2187 2188 static int parse_cache_args(struct cache_args *ca, int argc, char **argv, 2189 char **error) 2190 { 2191 int r; 2192 struct dm_arg_set as; 2193 2194 as.argc = argc; 2195 as.argv = argv; 2196 2197 r = parse_metadata_dev(ca, &as, error); 2198 if (r) 2199 return r; 2200 2201 r = parse_cache_dev(ca, &as, error); 2202 if (r) 2203 return r; 2204 2205 r = parse_origin_dev(ca, &as, error); 2206 if (r) 2207 return r; 2208 2209 r = parse_block_size(ca, &as, error); 2210 if (r) 2211 return r; 2212 2213 r = parse_features(ca, &as, error); 2214 if (r) 2215 return r; 2216 2217 r = parse_policy(ca, &as, error); 2218 if (r) 2219 return r; 2220 2221 return 0; 2222 } 2223 2224 /*----------------------------------------------------------------*/ 2225 2226 static struct kmem_cache *migration_cache; 2227 2228 #define NOT_CORE_OPTION 1 2229 2230 static int process_config_option(struct cache *cache, const char *key, const char *value) 2231 { 2232 unsigned long tmp; 2233 2234 if (!strcasecmp(key, "migration_threshold")) { 2235 if (kstrtoul(value, 10, &tmp)) 2236 return -EINVAL; 2237 2238 cache->migration_threshold = tmp; 2239 return 0; 2240 } 2241 2242 return NOT_CORE_OPTION; 2243 } 2244 2245 static int set_config_value(struct cache *cache, const char *key, const char *value) 2246 { 2247 int r = process_config_option(cache, key, value); 2248 2249 if (r == NOT_CORE_OPTION) 2250 r = policy_set_config_value(cache->policy, key, value); 2251 2252 if (r) 2253 DMWARN("bad config value for %s: %s", key, value); 2254 2255 return r; 2256 } 2257 2258 static int set_config_values(struct cache *cache, int argc, const char **argv) 2259 { 2260 int r = 0; 2261 2262 if (argc & 1) { 2263 DMWARN("Odd number of policy arguments given but they should be <key> <value> pairs."); 2264 return -EINVAL; 2265 } 2266 2267 while (argc) { 2268 r = set_config_value(cache, argv[0], argv[1]); 2269 if (r) 2270 break; 2271 2272 argc -= 2; 2273 argv += 2; 2274 } 2275 2276 return r; 2277 } 2278 2279 static int create_cache_policy(struct cache *cache, struct cache_args *ca, 2280 char **error) 2281 { 2282 struct dm_cache_policy *p = dm_cache_policy_create(ca->policy_name, 2283 cache->cache_size, 2284 cache->origin_sectors, 2285 cache->sectors_per_block); 2286 if (IS_ERR(p)) { 2287 *error = "Error creating cache's policy"; 2288 return PTR_ERR(p); 2289 } 2290 cache->policy = p; 2291 2292 return 0; 2293 } 2294 2295 /* 2296 * We want the discard block size to be at least the size of the cache 2297 * block size and have no more than 2^14 discard blocks across the origin. 2298 */ 2299 #define MAX_DISCARD_BLOCKS (1 << 14) 2300 2301 static bool too_many_discard_blocks(sector_t discard_block_size, 2302 sector_t origin_size) 2303 { 2304 (void) sector_div(origin_size, discard_block_size); 2305 2306 return origin_size > MAX_DISCARD_BLOCKS; 2307 } 2308 2309 static sector_t calculate_discard_block_size(sector_t cache_block_size, 2310 sector_t origin_size) 2311 { 2312 sector_t discard_block_size = cache_block_size; 2313 2314 if (origin_size) 2315 while (too_many_discard_blocks(discard_block_size, origin_size)) 2316 discard_block_size *= 2; 2317 2318 return discard_block_size; 2319 } 2320 2321 static void set_cache_size(struct cache *cache, dm_cblock_t size) 2322 { 2323 dm_block_t nr_blocks = from_cblock(size); 2324 2325 if (nr_blocks > (1 << 20) && cache->cache_size != size) 2326 DMWARN_LIMIT("You have created a cache device with a lot of individual cache blocks (%llu)\n" 2327 "All these mappings can consume a lot of kernel memory, and take some time to read/write.\n" 2328 "Please consider increasing the cache block size to reduce the overall cache block count.", 2329 (unsigned long long) nr_blocks); 2330 2331 cache->cache_size = size; 2332 } 2333 2334 #define DEFAULT_MIGRATION_THRESHOLD 2048 2335 2336 static int cache_create(struct cache_args *ca, struct cache **result) 2337 { 2338 int r = 0; 2339 char **error = &ca->ti->error; 2340 struct cache *cache; 2341 struct dm_target *ti = ca->ti; 2342 dm_block_t origin_blocks; 2343 struct dm_cache_metadata *cmd; 2344 bool may_format = ca->features.mode == CM_WRITE; 2345 2346 cache = kzalloc(sizeof(*cache), GFP_KERNEL); 2347 if (!cache) 2348 return -ENOMEM; 2349 2350 cache->ti = ca->ti; 2351 ti->private = cache; 2352 ti->num_flush_bios = 2; 2353 ti->flush_supported = true; 2354 2355 ti->num_discard_bios = 1; 2356 ti->discards_supported = true; 2357 ti->discard_zeroes_data_unsupported = true; 2358 ti->split_discard_bios = false; 2359 2360 cache->features = ca->features; 2361 ti->per_bio_data_size = get_per_bio_data_size(cache); 2362 2363 cache->callbacks.congested_fn = cache_is_congested; 2364 dm_table_add_target_callbacks(ti->table, &cache->callbacks); 2365 2366 cache->metadata_dev = ca->metadata_dev; 2367 cache->origin_dev = ca->origin_dev; 2368 cache->cache_dev = ca->cache_dev; 2369 2370 ca->metadata_dev = ca->origin_dev = ca->cache_dev = NULL; 2371 2372 /* FIXME: factor out this whole section */ 2373 origin_blocks = cache->origin_sectors = ca->origin_sectors; 2374 origin_blocks = block_div(origin_blocks, ca->block_size); 2375 cache->origin_blocks = to_oblock(origin_blocks); 2376 2377 cache->sectors_per_block = ca->block_size; 2378 if (dm_set_target_max_io_len(ti, cache->sectors_per_block)) { 2379 r = -EINVAL; 2380 goto bad; 2381 } 2382 2383 if (ca->block_size & (ca->block_size - 1)) { 2384 dm_block_t cache_size = ca->cache_sectors; 2385 2386 cache->sectors_per_block_shift = -1; 2387 cache_size = block_div(cache_size, ca->block_size); 2388 set_cache_size(cache, to_cblock(cache_size)); 2389 } else { 2390 cache->sectors_per_block_shift = __ffs(ca->block_size); 2391 set_cache_size(cache, to_cblock(ca->cache_sectors >> cache->sectors_per_block_shift)); 2392 } 2393 2394 r = create_cache_policy(cache, ca, error); 2395 if (r) 2396 goto bad; 2397 2398 cache->policy_nr_args = ca->policy_argc; 2399 cache->migration_threshold = DEFAULT_MIGRATION_THRESHOLD; 2400 2401 r = set_config_values(cache, ca->policy_argc, ca->policy_argv); 2402 if (r) { 2403 *error = "Error setting cache policy's config values"; 2404 goto bad; 2405 } 2406 2407 cmd = dm_cache_metadata_open(cache->metadata_dev->bdev, 2408 ca->block_size, may_format, 2409 dm_cache_policy_get_hint_size(cache->policy)); 2410 if (IS_ERR(cmd)) { 2411 *error = "Error creating metadata object"; 2412 r = PTR_ERR(cmd); 2413 goto bad; 2414 } 2415 cache->cmd = cmd; 2416 2417 if (passthrough_mode(&cache->features)) { 2418 bool all_clean; 2419 2420 r = dm_cache_metadata_all_clean(cache->cmd, &all_clean); 2421 if (r) { 2422 *error = "dm_cache_metadata_all_clean() failed"; 2423 goto bad; 2424 } 2425 2426 if (!all_clean) { 2427 *error = "Cannot enter passthrough mode unless all blocks are clean"; 2428 r = -EINVAL; 2429 goto bad; 2430 } 2431 } 2432 2433 spin_lock_init(&cache->lock); 2434 bio_list_init(&cache->deferred_bios); 2435 bio_list_init(&cache->deferred_flush_bios); 2436 bio_list_init(&cache->deferred_writethrough_bios); 2437 INIT_LIST_HEAD(&cache->quiesced_migrations); 2438 INIT_LIST_HEAD(&cache->completed_migrations); 2439 INIT_LIST_HEAD(&cache->need_commit_migrations); 2440 atomic_set(&cache->nr_allocated_migrations, 0); 2441 atomic_set(&cache->nr_io_migrations, 0); 2442 init_waitqueue_head(&cache->migration_wait); 2443 2444 init_waitqueue_head(&cache->quiescing_wait); 2445 atomic_set(&cache->quiescing, 0); 2446 atomic_set(&cache->quiescing_ack, 0); 2447 2448 r = -ENOMEM; 2449 atomic_set(&cache->nr_dirty, 0); 2450 cache->dirty_bitset = alloc_bitset(from_cblock(cache->cache_size)); 2451 if (!cache->dirty_bitset) { 2452 *error = "could not allocate dirty bitset"; 2453 goto bad; 2454 } 2455 clear_bitset(cache->dirty_bitset, from_cblock(cache->cache_size)); 2456 2457 cache->discard_block_size = 2458 calculate_discard_block_size(cache->sectors_per_block, 2459 cache->origin_sectors); 2460 cache->discard_nr_blocks = to_dblock(dm_sector_div_up(cache->origin_sectors, 2461 cache->discard_block_size)); 2462 cache->discard_bitset = alloc_bitset(from_dblock(cache->discard_nr_blocks)); 2463 if (!cache->discard_bitset) { 2464 *error = "could not allocate discard bitset"; 2465 goto bad; 2466 } 2467 clear_bitset(cache->discard_bitset, from_dblock(cache->discard_nr_blocks)); 2468 2469 cache->copier = dm_kcopyd_client_create(&dm_kcopyd_throttle); 2470 if (IS_ERR(cache->copier)) { 2471 *error = "could not create kcopyd client"; 2472 r = PTR_ERR(cache->copier); 2473 goto bad; 2474 } 2475 2476 cache->wq = alloc_ordered_workqueue("dm-" DM_MSG_PREFIX, WQ_MEM_RECLAIM); 2477 if (!cache->wq) { 2478 *error = "could not create workqueue for metadata object"; 2479 goto bad; 2480 } 2481 INIT_WORK(&cache->worker, do_worker); 2482 INIT_DELAYED_WORK(&cache->waker, do_waker); 2483 cache->last_commit_jiffies = jiffies; 2484 2485 cache->prison = dm_bio_prison_create(); 2486 if (!cache->prison) { 2487 *error = "could not create bio prison"; 2488 goto bad; 2489 } 2490 2491 cache->all_io_ds = dm_deferred_set_create(); 2492 if (!cache->all_io_ds) { 2493 *error = "could not create all_io deferred set"; 2494 goto bad; 2495 } 2496 2497 cache->migration_pool = mempool_create_slab_pool(MIGRATION_POOL_SIZE, 2498 migration_cache); 2499 if (!cache->migration_pool) { 2500 *error = "Error creating cache's migration mempool"; 2501 goto bad; 2502 } 2503 2504 cache->need_tick_bio = true; 2505 cache->sized = false; 2506 cache->invalidate = false; 2507 cache->commit_requested = false; 2508 cache->loaded_mappings = false; 2509 cache->loaded_discards = false; 2510 2511 load_stats(cache); 2512 2513 atomic_set(&cache->stats.demotion, 0); 2514 atomic_set(&cache->stats.promotion, 0); 2515 atomic_set(&cache->stats.copies_avoided, 0); 2516 atomic_set(&cache->stats.cache_cell_clash, 0); 2517 atomic_set(&cache->stats.commit_count, 0); 2518 atomic_set(&cache->stats.discard_count, 0); 2519 2520 spin_lock_init(&cache->invalidation_lock); 2521 INIT_LIST_HEAD(&cache->invalidation_requests); 2522 2523 *result = cache; 2524 return 0; 2525 2526 bad: 2527 destroy(cache); 2528 return r; 2529 } 2530 2531 static int copy_ctr_args(struct cache *cache, int argc, const char **argv) 2532 { 2533 unsigned i; 2534 const char **copy; 2535 2536 copy = kcalloc(argc, sizeof(*copy), GFP_KERNEL); 2537 if (!copy) 2538 return -ENOMEM; 2539 for (i = 0; i < argc; i++) { 2540 copy[i] = kstrdup(argv[i], GFP_KERNEL); 2541 if (!copy[i]) { 2542 while (i--) 2543 kfree(copy[i]); 2544 kfree(copy); 2545 return -ENOMEM; 2546 } 2547 } 2548 2549 cache->nr_ctr_args = argc; 2550 cache->ctr_args = copy; 2551 2552 return 0; 2553 } 2554 2555 static int cache_ctr(struct dm_target *ti, unsigned argc, char **argv) 2556 { 2557 int r = -EINVAL; 2558 struct cache_args *ca; 2559 struct cache *cache = NULL; 2560 2561 ca = kzalloc(sizeof(*ca), GFP_KERNEL); 2562 if (!ca) { 2563 ti->error = "Error allocating memory for cache"; 2564 return -ENOMEM; 2565 } 2566 ca->ti = ti; 2567 2568 r = parse_cache_args(ca, argc, argv, &ti->error); 2569 if (r) 2570 goto out; 2571 2572 r = cache_create(ca, &cache); 2573 if (r) 2574 goto out; 2575 2576 r = copy_ctr_args(cache, argc - 3, (const char **)argv + 3); 2577 if (r) { 2578 destroy(cache); 2579 goto out; 2580 } 2581 2582 ti->private = cache; 2583 2584 out: 2585 destroy_cache_args(ca); 2586 return r; 2587 } 2588 2589 static int __cache_map(struct cache *cache, struct bio *bio, struct dm_bio_prison_cell **cell) 2590 { 2591 int r; 2592 dm_oblock_t block = get_bio_block(cache, bio); 2593 size_t pb_data_size = get_per_bio_data_size(cache); 2594 bool can_migrate = false; 2595 bool discarded_block; 2596 struct policy_result lookup_result; 2597 struct per_bio_data *pb = init_per_bio_data(bio, pb_data_size); 2598 2599 if (unlikely(from_oblock(block) >= from_oblock(cache->origin_blocks))) { 2600 /* 2601 * This can only occur if the io goes to a partial block at 2602 * the end of the origin device. We don't cache these. 2603 * Just remap to the origin and carry on. 2604 */ 2605 remap_to_origin(cache, bio); 2606 return DM_MAPIO_REMAPPED; 2607 } 2608 2609 if (bio->bi_rw & (REQ_FLUSH | REQ_FUA | REQ_DISCARD)) { 2610 defer_bio(cache, bio); 2611 return DM_MAPIO_SUBMITTED; 2612 } 2613 2614 /* 2615 * Check to see if that block is currently migrating. 2616 */ 2617 *cell = alloc_prison_cell(cache); 2618 if (!*cell) { 2619 defer_bio(cache, bio); 2620 return DM_MAPIO_SUBMITTED; 2621 } 2622 2623 r = bio_detain(cache, block, bio, *cell, 2624 (cell_free_fn) free_prison_cell, 2625 cache, cell); 2626 if (r) { 2627 if (r < 0) 2628 defer_bio(cache, bio); 2629 2630 return DM_MAPIO_SUBMITTED; 2631 } 2632 2633 discarded_block = is_discarded_oblock(cache, block); 2634 2635 r = policy_map(cache->policy, block, false, can_migrate, discarded_block, 2636 bio, &lookup_result); 2637 if (r == -EWOULDBLOCK) { 2638 cell_defer(cache, *cell, true); 2639 return DM_MAPIO_SUBMITTED; 2640 2641 } else if (r) { 2642 DMERR_LIMIT("Unexpected return from cache replacement policy: %d", r); 2643 cell_defer(cache, *cell, false); 2644 bio_io_error(bio); 2645 return DM_MAPIO_SUBMITTED; 2646 } 2647 2648 r = DM_MAPIO_REMAPPED; 2649 switch (lookup_result.op) { 2650 case POLICY_HIT: 2651 if (passthrough_mode(&cache->features)) { 2652 if (bio_data_dir(bio) == WRITE) { 2653 /* 2654 * We need to invalidate this block, so 2655 * defer for the worker thread. 2656 */ 2657 cell_defer(cache, *cell, true); 2658 r = DM_MAPIO_SUBMITTED; 2659 2660 } else { 2661 inc_miss_counter(cache, bio); 2662 remap_to_origin_clear_discard(cache, bio, block); 2663 } 2664 2665 } else { 2666 inc_hit_counter(cache, bio); 2667 if (bio_data_dir(bio) == WRITE && writethrough_mode(&cache->features) && 2668 !is_dirty(cache, lookup_result.cblock)) 2669 remap_to_origin_then_cache(cache, bio, block, lookup_result.cblock); 2670 else 2671 remap_to_cache_dirty(cache, bio, block, lookup_result.cblock); 2672 } 2673 break; 2674 2675 case POLICY_MISS: 2676 inc_miss_counter(cache, bio); 2677 if (pb->req_nr != 0) { 2678 /* 2679 * This is a duplicate writethrough io that is no 2680 * longer needed because the block has been demoted. 2681 */ 2682 bio_endio(bio, 0); 2683 cell_defer(cache, *cell, false); 2684 r = DM_MAPIO_SUBMITTED; 2685 2686 } else 2687 remap_to_origin_clear_discard(cache, bio, block); 2688 2689 break; 2690 2691 default: 2692 DMERR_LIMIT("%s: erroring bio: unknown policy op: %u", __func__, 2693 (unsigned) lookup_result.op); 2694 cell_defer(cache, *cell, false); 2695 bio_io_error(bio); 2696 r = DM_MAPIO_SUBMITTED; 2697 } 2698 2699 return r; 2700 } 2701 2702 static int cache_map(struct dm_target *ti, struct bio *bio) 2703 { 2704 int r; 2705 struct dm_bio_prison_cell *cell = NULL; 2706 struct cache *cache = ti->private; 2707 2708 r = __cache_map(cache, bio, &cell); 2709 if (r == DM_MAPIO_REMAPPED && cell) { 2710 inc_ds(cache, bio, cell); 2711 cell_defer(cache, cell, false); 2712 } 2713 2714 return r; 2715 } 2716 2717 static int cache_end_io(struct dm_target *ti, struct bio *bio, int error) 2718 { 2719 struct cache *cache = ti->private; 2720 unsigned long flags; 2721 size_t pb_data_size = get_per_bio_data_size(cache); 2722 struct per_bio_data *pb = get_per_bio_data(bio, pb_data_size); 2723 2724 if (pb->tick) { 2725 policy_tick(cache->policy); 2726 2727 spin_lock_irqsave(&cache->lock, flags); 2728 cache->need_tick_bio = true; 2729 spin_unlock_irqrestore(&cache->lock, flags); 2730 } 2731 2732 check_for_quiesced_migrations(cache, pb); 2733 2734 return 0; 2735 } 2736 2737 static int write_dirty_bitset(struct cache *cache) 2738 { 2739 unsigned i, r; 2740 2741 for (i = 0; i < from_cblock(cache->cache_size); i++) { 2742 r = dm_cache_set_dirty(cache->cmd, to_cblock(i), 2743 is_dirty(cache, to_cblock(i))); 2744 if (r) 2745 return r; 2746 } 2747 2748 return 0; 2749 } 2750 2751 static int write_discard_bitset(struct cache *cache) 2752 { 2753 unsigned i, r; 2754 2755 r = dm_cache_discard_bitset_resize(cache->cmd, cache->discard_block_size, 2756 cache->discard_nr_blocks); 2757 if (r) { 2758 DMERR("could not resize on-disk discard bitset"); 2759 return r; 2760 } 2761 2762 for (i = 0; i < from_dblock(cache->discard_nr_blocks); i++) { 2763 r = dm_cache_set_discard(cache->cmd, to_dblock(i), 2764 is_discarded(cache, to_dblock(i))); 2765 if (r) 2766 return r; 2767 } 2768 2769 return 0; 2770 } 2771 2772 /* 2773 * returns true on success 2774 */ 2775 static bool sync_metadata(struct cache *cache) 2776 { 2777 int r1, r2, r3, r4; 2778 2779 r1 = write_dirty_bitset(cache); 2780 if (r1) 2781 DMERR("could not write dirty bitset"); 2782 2783 r2 = write_discard_bitset(cache); 2784 if (r2) 2785 DMERR("could not write discard bitset"); 2786 2787 save_stats(cache); 2788 2789 r3 = dm_cache_write_hints(cache->cmd, cache->policy); 2790 if (r3) 2791 DMERR("could not write hints"); 2792 2793 /* 2794 * If writing the above metadata failed, we still commit, but don't 2795 * set the clean shutdown flag. This will effectively force every 2796 * dirty bit to be set on reload. 2797 */ 2798 r4 = dm_cache_commit(cache->cmd, !r1 && !r2 && !r3); 2799 if (r4) 2800 DMERR("could not write cache metadata. Data loss may occur."); 2801 2802 return !r1 && !r2 && !r3 && !r4; 2803 } 2804 2805 static void cache_postsuspend(struct dm_target *ti) 2806 { 2807 struct cache *cache = ti->private; 2808 2809 start_quiescing(cache); 2810 wait_for_migrations(cache); 2811 stop_worker(cache); 2812 requeue_deferred_io(cache); 2813 stop_quiescing(cache); 2814 2815 (void) sync_metadata(cache); 2816 } 2817 2818 static int load_mapping(void *context, dm_oblock_t oblock, dm_cblock_t cblock, 2819 bool dirty, uint32_t hint, bool hint_valid) 2820 { 2821 int r; 2822 struct cache *cache = context; 2823 2824 r = policy_load_mapping(cache->policy, oblock, cblock, hint, hint_valid); 2825 if (r) 2826 return r; 2827 2828 if (dirty) 2829 set_dirty(cache, oblock, cblock); 2830 else 2831 clear_dirty(cache, oblock, cblock); 2832 2833 return 0; 2834 } 2835 2836 /* 2837 * The discard block size in the on disk metadata is not 2838 * neccessarily the same as we're currently using. So we have to 2839 * be careful to only set the discarded attribute if we know it 2840 * covers a complete block of the new size. 2841 */ 2842 struct discard_load_info { 2843 struct cache *cache; 2844 2845 /* 2846 * These blocks are sized using the on disk dblock size, rather 2847 * than the current one. 2848 */ 2849 dm_block_t block_size; 2850 dm_block_t discard_begin, discard_end; 2851 }; 2852 2853 static void discard_load_info_init(struct cache *cache, 2854 struct discard_load_info *li) 2855 { 2856 li->cache = cache; 2857 li->discard_begin = li->discard_end = 0; 2858 } 2859 2860 static void set_discard_range(struct discard_load_info *li) 2861 { 2862 sector_t b, e; 2863 2864 if (li->discard_begin == li->discard_end) 2865 return; 2866 2867 /* 2868 * Convert to sectors. 2869 */ 2870 b = li->discard_begin * li->block_size; 2871 e = li->discard_end * li->block_size; 2872 2873 /* 2874 * Then convert back to the current dblock size. 2875 */ 2876 b = dm_sector_div_up(b, li->cache->discard_block_size); 2877 sector_div(e, li->cache->discard_block_size); 2878 2879 /* 2880 * The origin may have shrunk, so we need to check we're still in 2881 * bounds. 2882 */ 2883 if (e > from_dblock(li->cache->discard_nr_blocks)) 2884 e = from_dblock(li->cache->discard_nr_blocks); 2885 2886 for (; b < e; b++) 2887 set_discard(li->cache, to_dblock(b)); 2888 } 2889 2890 static int load_discard(void *context, sector_t discard_block_size, 2891 dm_dblock_t dblock, bool discard) 2892 { 2893 struct discard_load_info *li = context; 2894 2895 li->block_size = discard_block_size; 2896 2897 if (discard) { 2898 if (from_dblock(dblock) == li->discard_end) 2899 /* 2900 * We're already in a discard range, just extend it. 2901 */ 2902 li->discard_end = li->discard_end + 1ULL; 2903 2904 else { 2905 /* 2906 * Emit the old range and start a new one. 2907 */ 2908 set_discard_range(li); 2909 li->discard_begin = from_dblock(dblock); 2910 li->discard_end = li->discard_begin + 1ULL; 2911 } 2912 } else { 2913 set_discard_range(li); 2914 li->discard_begin = li->discard_end = 0; 2915 } 2916 2917 return 0; 2918 } 2919 2920 static dm_cblock_t get_cache_dev_size(struct cache *cache) 2921 { 2922 sector_t size = get_dev_size(cache->cache_dev); 2923 (void) sector_div(size, cache->sectors_per_block); 2924 return to_cblock(size); 2925 } 2926 2927 static bool can_resize(struct cache *cache, dm_cblock_t new_size) 2928 { 2929 if (from_cblock(new_size) > from_cblock(cache->cache_size)) 2930 return true; 2931 2932 /* 2933 * We can't drop a dirty block when shrinking the cache. 2934 */ 2935 while (from_cblock(new_size) < from_cblock(cache->cache_size)) { 2936 new_size = to_cblock(from_cblock(new_size) + 1); 2937 if (is_dirty(cache, new_size)) { 2938 DMERR("unable to shrink cache; cache block %llu is dirty", 2939 (unsigned long long) from_cblock(new_size)); 2940 return false; 2941 } 2942 } 2943 2944 return true; 2945 } 2946 2947 static int resize_cache_dev(struct cache *cache, dm_cblock_t new_size) 2948 { 2949 int r; 2950 2951 r = dm_cache_resize(cache->cmd, new_size); 2952 if (r) { 2953 DMERR("could not resize cache metadata"); 2954 return r; 2955 } 2956 2957 set_cache_size(cache, new_size); 2958 2959 return 0; 2960 } 2961 2962 static int cache_preresume(struct dm_target *ti) 2963 { 2964 int r = 0; 2965 struct cache *cache = ti->private; 2966 dm_cblock_t csize = get_cache_dev_size(cache); 2967 2968 /* 2969 * Check to see if the cache has resized. 2970 */ 2971 if (!cache->sized) { 2972 r = resize_cache_dev(cache, csize); 2973 if (r) 2974 return r; 2975 2976 cache->sized = true; 2977 2978 } else if (csize != cache->cache_size) { 2979 if (!can_resize(cache, csize)) 2980 return -EINVAL; 2981 2982 r = resize_cache_dev(cache, csize); 2983 if (r) 2984 return r; 2985 } 2986 2987 if (!cache->loaded_mappings) { 2988 r = dm_cache_load_mappings(cache->cmd, cache->policy, 2989 load_mapping, cache); 2990 if (r) { 2991 DMERR("could not load cache mappings"); 2992 return r; 2993 } 2994 2995 cache->loaded_mappings = true; 2996 } 2997 2998 if (!cache->loaded_discards) { 2999 struct discard_load_info li; 3000 3001 /* 3002 * The discard bitset could have been resized, or the 3003 * discard block size changed. To be safe we start by 3004 * setting every dblock to not discarded. 3005 */ 3006 clear_bitset(cache->discard_bitset, from_dblock(cache->discard_nr_blocks)); 3007 3008 discard_load_info_init(cache, &li); 3009 r = dm_cache_load_discards(cache->cmd, load_discard, &li); 3010 if (r) { 3011 DMERR("could not load origin discards"); 3012 return r; 3013 } 3014 set_discard_range(&li); 3015 3016 cache->loaded_discards = true; 3017 } 3018 3019 return r; 3020 } 3021 3022 static void cache_resume(struct dm_target *ti) 3023 { 3024 struct cache *cache = ti->private; 3025 3026 cache->need_tick_bio = true; 3027 do_waker(&cache->waker.work); 3028 } 3029 3030 /* 3031 * Status format: 3032 * 3033 * <metadata block size> <#used metadata blocks>/<#total metadata blocks> 3034 * <cache block size> <#used cache blocks>/<#total cache blocks> 3035 * <#read hits> <#read misses> <#write hits> <#write misses> 3036 * <#demotions> <#promotions> <#dirty> 3037 * <#features> <features>* 3038 * <#core args> <core args> 3039 * <policy name> <#policy args> <policy args>* 3040 */ 3041 static void cache_status(struct dm_target *ti, status_type_t type, 3042 unsigned status_flags, char *result, unsigned maxlen) 3043 { 3044 int r = 0; 3045 unsigned i; 3046 ssize_t sz = 0; 3047 dm_block_t nr_free_blocks_metadata = 0; 3048 dm_block_t nr_blocks_metadata = 0; 3049 char buf[BDEVNAME_SIZE]; 3050 struct cache *cache = ti->private; 3051 dm_cblock_t residency; 3052 3053 switch (type) { 3054 case STATUSTYPE_INFO: 3055 /* Commit to ensure statistics aren't out-of-date */ 3056 if (!(status_flags & DM_STATUS_NOFLUSH_FLAG) && !dm_suspended(ti)) { 3057 r = dm_cache_commit(cache->cmd, false); 3058 if (r) 3059 DMERR("could not commit metadata for accurate status"); 3060 } 3061 3062 r = dm_cache_get_free_metadata_block_count(cache->cmd, 3063 &nr_free_blocks_metadata); 3064 if (r) { 3065 DMERR("could not get metadata free block count"); 3066 goto err; 3067 } 3068 3069 r = dm_cache_get_metadata_dev_size(cache->cmd, &nr_blocks_metadata); 3070 if (r) { 3071 DMERR("could not get metadata device size"); 3072 goto err; 3073 } 3074 3075 residency = policy_residency(cache->policy); 3076 3077 DMEMIT("%u %llu/%llu %u %llu/%llu %u %u %u %u %u %u %lu ", 3078 (unsigned)DM_CACHE_METADATA_BLOCK_SIZE, 3079 (unsigned long long)(nr_blocks_metadata - nr_free_blocks_metadata), 3080 (unsigned long long)nr_blocks_metadata, 3081 cache->sectors_per_block, 3082 (unsigned long long) from_cblock(residency), 3083 (unsigned long long) from_cblock(cache->cache_size), 3084 (unsigned) atomic_read(&cache->stats.read_hit), 3085 (unsigned) atomic_read(&cache->stats.read_miss), 3086 (unsigned) atomic_read(&cache->stats.write_hit), 3087 (unsigned) atomic_read(&cache->stats.write_miss), 3088 (unsigned) atomic_read(&cache->stats.demotion), 3089 (unsigned) atomic_read(&cache->stats.promotion), 3090 (unsigned long) atomic_read(&cache->nr_dirty)); 3091 3092 if (writethrough_mode(&cache->features)) 3093 DMEMIT("1 writethrough "); 3094 3095 else if (passthrough_mode(&cache->features)) 3096 DMEMIT("1 passthrough "); 3097 3098 else if (writeback_mode(&cache->features)) 3099 DMEMIT("1 writeback "); 3100 3101 else { 3102 DMERR("internal error: unknown io mode: %d", (int) cache->features.io_mode); 3103 goto err; 3104 } 3105 3106 DMEMIT("2 migration_threshold %llu ", (unsigned long long) cache->migration_threshold); 3107 3108 DMEMIT("%s ", dm_cache_policy_get_name(cache->policy)); 3109 if (sz < maxlen) { 3110 r = policy_emit_config_values(cache->policy, result + sz, maxlen - sz); 3111 if (r) 3112 DMERR("policy_emit_config_values returned %d", r); 3113 } 3114 3115 break; 3116 3117 case STATUSTYPE_TABLE: 3118 format_dev_t(buf, cache->metadata_dev->bdev->bd_dev); 3119 DMEMIT("%s ", buf); 3120 format_dev_t(buf, cache->cache_dev->bdev->bd_dev); 3121 DMEMIT("%s ", buf); 3122 format_dev_t(buf, cache->origin_dev->bdev->bd_dev); 3123 DMEMIT("%s", buf); 3124 3125 for (i = 0; i < cache->nr_ctr_args - 1; i++) 3126 DMEMIT(" %s", cache->ctr_args[i]); 3127 if (cache->nr_ctr_args) 3128 DMEMIT(" %s", cache->ctr_args[cache->nr_ctr_args - 1]); 3129 } 3130 3131 return; 3132 3133 err: 3134 DMEMIT("Error"); 3135 } 3136 3137 /* 3138 * A cache block range can take two forms: 3139 * 3140 * i) A single cblock, eg. '3456' 3141 * ii) A begin and end cblock with dots between, eg. 123-234 3142 */ 3143 static int parse_cblock_range(struct cache *cache, const char *str, 3144 struct cblock_range *result) 3145 { 3146 char dummy; 3147 uint64_t b, e; 3148 int r; 3149 3150 /* 3151 * Try and parse form (ii) first. 3152 */ 3153 r = sscanf(str, "%llu-%llu%c", &b, &e, &dummy); 3154 if (r < 0) 3155 return r; 3156 3157 if (r == 2) { 3158 result->begin = to_cblock(b); 3159 result->end = to_cblock(e); 3160 return 0; 3161 } 3162 3163 /* 3164 * That didn't work, try form (i). 3165 */ 3166 r = sscanf(str, "%llu%c", &b, &dummy); 3167 if (r < 0) 3168 return r; 3169 3170 if (r == 1) { 3171 result->begin = to_cblock(b); 3172 result->end = to_cblock(from_cblock(result->begin) + 1u); 3173 return 0; 3174 } 3175 3176 DMERR("invalid cblock range '%s'", str); 3177 return -EINVAL; 3178 } 3179 3180 static int validate_cblock_range(struct cache *cache, struct cblock_range *range) 3181 { 3182 uint64_t b = from_cblock(range->begin); 3183 uint64_t e = from_cblock(range->end); 3184 uint64_t n = from_cblock(cache->cache_size); 3185 3186 if (b >= n) { 3187 DMERR("begin cblock out of range: %llu >= %llu", b, n); 3188 return -EINVAL; 3189 } 3190 3191 if (e > n) { 3192 DMERR("end cblock out of range: %llu > %llu", e, n); 3193 return -EINVAL; 3194 } 3195 3196 if (b >= e) { 3197 DMERR("invalid cblock range: %llu >= %llu", b, e); 3198 return -EINVAL; 3199 } 3200 3201 return 0; 3202 } 3203 3204 static int request_invalidation(struct cache *cache, struct cblock_range *range) 3205 { 3206 struct invalidation_request req; 3207 3208 INIT_LIST_HEAD(&req.list); 3209 req.cblocks = range; 3210 atomic_set(&req.complete, 0); 3211 req.err = 0; 3212 init_waitqueue_head(&req.result_wait); 3213 3214 spin_lock(&cache->invalidation_lock); 3215 list_add(&req.list, &cache->invalidation_requests); 3216 spin_unlock(&cache->invalidation_lock); 3217 wake_worker(cache); 3218 3219 wait_event(req.result_wait, atomic_read(&req.complete)); 3220 return req.err; 3221 } 3222 3223 static int process_invalidate_cblocks_message(struct cache *cache, unsigned count, 3224 const char **cblock_ranges) 3225 { 3226 int r = 0; 3227 unsigned i; 3228 struct cblock_range range; 3229 3230 if (!passthrough_mode(&cache->features)) { 3231 DMERR("cache has to be in passthrough mode for invalidation"); 3232 return -EPERM; 3233 } 3234 3235 for (i = 0; i < count; i++) { 3236 r = parse_cblock_range(cache, cblock_ranges[i], &range); 3237 if (r) 3238 break; 3239 3240 r = validate_cblock_range(cache, &range); 3241 if (r) 3242 break; 3243 3244 /* 3245 * Pass begin and end origin blocks to the worker and wake it. 3246 */ 3247 r = request_invalidation(cache, &range); 3248 if (r) 3249 break; 3250 } 3251 3252 return r; 3253 } 3254 3255 /* 3256 * Supports 3257 * "<key> <value>" 3258 * and 3259 * "invalidate_cblocks [(<begin>)|(<begin>-<end>)]* 3260 * 3261 * The key migration_threshold is supported by the cache target core. 3262 */ 3263 static int cache_message(struct dm_target *ti, unsigned argc, char **argv) 3264 { 3265 struct cache *cache = ti->private; 3266 3267 if (!argc) 3268 return -EINVAL; 3269 3270 if (!strcasecmp(argv[0], "invalidate_cblocks")) 3271 return process_invalidate_cblocks_message(cache, argc - 1, (const char **) argv + 1); 3272 3273 if (argc != 2) 3274 return -EINVAL; 3275 3276 return set_config_value(cache, argv[0], argv[1]); 3277 } 3278 3279 static int cache_iterate_devices(struct dm_target *ti, 3280 iterate_devices_callout_fn fn, void *data) 3281 { 3282 int r = 0; 3283 struct cache *cache = ti->private; 3284 3285 r = fn(ti, cache->cache_dev, 0, get_dev_size(cache->cache_dev), data); 3286 if (!r) 3287 r = fn(ti, cache->origin_dev, 0, ti->len, data); 3288 3289 return r; 3290 } 3291 3292 /* 3293 * We assume I/O is going to the origin (which is the volume 3294 * more likely to have restrictions e.g. by being striped). 3295 * (Looking up the exact location of the data would be expensive 3296 * and could always be out of date by the time the bio is submitted.) 3297 */ 3298 static int cache_bvec_merge(struct dm_target *ti, 3299 struct bvec_merge_data *bvm, 3300 struct bio_vec *biovec, int max_size) 3301 { 3302 struct cache *cache = ti->private; 3303 struct request_queue *q = bdev_get_queue(cache->origin_dev->bdev); 3304 3305 if (!q->merge_bvec_fn) 3306 return max_size; 3307 3308 bvm->bi_bdev = cache->origin_dev->bdev; 3309 return min(max_size, q->merge_bvec_fn(q, bvm, biovec)); 3310 } 3311 3312 static void set_discard_limits(struct cache *cache, struct queue_limits *limits) 3313 { 3314 /* 3315 * FIXME: these limits may be incompatible with the cache device 3316 */ 3317 limits->max_discard_sectors = min_t(sector_t, cache->discard_block_size * 1024, 3318 cache->origin_sectors); 3319 limits->discard_granularity = cache->discard_block_size << SECTOR_SHIFT; 3320 } 3321 3322 static void cache_io_hints(struct dm_target *ti, struct queue_limits *limits) 3323 { 3324 struct cache *cache = ti->private; 3325 uint64_t io_opt_sectors = limits->io_opt >> SECTOR_SHIFT; 3326 3327 /* 3328 * If the system-determined stacked limits are compatible with the 3329 * cache's blocksize (io_opt is a factor) do not override them. 3330 */ 3331 if (io_opt_sectors < cache->sectors_per_block || 3332 do_div(io_opt_sectors, cache->sectors_per_block)) { 3333 blk_limits_io_min(limits, cache->sectors_per_block << SECTOR_SHIFT); 3334 blk_limits_io_opt(limits, cache->sectors_per_block << SECTOR_SHIFT); 3335 } 3336 set_discard_limits(cache, limits); 3337 } 3338 3339 /*----------------------------------------------------------------*/ 3340 3341 static struct target_type cache_target = { 3342 .name = "cache", 3343 .version = {1, 6, 0}, 3344 .module = THIS_MODULE, 3345 .ctr = cache_ctr, 3346 .dtr = cache_dtr, 3347 .map = cache_map, 3348 .end_io = cache_end_io, 3349 .postsuspend = cache_postsuspend, 3350 .preresume = cache_preresume, 3351 .resume = cache_resume, 3352 .status = cache_status, 3353 .message = cache_message, 3354 .iterate_devices = cache_iterate_devices, 3355 .merge = cache_bvec_merge, 3356 .io_hints = cache_io_hints, 3357 }; 3358 3359 static int __init dm_cache_init(void) 3360 { 3361 int r; 3362 3363 r = dm_register_target(&cache_target); 3364 if (r) { 3365 DMERR("cache target registration failed: %d", r); 3366 return r; 3367 } 3368 3369 migration_cache = KMEM_CACHE(dm_cache_migration, 0); 3370 if (!migration_cache) { 3371 dm_unregister_target(&cache_target); 3372 return -ENOMEM; 3373 } 3374 3375 return 0; 3376 } 3377 3378 static void __exit dm_cache_exit(void) 3379 { 3380 dm_unregister_target(&cache_target); 3381 kmem_cache_destroy(migration_cache); 3382 } 3383 3384 module_init(dm_cache_init); 3385 module_exit(dm_cache_exit); 3386 3387 MODULE_DESCRIPTION(DM_NAME " cache target"); 3388 MODULE_AUTHOR("Joe Thornber <ejt@redhat.com>"); 3389 MODULE_LICENSE("GPL"); 3390