1 /* 2 * Copyright (C) 2012 Red Hat. All rights reserved. 3 * 4 * This file is released under the GPL. 5 */ 6 7 #include "dm.h" 8 #include "dm-bio-prison.h" 9 #include "dm-bio-record.h" 10 #include "dm-cache-metadata.h" 11 12 #include <linux/dm-io.h> 13 #include <linux/dm-kcopyd.h> 14 #include <linux/jiffies.h> 15 #include <linux/init.h> 16 #include <linux/mempool.h> 17 #include <linux/module.h> 18 #include <linux/slab.h> 19 #include <linux/vmalloc.h> 20 21 #define DM_MSG_PREFIX "cache" 22 23 DECLARE_DM_KCOPYD_THROTTLE_WITH_MODULE_PARM(cache_copy_throttle, 24 "A percentage of time allocated for copying to and/or from cache"); 25 26 /*----------------------------------------------------------------*/ 27 28 #define IOT_RESOLUTION 4 29 30 struct io_tracker { 31 spinlock_t lock; 32 33 /* 34 * Sectors of in-flight IO. 35 */ 36 sector_t in_flight; 37 38 /* 39 * The time, in jiffies, when this device became idle (if it is 40 * indeed idle). 41 */ 42 unsigned long idle_time; 43 unsigned long last_update_time; 44 }; 45 46 static void iot_init(struct io_tracker *iot) 47 { 48 spin_lock_init(&iot->lock); 49 iot->in_flight = 0ul; 50 iot->idle_time = 0ul; 51 iot->last_update_time = jiffies; 52 } 53 54 static bool __iot_idle_for(struct io_tracker *iot, unsigned long jifs) 55 { 56 if (iot->in_flight) 57 return false; 58 59 return time_after(jiffies, iot->idle_time + jifs); 60 } 61 62 static bool iot_idle_for(struct io_tracker *iot, unsigned long jifs) 63 { 64 bool r; 65 unsigned long flags; 66 67 spin_lock_irqsave(&iot->lock, flags); 68 r = __iot_idle_for(iot, jifs); 69 spin_unlock_irqrestore(&iot->lock, flags); 70 71 return r; 72 } 73 74 static void iot_io_begin(struct io_tracker *iot, sector_t len) 75 { 76 unsigned long flags; 77 78 spin_lock_irqsave(&iot->lock, flags); 79 iot->in_flight += len; 80 spin_unlock_irqrestore(&iot->lock, flags); 81 } 82 83 static void __iot_io_end(struct io_tracker *iot, sector_t len) 84 { 85 iot->in_flight -= len; 86 if (!iot->in_flight) 87 iot->idle_time = jiffies; 88 } 89 90 static void iot_io_end(struct io_tracker *iot, sector_t len) 91 { 92 unsigned long flags; 93 94 spin_lock_irqsave(&iot->lock, flags); 95 __iot_io_end(iot, len); 96 spin_unlock_irqrestore(&iot->lock, flags); 97 } 98 99 /*----------------------------------------------------------------*/ 100 101 /* 102 * Glossary: 103 * 104 * oblock: index of an origin block 105 * cblock: index of a cache block 106 * promotion: movement of a block from origin to cache 107 * demotion: movement of a block from cache to origin 108 * migration: movement of a block between the origin and cache device, 109 * either direction 110 */ 111 112 /*----------------------------------------------------------------*/ 113 114 /* 115 * There are a couple of places where we let a bio run, but want to do some 116 * work before calling its endio function. We do this by temporarily 117 * changing the endio fn. 118 */ 119 struct dm_hook_info { 120 bio_end_io_t *bi_end_io; 121 }; 122 123 static void dm_hook_bio(struct dm_hook_info *h, struct bio *bio, 124 bio_end_io_t *bi_end_io, void *bi_private) 125 { 126 h->bi_end_io = bio->bi_end_io; 127 128 bio->bi_end_io = bi_end_io; 129 bio->bi_private = bi_private; 130 } 131 132 static void dm_unhook_bio(struct dm_hook_info *h, struct bio *bio) 133 { 134 bio->bi_end_io = h->bi_end_io; 135 } 136 137 /*----------------------------------------------------------------*/ 138 139 #define MIGRATION_POOL_SIZE 128 140 #define COMMIT_PERIOD HZ 141 #define MIGRATION_COUNT_WINDOW 10 142 143 /* 144 * The block size of the device holding cache data must be 145 * between 32KB and 1GB. 146 */ 147 #define DATA_DEV_BLOCK_SIZE_MIN_SECTORS (32 * 1024 >> SECTOR_SHIFT) 148 #define DATA_DEV_BLOCK_SIZE_MAX_SECTORS (1024 * 1024 * 1024 >> SECTOR_SHIFT) 149 150 enum cache_metadata_mode { 151 CM_WRITE, /* metadata may be changed */ 152 CM_READ_ONLY, /* metadata may not be changed */ 153 CM_FAIL 154 }; 155 156 enum cache_io_mode { 157 /* 158 * Data is written to cached blocks only. These blocks are marked 159 * dirty. If you lose the cache device you will lose data. 160 * Potential performance increase for both reads and writes. 161 */ 162 CM_IO_WRITEBACK, 163 164 /* 165 * Data is written to both cache and origin. Blocks are never 166 * dirty. Potential performance benfit for reads only. 167 */ 168 CM_IO_WRITETHROUGH, 169 170 /* 171 * A degraded mode useful for various cache coherency situations 172 * (eg, rolling back snapshots). Reads and writes always go to the 173 * origin. If a write goes to a cached oblock, then the cache 174 * block is invalidated. 175 */ 176 CM_IO_PASSTHROUGH 177 }; 178 179 struct cache_features { 180 enum cache_metadata_mode mode; 181 enum cache_io_mode io_mode; 182 unsigned metadata_version; 183 }; 184 185 struct cache_stats { 186 atomic_t read_hit; 187 atomic_t read_miss; 188 atomic_t write_hit; 189 atomic_t write_miss; 190 atomic_t demotion; 191 atomic_t promotion; 192 atomic_t copies_avoided; 193 atomic_t cache_cell_clash; 194 atomic_t commit_count; 195 atomic_t discard_count; 196 }; 197 198 /* 199 * Defines a range of cblocks, begin to (end - 1) are in the range. end is 200 * the one-past-the-end value. 201 */ 202 struct cblock_range { 203 dm_cblock_t begin; 204 dm_cblock_t end; 205 }; 206 207 struct invalidation_request { 208 struct list_head list; 209 struct cblock_range *cblocks; 210 211 atomic_t complete; 212 int err; 213 214 wait_queue_head_t result_wait; 215 }; 216 217 struct cache { 218 struct dm_target *ti; 219 struct dm_target_callbacks callbacks; 220 221 struct dm_cache_metadata *cmd; 222 223 /* 224 * Metadata is written to this device. 225 */ 226 struct dm_dev *metadata_dev; 227 228 /* 229 * The slower of the two data devices. Typically a spindle. 230 */ 231 struct dm_dev *origin_dev; 232 233 /* 234 * The faster of the two data devices. Typically an SSD. 235 */ 236 struct dm_dev *cache_dev; 237 238 /* 239 * Size of the origin device in _complete_ blocks and native sectors. 240 */ 241 dm_oblock_t origin_blocks; 242 sector_t origin_sectors; 243 244 /* 245 * Size of the cache device in blocks. 246 */ 247 dm_cblock_t cache_size; 248 249 /* 250 * Fields for converting from sectors to blocks. 251 */ 252 sector_t sectors_per_block; 253 int sectors_per_block_shift; 254 255 spinlock_t lock; 256 struct list_head deferred_cells; 257 struct bio_list deferred_bios; 258 struct bio_list deferred_flush_bios; 259 struct bio_list deferred_writethrough_bios; 260 struct list_head quiesced_migrations; 261 struct list_head completed_migrations; 262 struct list_head need_commit_migrations; 263 sector_t migration_threshold; 264 wait_queue_head_t migration_wait; 265 atomic_t nr_allocated_migrations; 266 267 /* 268 * The number of in flight migrations that are performing 269 * background io. eg, promotion, writeback. 270 */ 271 atomic_t nr_io_migrations; 272 273 wait_queue_head_t quiescing_wait; 274 atomic_t quiescing; 275 atomic_t quiescing_ack; 276 277 /* 278 * cache_size entries, dirty if set 279 */ 280 atomic_t nr_dirty; 281 unsigned long *dirty_bitset; 282 283 /* 284 * origin_blocks entries, discarded if set. 285 */ 286 dm_dblock_t discard_nr_blocks; 287 unsigned long *discard_bitset; 288 uint32_t discard_block_size; /* a power of 2 times sectors per block */ 289 290 /* 291 * Rather than reconstructing the table line for the status we just 292 * save it and regurgitate. 293 */ 294 unsigned nr_ctr_args; 295 const char **ctr_args; 296 297 struct dm_kcopyd_client *copier; 298 struct workqueue_struct *wq; 299 struct work_struct worker; 300 301 struct delayed_work waker; 302 unsigned long last_commit_jiffies; 303 304 struct dm_bio_prison *prison; 305 struct dm_deferred_set *all_io_ds; 306 307 mempool_t *migration_pool; 308 309 struct dm_cache_policy *policy; 310 unsigned policy_nr_args; 311 312 bool need_tick_bio:1; 313 bool sized:1; 314 bool invalidate:1; 315 bool commit_requested:1; 316 bool loaded_mappings:1; 317 bool loaded_discards:1; 318 319 /* 320 * Cache features such as write-through. 321 */ 322 struct cache_features features; 323 324 struct cache_stats stats; 325 326 /* 327 * Invalidation fields. 328 */ 329 spinlock_t invalidation_lock; 330 struct list_head invalidation_requests; 331 332 struct io_tracker origin_tracker; 333 }; 334 335 struct per_bio_data { 336 bool tick:1; 337 unsigned req_nr:2; 338 struct dm_deferred_entry *all_io_entry; 339 struct dm_hook_info hook_info; 340 sector_t len; 341 342 /* 343 * writethrough fields. These MUST remain at the end of this 344 * structure and the 'cache' member must be the first as it 345 * is used to determine the offset of the writethrough fields. 346 */ 347 struct cache *cache; 348 dm_cblock_t cblock; 349 struct dm_bio_details bio_details; 350 }; 351 352 struct dm_cache_migration { 353 struct list_head list; 354 struct cache *cache; 355 356 unsigned long start_jiffies; 357 dm_oblock_t old_oblock; 358 dm_oblock_t new_oblock; 359 dm_cblock_t cblock; 360 361 bool err:1; 362 bool discard:1; 363 bool writeback:1; 364 bool demote:1; 365 bool promote:1; 366 bool requeue_holder:1; 367 bool invalidate:1; 368 369 struct dm_bio_prison_cell *old_ocell; 370 struct dm_bio_prison_cell *new_ocell; 371 }; 372 373 /* 374 * Processing a bio in the worker thread may require these memory 375 * allocations. We prealloc to avoid deadlocks (the same worker thread 376 * frees them back to the mempool). 377 */ 378 struct prealloc { 379 struct dm_cache_migration *mg; 380 struct dm_bio_prison_cell *cell1; 381 struct dm_bio_prison_cell *cell2; 382 }; 383 384 static enum cache_metadata_mode get_cache_mode(struct cache *cache); 385 386 static void wake_worker(struct cache *cache) 387 { 388 queue_work(cache->wq, &cache->worker); 389 } 390 391 /*----------------------------------------------------------------*/ 392 393 static struct dm_bio_prison_cell *alloc_prison_cell(struct cache *cache) 394 { 395 /* FIXME: change to use a local slab. */ 396 return dm_bio_prison_alloc_cell(cache->prison, GFP_NOWAIT); 397 } 398 399 static void free_prison_cell(struct cache *cache, struct dm_bio_prison_cell *cell) 400 { 401 dm_bio_prison_free_cell(cache->prison, cell); 402 } 403 404 static struct dm_cache_migration *alloc_migration(struct cache *cache) 405 { 406 struct dm_cache_migration *mg; 407 408 mg = mempool_alloc(cache->migration_pool, GFP_NOWAIT); 409 if (mg) { 410 mg->cache = cache; 411 atomic_inc(&mg->cache->nr_allocated_migrations); 412 } 413 414 return mg; 415 } 416 417 static void free_migration(struct dm_cache_migration *mg) 418 { 419 struct cache *cache = mg->cache; 420 421 if (atomic_dec_and_test(&cache->nr_allocated_migrations)) 422 wake_up(&cache->migration_wait); 423 424 mempool_free(mg, cache->migration_pool); 425 } 426 427 static int prealloc_data_structs(struct cache *cache, struct prealloc *p) 428 { 429 if (!p->mg) { 430 p->mg = alloc_migration(cache); 431 if (!p->mg) 432 return -ENOMEM; 433 } 434 435 if (!p->cell1) { 436 p->cell1 = alloc_prison_cell(cache); 437 if (!p->cell1) 438 return -ENOMEM; 439 } 440 441 if (!p->cell2) { 442 p->cell2 = alloc_prison_cell(cache); 443 if (!p->cell2) 444 return -ENOMEM; 445 } 446 447 return 0; 448 } 449 450 static void prealloc_free_structs(struct cache *cache, struct prealloc *p) 451 { 452 if (p->cell2) 453 free_prison_cell(cache, p->cell2); 454 455 if (p->cell1) 456 free_prison_cell(cache, p->cell1); 457 458 if (p->mg) 459 free_migration(p->mg); 460 } 461 462 static struct dm_cache_migration *prealloc_get_migration(struct prealloc *p) 463 { 464 struct dm_cache_migration *mg = p->mg; 465 466 BUG_ON(!mg); 467 p->mg = NULL; 468 469 return mg; 470 } 471 472 /* 473 * You must have a cell within the prealloc struct to return. If not this 474 * function will BUG() rather than returning NULL. 475 */ 476 static struct dm_bio_prison_cell *prealloc_get_cell(struct prealloc *p) 477 { 478 struct dm_bio_prison_cell *r = NULL; 479 480 if (p->cell1) { 481 r = p->cell1; 482 p->cell1 = NULL; 483 484 } else if (p->cell2) { 485 r = p->cell2; 486 p->cell2 = NULL; 487 } else 488 BUG(); 489 490 return r; 491 } 492 493 /* 494 * You can't have more than two cells in a prealloc struct. BUG() will be 495 * called if you try and overfill. 496 */ 497 static void prealloc_put_cell(struct prealloc *p, struct dm_bio_prison_cell *cell) 498 { 499 if (!p->cell2) 500 p->cell2 = cell; 501 502 else if (!p->cell1) 503 p->cell1 = cell; 504 505 else 506 BUG(); 507 } 508 509 /*----------------------------------------------------------------*/ 510 511 static void build_key(dm_oblock_t begin, dm_oblock_t end, struct dm_cell_key *key) 512 { 513 key->virtual = 0; 514 key->dev = 0; 515 key->block_begin = from_oblock(begin); 516 key->block_end = from_oblock(end); 517 } 518 519 /* 520 * The caller hands in a preallocated cell, and a free function for it. 521 * The cell will be freed if there's an error, or if it wasn't used because 522 * a cell with that key already exists. 523 */ 524 typedef void (*cell_free_fn)(void *context, struct dm_bio_prison_cell *cell); 525 526 static int bio_detain_range(struct cache *cache, dm_oblock_t oblock_begin, dm_oblock_t oblock_end, 527 struct bio *bio, struct dm_bio_prison_cell *cell_prealloc, 528 cell_free_fn free_fn, void *free_context, 529 struct dm_bio_prison_cell **cell_result) 530 { 531 int r; 532 struct dm_cell_key key; 533 534 build_key(oblock_begin, oblock_end, &key); 535 r = dm_bio_detain(cache->prison, &key, bio, cell_prealloc, cell_result); 536 if (r) 537 free_fn(free_context, cell_prealloc); 538 539 return r; 540 } 541 542 static int bio_detain(struct cache *cache, dm_oblock_t oblock, 543 struct bio *bio, struct dm_bio_prison_cell *cell_prealloc, 544 cell_free_fn free_fn, void *free_context, 545 struct dm_bio_prison_cell **cell_result) 546 { 547 dm_oblock_t end = to_oblock(from_oblock(oblock) + 1ULL); 548 return bio_detain_range(cache, oblock, end, bio, 549 cell_prealloc, free_fn, free_context, cell_result); 550 } 551 552 static int get_cell(struct cache *cache, 553 dm_oblock_t oblock, 554 struct prealloc *structs, 555 struct dm_bio_prison_cell **cell_result) 556 { 557 int r; 558 struct dm_cell_key key; 559 struct dm_bio_prison_cell *cell_prealloc; 560 561 cell_prealloc = prealloc_get_cell(structs); 562 563 build_key(oblock, to_oblock(from_oblock(oblock) + 1ULL), &key); 564 r = dm_get_cell(cache->prison, &key, cell_prealloc, cell_result); 565 if (r) 566 prealloc_put_cell(structs, cell_prealloc); 567 568 return r; 569 } 570 571 /*----------------------------------------------------------------*/ 572 573 static bool is_dirty(struct cache *cache, dm_cblock_t b) 574 { 575 return test_bit(from_cblock(b), cache->dirty_bitset); 576 } 577 578 static void set_dirty(struct cache *cache, dm_oblock_t oblock, dm_cblock_t cblock) 579 { 580 if (!test_and_set_bit(from_cblock(cblock), cache->dirty_bitset)) { 581 atomic_inc(&cache->nr_dirty); 582 policy_set_dirty(cache->policy, oblock); 583 } 584 } 585 586 static void clear_dirty(struct cache *cache, dm_oblock_t oblock, dm_cblock_t cblock) 587 { 588 if (test_and_clear_bit(from_cblock(cblock), cache->dirty_bitset)) { 589 policy_clear_dirty(cache->policy, oblock); 590 if (atomic_dec_return(&cache->nr_dirty) == 0) 591 dm_table_event(cache->ti->table); 592 } 593 } 594 595 /*----------------------------------------------------------------*/ 596 597 static bool block_size_is_power_of_two(struct cache *cache) 598 { 599 return cache->sectors_per_block_shift >= 0; 600 } 601 602 /* gcc on ARM generates spurious references to __udivdi3 and __umoddi3 */ 603 #if defined(CONFIG_ARM) && __GNUC__ == 4 && __GNUC_MINOR__ <= 6 604 __always_inline 605 #endif 606 static dm_block_t block_div(dm_block_t b, uint32_t n) 607 { 608 do_div(b, n); 609 610 return b; 611 } 612 613 static dm_block_t oblocks_per_dblock(struct cache *cache) 614 { 615 dm_block_t oblocks = cache->discard_block_size; 616 617 if (block_size_is_power_of_two(cache)) 618 oblocks >>= cache->sectors_per_block_shift; 619 else 620 oblocks = block_div(oblocks, cache->sectors_per_block); 621 622 return oblocks; 623 } 624 625 static dm_dblock_t oblock_to_dblock(struct cache *cache, dm_oblock_t oblock) 626 { 627 return to_dblock(block_div(from_oblock(oblock), 628 oblocks_per_dblock(cache))); 629 } 630 631 static dm_oblock_t dblock_to_oblock(struct cache *cache, dm_dblock_t dblock) 632 { 633 return to_oblock(from_dblock(dblock) * oblocks_per_dblock(cache)); 634 } 635 636 static void set_discard(struct cache *cache, dm_dblock_t b) 637 { 638 unsigned long flags; 639 640 BUG_ON(from_dblock(b) >= from_dblock(cache->discard_nr_blocks)); 641 atomic_inc(&cache->stats.discard_count); 642 643 spin_lock_irqsave(&cache->lock, flags); 644 set_bit(from_dblock(b), cache->discard_bitset); 645 spin_unlock_irqrestore(&cache->lock, flags); 646 } 647 648 static void clear_discard(struct cache *cache, dm_dblock_t b) 649 { 650 unsigned long flags; 651 652 spin_lock_irqsave(&cache->lock, flags); 653 clear_bit(from_dblock(b), cache->discard_bitset); 654 spin_unlock_irqrestore(&cache->lock, flags); 655 } 656 657 static bool is_discarded(struct cache *cache, dm_dblock_t b) 658 { 659 int r; 660 unsigned long flags; 661 662 spin_lock_irqsave(&cache->lock, flags); 663 r = test_bit(from_dblock(b), cache->discard_bitset); 664 spin_unlock_irqrestore(&cache->lock, flags); 665 666 return r; 667 } 668 669 static bool is_discarded_oblock(struct cache *cache, dm_oblock_t b) 670 { 671 int r; 672 unsigned long flags; 673 674 spin_lock_irqsave(&cache->lock, flags); 675 r = test_bit(from_dblock(oblock_to_dblock(cache, b)), 676 cache->discard_bitset); 677 spin_unlock_irqrestore(&cache->lock, flags); 678 679 return r; 680 } 681 682 /*----------------------------------------------------------------*/ 683 684 static void load_stats(struct cache *cache) 685 { 686 struct dm_cache_statistics stats; 687 688 dm_cache_metadata_get_stats(cache->cmd, &stats); 689 atomic_set(&cache->stats.read_hit, stats.read_hits); 690 atomic_set(&cache->stats.read_miss, stats.read_misses); 691 atomic_set(&cache->stats.write_hit, stats.write_hits); 692 atomic_set(&cache->stats.write_miss, stats.write_misses); 693 } 694 695 static void save_stats(struct cache *cache) 696 { 697 struct dm_cache_statistics stats; 698 699 if (get_cache_mode(cache) >= CM_READ_ONLY) 700 return; 701 702 stats.read_hits = atomic_read(&cache->stats.read_hit); 703 stats.read_misses = atomic_read(&cache->stats.read_miss); 704 stats.write_hits = atomic_read(&cache->stats.write_hit); 705 stats.write_misses = atomic_read(&cache->stats.write_miss); 706 707 dm_cache_metadata_set_stats(cache->cmd, &stats); 708 } 709 710 /*---------------------------------------------------------------- 711 * Per bio data 712 *--------------------------------------------------------------*/ 713 714 /* 715 * If using writeback, leave out struct per_bio_data's writethrough fields. 716 */ 717 #define PB_DATA_SIZE_WB (offsetof(struct per_bio_data, cache)) 718 #define PB_DATA_SIZE_WT (sizeof(struct per_bio_data)) 719 720 static bool writethrough_mode(struct cache_features *f) 721 { 722 return f->io_mode == CM_IO_WRITETHROUGH; 723 } 724 725 static bool writeback_mode(struct cache_features *f) 726 { 727 return f->io_mode == CM_IO_WRITEBACK; 728 } 729 730 static bool passthrough_mode(struct cache_features *f) 731 { 732 return f->io_mode == CM_IO_PASSTHROUGH; 733 } 734 735 static size_t get_per_bio_data_size(struct cache *cache) 736 { 737 return writethrough_mode(&cache->features) ? PB_DATA_SIZE_WT : PB_DATA_SIZE_WB; 738 } 739 740 static struct per_bio_data *get_per_bio_data(struct bio *bio, size_t data_size) 741 { 742 struct per_bio_data *pb = dm_per_bio_data(bio, data_size); 743 BUG_ON(!pb); 744 return pb; 745 } 746 747 static struct per_bio_data *init_per_bio_data(struct bio *bio, size_t data_size) 748 { 749 struct per_bio_data *pb = get_per_bio_data(bio, data_size); 750 751 pb->tick = false; 752 pb->req_nr = dm_bio_get_target_bio_nr(bio); 753 pb->all_io_entry = NULL; 754 pb->len = 0; 755 756 return pb; 757 } 758 759 /*---------------------------------------------------------------- 760 * Remapping 761 *--------------------------------------------------------------*/ 762 static void remap_to_origin(struct cache *cache, struct bio *bio) 763 { 764 bio->bi_bdev = cache->origin_dev->bdev; 765 } 766 767 static void remap_to_cache(struct cache *cache, struct bio *bio, 768 dm_cblock_t cblock) 769 { 770 sector_t bi_sector = bio->bi_iter.bi_sector; 771 sector_t block = from_cblock(cblock); 772 773 bio->bi_bdev = cache->cache_dev->bdev; 774 if (!block_size_is_power_of_two(cache)) 775 bio->bi_iter.bi_sector = 776 (block * cache->sectors_per_block) + 777 sector_div(bi_sector, cache->sectors_per_block); 778 else 779 bio->bi_iter.bi_sector = 780 (block << cache->sectors_per_block_shift) | 781 (bi_sector & (cache->sectors_per_block - 1)); 782 } 783 784 static void check_if_tick_bio_needed(struct cache *cache, struct bio *bio) 785 { 786 unsigned long flags; 787 size_t pb_data_size = get_per_bio_data_size(cache); 788 struct per_bio_data *pb = get_per_bio_data(bio, pb_data_size); 789 790 spin_lock_irqsave(&cache->lock, flags); 791 if (cache->need_tick_bio && !op_is_flush(bio->bi_opf) && 792 bio_op(bio) != REQ_OP_DISCARD) { 793 pb->tick = true; 794 cache->need_tick_bio = false; 795 } 796 spin_unlock_irqrestore(&cache->lock, flags); 797 } 798 799 static void remap_to_origin_clear_discard(struct cache *cache, struct bio *bio, 800 dm_oblock_t oblock) 801 { 802 check_if_tick_bio_needed(cache, bio); 803 remap_to_origin(cache, bio); 804 if (bio_data_dir(bio) == WRITE) 805 clear_discard(cache, oblock_to_dblock(cache, oblock)); 806 } 807 808 static void remap_to_cache_dirty(struct cache *cache, struct bio *bio, 809 dm_oblock_t oblock, dm_cblock_t cblock) 810 { 811 check_if_tick_bio_needed(cache, bio); 812 remap_to_cache(cache, bio, cblock); 813 if (bio_data_dir(bio) == WRITE) { 814 set_dirty(cache, oblock, cblock); 815 clear_discard(cache, oblock_to_dblock(cache, oblock)); 816 } 817 } 818 819 static dm_oblock_t get_bio_block(struct cache *cache, struct bio *bio) 820 { 821 sector_t block_nr = bio->bi_iter.bi_sector; 822 823 if (!block_size_is_power_of_two(cache)) 824 (void) sector_div(block_nr, cache->sectors_per_block); 825 else 826 block_nr >>= cache->sectors_per_block_shift; 827 828 return to_oblock(block_nr); 829 } 830 831 /* 832 * You must increment the deferred set whilst the prison cell is held. To 833 * encourage this, we ask for 'cell' to be passed in. 834 */ 835 static void inc_ds(struct cache *cache, struct bio *bio, 836 struct dm_bio_prison_cell *cell) 837 { 838 size_t pb_data_size = get_per_bio_data_size(cache); 839 struct per_bio_data *pb = get_per_bio_data(bio, pb_data_size); 840 841 BUG_ON(!cell); 842 BUG_ON(pb->all_io_entry); 843 844 pb->all_io_entry = dm_deferred_entry_inc(cache->all_io_ds); 845 } 846 847 static bool accountable_bio(struct cache *cache, struct bio *bio) 848 { 849 return ((bio->bi_bdev == cache->origin_dev->bdev) && 850 bio_op(bio) != REQ_OP_DISCARD); 851 } 852 853 static void accounted_begin(struct cache *cache, struct bio *bio) 854 { 855 size_t pb_data_size = get_per_bio_data_size(cache); 856 struct per_bio_data *pb = get_per_bio_data(bio, pb_data_size); 857 858 if (accountable_bio(cache, bio)) { 859 pb->len = bio_sectors(bio); 860 iot_io_begin(&cache->origin_tracker, pb->len); 861 } 862 } 863 864 static void accounted_complete(struct cache *cache, struct bio *bio) 865 { 866 size_t pb_data_size = get_per_bio_data_size(cache); 867 struct per_bio_data *pb = get_per_bio_data(bio, pb_data_size); 868 869 iot_io_end(&cache->origin_tracker, pb->len); 870 } 871 872 static void accounted_request(struct cache *cache, struct bio *bio) 873 { 874 accounted_begin(cache, bio); 875 generic_make_request(bio); 876 } 877 878 static void issue(struct cache *cache, struct bio *bio) 879 { 880 unsigned long flags; 881 882 if (!op_is_flush(bio->bi_opf)) { 883 accounted_request(cache, bio); 884 return; 885 } 886 887 /* 888 * Batch together any bios that trigger commits and then issue a 889 * single commit for them in do_worker(). 890 */ 891 spin_lock_irqsave(&cache->lock, flags); 892 cache->commit_requested = true; 893 bio_list_add(&cache->deferred_flush_bios, bio); 894 spin_unlock_irqrestore(&cache->lock, flags); 895 } 896 897 static void inc_and_issue(struct cache *cache, struct bio *bio, struct dm_bio_prison_cell *cell) 898 { 899 inc_ds(cache, bio, cell); 900 issue(cache, bio); 901 } 902 903 static void defer_writethrough_bio(struct cache *cache, struct bio *bio) 904 { 905 unsigned long flags; 906 907 spin_lock_irqsave(&cache->lock, flags); 908 bio_list_add(&cache->deferred_writethrough_bios, bio); 909 spin_unlock_irqrestore(&cache->lock, flags); 910 911 wake_worker(cache); 912 } 913 914 static void writethrough_endio(struct bio *bio) 915 { 916 struct per_bio_data *pb = get_per_bio_data(bio, PB_DATA_SIZE_WT); 917 918 dm_unhook_bio(&pb->hook_info, bio); 919 920 if (bio->bi_error) { 921 bio_endio(bio); 922 return; 923 } 924 925 dm_bio_restore(&pb->bio_details, bio); 926 remap_to_cache(pb->cache, bio, pb->cblock); 927 928 /* 929 * We can't issue this bio directly, since we're in interrupt 930 * context. So it gets put on a bio list for processing by the 931 * worker thread. 932 */ 933 defer_writethrough_bio(pb->cache, bio); 934 } 935 936 /* 937 * When running in writethrough mode we need to send writes to clean blocks 938 * to both the cache and origin devices. In future we'd like to clone the 939 * bio and send them in parallel, but for now we're doing them in 940 * series as this is easier. 941 */ 942 static void remap_to_origin_then_cache(struct cache *cache, struct bio *bio, 943 dm_oblock_t oblock, dm_cblock_t cblock) 944 { 945 struct per_bio_data *pb = get_per_bio_data(bio, PB_DATA_SIZE_WT); 946 947 pb->cache = cache; 948 pb->cblock = cblock; 949 dm_hook_bio(&pb->hook_info, bio, writethrough_endio, NULL); 950 dm_bio_record(&pb->bio_details, bio); 951 952 remap_to_origin_clear_discard(pb->cache, bio, oblock); 953 } 954 955 /*---------------------------------------------------------------- 956 * Failure modes 957 *--------------------------------------------------------------*/ 958 static enum cache_metadata_mode get_cache_mode(struct cache *cache) 959 { 960 return cache->features.mode; 961 } 962 963 static const char *cache_device_name(struct cache *cache) 964 { 965 return dm_device_name(dm_table_get_md(cache->ti->table)); 966 } 967 968 static void notify_mode_switch(struct cache *cache, enum cache_metadata_mode mode) 969 { 970 const char *descs[] = { 971 "write", 972 "read-only", 973 "fail" 974 }; 975 976 dm_table_event(cache->ti->table); 977 DMINFO("%s: switching cache to %s mode", 978 cache_device_name(cache), descs[(int)mode]); 979 } 980 981 static void set_cache_mode(struct cache *cache, enum cache_metadata_mode new_mode) 982 { 983 bool needs_check; 984 enum cache_metadata_mode old_mode = get_cache_mode(cache); 985 986 if (dm_cache_metadata_needs_check(cache->cmd, &needs_check)) { 987 DMERR("%s: unable to read needs_check flag, setting failure mode.", 988 cache_device_name(cache)); 989 new_mode = CM_FAIL; 990 } 991 992 if (new_mode == CM_WRITE && needs_check) { 993 DMERR("%s: unable to switch cache to write mode until repaired.", 994 cache_device_name(cache)); 995 if (old_mode != new_mode) 996 new_mode = old_mode; 997 else 998 new_mode = CM_READ_ONLY; 999 } 1000 1001 /* Never move out of fail mode */ 1002 if (old_mode == CM_FAIL) 1003 new_mode = CM_FAIL; 1004 1005 switch (new_mode) { 1006 case CM_FAIL: 1007 case CM_READ_ONLY: 1008 dm_cache_metadata_set_read_only(cache->cmd); 1009 break; 1010 1011 case CM_WRITE: 1012 dm_cache_metadata_set_read_write(cache->cmd); 1013 break; 1014 } 1015 1016 cache->features.mode = new_mode; 1017 1018 if (new_mode != old_mode) 1019 notify_mode_switch(cache, new_mode); 1020 } 1021 1022 static void abort_transaction(struct cache *cache) 1023 { 1024 const char *dev_name = cache_device_name(cache); 1025 1026 if (get_cache_mode(cache) >= CM_READ_ONLY) 1027 return; 1028 1029 if (dm_cache_metadata_set_needs_check(cache->cmd)) { 1030 DMERR("%s: failed to set 'needs_check' flag in metadata", dev_name); 1031 set_cache_mode(cache, CM_FAIL); 1032 } 1033 1034 DMERR_LIMIT("%s: aborting current metadata transaction", dev_name); 1035 if (dm_cache_metadata_abort(cache->cmd)) { 1036 DMERR("%s: failed to abort metadata transaction", dev_name); 1037 set_cache_mode(cache, CM_FAIL); 1038 } 1039 } 1040 1041 static void metadata_operation_failed(struct cache *cache, const char *op, int r) 1042 { 1043 DMERR_LIMIT("%s: metadata operation '%s' failed: error = %d", 1044 cache_device_name(cache), op, r); 1045 abort_transaction(cache); 1046 set_cache_mode(cache, CM_READ_ONLY); 1047 } 1048 1049 /*---------------------------------------------------------------- 1050 * Migration processing 1051 * 1052 * Migration covers moving data from the origin device to the cache, or 1053 * vice versa. 1054 *--------------------------------------------------------------*/ 1055 static void inc_io_migrations(struct cache *cache) 1056 { 1057 atomic_inc(&cache->nr_io_migrations); 1058 } 1059 1060 static void dec_io_migrations(struct cache *cache) 1061 { 1062 atomic_dec(&cache->nr_io_migrations); 1063 } 1064 1065 static bool discard_or_flush(struct bio *bio) 1066 { 1067 return bio_op(bio) == REQ_OP_DISCARD || op_is_flush(bio->bi_opf); 1068 } 1069 1070 static void __cell_defer(struct cache *cache, struct dm_bio_prison_cell *cell) 1071 { 1072 if (discard_or_flush(cell->holder)) { 1073 /* 1074 * We have to handle these bios individually. 1075 */ 1076 dm_cell_release(cache->prison, cell, &cache->deferred_bios); 1077 free_prison_cell(cache, cell); 1078 } else 1079 list_add_tail(&cell->user_list, &cache->deferred_cells); 1080 } 1081 1082 static void cell_defer(struct cache *cache, struct dm_bio_prison_cell *cell, bool holder) 1083 { 1084 unsigned long flags; 1085 1086 if (!holder && dm_cell_promote_or_release(cache->prison, cell)) { 1087 /* 1088 * There was no prisoner to promote to holder, the 1089 * cell has been released. 1090 */ 1091 free_prison_cell(cache, cell); 1092 return; 1093 } 1094 1095 spin_lock_irqsave(&cache->lock, flags); 1096 __cell_defer(cache, cell); 1097 spin_unlock_irqrestore(&cache->lock, flags); 1098 1099 wake_worker(cache); 1100 } 1101 1102 static void cell_error_with_code(struct cache *cache, struct dm_bio_prison_cell *cell, int err) 1103 { 1104 dm_cell_error(cache->prison, cell, err); 1105 free_prison_cell(cache, cell); 1106 } 1107 1108 static void cell_requeue(struct cache *cache, struct dm_bio_prison_cell *cell) 1109 { 1110 cell_error_with_code(cache, cell, DM_ENDIO_REQUEUE); 1111 } 1112 1113 static void free_io_migration(struct dm_cache_migration *mg) 1114 { 1115 struct cache *cache = mg->cache; 1116 1117 dec_io_migrations(cache); 1118 free_migration(mg); 1119 wake_worker(cache); 1120 } 1121 1122 static void migration_failure(struct dm_cache_migration *mg) 1123 { 1124 struct cache *cache = mg->cache; 1125 const char *dev_name = cache_device_name(cache); 1126 1127 if (mg->writeback) { 1128 DMERR_LIMIT("%s: writeback failed; couldn't copy block", dev_name); 1129 set_dirty(cache, mg->old_oblock, mg->cblock); 1130 cell_defer(cache, mg->old_ocell, false); 1131 1132 } else if (mg->demote) { 1133 DMERR_LIMIT("%s: demotion failed; couldn't copy block", dev_name); 1134 policy_force_mapping(cache->policy, mg->new_oblock, mg->old_oblock); 1135 1136 cell_defer(cache, mg->old_ocell, mg->promote ? false : true); 1137 if (mg->promote) 1138 cell_defer(cache, mg->new_ocell, true); 1139 } else { 1140 DMERR_LIMIT("%s: promotion failed; couldn't copy block", dev_name); 1141 policy_remove_mapping(cache->policy, mg->new_oblock); 1142 cell_defer(cache, mg->new_ocell, true); 1143 } 1144 1145 free_io_migration(mg); 1146 } 1147 1148 static void migration_success_pre_commit(struct dm_cache_migration *mg) 1149 { 1150 int r; 1151 unsigned long flags; 1152 struct cache *cache = mg->cache; 1153 1154 if (mg->writeback) { 1155 clear_dirty(cache, mg->old_oblock, mg->cblock); 1156 cell_defer(cache, mg->old_ocell, false); 1157 free_io_migration(mg); 1158 return; 1159 1160 } else if (mg->demote) { 1161 r = dm_cache_remove_mapping(cache->cmd, mg->cblock); 1162 if (r) { 1163 DMERR_LIMIT("%s: demotion failed; couldn't update on disk metadata", 1164 cache_device_name(cache)); 1165 metadata_operation_failed(cache, "dm_cache_remove_mapping", r); 1166 policy_force_mapping(cache->policy, mg->new_oblock, 1167 mg->old_oblock); 1168 if (mg->promote) 1169 cell_defer(cache, mg->new_ocell, true); 1170 free_io_migration(mg); 1171 return; 1172 } 1173 } else { 1174 r = dm_cache_insert_mapping(cache->cmd, mg->cblock, mg->new_oblock); 1175 if (r) { 1176 DMERR_LIMIT("%s: promotion failed; couldn't update on disk metadata", 1177 cache_device_name(cache)); 1178 metadata_operation_failed(cache, "dm_cache_insert_mapping", r); 1179 policy_remove_mapping(cache->policy, mg->new_oblock); 1180 free_io_migration(mg); 1181 return; 1182 } 1183 } 1184 1185 spin_lock_irqsave(&cache->lock, flags); 1186 list_add_tail(&mg->list, &cache->need_commit_migrations); 1187 cache->commit_requested = true; 1188 spin_unlock_irqrestore(&cache->lock, flags); 1189 } 1190 1191 static void migration_success_post_commit(struct dm_cache_migration *mg) 1192 { 1193 unsigned long flags; 1194 struct cache *cache = mg->cache; 1195 1196 if (mg->writeback) { 1197 DMWARN_LIMIT("%s: writeback unexpectedly triggered commit", 1198 cache_device_name(cache)); 1199 return; 1200 1201 } else if (mg->demote) { 1202 cell_defer(cache, mg->old_ocell, mg->promote ? false : true); 1203 1204 if (mg->promote) { 1205 mg->demote = false; 1206 1207 spin_lock_irqsave(&cache->lock, flags); 1208 list_add_tail(&mg->list, &cache->quiesced_migrations); 1209 spin_unlock_irqrestore(&cache->lock, flags); 1210 1211 } else { 1212 if (mg->invalidate) 1213 policy_remove_mapping(cache->policy, mg->old_oblock); 1214 free_io_migration(mg); 1215 } 1216 1217 } else { 1218 if (mg->requeue_holder) { 1219 clear_dirty(cache, mg->new_oblock, mg->cblock); 1220 cell_defer(cache, mg->new_ocell, true); 1221 } else { 1222 /* 1223 * The block was promoted via an overwrite, so it's dirty. 1224 */ 1225 set_dirty(cache, mg->new_oblock, mg->cblock); 1226 bio_endio(mg->new_ocell->holder); 1227 cell_defer(cache, mg->new_ocell, false); 1228 } 1229 free_io_migration(mg); 1230 } 1231 } 1232 1233 static void copy_complete(int read_err, unsigned long write_err, void *context) 1234 { 1235 unsigned long flags; 1236 struct dm_cache_migration *mg = (struct dm_cache_migration *) context; 1237 struct cache *cache = mg->cache; 1238 1239 if (read_err || write_err) 1240 mg->err = true; 1241 1242 spin_lock_irqsave(&cache->lock, flags); 1243 list_add_tail(&mg->list, &cache->completed_migrations); 1244 spin_unlock_irqrestore(&cache->lock, flags); 1245 1246 wake_worker(cache); 1247 } 1248 1249 static void issue_copy(struct dm_cache_migration *mg) 1250 { 1251 int r; 1252 struct dm_io_region o_region, c_region; 1253 struct cache *cache = mg->cache; 1254 sector_t cblock = from_cblock(mg->cblock); 1255 1256 o_region.bdev = cache->origin_dev->bdev; 1257 o_region.count = cache->sectors_per_block; 1258 1259 c_region.bdev = cache->cache_dev->bdev; 1260 c_region.sector = cblock * cache->sectors_per_block; 1261 c_region.count = cache->sectors_per_block; 1262 1263 if (mg->writeback || mg->demote) { 1264 /* demote */ 1265 o_region.sector = from_oblock(mg->old_oblock) * cache->sectors_per_block; 1266 r = dm_kcopyd_copy(cache->copier, &c_region, 1, &o_region, 0, copy_complete, mg); 1267 } else { 1268 /* promote */ 1269 o_region.sector = from_oblock(mg->new_oblock) * cache->sectors_per_block; 1270 r = dm_kcopyd_copy(cache->copier, &o_region, 1, &c_region, 0, copy_complete, mg); 1271 } 1272 1273 if (r < 0) { 1274 DMERR_LIMIT("%s: issuing migration failed", cache_device_name(cache)); 1275 migration_failure(mg); 1276 } 1277 } 1278 1279 static void overwrite_endio(struct bio *bio) 1280 { 1281 struct dm_cache_migration *mg = bio->bi_private; 1282 struct cache *cache = mg->cache; 1283 size_t pb_data_size = get_per_bio_data_size(cache); 1284 struct per_bio_data *pb = get_per_bio_data(bio, pb_data_size); 1285 unsigned long flags; 1286 1287 dm_unhook_bio(&pb->hook_info, bio); 1288 1289 if (bio->bi_error) 1290 mg->err = true; 1291 1292 mg->requeue_holder = false; 1293 1294 spin_lock_irqsave(&cache->lock, flags); 1295 list_add_tail(&mg->list, &cache->completed_migrations); 1296 spin_unlock_irqrestore(&cache->lock, flags); 1297 1298 wake_worker(cache); 1299 } 1300 1301 static void issue_overwrite(struct dm_cache_migration *mg, struct bio *bio) 1302 { 1303 size_t pb_data_size = get_per_bio_data_size(mg->cache); 1304 struct per_bio_data *pb = get_per_bio_data(bio, pb_data_size); 1305 1306 dm_hook_bio(&pb->hook_info, bio, overwrite_endio, mg); 1307 remap_to_cache_dirty(mg->cache, bio, mg->new_oblock, mg->cblock); 1308 1309 /* 1310 * No need to inc_ds() here, since the cell will be held for the 1311 * duration of the io. 1312 */ 1313 accounted_request(mg->cache, bio); 1314 } 1315 1316 static bool bio_writes_complete_block(struct cache *cache, struct bio *bio) 1317 { 1318 return (bio_data_dir(bio) == WRITE) && 1319 (bio->bi_iter.bi_size == (cache->sectors_per_block << SECTOR_SHIFT)); 1320 } 1321 1322 static void avoid_copy(struct dm_cache_migration *mg) 1323 { 1324 atomic_inc(&mg->cache->stats.copies_avoided); 1325 migration_success_pre_commit(mg); 1326 } 1327 1328 static void calc_discard_block_range(struct cache *cache, struct bio *bio, 1329 dm_dblock_t *b, dm_dblock_t *e) 1330 { 1331 sector_t sb = bio->bi_iter.bi_sector; 1332 sector_t se = bio_end_sector(bio); 1333 1334 *b = to_dblock(dm_sector_div_up(sb, cache->discard_block_size)); 1335 1336 if (se - sb < cache->discard_block_size) 1337 *e = *b; 1338 else 1339 *e = to_dblock(block_div(se, cache->discard_block_size)); 1340 } 1341 1342 static void issue_discard(struct dm_cache_migration *mg) 1343 { 1344 dm_dblock_t b, e; 1345 struct bio *bio = mg->new_ocell->holder; 1346 struct cache *cache = mg->cache; 1347 1348 calc_discard_block_range(cache, bio, &b, &e); 1349 while (b != e) { 1350 set_discard(cache, b); 1351 b = to_dblock(from_dblock(b) + 1); 1352 } 1353 1354 bio_endio(bio); 1355 cell_defer(cache, mg->new_ocell, false); 1356 free_migration(mg); 1357 wake_worker(cache); 1358 } 1359 1360 static void issue_copy_or_discard(struct dm_cache_migration *mg) 1361 { 1362 bool avoid; 1363 struct cache *cache = mg->cache; 1364 1365 if (mg->discard) { 1366 issue_discard(mg); 1367 return; 1368 } 1369 1370 if (mg->writeback || mg->demote) 1371 avoid = !is_dirty(cache, mg->cblock) || 1372 is_discarded_oblock(cache, mg->old_oblock); 1373 else { 1374 struct bio *bio = mg->new_ocell->holder; 1375 1376 avoid = is_discarded_oblock(cache, mg->new_oblock); 1377 1378 if (writeback_mode(&cache->features) && 1379 !avoid && bio_writes_complete_block(cache, bio)) { 1380 issue_overwrite(mg, bio); 1381 return; 1382 } 1383 } 1384 1385 avoid ? avoid_copy(mg) : issue_copy(mg); 1386 } 1387 1388 static void complete_migration(struct dm_cache_migration *mg) 1389 { 1390 if (mg->err) 1391 migration_failure(mg); 1392 else 1393 migration_success_pre_commit(mg); 1394 } 1395 1396 static void process_migrations(struct cache *cache, struct list_head *head, 1397 void (*fn)(struct dm_cache_migration *)) 1398 { 1399 unsigned long flags; 1400 struct list_head list; 1401 struct dm_cache_migration *mg, *tmp; 1402 1403 INIT_LIST_HEAD(&list); 1404 spin_lock_irqsave(&cache->lock, flags); 1405 list_splice_init(head, &list); 1406 spin_unlock_irqrestore(&cache->lock, flags); 1407 1408 list_for_each_entry_safe(mg, tmp, &list, list) 1409 fn(mg); 1410 } 1411 1412 static void __queue_quiesced_migration(struct dm_cache_migration *mg) 1413 { 1414 list_add_tail(&mg->list, &mg->cache->quiesced_migrations); 1415 } 1416 1417 static void queue_quiesced_migration(struct dm_cache_migration *mg) 1418 { 1419 unsigned long flags; 1420 struct cache *cache = mg->cache; 1421 1422 spin_lock_irqsave(&cache->lock, flags); 1423 __queue_quiesced_migration(mg); 1424 spin_unlock_irqrestore(&cache->lock, flags); 1425 1426 wake_worker(cache); 1427 } 1428 1429 static void queue_quiesced_migrations(struct cache *cache, struct list_head *work) 1430 { 1431 unsigned long flags; 1432 struct dm_cache_migration *mg, *tmp; 1433 1434 spin_lock_irqsave(&cache->lock, flags); 1435 list_for_each_entry_safe(mg, tmp, work, list) 1436 __queue_quiesced_migration(mg); 1437 spin_unlock_irqrestore(&cache->lock, flags); 1438 1439 wake_worker(cache); 1440 } 1441 1442 static void check_for_quiesced_migrations(struct cache *cache, 1443 struct per_bio_data *pb) 1444 { 1445 struct list_head work; 1446 1447 if (!pb->all_io_entry) 1448 return; 1449 1450 INIT_LIST_HEAD(&work); 1451 dm_deferred_entry_dec(pb->all_io_entry, &work); 1452 1453 if (!list_empty(&work)) 1454 queue_quiesced_migrations(cache, &work); 1455 } 1456 1457 static void quiesce_migration(struct dm_cache_migration *mg) 1458 { 1459 if (!dm_deferred_set_add_work(mg->cache->all_io_ds, &mg->list)) 1460 queue_quiesced_migration(mg); 1461 } 1462 1463 static void promote(struct cache *cache, struct prealloc *structs, 1464 dm_oblock_t oblock, dm_cblock_t cblock, 1465 struct dm_bio_prison_cell *cell) 1466 { 1467 struct dm_cache_migration *mg = prealloc_get_migration(structs); 1468 1469 mg->err = false; 1470 mg->discard = false; 1471 mg->writeback = false; 1472 mg->demote = false; 1473 mg->promote = true; 1474 mg->requeue_holder = true; 1475 mg->invalidate = false; 1476 mg->cache = cache; 1477 mg->new_oblock = oblock; 1478 mg->cblock = cblock; 1479 mg->old_ocell = NULL; 1480 mg->new_ocell = cell; 1481 mg->start_jiffies = jiffies; 1482 1483 inc_io_migrations(cache); 1484 quiesce_migration(mg); 1485 } 1486 1487 static void writeback(struct cache *cache, struct prealloc *structs, 1488 dm_oblock_t oblock, dm_cblock_t cblock, 1489 struct dm_bio_prison_cell *cell) 1490 { 1491 struct dm_cache_migration *mg = prealloc_get_migration(structs); 1492 1493 mg->err = false; 1494 mg->discard = false; 1495 mg->writeback = true; 1496 mg->demote = false; 1497 mg->promote = false; 1498 mg->requeue_holder = true; 1499 mg->invalidate = false; 1500 mg->cache = cache; 1501 mg->old_oblock = oblock; 1502 mg->cblock = cblock; 1503 mg->old_ocell = cell; 1504 mg->new_ocell = NULL; 1505 mg->start_jiffies = jiffies; 1506 1507 inc_io_migrations(cache); 1508 quiesce_migration(mg); 1509 } 1510 1511 static void demote_then_promote(struct cache *cache, struct prealloc *structs, 1512 dm_oblock_t old_oblock, dm_oblock_t new_oblock, 1513 dm_cblock_t cblock, 1514 struct dm_bio_prison_cell *old_ocell, 1515 struct dm_bio_prison_cell *new_ocell) 1516 { 1517 struct dm_cache_migration *mg = prealloc_get_migration(structs); 1518 1519 mg->err = false; 1520 mg->discard = false; 1521 mg->writeback = false; 1522 mg->demote = true; 1523 mg->promote = true; 1524 mg->requeue_holder = true; 1525 mg->invalidate = false; 1526 mg->cache = cache; 1527 mg->old_oblock = old_oblock; 1528 mg->new_oblock = new_oblock; 1529 mg->cblock = cblock; 1530 mg->old_ocell = old_ocell; 1531 mg->new_ocell = new_ocell; 1532 mg->start_jiffies = jiffies; 1533 1534 inc_io_migrations(cache); 1535 quiesce_migration(mg); 1536 } 1537 1538 /* 1539 * Invalidate a cache entry. No writeback occurs; any changes in the cache 1540 * block are thrown away. 1541 */ 1542 static void invalidate(struct cache *cache, struct prealloc *structs, 1543 dm_oblock_t oblock, dm_cblock_t cblock, 1544 struct dm_bio_prison_cell *cell) 1545 { 1546 struct dm_cache_migration *mg = prealloc_get_migration(structs); 1547 1548 mg->err = false; 1549 mg->discard = false; 1550 mg->writeback = false; 1551 mg->demote = true; 1552 mg->promote = false; 1553 mg->requeue_holder = true; 1554 mg->invalidate = true; 1555 mg->cache = cache; 1556 mg->old_oblock = oblock; 1557 mg->cblock = cblock; 1558 mg->old_ocell = cell; 1559 mg->new_ocell = NULL; 1560 mg->start_jiffies = jiffies; 1561 1562 inc_io_migrations(cache); 1563 quiesce_migration(mg); 1564 } 1565 1566 static void discard(struct cache *cache, struct prealloc *structs, 1567 struct dm_bio_prison_cell *cell) 1568 { 1569 struct dm_cache_migration *mg = prealloc_get_migration(structs); 1570 1571 mg->err = false; 1572 mg->discard = true; 1573 mg->writeback = false; 1574 mg->demote = false; 1575 mg->promote = false; 1576 mg->requeue_holder = false; 1577 mg->invalidate = false; 1578 mg->cache = cache; 1579 mg->old_ocell = NULL; 1580 mg->new_ocell = cell; 1581 mg->start_jiffies = jiffies; 1582 1583 quiesce_migration(mg); 1584 } 1585 1586 /*---------------------------------------------------------------- 1587 * bio processing 1588 *--------------------------------------------------------------*/ 1589 static void defer_bio(struct cache *cache, struct bio *bio) 1590 { 1591 unsigned long flags; 1592 1593 spin_lock_irqsave(&cache->lock, flags); 1594 bio_list_add(&cache->deferred_bios, bio); 1595 spin_unlock_irqrestore(&cache->lock, flags); 1596 1597 wake_worker(cache); 1598 } 1599 1600 static void process_flush_bio(struct cache *cache, struct bio *bio) 1601 { 1602 size_t pb_data_size = get_per_bio_data_size(cache); 1603 struct per_bio_data *pb = get_per_bio_data(bio, pb_data_size); 1604 1605 BUG_ON(bio->bi_iter.bi_size); 1606 if (!pb->req_nr) 1607 remap_to_origin(cache, bio); 1608 else 1609 remap_to_cache(cache, bio, 0); 1610 1611 /* 1612 * REQ_PREFLUSH is not directed at any particular block so we don't 1613 * need to inc_ds(). REQ_FUA's are split into a write + REQ_PREFLUSH 1614 * by dm-core. 1615 */ 1616 issue(cache, bio); 1617 } 1618 1619 static void process_discard_bio(struct cache *cache, struct prealloc *structs, 1620 struct bio *bio) 1621 { 1622 int r; 1623 dm_dblock_t b, e; 1624 struct dm_bio_prison_cell *cell_prealloc, *new_ocell; 1625 1626 calc_discard_block_range(cache, bio, &b, &e); 1627 if (b == e) { 1628 bio_endio(bio); 1629 return; 1630 } 1631 1632 cell_prealloc = prealloc_get_cell(structs); 1633 r = bio_detain_range(cache, dblock_to_oblock(cache, b), dblock_to_oblock(cache, e), bio, cell_prealloc, 1634 (cell_free_fn) prealloc_put_cell, 1635 structs, &new_ocell); 1636 if (r > 0) 1637 return; 1638 1639 discard(cache, structs, new_ocell); 1640 } 1641 1642 static bool spare_migration_bandwidth(struct cache *cache) 1643 { 1644 sector_t current_volume = (atomic_read(&cache->nr_io_migrations) + 1) * 1645 cache->sectors_per_block; 1646 return current_volume < cache->migration_threshold; 1647 } 1648 1649 static void inc_hit_counter(struct cache *cache, struct bio *bio) 1650 { 1651 atomic_inc(bio_data_dir(bio) == READ ? 1652 &cache->stats.read_hit : &cache->stats.write_hit); 1653 } 1654 1655 static void inc_miss_counter(struct cache *cache, struct bio *bio) 1656 { 1657 atomic_inc(bio_data_dir(bio) == READ ? 1658 &cache->stats.read_miss : &cache->stats.write_miss); 1659 } 1660 1661 /*----------------------------------------------------------------*/ 1662 1663 struct inc_detail { 1664 struct cache *cache; 1665 struct bio_list bios_for_issue; 1666 struct bio_list unhandled_bios; 1667 bool any_writes; 1668 }; 1669 1670 static void inc_fn(void *context, struct dm_bio_prison_cell *cell) 1671 { 1672 struct bio *bio; 1673 struct inc_detail *detail = context; 1674 struct cache *cache = detail->cache; 1675 1676 inc_ds(cache, cell->holder, cell); 1677 if (bio_data_dir(cell->holder) == WRITE) 1678 detail->any_writes = true; 1679 1680 while ((bio = bio_list_pop(&cell->bios))) { 1681 if (discard_or_flush(bio)) { 1682 bio_list_add(&detail->unhandled_bios, bio); 1683 continue; 1684 } 1685 1686 if (bio_data_dir(bio) == WRITE) 1687 detail->any_writes = true; 1688 1689 bio_list_add(&detail->bios_for_issue, bio); 1690 inc_ds(cache, bio, cell); 1691 } 1692 } 1693 1694 // FIXME: refactor these two 1695 static void remap_cell_to_origin_clear_discard(struct cache *cache, 1696 struct dm_bio_prison_cell *cell, 1697 dm_oblock_t oblock, bool issue_holder) 1698 { 1699 struct bio *bio; 1700 unsigned long flags; 1701 struct inc_detail detail; 1702 1703 detail.cache = cache; 1704 bio_list_init(&detail.bios_for_issue); 1705 bio_list_init(&detail.unhandled_bios); 1706 detail.any_writes = false; 1707 1708 spin_lock_irqsave(&cache->lock, flags); 1709 dm_cell_visit_release(cache->prison, inc_fn, &detail, cell); 1710 bio_list_merge(&cache->deferred_bios, &detail.unhandled_bios); 1711 spin_unlock_irqrestore(&cache->lock, flags); 1712 1713 remap_to_origin(cache, cell->holder); 1714 if (issue_holder) 1715 issue(cache, cell->holder); 1716 else 1717 accounted_begin(cache, cell->holder); 1718 1719 if (detail.any_writes) 1720 clear_discard(cache, oblock_to_dblock(cache, oblock)); 1721 1722 while ((bio = bio_list_pop(&detail.bios_for_issue))) { 1723 remap_to_origin(cache, bio); 1724 issue(cache, bio); 1725 } 1726 1727 free_prison_cell(cache, cell); 1728 } 1729 1730 static void remap_cell_to_cache_dirty(struct cache *cache, struct dm_bio_prison_cell *cell, 1731 dm_oblock_t oblock, dm_cblock_t cblock, bool issue_holder) 1732 { 1733 struct bio *bio; 1734 unsigned long flags; 1735 struct inc_detail detail; 1736 1737 detail.cache = cache; 1738 bio_list_init(&detail.bios_for_issue); 1739 bio_list_init(&detail.unhandled_bios); 1740 detail.any_writes = false; 1741 1742 spin_lock_irqsave(&cache->lock, flags); 1743 dm_cell_visit_release(cache->prison, inc_fn, &detail, cell); 1744 bio_list_merge(&cache->deferred_bios, &detail.unhandled_bios); 1745 spin_unlock_irqrestore(&cache->lock, flags); 1746 1747 remap_to_cache(cache, cell->holder, cblock); 1748 if (issue_holder) 1749 issue(cache, cell->holder); 1750 else 1751 accounted_begin(cache, cell->holder); 1752 1753 if (detail.any_writes) { 1754 set_dirty(cache, oblock, cblock); 1755 clear_discard(cache, oblock_to_dblock(cache, oblock)); 1756 } 1757 1758 while ((bio = bio_list_pop(&detail.bios_for_issue))) { 1759 remap_to_cache(cache, bio, cblock); 1760 issue(cache, bio); 1761 } 1762 1763 free_prison_cell(cache, cell); 1764 } 1765 1766 /*----------------------------------------------------------------*/ 1767 1768 struct old_oblock_lock { 1769 struct policy_locker locker; 1770 struct cache *cache; 1771 struct prealloc *structs; 1772 struct dm_bio_prison_cell *cell; 1773 }; 1774 1775 static int null_locker(struct policy_locker *locker, dm_oblock_t b) 1776 { 1777 /* This should never be called */ 1778 BUG(); 1779 return 0; 1780 } 1781 1782 static int cell_locker(struct policy_locker *locker, dm_oblock_t b) 1783 { 1784 struct old_oblock_lock *l = container_of(locker, struct old_oblock_lock, locker); 1785 struct dm_bio_prison_cell *cell_prealloc = prealloc_get_cell(l->structs); 1786 1787 return bio_detain(l->cache, b, NULL, cell_prealloc, 1788 (cell_free_fn) prealloc_put_cell, 1789 l->structs, &l->cell); 1790 } 1791 1792 static void process_cell(struct cache *cache, struct prealloc *structs, 1793 struct dm_bio_prison_cell *new_ocell) 1794 { 1795 int r; 1796 bool release_cell = true; 1797 struct bio *bio = new_ocell->holder; 1798 dm_oblock_t block = get_bio_block(cache, bio); 1799 struct policy_result lookup_result; 1800 bool passthrough = passthrough_mode(&cache->features); 1801 bool fast_promotion, can_migrate; 1802 struct old_oblock_lock ool; 1803 1804 fast_promotion = is_discarded_oblock(cache, block) || bio_writes_complete_block(cache, bio); 1805 can_migrate = !passthrough && (fast_promotion || spare_migration_bandwidth(cache)); 1806 1807 ool.locker.fn = cell_locker; 1808 ool.cache = cache; 1809 ool.structs = structs; 1810 ool.cell = NULL; 1811 r = policy_map(cache->policy, block, true, can_migrate, fast_promotion, 1812 bio, &ool.locker, &lookup_result); 1813 1814 if (r == -EWOULDBLOCK) 1815 /* migration has been denied */ 1816 lookup_result.op = POLICY_MISS; 1817 1818 switch (lookup_result.op) { 1819 case POLICY_HIT: 1820 if (passthrough) { 1821 inc_miss_counter(cache, bio); 1822 1823 /* 1824 * Passthrough always maps to the origin, 1825 * invalidating any cache blocks that are written 1826 * to. 1827 */ 1828 1829 if (bio_data_dir(bio) == WRITE) { 1830 atomic_inc(&cache->stats.demotion); 1831 invalidate(cache, structs, block, lookup_result.cblock, new_ocell); 1832 release_cell = false; 1833 1834 } else { 1835 /* FIXME: factor out issue_origin() */ 1836 remap_to_origin_clear_discard(cache, bio, block); 1837 inc_and_issue(cache, bio, new_ocell); 1838 } 1839 } else { 1840 inc_hit_counter(cache, bio); 1841 1842 if (bio_data_dir(bio) == WRITE && 1843 writethrough_mode(&cache->features) && 1844 !is_dirty(cache, lookup_result.cblock)) { 1845 remap_to_origin_then_cache(cache, bio, block, lookup_result.cblock); 1846 inc_and_issue(cache, bio, new_ocell); 1847 1848 } else { 1849 remap_cell_to_cache_dirty(cache, new_ocell, block, lookup_result.cblock, true); 1850 release_cell = false; 1851 } 1852 } 1853 1854 break; 1855 1856 case POLICY_MISS: 1857 inc_miss_counter(cache, bio); 1858 remap_cell_to_origin_clear_discard(cache, new_ocell, block, true); 1859 release_cell = false; 1860 break; 1861 1862 case POLICY_NEW: 1863 atomic_inc(&cache->stats.promotion); 1864 promote(cache, structs, block, lookup_result.cblock, new_ocell); 1865 release_cell = false; 1866 break; 1867 1868 case POLICY_REPLACE: 1869 atomic_inc(&cache->stats.demotion); 1870 atomic_inc(&cache->stats.promotion); 1871 demote_then_promote(cache, structs, lookup_result.old_oblock, 1872 block, lookup_result.cblock, 1873 ool.cell, new_ocell); 1874 release_cell = false; 1875 break; 1876 1877 default: 1878 DMERR_LIMIT("%s: %s: erroring bio, unknown policy op: %u", 1879 cache_device_name(cache), __func__, 1880 (unsigned) lookup_result.op); 1881 bio_io_error(bio); 1882 } 1883 1884 if (release_cell) 1885 cell_defer(cache, new_ocell, false); 1886 } 1887 1888 static void process_bio(struct cache *cache, struct prealloc *structs, 1889 struct bio *bio) 1890 { 1891 int r; 1892 dm_oblock_t block = get_bio_block(cache, bio); 1893 struct dm_bio_prison_cell *cell_prealloc, *new_ocell; 1894 1895 /* 1896 * Check to see if that block is currently migrating. 1897 */ 1898 cell_prealloc = prealloc_get_cell(structs); 1899 r = bio_detain(cache, block, bio, cell_prealloc, 1900 (cell_free_fn) prealloc_put_cell, 1901 structs, &new_ocell); 1902 if (r > 0) 1903 return; 1904 1905 process_cell(cache, structs, new_ocell); 1906 } 1907 1908 static int need_commit_due_to_time(struct cache *cache) 1909 { 1910 return jiffies < cache->last_commit_jiffies || 1911 jiffies > cache->last_commit_jiffies + COMMIT_PERIOD; 1912 } 1913 1914 /* 1915 * A non-zero return indicates read_only or fail_io mode. 1916 */ 1917 static int commit(struct cache *cache, bool clean_shutdown) 1918 { 1919 int r; 1920 1921 if (get_cache_mode(cache) >= CM_READ_ONLY) 1922 return -EINVAL; 1923 1924 atomic_inc(&cache->stats.commit_count); 1925 r = dm_cache_commit(cache->cmd, clean_shutdown); 1926 if (r) 1927 metadata_operation_failed(cache, "dm_cache_commit", r); 1928 1929 return r; 1930 } 1931 1932 static int commit_if_needed(struct cache *cache) 1933 { 1934 int r = 0; 1935 1936 if ((cache->commit_requested || need_commit_due_to_time(cache)) && 1937 dm_cache_changed_this_transaction(cache->cmd)) { 1938 r = commit(cache, false); 1939 cache->commit_requested = false; 1940 cache->last_commit_jiffies = jiffies; 1941 } 1942 1943 return r; 1944 } 1945 1946 static void process_deferred_bios(struct cache *cache) 1947 { 1948 bool prealloc_used = false; 1949 unsigned long flags; 1950 struct bio_list bios; 1951 struct bio *bio; 1952 struct prealloc structs; 1953 1954 memset(&structs, 0, sizeof(structs)); 1955 bio_list_init(&bios); 1956 1957 spin_lock_irqsave(&cache->lock, flags); 1958 bio_list_merge(&bios, &cache->deferred_bios); 1959 bio_list_init(&cache->deferred_bios); 1960 spin_unlock_irqrestore(&cache->lock, flags); 1961 1962 while (!bio_list_empty(&bios)) { 1963 /* 1964 * If we've got no free migration structs, and processing 1965 * this bio might require one, we pause until there are some 1966 * prepared mappings to process. 1967 */ 1968 prealloc_used = true; 1969 if (prealloc_data_structs(cache, &structs)) { 1970 spin_lock_irqsave(&cache->lock, flags); 1971 bio_list_merge(&cache->deferred_bios, &bios); 1972 spin_unlock_irqrestore(&cache->lock, flags); 1973 break; 1974 } 1975 1976 bio = bio_list_pop(&bios); 1977 1978 if (bio->bi_opf & REQ_PREFLUSH) 1979 process_flush_bio(cache, bio); 1980 else if (bio_op(bio) == REQ_OP_DISCARD) 1981 process_discard_bio(cache, &structs, bio); 1982 else 1983 process_bio(cache, &structs, bio); 1984 } 1985 1986 if (prealloc_used) 1987 prealloc_free_structs(cache, &structs); 1988 } 1989 1990 static void process_deferred_cells(struct cache *cache) 1991 { 1992 bool prealloc_used = false; 1993 unsigned long flags; 1994 struct dm_bio_prison_cell *cell, *tmp; 1995 struct list_head cells; 1996 struct prealloc structs; 1997 1998 memset(&structs, 0, sizeof(structs)); 1999 2000 INIT_LIST_HEAD(&cells); 2001 2002 spin_lock_irqsave(&cache->lock, flags); 2003 list_splice_init(&cache->deferred_cells, &cells); 2004 spin_unlock_irqrestore(&cache->lock, flags); 2005 2006 list_for_each_entry_safe(cell, tmp, &cells, user_list) { 2007 /* 2008 * If we've got no free migration structs, and processing 2009 * this bio might require one, we pause until there are some 2010 * prepared mappings to process. 2011 */ 2012 prealloc_used = true; 2013 if (prealloc_data_structs(cache, &structs)) { 2014 spin_lock_irqsave(&cache->lock, flags); 2015 list_splice(&cells, &cache->deferred_cells); 2016 spin_unlock_irqrestore(&cache->lock, flags); 2017 break; 2018 } 2019 2020 process_cell(cache, &structs, cell); 2021 } 2022 2023 if (prealloc_used) 2024 prealloc_free_structs(cache, &structs); 2025 } 2026 2027 static void process_deferred_flush_bios(struct cache *cache, bool submit_bios) 2028 { 2029 unsigned long flags; 2030 struct bio_list bios; 2031 struct bio *bio; 2032 2033 bio_list_init(&bios); 2034 2035 spin_lock_irqsave(&cache->lock, flags); 2036 bio_list_merge(&bios, &cache->deferred_flush_bios); 2037 bio_list_init(&cache->deferred_flush_bios); 2038 spin_unlock_irqrestore(&cache->lock, flags); 2039 2040 /* 2041 * These bios have already been through inc_ds() 2042 */ 2043 while ((bio = bio_list_pop(&bios))) 2044 submit_bios ? accounted_request(cache, bio) : bio_io_error(bio); 2045 } 2046 2047 static void process_deferred_writethrough_bios(struct cache *cache) 2048 { 2049 unsigned long flags; 2050 struct bio_list bios; 2051 struct bio *bio; 2052 2053 bio_list_init(&bios); 2054 2055 spin_lock_irqsave(&cache->lock, flags); 2056 bio_list_merge(&bios, &cache->deferred_writethrough_bios); 2057 bio_list_init(&cache->deferred_writethrough_bios); 2058 spin_unlock_irqrestore(&cache->lock, flags); 2059 2060 /* 2061 * These bios have already been through inc_ds() 2062 */ 2063 while ((bio = bio_list_pop(&bios))) 2064 accounted_request(cache, bio); 2065 } 2066 2067 static void writeback_some_dirty_blocks(struct cache *cache) 2068 { 2069 bool prealloc_used = false; 2070 dm_oblock_t oblock; 2071 dm_cblock_t cblock; 2072 struct prealloc structs; 2073 struct dm_bio_prison_cell *old_ocell; 2074 bool busy = !iot_idle_for(&cache->origin_tracker, HZ); 2075 2076 memset(&structs, 0, sizeof(structs)); 2077 2078 while (spare_migration_bandwidth(cache)) { 2079 if (policy_writeback_work(cache->policy, &oblock, &cblock, busy)) 2080 break; /* no work to do */ 2081 2082 prealloc_used = true; 2083 if (prealloc_data_structs(cache, &structs) || 2084 get_cell(cache, oblock, &structs, &old_ocell)) { 2085 policy_set_dirty(cache->policy, oblock); 2086 break; 2087 } 2088 2089 writeback(cache, &structs, oblock, cblock, old_ocell); 2090 } 2091 2092 if (prealloc_used) 2093 prealloc_free_structs(cache, &structs); 2094 } 2095 2096 /*---------------------------------------------------------------- 2097 * Invalidations. 2098 * Dropping something from the cache *without* writing back. 2099 *--------------------------------------------------------------*/ 2100 2101 static void process_invalidation_request(struct cache *cache, struct invalidation_request *req) 2102 { 2103 int r = 0; 2104 uint64_t begin = from_cblock(req->cblocks->begin); 2105 uint64_t end = from_cblock(req->cblocks->end); 2106 2107 while (begin != end) { 2108 r = policy_remove_cblock(cache->policy, to_cblock(begin)); 2109 if (!r) { 2110 r = dm_cache_remove_mapping(cache->cmd, to_cblock(begin)); 2111 if (r) { 2112 metadata_operation_failed(cache, "dm_cache_remove_mapping", r); 2113 break; 2114 } 2115 2116 } else if (r == -ENODATA) { 2117 /* harmless, already unmapped */ 2118 r = 0; 2119 2120 } else { 2121 DMERR("%s: policy_remove_cblock failed", cache_device_name(cache)); 2122 break; 2123 } 2124 2125 begin++; 2126 } 2127 2128 cache->commit_requested = true; 2129 2130 req->err = r; 2131 atomic_set(&req->complete, 1); 2132 2133 wake_up(&req->result_wait); 2134 } 2135 2136 static void process_invalidation_requests(struct cache *cache) 2137 { 2138 struct list_head list; 2139 struct invalidation_request *req, *tmp; 2140 2141 INIT_LIST_HEAD(&list); 2142 spin_lock(&cache->invalidation_lock); 2143 list_splice_init(&cache->invalidation_requests, &list); 2144 spin_unlock(&cache->invalidation_lock); 2145 2146 list_for_each_entry_safe (req, tmp, &list, list) 2147 process_invalidation_request(cache, req); 2148 } 2149 2150 /*---------------------------------------------------------------- 2151 * Main worker loop 2152 *--------------------------------------------------------------*/ 2153 static bool is_quiescing(struct cache *cache) 2154 { 2155 return atomic_read(&cache->quiescing); 2156 } 2157 2158 static void ack_quiescing(struct cache *cache) 2159 { 2160 if (is_quiescing(cache)) { 2161 atomic_inc(&cache->quiescing_ack); 2162 wake_up(&cache->quiescing_wait); 2163 } 2164 } 2165 2166 static void wait_for_quiescing_ack(struct cache *cache) 2167 { 2168 wait_event(cache->quiescing_wait, atomic_read(&cache->quiescing_ack)); 2169 } 2170 2171 static void start_quiescing(struct cache *cache) 2172 { 2173 atomic_inc(&cache->quiescing); 2174 wait_for_quiescing_ack(cache); 2175 } 2176 2177 static void stop_quiescing(struct cache *cache) 2178 { 2179 atomic_set(&cache->quiescing, 0); 2180 atomic_set(&cache->quiescing_ack, 0); 2181 } 2182 2183 static void wait_for_migrations(struct cache *cache) 2184 { 2185 wait_event(cache->migration_wait, !atomic_read(&cache->nr_allocated_migrations)); 2186 } 2187 2188 static void stop_worker(struct cache *cache) 2189 { 2190 cancel_delayed_work(&cache->waker); 2191 flush_workqueue(cache->wq); 2192 } 2193 2194 static void requeue_deferred_cells(struct cache *cache) 2195 { 2196 unsigned long flags; 2197 struct list_head cells; 2198 struct dm_bio_prison_cell *cell, *tmp; 2199 2200 INIT_LIST_HEAD(&cells); 2201 spin_lock_irqsave(&cache->lock, flags); 2202 list_splice_init(&cache->deferred_cells, &cells); 2203 spin_unlock_irqrestore(&cache->lock, flags); 2204 2205 list_for_each_entry_safe(cell, tmp, &cells, user_list) 2206 cell_requeue(cache, cell); 2207 } 2208 2209 static void requeue_deferred_bios(struct cache *cache) 2210 { 2211 struct bio *bio; 2212 struct bio_list bios; 2213 2214 bio_list_init(&bios); 2215 bio_list_merge(&bios, &cache->deferred_bios); 2216 bio_list_init(&cache->deferred_bios); 2217 2218 while ((bio = bio_list_pop(&bios))) { 2219 bio->bi_error = DM_ENDIO_REQUEUE; 2220 bio_endio(bio); 2221 } 2222 } 2223 2224 static int more_work(struct cache *cache) 2225 { 2226 if (is_quiescing(cache)) 2227 return !list_empty(&cache->quiesced_migrations) || 2228 !list_empty(&cache->completed_migrations) || 2229 !list_empty(&cache->need_commit_migrations); 2230 else 2231 return !bio_list_empty(&cache->deferred_bios) || 2232 !list_empty(&cache->deferred_cells) || 2233 !bio_list_empty(&cache->deferred_flush_bios) || 2234 !bio_list_empty(&cache->deferred_writethrough_bios) || 2235 !list_empty(&cache->quiesced_migrations) || 2236 !list_empty(&cache->completed_migrations) || 2237 !list_empty(&cache->need_commit_migrations) || 2238 cache->invalidate; 2239 } 2240 2241 static void do_worker(struct work_struct *ws) 2242 { 2243 struct cache *cache = container_of(ws, struct cache, worker); 2244 2245 do { 2246 if (!is_quiescing(cache)) { 2247 writeback_some_dirty_blocks(cache); 2248 process_deferred_writethrough_bios(cache); 2249 process_deferred_bios(cache); 2250 process_deferred_cells(cache); 2251 process_invalidation_requests(cache); 2252 } 2253 2254 process_migrations(cache, &cache->quiesced_migrations, issue_copy_or_discard); 2255 process_migrations(cache, &cache->completed_migrations, complete_migration); 2256 2257 if (commit_if_needed(cache)) { 2258 process_deferred_flush_bios(cache, false); 2259 process_migrations(cache, &cache->need_commit_migrations, migration_failure); 2260 } else { 2261 process_deferred_flush_bios(cache, true); 2262 process_migrations(cache, &cache->need_commit_migrations, 2263 migration_success_post_commit); 2264 } 2265 2266 ack_quiescing(cache); 2267 2268 } while (more_work(cache)); 2269 } 2270 2271 /* 2272 * We want to commit periodically so that not too much 2273 * unwritten metadata builds up. 2274 */ 2275 static void do_waker(struct work_struct *ws) 2276 { 2277 struct cache *cache = container_of(to_delayed_work(ws), struct cache, waker); 2278 policy_tick(cache->policy, true); 2279 wake_worker(cache); 2280 queue_delayed_work(cache->wq, &cache->waker, COMMIT_PERIOD); 2281 } 2282 2283 /*----------------------------------------------------------------*/ 2284 2285 static int is_congested(struct dm_dev *dev, int bdi_bits) 2286 { 2287 struct request_queue *q = bdev_get_queue(dev->bdev); 2288 return bdi_congested(q->backing_dev_info, bdi_bits); 2289 } 2290 2291 static int cache_is_congested(struct dm_target_callbacks *cb, int bdi_bits) 2292 { 2293 struct cache *cache = container_of(cb, struct cache, callbacks); 2294 2295 return is_congested(cache->origin_dev, bdi_bits) || 2296 is_congested(cache->cache_dev, bdi_bits); 2297 } 2298 2299 /*---------------------------------------------------------------- 2300 * Target methods 2301 *--------------------------------------------------------------*/ 2302 2303 /* 2304 * This function gets called on the error paths of the constructor, so we 2305 * have to cope with a partially initialised struct. 2306 */ 2307 static void destroy(struct cache *cache) 2308 { 2309 unsigned i; 2310 2311 mempool_destroy(cache->migration_pool); 2312 2313 if (cache->all_io_ds) 2314 dm_deferred_set_destroy(cache->all_io_ds); 2315 2316 if (cache->prison) 2317 dm_bio_prison_destroy(cache->prison); 2318 2319 if (cache->wq) 2320 destroy_workqueue(cache->wq); 2321 2322 if (cache->dirty_bitset) 2323 free_bitset(cache->dirty_bitset); 2324 2325 if (cache->discard_bitset) 2326 free_bitset(cache->discard_bitset); 2327 2328 if (cache->copier) 2329 dm_kcopyd_client_destroy(cache->copier); 2330 2331 if (cache->cmd) 2332 dm_cache_metadata_close(cache->cmd); 2333 2334 if (cache->metadata_dev) 2335 dm_put_device(cache->ti, cache->metadata_dev); 2336 2337 if (cache->origin_dev) 2338 dm_put_device(cache->ti, cache->origin_dev); 2339 2340 if (cache->cache_dev) 2341 dm_put_device(cache->ti, cache->cache_dev); 2342 2343 if (cache->policy) 2344 dm_cache_policy_destroy(cache->policy); 2345 2346 for (i = 0; i < cache->nr_ctr_args ; i++) 2347 kfree(cache->ctr_args[i]); 2348 kfree(cache->ctr_args); 2349 2350 kfree(cache); 2351 } 2352 2353 static void cache_dtr(struct dm_target *ti) 2354 { 2355 struct cache *cache = ti->private; 2356 2357 destroy(cache); 2358 } 2359 2360 static sector_t get_dev_size(struct dm_dev *dev) 2361 { 2362 return i_size_read(dev->bdev->bd_inode) >> SECTOR_SHIFT; 2363 } 2364 2365 /*----------------------------------------------------------------*/ 2366 2367 /* 2368 * Construct a cache device mapping. 2369 * 2370 * cache <metadata dev> <cache dev> <origin dev> <block size> 2371 * <#feature args> [<feature arg>]* 2372 * <policy> <#policy args> [<policy arg>]* 2373 * 2374 * metadata dev : fast device holding the persistent metadata 2375 * cache dev : fast device holding cached data blocks 2376 * origin dev : slow device holding original data blocks 2377 * block size : cache unit size in sectors 2378 * 2379 * #feature args : number of feature arguments passed 2380 * feature args : writethrough. (The default is writeback.) 2381 * 2382 * policy : the replacement policy to use 2383 * #policy args : an even number of policy arguments corresponding 2384 * to key/value pairs passed to the policy 2385 * policy args : key/value pairs passed to the policy 2386 * E.g. 'sequential_threshold 1024' 2387 * See cache-policies.txt for details. 2388 * 2389 * Optional feature arguments are: 2390 * writethrough : write through caching that prohibits cache block 2391 * content from being different from origin block content. 2392 * Without this argument, the default behaviour is to write 2393 * back cache block contents later for performance reasons, 2394 * so they may differ from the corresponding origin blocks. 2395 */ 2396 struct cache_args { 2397 struct dm_target *ti; 2398 2399 struct dm_dev *metadata_dev; 2400 2401 struct dm_dev *cache_dev; 2402 sector_t cache_sectors; 2403 2404 struct dm_dev *origin_dev; 2405 sector_t origin_sectors; 2406 2407 uint32_t block_size; 2408 2409 const char *policy_name; 2410 int policy_argc; 2411 const char **policy_argv; 2412 2413 struct cache_features features; 2414 }; 2415 2416 static void destroy_cache_args(struct cache_args *ca) 2417 { 2418 if (ca->metadata_dev) 2419 dm_put_device(ca->ti, ca->metadata_dev); 2420 2421 if (ca->cache_dev) 2422 dm_put_device(ca->ti, ca->cache_dev); 2423 2424 if (ca->origin_dev) 2425 dm_put_device(ca->ti, ca->origin_dev); 2426 2427 kfree(ca); 2428 } 2429 2430 static bool at_least_one_arg(struct dm_arg_set *as, char **error) 2431 { 2432 if (!as->argc) { 2433 *error = "Insufficient args"; 2434 return false; 2435 } 2436 2437 return true; 2438 } 2439 2440 static int parse_metadata_dev(struct cache_args *ca, struct dm_arg_set *as, 2441 char **error) 2442 { 2443 int r; 2444 sector_t metadata_dev_size; 2445 char b[BDEVNAME_SIZE]; 2446 2447 if (!at_least_one_arg(as, error)) 2448 return -EINVAL; 2449 2450 r = dm_get_device(ca->ti, dm_shift_arg(as), FMODE_READ | FMODE_WRITE, 2451 &ca->metadata_dev); 2452 if (r) { 2453 *error = "Error opening metadata device"; 2454 return r; 2455 } 2456 2457 metadata_dev_size = get_dev_size(ca->metadata_dev); 2458 if (metadata_dev_size > DM_CACHE_METADATA_MAX_SECTORS_WARNING) 2459 DMWARN("Metadata device %s is larger than %u sectors: excess space will not be used.", 2460 bdevname(ca->metadata_dev->bdev, b), THIN_METADATA_MAX_SECTORS); 2461 2462 return 0; 2463 } 2464 2465 static int parse_cache_dev(struct cache_args *ca, struct dm_arg_set *as, 2466 char **error) 2467 { 2468 int r; 2469 2470 if (!at_least_one_arg(as, error)) 2471 return -EINVAL; 2472 2473 r = dm_get_device(ca->ti, dm_shift_arg(as), FMODE_READ | FMODE_WRITE, 2474 &ca->cache_dev); 2475 if (r) { 2476 *error = "Error opening cache device"; 2477 return r; 2478 } 2479 ca->cache_sectors = get_dev_size(ca->cache_dev); 2480 2481 return 0; 2482 } 2483 2484 static int parse_origin_dev(struct cache_args *ca, struct dm_arg_set *as, 2485 char **error) 2486 { 2487 int r; 2488 2489 if (!at_least_one_arg(as, error)) 2490 return -EINVAL; 2491 2492 r = dm_get_device(ca->ti, dm_shift_arg(as), FMODE_READ | FMODE_WRITE, 2493 &ca->origin_dev); 2494 if (r) { 2495 *error = "Error opening origin device"; 2496 return r; 2497 } 2498 2499 ca->origin_sectors = get_dev_size(ca->origin_dev); 2500 if (ca->ti->len > ca->origin_sectors) { 2501 *error = "Device size larger than cached device"; 2502 return -EINVAL; 2503 } 2504 2505 return 0; 2506 } 2507 2508 static int parse_block_size(struct cache_args *ca, struct dm_arg_set *as, 2509 char **error) 2510 { 2511 unsigned long block_size; 2512 2513 if (!at_least_one_arg(as, error)) 2514 return -EINVAL; 2515 2516 if (kstrtoul(dm_shift_arg(as), 10, &block_size) || !block_size || 2517 block_size < DATA_DEV_BLOCK_SIZE_MIN_SECTORS || 2518 block_size > DATA_DEV_BLOCK_SIZE_MAX_SECTORS || 2519 block_size & (DATA_DEV_BLOCK_SIZE_MIN_SECTORS - 1)) { 2520 *error = "Invalid data block size"; 2521 return -EINVAL; 2522 } 2523 2524 if (block_size > ca->cache_sectors) { 2525 *error = "Data block size is larger than the cache device"; 2526 return -EINVAL; 2527 } 2528 2529 ca->block_size = block_size; 2530 2531 return 0; 2532 } 2533 2534 static void init_features(struct cache_features *cf) 2535 { 2536 cf->mode = CM_WRITE; 2537 cf->io_mode = CM_IO_WRITEBACK; 2538 cf->metadata_version = 1; 2539 } 2540 2541 static int parse_features(struct cache_args *ca, struct dm_arg_set *as, 2542 char **error) 2543 { 2544 static struct dm_arg _args[] = { 2545 {0, 2, "Invalid number of cache feature arguments"}, 2546 }; 2547 2548 int r; 2549 unsigned argc; 2550 const char *arg; 2551 struct cache_features *cf = &ca->features; 2552 2553 init_features(cf); 2554 2555 r = dm_read_arg_group(_args, as, &argc, error); 2556 if (r) 2557 return -EINVAL; 2558 2559 while (argc--) { 2560 arg = dm_shift_arg(as); 2561 2562 if (!strcasecmp(arg, "writeback")) 2563 cf->io_mode = CM_IO_WRITEBACK; 2564 2565 else if (!strcasecmp(arg, "writethrough")) 2566 cf->io_mode = CM_IO_WRITETHROUGH; 2567 2568 else if (!strcasecmp(arg, "passthrough")) 2569 cf->io_mode = CM_IO_PASSTHROUGH; 2570 2571 else if (!strcasecmp(arg, "metadata2")) 2572 cf->metadata_version = 2; 2573 2574 else { 2575 *error = "Unrecognised cache feature requested"; 2576 return -EINVAL; 2577 } 2578 } 2579 2580 return 0; 2581 } 2582 2583 static int parse_policy(struct cache_args *ca, struct dm_arg_set *as, 2584 char **error) 2585 { 2586 static struct dm_arg _args[] = { 2587 {0, 1024, "Invalid number of policy arguments"}, 2588 }; 2589 2590 int r; 2591 2592 if (!at_least_one_arg(as, error)) 2593 return -EINVAL; 2594 2595 ca->policy_name = dm_shift_arg(as); 2596 2597 r = dm_read_arg_group(_args, as, &ca->policy_argc, error); 2598 if (r) 2599 return -EINVAL; 2600 2601 ca->policy_argv = (const char **)as->argv; 2602 dm_consume_args(as, ca->policy_argc); 2603 2604 return 0; 2605 } 2606 2607 static int parse_cache_args(struct cache_args *ca, int argc, char **argv, 2608 char **error) 2609 { 2610 int r; 2611 struct dm_arg_set as; 2612 2613 as.argc = argc; 2614 as.argv = argv; 2615 2616 r = parse_metadata_dev(ca, &as, error); 2617 if (r) 2618 return r; 2619 2620 r = parse_cache_dev(ca, &as, error); 2621 if (r) 2622 return r; 2623 2624 r = parse_origin_dev(ca, &as, error); 2625 if (r) 2626 return r; 2627 2628 r = parse_block_size(ca, &as, error); 2629 if (r) 2630 return r; 2631 2632 r = parse_features(ca, &as, error); 2633 if (r) 2634 return r; 2635 2636 r = parse_policy(ca, &as, error); 2637 if (r) 2638 return r; 2639 2640 return 0; 2641 } 2642 2643 /*----------------------------------------------------------------*/ 2644 2645 static struct kmem_cache *migration_cache; 2646 2647 #define NOT_CORE_OPTION 1 2648 2649 static int process_config_option(struct cache *cache, const char *key, const char *value) 2650 { 2651 unsigned long tmp; 2652 2653 if (!strcasecmp(key, "migration_threshold")) { 2654 if (kstrtoul(value, 10, &tmp)) 2655 return -EINVAL; 2656 2657 cache->migration_threshold = tmp; 2658 return 0; 2659 } 2660 2661 return NOT_CORE_OPTION; 2662 } 2663 2664 static int set_config_value(struct cache *cache, const char *key, const char *value) 2665 { 2666 int r = process_config_option(cache, key, value); 2667 2668 if (r == NOT_CORE_OPTION) 2669 r = policy_set_config_value(cache->policy, key, value); 2670 2671 if (r) 2672 DMWARN("bad config value for %s: %s", key, value); 2673 2674 return r; 2675 } 2676 2677 static int set_config_values(struct cache *cache, int argc, const char **argv) 2678 { 2679 int r = 0; 2680 2681 if (argc & 1) { 2682 DMWARN("Odd number of policy arguments given but they should be <key> <value> pairs."); 2683 return -EINVAL; 2684 } 2685 2686 while (argc) { 2687 r = set_config_value(cache, argv[0], argv[1]); 2688 if (r) 2689 break; 2690 2691 argc -= 2; 2692 argv += 2; 2693 } 2694 2695 return r; 2696 } 2697 2698 static int create_cache_policy(struct cache *cache, struct cache_args *ca, 2699 char **error) 2700 { 2701 struct dm_cache_policy *p = dm_cache_policy_create(ca->policy_name, 2702 cache->cache_size, 2703 cache->origin_sectors, 2704 cache->sectors_per_block); 2705 if (IS_ERR(p)) { 2706 *error = "Error creating cache's policy"; 2707 return PTR_ERR(p); 2708 } 2709 cache->policy = p; 2710 2711 return 0; 2712 } 2713 2714 /* 2715 * We want the discard block size to be at least the size of the cache 2716 * block size and have no more than 2^14 discard blocks across the origin. 2717 */ 2718 #define MAX_DISCARD_BLOCKS (1 << 14) 2719 2720 static bool too_many_discard_blocks(sector_t discard_block_size, 2721 sector_t origin_size) 2722 { 2723 (void) sector_div(origin_size, discard_block_size); 2724 2725 return origin_size > MAX_DISCARD_BLOCKS; 2726 } 2727 2728 static sector_t calculate_discard_block_size(sector_t cache_block_size, 2729 sector_t origin_size) 2730 { 2731 sector_t discard_block_size = cache_block_size; 2732 2733 if (origin_size) 2734 while (too_many_discard_blocks(discard_block_size, origin_size)) 2735 discard_block_size *= 2; 2736 2737 return discard_block_size; 2738 } 2739 2740 static void set_cache_size(struct cache *cache, dm_cblock_t size) 2741 { 2742 dm_block_t nr_blocks = from_cblock(size); 2743 2744 if (nr_blocks > (1 << 20) && cache->cache_size != size) 2745 DMWARN_LIMIT("You have created a cache device with a lot of individual cache blocks (%llu)\n" 2746 "All these mappings can consume a lot of kernel memory, and take some time to read/write.\n" 2747 "Please consider increasing the cache block size to reduce the overall cache block count.", 2748 (unsigned long long) nr_blocks); 2749 2750 cache->cache_size = size; 2751 } 2752 2753 #define DEFAULT_MIGRATION_THRESHOLD 2048 2754 2755 static int cache_create(struct cache_args *ca, struct cache **result) 2756 { 2757 int r = 0; 2758 char **error = &ca->ti->error; 2759 struct cache *cache; 2760 struct dm_target *ti = ca->ti; 2761 dm_block_t origin_blocks; 2762 struct dm_cache_metadata *cmd; 2763 bool may_format = ca->features.mode == CM_WRITE; 2764 2765 cache = kzalloc(sizeof(*cache), GFP_KERNEL); 2766 if (!cache) 2767 return -ENOMEM; 2768 2769 cache->ti = ca->ti; 2770 ti->private = cache; 2771 ti->num_flush_bios = 2; 2772 ti->flush_supported = true; 2773 2774 ti->num_discard_bios = 1; 2775 ti->discards_supported = true; 2776 ti->discard_zeroes_data_unsupported = true; 2777 ti->split_discard_bios = false; 2778 2779 cache->features = ca->features; 2780 ti->per_io_data_size = get_per_bio_data_size(cache); 2781 2782 cache->callbacks.congested_fn = cache_is_congested; 2783 dm_table_add_target_callbacks(ti->table, &cache->callbacks); 2784 2785 cache->metadata_dev = ca->metadata_dev; 2786 cache->origin_dev = ca->origin_dev; 2787 cache->cache_dev = ca->cache_dev; 2788 2789 ca->metadata_dev = ca->origin_dev = ca->cache_dev = NULL; 2790 2791 /* FIXME: factor out this whole section */ 2792 origin_blocks = cache->origin_sectors = ca->origin_sectors; 2793 origin_blocks = block_div(origin_blocks, ca->block_size); 2794 cache->origin_blocks = to_oblock(origin_blocks); 2795 2796 cache->sectors_per_block = ca->block_size; 2797 if (dm_set_target_max_io_len(ti, cache->sectors_per_block)) { 2798 r = -EINVAL; 2799 goto bad; 2800 } 2801 2802 if (ca->block_size & (ca->block_size - 1)) { 2803 dm_block_t cache_size = ca->cache_sectors; 2804 2805 cache->sectors_per_block_shift = -1; 2806 cache_size = block_div(cache_size, ca->block_size); 2807 set_cache_size(cache, to_cblock(cache_size)); 2808 } else { 2809 cache->sectors_per_block_shift = __ffs(ca->block_size); 2810 set_cache_size(cache, to_cblock(ca->cache_sectors >> cache->sectors_per_block_shift)); 2811 } 2812 2813 r = create_cache_policy(cache, ca, error); 2814 if (r) 2815 goto bad; 2816 2817 cache->policy_nr_args = ca->policy_argc; 2818 cache->migration_threshold = DEFAULT_MIGRATION_THRESHOLD; 2819 2820 r = set_config_values(cache, ca->policy_argc, ca->policy_argv); 2821 if (r) { 2822 *error = "Error setting cache policy's config values"; 2823 goto bad; 2824 } 2825 2826 cmd = dm_cache_metadata_open(cache->metadata_dev->bdev, 2827 ca->block_size, may_format, 2828 dm_cache_policy_get_hint_size(cache->policy), 2829 ca->features.metadata_version); 2830 if (IS_ERR(cmd)) { 2831 *error = "Error creating metadata object"; 2832 r = PTR_ERR(cmd); 2833 goto bad; 2834 } 2835 cache->cmd = cmd; 2836 set_cache_mode(cache, CM_WRITE); 2837 if (get_cache_mode(cache) != CM_WRITE) { 2838 *error = "Unable to get write access to metadata, please check/repair metadata."; 2839 r = -EINVAL; 2840 goto bad; 2841 } 2842 2843 if (passthrough_mode(&cache->features)) { 2844 bool all_clean; 2845 2846 r = dm_cache_metadata_all_clean(cache->cmd, &all_clean); 2847 if (r) { 2848 *error = "dm_cache_metadata_all_clean() failed"; 2849 goto bad; 2850 } 2851 2852 if (!all_clean) { 2853 *error = "Cannot enter passthrough mode unless all blocks are clean"; 2854 r = -EINVAL; 2855 goto bad; 2856 } 2857 } 2858 2859 spin_lock_init(&cache->lock); 2860 INIT_LIST_HEAD(&cache->deferred_cells); 2861 bio_list_init(&cache->deferred_bios); 2862 bio_list_init(&cache->deferred_flush_bios); 2863 bio_list_init(&cache->deferred_writethrough_bios); 2864 INIT_LIST_HEAD(&cache->quiesced_migrations); 2865 INIT_LIST_HEAD(&cache->completed_migrations); 2866 INIT_LIST_HEAD(&cache->need_commit_migrations); 2867 atomic_set(&cache->nr_allocated_migrations, 0); 2868 atomic_set(&cache->nr_io_migrations, 0); 2869 init_waitqueue_head(&cache->migration_wait); 2870 2871 init_waitqueue_head(&cache->quiescing_wait); 2872 atomic_set(&cache->quiescing, 0); 2873 atomic_set(&cache->quiescing_ack, 0); 2874 2875 r = -ENOMEM; 2876 atomic_set(&cache->nr_dirty, 0); 2877 cache->dirty_bitset = alloc_bitset(from_cblock(cache->cache_size)); 2878 if (!cache->dirty_bitset) { 2879 *error = "could not allocate dirty bitset"; 2880 goto bad; 2881 } 2882 clear_bitset(cache->dirty_bitset, from_cblock(cache->cache_size)); 2883 2884 cache->discard_block_size = 2885 calculate_discard_block_size(cache->sectors_per_block, 2886 cache->origin_sectors); 2887 cache->discard_nr_blocks = to_dblock(dm_sector_div_up(cache->origin_sectors, 2888 cache->discard_block_size)); 2889 cache->discard_bitset = alloc_bitset(from_dblock(cache->discard_nr_blocks)); 2890 if (!cache->discard_bitset) { 2891 *error = "could not allocate discard bitset"; 2892 goto bad; 2893 } 2894 clear_bitset(cache->discard_bitset, from_dblock(cache->discard_nr_blocks)); 2895 2896 cache->copier = dm_kcopyd_client_create(&dm_kcopyd_throttle); 2897 if (IS_ERR(cache->copier)) { 2898 *error = "could not create kcopyd client"; 2899 r = PTR_ERR(cache->copier); 2900 goto bad; 2901 } 2902 2903 cache->wq = alloc_ordered_workqueue("dm-" DM_MSG_PREFIX, WQ_MEM_RECLAIM); 2904 if (!cache->wq) { 2905 *error = "could not create workqueue for metadata object"; 2906 goto bad; 2907 } 2908 INIT_WORK(&cache->worker, do_worker); 2909 INIT_DELAYED_WORK(&cache->waker, do_waker); 2910 cache->last_commit_jiffies = jiffies; 2911 2912 cache->prison = dm_bio_prison_create(); 2913 if (!cache->prison) { 2914 *error = "could not create bio prison"; 2915 goto bad; 2916 } 2917 2918 cache->all_io_ds = dm_deferred_set_create(); 2919 if (!cache->all_io_ds) { 2920 *error = "could not create all_io deferred set"; 2921 goto bad; 2922 } 2923 2924 cache->migration_pool = mempool_create_slab_pool(MIGRATION_POOL_SIZE, 2925 migration_cache); 2926 if (!cache->migration_pool) { 2927 *error = "Error creating cache's migration mempool"; 2928 goto bad; 2929 } 2930 2931 cache->need_tick_bio = true; 2932 cache->sized = false; 2933 cache->invalidate = false; 2934 cache->commit_requested = false; 2935 cache->loaded_mappings = false; 2936 cache->loaded_discards = false; 2937 2938 load_stats(cache); 2939 2940 atomic_set(&cache->stats.demotion, 0); 2941 atomic_set(&cache->stats.promotion, 0); 2942 atomic_set(&cache->stats.copies_avoided, 0); 2943 atomic_set(&cache->stats.cache_cell_clash, 0); 2944 atomic_set(&cache->stats.commit_count, 0); 2945 atomic_set(&cache->stats.discard_count, 0); 2946 2947 spin_lock_init(&cache->invalidation_lock); 2948 INIT_LIST_HEAD(&cache->invalidation_requests); 2949 2950 iot_init(&cache->origin_tracker); 2951 2952 *result = cache; 2953 return 0; 2954 2955 bad: 2956 destroy(cache); 2957 return r; 2958 } 2959 2960 static int copy_ctr_args(struct cache *cache, int argc, const char **argv) 2961 { 2962 unsigned i; 2963 const char **copy; 2964 2965 copy = kcalloc(argc, sizeof(*copy), GFP_KERNEL); 2966 if (!copy) 2967 return -ENOMEM; 2968 for (i = 0; i < argc; i++) { 2969 copy[i] = kstrdup(argv[i], GFP_KERNEL); 2970 if (!copy[i]) { 2971 while (i--) 2972 kfree(copy[i]); 2973 kfree(copy); 2974 return -ENOMEM; 2975 } 2976 } 2977 2978 cache->nr_ctr_args = argc; 2979 cache->ctr_args = copy; 2980 2981 return 0; 2982 } 2983 2984 static int cache_ctr(struct dm_target *ti, unsigned argc, char **argv) 2985 { 2986 int r = -EINVAL; 2987 struct cache_args *ca; 2988 struct cache *cache = NULL; 2989 2990 ca = kzalloc(sizeof(*ca), GFP_KERNEL); 2991 if (!ca) { 2992 ti->error = "Error allocating memory for cache"; 2993 return -ENOMEM; 2994 } 2995 ca->ti = ti; 2996 2997 r = parse_cache_args(ca, argc, argv, &ti->error); 2998 if (r) 2999 goto out; 3000 3001 r = cache_create(ca, &cache); 3002 if (r) 3003 goto out; 3004 3005 r = copy_ctr_args(cache, argc - 3, (const char **)argv + 3); 3006 if (r) { 3007 destroy(cache); 3008 goto out; 3009 } 3010 3011 ti->private = cache; 3012 3013 out: 3014 destroy_cache_args(ca); 3015 return r; 3016 } 3017 3018 /*----------------------------------------------------------------*/ 3019 3020 static int cache_map(struct dm_target *ti, struct bio *bio) 3021 { 3022 struct cache *cache = ti->private; 3023 3024 int r; 3025 struct dm_bio_prison_cell *cell = NULL; 3026 dm_oblock_t block = get_bio_block(cache, bio); 3027 size_t pb_data_size = get_per_bio_data_size(cache); 3028 bool can_migrate = false; 3029 bool fast_promotion; 3030 struct policy_result lookup_result; 3031 struct per_bio_data *pb = init_per_bio_data(bio, pb_data_size); 3032 struct old_oblock_lock ool; 3033 3034 ool.locker.fn = null_locker; 3035 3036 if (unlikely(from_oblock(block) >= from_oblock(cache->origin_blocks))) { 3037 /* 3038 * This can only occur if the io goes to a partial block at 3039 * the end of the origin device. We don't cache these. 3040 * Just remap to the origin and carry on. 3041 */ 3042 remap_to_origin(cache, bio); 3043 accounted_begin(cache, bio); 3044 return DM_MAPIO_REMAPPED; 3045 } 3046 3047 if (discard_or_flush(bio)) { 3048 defer_bio(cache, bio); 3049 return DM_MAPIO_SUBMITTED; 3050 } 3051 3052 /* 3053 * Check to see if that block is currently migrating. 3054 */ 3055 cell = alloc_prison_cell(cache); 3056 if (!cell) { 3057 defer_bio(cache, bio); 3058 return DM_MAPIO_SUBMITTED; 3059 } 3060 3061 r = bio_detain(cache, block, bio, cell, 3062 (cell_free_fn) free_prison_cell, 3063 cache, &cell); 3064 if (r) { 3065 if (r < 0) 3066 defer_bio(cache, bio); 3067 3068 return DM_MAPIO_SUBMITTED; 3069 } 3070 3071 fast_promotion = is_discarded_oblock(cache, block) || bio_writes_complete_block(cache, bio); 3072 3073 r = policy_map(cache->policy, block, false, can_migrate, fast_promotion, 3074 bio, &ool.locker, &lookup_result); 3075 if (r == -EWOULDBLOCK) { 3076 cell_defer(cache, cell, true); 3077 return DM_MAPIO_SUBMITTED; 3078 3079 } else if (r) { 3080 DMERR_LIMIT("%s: Unexpected return from cache replacement policy: %d", 3081 cache_device_name(cache), r); 3082 cell_defer(cache, cell, false); 3083 bio_io_error(bio); 3084 return DM_MAPIO_SUBMITTED; 3085 } 3086 3087 r = DM_MAPIO_REMAPPED; 3088 switch (lookup_result.op) { 3089 case POLICY_HIT: 3090 if (passthrough_mode(&cache->features)) { 3091 if (bio_data_dir(bio) == WRITE) { 3092 /* 3093 * We need to invalidate this block, so 3094 * defer for the worker thread. 3095 */ 3096 cell_defer(cache, cell, true); 3097 r = DM_MAPIO_SUBMITTED; 3098 3099 } else { 3100 inc_miss_counter(cache, bio); 3101 remap_to_origin_clear_discard(cache, bio, block); 3102 accounted_begin(cache, bio); 3103 inc_ds(cache, bio, cell); 3104 // FIXME: we want to remap hits or misses straight 3105 // away rather than passing over to the worker. 3106 cell_defer(cache, cell, false); 3107 } 3108 3109 } else { 3110 inc_hit_counter(cache, bio); 3111 if (bio_data_dir(bio) == WRITE && writethrough_mode(&cache->features) && 3112 !is_dirty(cache, lookup_result.cblock)) { 3113 remap_to_origin_then_cache(cache, bio, block, lookup_result.cblock); 3114 accounted_begin(cache, bio); 3115 inc_ds(cache, bio, cell); 3116 cell_defer(cache, cell, false); 3117 3118 } else 3119 remap_cell_to_cache_dirty(cache, cell, block, lookup_result.cblock, false); 3120 } 3121 break; 3122 3123 case POLICY_MISS: 3124 inc_miss_counter(cache, bio); 3125 if (pb->req_nr != 0) { 3126 /* 3127 * This is a duplicate writethrough io that is no 3128 * longer needed because the block has been demoted. 3129 */ 3130 bio_endio(bio); 3131 // FIXME: remap everything as a miss 3132 cell_defer(cache, cell, false); 3133 r = DM_MAPIO_SUBMITTED; 3134 3135 } else 3136 remap_cell_to_origin_clear_discard(cache, cell, block, false); 3137 break; 3138 3139 default: 3140 DMERR_LIMIT("%s: %s: erroring bio: unknown policy op: %u", 3141 cache_device_name(cache), __func__, 3142 (unsigned) lookup_result.op); 3143 cell_defer(cache, cell, false); 3144 bio_io_error(bio); 3145 r = DM_MAPIO_SUBMITTED; 3146 } 3147 3148 return r; 3149 } 3150 3151 static int cache_end_io(struct dm_target *ti, struct bio *bio, int error) 3152 { 3153 struct cache *cache = ti->private; 3154 unsigned long flags; 3155 size_t pb_data_size = get_per_bio_data_size(cache); 3156 struct per_bio_data *pb = get_per_bio_data(bio, pb_data_size); 3157 3158 if (pb->tick) { 3159 policy_tick(cache->policy, false); 3160 3161 spin_lock_irqsave(&cache->lock, flags); 3162 cache->need_tick_bio = true; 3163 spin_unlock_irqrestore(&cache->lock, flags); 3164 } 3165 3166 check_for_quiesced_migrations(cache, pb); 3167 accounted_complete(cache, bio); 3168 3169 return 0; 3170 } 3171 3172 static int write_dirty_bitset(struct cache *cache) 3173 { 3174 int r; 3175 3176 if (get_cache_mode(cache) >= CM_READ_ONLY) 3177 return -EINVAL; 3178 3179 r = dm_cache_set_dirty_bits(cache->cmd, from_cblock(cache->cache_size), cache->dirty_bitset); 3180 if (r) 3181 metadata_operation_failed(cache, "dm_cache_set_dirty_bits", r); 3182 3183 return r; 3184 } 3185 3186 static int write_discard_bitset(struct cache *cache) 3187 { 3188 unsigned i, r; 3189 3190 if (get_cache_mode(cache) >= CM_READ_ONLY) 3191 return -EINVAL; 3192 3193 r = dm_cache_discard_bitset_resize(cache->cmd, cache->discard_block_size, 3194 cache->discard_nr_blocks); 3195 if (r) { 3196 DMERR("%s: could not resize on-disk discard bitset", cache_device_name(cache)); 3197 metadata_operation_failed(cache, "dm_cache_discard_bitset_resize", r); 3198 return r; 3199 } 3200 3201 for (i = 0; i < from_dblock(cache->discard_nr_blocks); i++) { 3202 r = dm_cache_set_discard(cache->cmd, to_dblock(i), 3203 is_discarded(cache, to_dblock(i))); 3204 if (r) { 3205 metadata_operation_failed(cache, "dm_cache_set_discard", r); 3206 return r; 3207 } 3208 } 3209 3210 return 0; 3211 } 3212 3213 static int write_hints(struct cache *cache) 3214 { 3215 int r; 3216 3217 if (get_cache_mode(cache) >= CM_READ_ONLY) 3218 return -EINVAL; 3219 3220 r = dm_cache_write_hints(cache->cmd, cache->policy); 3221 if (r) { 3222 metadata_operation_failed(cache, "dm_cache_write_hints", r); 3223 return r; 3224 } 3225 3226 return 0; 3227 } 3228 3229 /* 3230 * returns true on success 3231 */ 3232 static bool sync_metadata(struct cache *cache) 3233 { 3234 int r1, r2, r3, r4; 3235 3236 r1 = write_dirty_bitset(cache); 3237 if (r1) 3238 DMERR("%s: could not write dirty bitset", cache_device_name(cache)); 3239 3240 r2 = write_discard_bitset(cache); 3241 if (r2) 3242 DMERR("%s: could not write discard bitset", cache_device_name(cache)); 3243 3244 save_stats(cache); 3245 3246 r3 = write_hints(cache); 3247 if (r3) 3248 DMERR("%s: could not write hints", cache_device_name(cache)); 3249 3250 /* 3251 * If writing the above metadata failed, we still commit, but don't 3252 * set the clean shutdown flag. This will effectively force every 3253 * dirty bit to be set on reload. 3254 */ 3255 r4 = commit(cache, !r1 && !r2 && !r3); 3256 if (r4) 3257 DMERR("%s: could not write cache metadata", cache_device_name(cache)); 3258 3259 return !r1 && !r2 && !r3 && !r4; 3260 } 3261 3262 static void cache_postsuspend(struct dm_target *ti) 3263 { 3264 struct cache *cache = ti->private; 3265 3266 start_quiescing(cache); 3267 wait_for_migrations(cache); 3268 stop_worker(cache); 3269 requeue_deferred_bios(cache); 3270 requeue_deferred_cells(cache); 3271 stop_quiescing(cache); 3272 3273 if (get_cache_mode(cache) == CM_WRITE) 3274 (void) sync_metadata(cache); 3275 } 3276 3277 static int load_mapping(void *context, dm_oblock_t oblock, dm_cblock_t cblock, 3278 bool dirty, uint32_t hint, bool hint_valid) 3279 { 3280 int r; 3281 struct cache *cache = context; 3282 3283 r = policy_load_mapping(cache->policy, oblock, cblock, hint, hint_valid); 3284 if (r) 3285 return r; 3286 3287 if (dirty) 3288 set_dirty(cache, oblock, cblock); 3289 else 3290 clear_dirty(cache, oblock, cblock); 3291 3292 return 0; 3293 } 3294 3295 /* 3296 * The discard block size in the on disk metadata is not 3297 * neccessarily the same as we're currently using. So we have to 3298 * be careful to only set the discarded attribute if we know it 3299 * covers a complete block of the new size. 3300 */ 3301 struct discard_load_info { 3302 struct cache *cache; 3303 3304 /* 3305 * These blocks are sized using the on disk dblock size, rather 3306 * than the current one. 3307 */ 3308 dm_block_t block_size; 3309 dm_block_t discard_begin, discard_end; 3310 }; 3311 3312 static void discard_load_info_init(struct cache *cache, 3313 struct discard_load_info *li) 3314 { 3315 li->cache = cache; 3316 li->discard_begin = li->discard_end = 0; 3317 } 3318 3319 static void set_discard_range(struct discard_load_info *li) 3320 { 3321 sector_t b, e; 3322 3323 if (li->discard_begin == li->discard_end) 3324 return; 3325 3326 /* 3327 * Convert to sectors. 3328 */ 3329 b = li->discard_begin * li->block_size; 3330 e = li->discard_end * li->block_size; 3331 3332 /* 3333 * Then convert back to the current dblock size. 3334 */ 3335 b = dm_sector_div_up(b, li->cache->discard_block_size); 3336 sector_div(e, li->cache->discard_block_size); 3337 3338 /* 3339 * The origin may have shrunk, so we need to check we're still in 3340 * bounds. 3341 */ 3342 if (e > from_dblock(li->cache->discard_nr_blocks)) 3343 e = from_dblock(li->cache->discard_nr_blocks); 3344 3345 for (; b < e; b++) 3346 set_discard(li->cache, to_dblock(b)); 3347 } 3348 3349 static int load_discard(void *context, sector_t discard_block_size, 3350 dm_dblock_t dblock, bool discard) 3351 { 3352 struct discard_load_info *li = context; 3353 3354 li->block_size = discard_block_size; 3355 3356 if (discard) { 3357 if (from_dblock(dblock) == li->discard_end) 3358 /* 3359 * We're already in a discard range, just extend it. 3360 */ 3361 li->discard_end = li->discard_end + 1ULL; 3362 3363 else { 3364 /* 3365 * Emit the old range and start a new one. 3366 */ 3367 set_discard_range(li); 3368 li->discard_begin = from_dblock(dblock); 3369 li->discard_end = li->discard_begin + 1ULL; 3370 } 3371 } else { 3372 set_discard_range(li); 3373 li->discard_begin = li->discard_end = 0; 3374 } 3375 3376 return 0; 3377 } 3378 3379 static dm_cblock_t get_cache_dev_size(struct cache *cache) 3380 { 3381 sector_t size = get_dev_size(cache->cache_dev); 3382 (void) sector_div(size, cache->sectors_per_block); 3383 return to_cblock(size); 3384 } 3385 3386 static bool can_resize(struct cache *cache, dm_cblock_t new_size) 3387 { 3388 if (from_cblock(new_size) > from_cblock(cache->cache_size)) 3389 return true; 3390 3391 /* 3392 * We can't drop a dirty block when shrinking the cache. 3393 */ 3394 while (from_cblock(new_size) < from_cblock(cache->cache_size)) { 3395 new_size = to_cblock(from_cblock(new_size) + 1); 3396 if (is_dirty(cache, new_size)) { 3397 DMERR("%s: unable to shrink cache; cache block %llu is dirty", 3398 cache_device_name(cache), 3399 (unsigned long long) from_cblock(new_size)); 3400 return false; 3401 } 3402 } 3403 3404 return true; 3405 } 3406 3407 static int resize_cache_dev(struct cache *cache, dm_cblock_t new_size) 3408 { 3409 int r; 3410 3411 r = dm_cache_resize(cache->cmd, new_size); 3412 if (r) { 3413 DMERR("%s: could not resize cache metadata", cache_device_name(cache)); 3414 metadata_operation_failed(cache, "dm_cache_resize", r); 3415 return r; 3416 } 3417 3418 set_cache_size(cache, new_size); 3419 3420 return 0; 3421 } 3422 3423 static int cache_preresume(struct dm_target *ti) 3424 { 3425 int r = 0; 3426 struct cache *cache = ti->private; 3427 dm_cblock_t csize = get_cache_dev_size(cache); 3428 3429 /* 3430 * Check to see if the cache has resized. 3431 */ 3432 if (!cache->sized) { 3433 r = resize_cache_dev(cache, csize); 3434 if (r) 3435 return r; 3436 3437 cache->sized = true; 3438 3439 } else if (csize != cache->cache_size) { 3440 if (!can_resize(cache, csize)) 3441 return -EINVAL; 3442 3443 r = resize_cache_dev(cache, csize); 3444 if (r) 3445 return r; 3446 } 3447 3448 if (!cache->loaded_mappings) { 3449 r = dm_cache_load_mappings(cache->cmd, cache->policy, 3450 load_mapping, cache); 3451 if (r) { 3452 DMERR("%s: could not load cache mappings", cache_device_name(cache)); 3453 metadata_operation_failed(cache, "dm_cache_load_mappings", r); 3454 return r; 3455 } 3456 3457 cache->loaded_mappings = true; 3458 } 3459 3460 if (!cache->loaded_discards) { 3461 struct discard_load_info li; 3462 3463 /* 3464 * The discard bitset could have been resized, or the 3465 * discard block size changed. To be safe we start by 3466 * setting every dblock to not discarded. 3467 */ 3468 clear_bitset(cache->discard_bitset, from_dblock(cache->discard_nr_blocks)); 3469 3470 discard_load_info_init(cache, &li); 3471 r = dm_cache_load_discards(cache->cmd, load_discard, &li); 3472 if (r) { 3473 DMERR("%s: could not load origin discards", cache_device_name(cache)); 3474 metadata_operation_failed(cache, "dm_cache_load_discards", r); 3475 return r; 3476 } 3477 set_discard_range(&li); 3478 3479 cache->loaded_discards = true; 3480 } 3481 3482 return r; 3483 } 3484 3485 static void cache_resume(struct dm_target *ti) 3486 { 3487 struct cache *cache = ti->private; 3488 3489 cache->need_tick_bio = true; 3490 do_waker(&cache->waker.work); 3491 } 3492 3493 /* 3494 * Status format: 3495 * 3496 * <metadata block size> <#used metadata blocks>/<#total metadata blocks> 3497 * <cache block size> <#used cache blocks>/<#total cache blocks> 3498 * <#read hits> <#read misses> <#write hits> <#write misses> 3499 * <#demotions> <#promotions> <#dirty> 3500 * <#features> <features>* 3501 * <#core args> <core args> 3502 * <policy name> <#policy args> <policy args>* <cache metadata mode> <needs_check> 3503 */ 3504 static void cache_status(struct dm_target *ti, status_type_t type, 3505 unsigned status_flags, char *result, unsigned maxlen) 3506 { 3507 int r = 0; 3508 unsigned i; 3509 ssize_t sz = 0; 3510 dm_block_t nr_free_blocks_metadata = 0; 3511 dm_block_t nr_blocks_metadata = 0; 3512 char buf[BDEVNAME_SIZE]; 3513 struct cache *cache = ti->private; 3514 dm_cblock_t residency; 3515 bool needs_check; 3516 3517 switch (type) { 3518 case STATUSTYPE_INFO: 3519 if (get_cache_mode(cache) == CM_FAIL) { 3520 DMEMIT("Fail"); 3521 break; 3522 } 3523 3524 /* Commit to ensure statistics aren't out-of-date */ 3525 if (!(status_flags & DM_STATUS_NOFLUSH_FLAG) && !dm_suspended(ti)) 3526 (void) commit(cache, false); 3527 3528 r = dm_cache_get_free_metadata_block_count(cache->cmd, &nr_free_blocks_metadata); 3529 if (r) { 3530 DMERR("%s: dm_cache_get_free_metadata_block_count returned %d", 3531 cache_device_name(cache), r); 3532 goto err; 3533 } 3534 3535 r = dm_cache_get_metadata_dev_size(cache->cmd, &nr_blocks_metadata); 3536 if (r) { 3537 DMERR("%s: dm_cache_get_metadata_dev_size returned %d", 3538 cache_device_name(cache), r); 3539 goto err; 3540 } 3541 3542 residency = policy_residency(cache->policy); 3543 3544 DMEMIT("%u %llu/%llu %llu %llu/%llu %u %u %u %u %u %u %lu ", 3545 (unsigned)DM_CACHE_METADATA_BLOCK_SIZE, 3546 (unsigned long long)(nr_blocks_metadata - nr_free_blocks_metadata), 3547 (unsigned long long)nr_blocks_metadata, 3548 (unsigned long long)cache->sectors_per_block, 3549 (unsigned long long) from_cblock(residency), 3550 (unsigned long long) from_cblock(cache->cache_size), 3551 (unsigned) atomic_read(&cache->stats.read_hit), 3552 (unsigned) atomic_read(&cache->stats.read_miss), 3553 (unsigned) atomic_read(&cache->stats.write_hit), 3554 (unsigned) atomic_read(&cache->stats.write_miss), 3555 (unsigned) atomic_read(&cache->stats.demotion), 3556 (unsigned) atomic_read(&cache->stats.promotion), 3557 (unsigned long) atomic_read(&cache->nr_dirty)); 3558 3559 if (cache->features.metadata_version == 2) 3560 DMEMIT("2 metadata2 "); 3561 else 3562 DMEMIT("1 "); 3563 3564 if (writethrough_mode(&cache->features)) 3565 DMEMIT("writethrough "); 3566 3567 else if (passthrough_mode(&cache->features)) 3568 DMEMIT("passthrough "); 3569 3570 else if (writeback_mode(&cache->features)) 3571 DMEMIT("writeback "); 3572 3573 else { 3574 DMERR("%s: internal error: unknown io mode: %d", 3575 cache_device_name(cache), (int) cache->features.io_mode); 3576 goto err; 3577 } 3578 3579 DMEMIT("2 migration_threshold %llu ", (unsigned long long) cache->migration_threshold); 3580 3581 DMEMIT("%s ", dm_cache_policy_get_name(cache->policy)); 3582 if (sz < maxlen) { 3583 r = policy_emit_config_values(cache->policy, result, maxlen, &sz); 3584 if (r) 3585 DMERR("%s: policy_emit_config_values returned %d", 3586 cache_device_name(cache), r); 3587 } 3588 3589 if (get_cache_mode(cache) == CM_READ_ONLY) 3590 DMEMIT("ro "); 3591 else 3592 DMEMIT("rw "); 3593 3594 r = dm_cache_metadata_needs_check(cache->cmd, &needs_check); 3595 3596 if (r || needs_check) 3597 DMEMIT("needs_check "); 3598 else 3599 DMEMIT("- "); 3600 3601 break; 3602 3603 case STATUSTYPE_TABLE: 3604 format_dev_t(buf, cache->metadata_dev->bdev->bd_dev); 3605 DMEMIT("%s ", buf); 3606 format_dev_t(buf, cache->cache_dev->bdev->bd_dev); 3607 DMEMIT("%s ", buf); 3608 format_dev_t(buf, cache->origin_dev->bdev->bd_dev); 3609 DMEMIT("%s", buf); 3610 3611 for (i = 0; i < cache->nr_ctr_args - 1; i++) 3612 DMEMIT(" %s", cache->ctr_args[i]); 3613 if (cache->nr_ctr_args) 3614 DMEMIT(" %s", cache->ctr_args[cache->nr_ctr_args - 1]); 3615 } 3616 3617 return; 3618 3619 err: 3620 DMEMIT("Error"); 3621 } 3622 3623 /* 3624 * A cache block range can take two forms: 3625 * 3626 * i) A single cblock, eg. '3456' 3627 * ii) A begin and end cblock with dots between, eg. 123-234 3628 */ 3629 static int parse_cblock_range(struct cache *cache, const char *str, 3630 struct cblock_range *result) 3631 { 3632 char dummy; 3633 uint64_t b, e; 3634 int r; 3635 3636 /* 3637 * Try and parse form (ii) first. 3638 */ 3639 r = sscanf(str, "%llu-%llu%c", &b, &e, &dummy); 3640 if (r < 0) 3641 return r; 3642 3643 if (r == 2) { 3644 result->begin = to_cblock(b); 3645 result->end = to_cblock(e); 3646 return 0; 3647 } 3648 3649 /* 3650 * That didn't work, try form (i). 3651 */ 3652 r = sscanf(str, "%llu%c", &b, &dummy); 3653 if (r < 0) 3654 return r; 3655 3656 if (r == 1) { 3657 result->begin = to_cblock(b); 3658 result->end = to_cblock(from_cblock(result->begin) + 1u); 3659 return 0; 3660 } 3661 3662 DMERR("%s: invalid cblock range '%s'", cache_device_name(cache), str); 3663 return -EINVAL; 3664 } 3665 3666 static int validate_cblock_range(struct cache *cache, struct cblock_range *range) 3667 { 3668 uint64_t b = from_cblock(range->begin); 3669 uint64_t e = from_cblock(range->end); 3670 uint64_t n = from_cblock(cache->cache_size); 3671 3672 if (b >= n) { 3673 DMERR("%s: begin cblock out of range: %llu >= %llu", 3674 cache_device_name(cache), b, n); 3675 return -EINVAL; 3676 } 3677 3678 if (e > n) { 3679 DMERR("%s: end cblock out of range: %llu > %llu", 3680 cache_device_name(cache), e, n); 3681 return -EINVAL; 3682 } 3683 3684 if (b >= e) { 3685 DMERR("%s: invalid cblock range: %llu >= %llu", 3686 cache_device_name(cache), b, e); 3687 return -EINVAL; 3688 } 3689 3690 return 0; 3691 } 3692 3693 static int request_invalidation(struct cache *cache, struct cblock_range *range) 3694 { 3695 struct invalidation_request req; 3696 3697 INIT_LIST_HEAD(&req.list); 3698 req.cblocks = range; 3699 atomic_set(&req.complete, 0); 3700 req.err = 0; 3701 init_waitqueue_head(&req.result_wait); 3702 3703 spin_lock(&cache->invalidation_lock); 3704 list_add(&req.list, &cache->invalidation_requests); 3705 spin_unlock(&cache->invalidation_lock); 3706 wake_worker(cache); 3707 3708 wait_event(req.result_wait, atomic_read(&req.complete)); 3709 return req.err; 3710 } 3711 3712 static int process_invalidate_cblocks_message(struct cache *cache, unsigned count, 3713 const char **cblock_ranges) 3714 { 3715 int r = 0; 3716 unsigned i; 3717 struct cblock_range range; 3718 3719 if (!passthrough_mode(&cache->features)) { 3720 DMERR("%s: cache has to be in passthrough mode for invalidation", 3721 cache_device_name(cache)); 3722 return -EPERM; 3723 } 3724 3725 for (i = 0; i < count; i++) { 3726 r = parse_cblock_range(cache, cblock_ranges[i], &range); 3727 if (r) 3728 break; 3729 3730 r = validate_cblock_range(cache, &range); 3731 if (r) 3732 break; 3733 3734 /* 3735 * Pass begin and end origin blocks to the worker and wake it. 3736 */ 3737 r = request_invalidation(cache, &range); 3738 if (r) 3739 break; 3740 } 3741 3742 return r; 3743 } 3744 3745 /* 3746 * Supports 3747 * "<key> <value>" 3748 * and 3749 * "invalidate_cblocks [(<begin>)|(<begin>-<end>)]* 3750 * 3751 * The key migration_threshold is supported by the cache target core. 3752 */ 3753 static int cache_message(struct dm_target *ti, unsigned argc, char **argv) 3754 { 3755 struct cache *cache = ti->private; 3756 3757 if (!argc) 3758 return -EINVAL; 3759 3760 if (get_cache_mode(cache) >= CM_READ_ONLY) { 3761 DMERR("%s: unable to service cache target messages in READ_ONLY or FAIL mode", 3762 cache_device_name(cache)); 3763 return -EOPNOTSUPP; 3764 } 3765 3766 if (!strcasecmp(argv[0], "invalidate_cblocks")) 3767 return process_invalidate_cblocks_message(cache, argc - 1, (const char **) argv + 1); 3768 3769 if (argc != 2) 3770 return -EINVAL; 3771 3772 return set_config_value(cache, argv[0], argv[1]); 3773 } 3774 3775 static int cache_iterate_devices(struct dm_target *ti, 3776 iterate_devices_callout_fn fn, void *data) 3777 { 3778 int r = 0; 3779 struct cache *cache = ti->private; 3780 3781 r = fn(ti, cache->cache_dev, 0, get_dev_size(cache->cache_dev), data); 3782 if (!r) 3783 r = fn(ti, cache->origin_dev, 0, ti->len, data); 3784 3785 return r; 3786 } 3787 3788 static void set_discard_limits(struct cache *cache, struct queue_limits *limits) 3789 { 3790 /* 3791 * FIXME: these limits may be incompatible with the cache device 3792 */ 3793 limits->max_discard_sectors = min_t(sector_t, cache->discard_block_size * 1024, 3794 cache->origin_sectors); 3795 limits->discard_granularity = cache->discard_block_size << SECTOR_SHIFT; 3796 } 3797 3798 static void cache_io_hints(struct dm_target *ti, struct queue_limits *limits) 3799 { 3800 struct cache *cache = ti->private; 3801 uint64_t io_opt_sectors = limits->io_opt >> SECTOR_SHIFT; 3802 3803 /* 3804 * If the system-determined stacked limits are compatible with the 3805 * cache's blocksize (io_opt is a factor) do not override them. 3806 */ 3807 if (io_opt_sectors < cache->sectors_per_block || 3808 do_div(io_opt_sectors, cache->sectors_per_block)) { 3809 blk_limits_io_min(limits, cache->sectors_per_block << SECTOR_SHIFT); 3810 blk_limits_io_opt(limits, cache->sectors_per_block << SECTOR_SHIFT); 3811 } 3812 set_discard_limits(cache, limits); 3813 } 3814 3815 /*----------------------------------------------------------------*/ 3816 3817 static struct target_type cache_target = { 3818 .name = "cache", 3819 .version = {1, 10, 0}, 3820 .module = THIS_MODULE, 3821 .ctr = cache_ctr, 3822 .dtr = cache_dtr, 3823 .map = cache_map, 3824 .end_io = cache_end_io, 3825 .postsuspend = cache_postsuspend, 3826 .preresume = cache_preresume, 3827 .resume = cache_resume, 3828 .status = cache_status, 3829 .message = cache_message, 3830 .iterate_devices = cache_iterate_devices, 3831 .io_hints = cache_io_hints, 3832 }; 3833 3834 static int __init dm_cache_init(void) 3835 { 3836 int r; 3837 3838 r = dm_register_target(&cache_target); 3839 if (r) { 3840 DMERR("cache target registration failed: %d", r); 3841 return r; 3842 } 3843 3844 migration_cache = KMEM_CACHE(dm_cache_migration, 0); 3845 if (!migration_cache) { 3846 dm_unregister_target(&cache_target); 3847 return -ENOMEM; 3848 } 3849 3850 return 0; 3851 } 3852 3853 static void __exit dm_cache_exit(void) 3854 { 3855 dm_unregister_target(&cache_target); 3856 kmem_cache_destroy(migration_cache); 3857 } 3858 3859 module_init(dm_cache_init); 3860 module_exit(dm_cache_exit); 3861 3862 MODULE_DESCRIPTION(DM_NAME " cache target"); 3863 MODULE_AUTHOR("Joe Thornber <ejt@redhat.com>"); 3864 MODULE_LICENSE("GPL"); 3865