1 /* 2 * Copyright (C) 2012 Red Hat. All rights reserved. 3 * 4 * This file is released under the GPL. 5 */ 6 7 #include "dm.h" 8 #include "dm-bio-prison.h" 9 #include "dm-bio-record.h" 10 #include "dm-cache-metadata.h" 11 12 #include <linux/dm-io.h> 13 #include <linux/dm-kcopyd.h> 14 #include <linux/jiffies.h> 15 #include <linux/init.h> 16 #include <linux/mempool.h> 17 #include <linux/module.h> 18 #include <linux/slab.h> 19 #include <linux/vmalloc.h> 20 21 #define DM_MSG_PREFIX "cache" 22 23 DECLARE_DM_KCOPYD_THROTTLE_WITH_MODULE_PARM(cache_copy_throttle, 24 "A percentage of time allocated for copying to and/or from cache"); 25 26 /*----------------------------------------------------------------*/ 27 28 #define IOT_RESOLUTION 4 29 30 struct io_tracker { 31 spinlock_t lock; 32 33 /* 34 * Sectors of in-flight IO. 35 */ 36 sector_t in_flight; 37 38 /* 39 * The time, in jiffies, when this device became idle (if it is 40 * indeed idle). 41 */ 42 unsigned long idle_time; 43 unsigned long last_update_time; 44 }; 45 46 static void iot_init(struct io_tracker *iot) 47 { 48 spin_lock_init(&iot->lock); 49 iot->in_flight = 0ul; 50 iot->idle_time = 0ul; 51 iot->last_update_time = jiffies; 52 } 53 54 static bool __iot_idle_for(struct io_tracker *iot, unsigned long jifs) 55 { 56 if (iot->in_flight) 57 return false; 58 59 return time_after(jiffies, iot->idle_time + jifs); 60 } 61 62 static bool iot_idle_for(struct io_tracker *iot, unsigned long jifs) 63 { 64 bool r; 65 unsigned long flags; 66 67 spin_lock_irqsave(&iot->lock, flags); 68 r = __iot_idle_for(iot, jifs); 69 spin_unlock_irqrestore(&iot->lock, flags); 70 71 return r; 72 } 73 74 static void iot_io_begin(struct io_tracker *iot, sector_t len) 75 { 76 unsigned long flags; 77 78 spin_lock_irqsave(&iot->lock, flags); 79 iot->in_flight += len; 80 spin_unlock_irqrestore(&iot->lock, flags); 81 } 82 83 static void __iot_io_end(struct io_tracker *iot, sector_t len) 84 { 85 iot->in_flight -= len; 86 if (!iot->in_flight) 87 iot->idle_time = jiffies; 88 } 89 90 static void iot_io_end(struct io_tracker *iot, sector_t len) 91 { 92 unsigned long flags; 93 94 spin_lock_irqsave(&iot->lock, flags); 95 __iot_io_end(iot, len); 96 spin_unlock_irqrestore(&iot->lock, flags); 97 } 98 99 /*----------------------------------------------------------------*/ 100 101 /* 102 * Glossary: 103 * 104 * oblock: index of an origin block 105 * cblock: index of a cache block 106 * promotion: movement of a block from origin to cache 107 * demotion: movement of a block from cache to origin 108 * migration: movement of a block between the origin and cache device, 109 * either direction 110 */ 111 112 /*----------------------------------------------------------------*/ 113 114 /* 115 * There are a couple of places where we let a bio run, but want to do some 116 * work before calling its endio function. We do this by temporarily 117 * changing the endio fn. 118 */ 119 struct dm_hook_info { 120 bio_end_io_t *bi_end_io; 121 }; 122 123 static void dm_hook_bio(struct dm_hook_info *h, struct bio *bio, 124 bio_end_io_t *bi_end_io, void *bi_private) 125 { 126 h->bi_end_io = bio->bi_end_io; 127 128 bio->bi_end_io = bi_end_io; 129 bio->bi_private = bi_private; 130 } 131 132 static void dm_unhook_bio(struct dm_hook_info *h, struct bio *bio) 133 { 134 bio->bi_end_io = h->bi_end_io; 135 } 136 137 /*----------------------------------------------------------------*/ 138 139 #define MIGRATION_POOL_SIZE 128 140 #define COMMIT_PERIOD HZ 141 #define MIGRATION_COUNT_WINDOW 10 142 143 /* 144 * The block size of the device holding cache data must be 145 * between 32KB and 1GB. 146 */ 147 #define DATA_DEV_BLOCK_SIZE_MIN_SECTORS (32 * 1024 >> SECTOR_SHIFT) 148 #define DATA_DEV_BLOCK_SIZE_MAX_SECTORS (1024 * 1024 * 1024 >> SECTOR_SHIFT) 149 150 enum cache_metadata_mode { 151 CM_WRITE, /* metadata may be changed */ 152 CM_READ_ONLY, /* metadata may not be changed */ 153 CM_FAIL 154 }; 155 156 enum cache_io_mode { 157 /* 158 * Data is written to cached blocks only. These blocks are marked 159 * dirty. If you lose the cache device you will lose data. 160 * Potential performance increase for both reads and writes. 161 */ 162 CM_IO_WRITEBACK, 163 164 /* 165 * Data is written to both cache and origin. Blocks are never 166 * dirty. Potential performance benfit for reads only. 167 */ 168 CM_IO_WRITETHROUGH, 169 170 /* 171 * A degraded mode useful for various cache coherency situations 172 * (eg, rolling back snapshots). Reads and writes always go to the 173 * origin. If a write goes to a cached oblock, then the cache 174 * block is invalidated. 175 */ 176 CM_IO_PASSTHROUGH 177 }; 178 179 struct cache_features { 180 enum cache_metadata_mode mode; 181 enum cache_io_mode io_mode; 182 }; 183 184 struct cache_stats { 185 atomic_t read_hit; 186 atomic_t read_miss; 187 atomic_t write_hit; 188 atomic_t write_miss; 189 atomic_t demotion; 190 atomic_t promotion; 191 atomic_t copies_avoided; 192 atomic_t cache_cell_clash; 193 atomic_t commit_count; 194 atomic_t discard_count; 195 }; 196 197 /* 198 * Defines a range of cblocks, begin to (end - 1) are in the range. end is 199 * the one-past-the-end value. 200 */ 201 struct cblock_range { 202 dm_cblock_t begin; 203 dm_cblock_t end; 204 }; 205 206 struct invalidation_request { 207 struct list_head list; 208 struct cblock_range *cblocks; 209 210 atomic_t complete; 211 int err; 212 213 wait_queue_head_t result_wait; 214 }; 215 216 struct cache { 217 struct dm_target *ti; 218 struct dm_target_callbacks callbacks; 219 220 struct dm_cache_metadata *cmd; 221 222 /* 223 * Metadata is written to this device. 224 */ 225 struct dm_dev *metadata_dev; 226 227 /* 228 * The slower of the two data devices. Typically a spindle. 229 */ 230 struct dm_dev *origin_dev; 231 232 /* 233 * The faster of the two data devices. Typically an SSD. 234 */ 235 struct dm_dev *cache_dev; 236 237 /* 238 * Size of the origin device in _complete_ blocks and native sectors. 239 */ 240 dm_oblock_t origin_blocks; 241 sector_t origin_sectors; 242 243 /* 244 * Size of the cache device in blocks. 245 */ 246 dm_cblock_t cache_size; 247 248 /* 249 * Fields for converting from sectors to blocks. 250 */ 251 uint32_t sectors_per_block; 252 int sectors_per_block_shift; 253 254 spinlock_t lock; 255 struct list_head deferred_cells; 256 struct bio_list deferred_bios; 257 struct bio_list deferred_flush_bios; 258 struct bio_list deferred_writethrough_bios; 259 struct list_head quiesced_migrations; 260 struct list_head completed_migrations; 261 struct list_head need_commit_migrations; 262 sector_t migration_threshold; 263 wait_queue_head_t migration_wait; 264 atomic_t nr_allocated_migrations; 265 266 /* 267 * The number of in flight migrations that are performing 268 * background io. eg, promotion, writeback. 269 */ 270 atomic_t nr_io_migrations; 271 272 wait_queue_head_t quiescing_wait; 273 atomic_t quiescing; 274 atomic_t quiescing_ack; 275 276 /* 277 * cache_size entries, dirty if set 278 */ 279 atomic_t nr_dirty; 280 unsigned long *dirty_bitset; 281 282 /* 283 * origin_blocks entries, discarded if set. 284 */ 285 dm_dblock_t discard_nr_blocks; 286 unsigned long *discard_bitset; 287 uint32_t discard_block_size; /* a power of 2 times sectors per block */ 288 289 /* 290 * Rather than reconstructing the table line for the status we just 291 * save it and regurgitate. 292 */ 293 unsigned nr_ctr_args; 294 const char **ctr_args; 295 296 struct dm_kcopyd_client *copier; 297 struct workqueue_struct *wq; 298 struct work_struct worker; 299 300 struct delayed_work waker; 301 unsigned long last_commit_jiffies; 302 303 struct dm_bio_prison *prison; 304 struct dm_deferred_set *all_io_ds; 305 306 mempool_t *migration_pool; 307 308 struct dm_cache_policy *policy; 309 unsigned policy_nr_args; 310 311 bool need_tick_bio:1; 312 bool sized:1; 313 bool invalidate:1; 314 bool commit_requested:1; 315 bool loaded_mappings:1; 316 bool loaded_discards:1; 317 318 /* 319 * Cache features such as write-through. 320 */ 321 struct cache_features features; 322 323 struct cache_stats stats; 324 325 /* 326 * Invalidation fields. 327 */ 328 spinlock_t invalidation_lock; 329 struct list_head invalidation_requests; 330 331 struct io_tracker origin_tracker; 332 }; 333 334 struct per_bio_data { 335 bool tick:1; 336 unsigned req_nr:2; 337 struct dm_deferred_entry *all_io_entry; 338 struct dm_hook_info hook_info; 339 sector_t len; 340 341 /* 342 * writethrough fields. These MUST remain at the end of this 343 * structure and the 'cache' member must be the first as it 344 * is used to determine the offset of the writethrough fields. 345 */ 346 struct cache *cache; 347 dm_cblock_t cblock; 348 struct dm_bio_details bio_details; 349 }; 350 351 struct dm_cache_migration { 352 struct list_head list; 353 struct cache *cache; 354 355 unsigned long start_jiffies; 356 dm_oblock_t old_oblock; 357 dm_oblock_t new_oblock; 358 dm_cblock_t cblock; 359 360 bool err:1; 361 bool discard:1; 362 bool writeback:1; 363 bool demote:1; 364 bool promote:1; 365 bool requeue_holder:1; 366 bool invalidate:1; 367 368 struct dm_bio_prison_cell *old_ocell; 369 struct dm_bio_prison_cell *new_ocell; 370 }; 371 372 /* 373 * Processing a bio in the worker thread may require these memory 374 * allocations. We prealloc to avoid deadlocks (the same worker thread 375 * frees them back to the mempool). 376 */ 377 struct prealloc { 378 struct dm_cache_migration *mg; 379 struct dm_bio_prison_cell *cell1; 380 struct dm_bio_prison_cell *cell2; 381 }; 382 383 static enum cache_metadata_mode get_cache_mode(struct cache *cache); 384 385 static void wake_worker(struct cache *cache) 386 { 387 queue_work(cache->wq, &cache->worker); 388 } 389 390 /*----------------------------------------------------------------*/ 391 392 static struct dm_bio_prison_cell *alloc_prison_cell(struct cache *cache) 393 { 394 /* FIXME: change to use a local slab. */ 395 return dm_bio_prison_alloc_cell(cache->prison, GFP_NOWAIT); 396 } 397 398 static void free_prison_cell(struct cache *cache, struct dm_bio_prison_cell *cell) 399 { 400 dm_bio_prison_free_cell(cache->prison, cell); 401 } 402 403 static struct dm_cache_migration *alloc_migration(struct cache *cache) 404 { 405 struct dm_cache_migration *mg; 406 407 mg = mempool_alloc(cache->migration_pool, GFP_NOWAIT); 408 if (mg) { 409 mg->cache = cache; 410 atomic_inc(&mg->cache->nr_allocated_migrations); 411 } 412 413 return mg; 414 } 415 416 static void free_migration(struct dm_cache_migration *mg) 417 { 418 struct cache *cache = mg->cache; 419 420 if (atomic_dec_and_test(&cache->nr_allocated_migrations)) 421 wake_up(&cache->migration_wait); 422 423 mempool_free(mg, cache->migration_pool); 424 } 425 426 static int prealloc_data_structs(struct cache *cache, struct prealloc *p) 427 { 428 if (!p->mg) { 429 p->mg = alloc_migration(cache); 430 if (!p->mg) 431 return -ENOMEM; 432 } 433 434 if (!p->cell1) { 435 p->cell1 = alloc_prison_cell(cache); 436 if (!p->cell1) 437 return -ENOMEM; 438 } 439 440 if (!p->cell2) { 441 p->cell2 = alloc_prison_cell(cache); 442 if (!p->cell2) 443 return -ENOMEM; 444 } 445 446 return 0; 447 } 448 449 static void prealloc_free_structs(struct cache *cache, struct prealloc *p) 450 { 451 if (p->cell2) 452 free_prison_cell(cache, p->cell2); 453 454 if (p->cell1) 455 free_prison_cell(cache, p->cell1); 456 457 if (p->mg) 458 free_migration(p->mg); 459 } 460 461 static struct dm_cache_migration *prealloc_get_migration(struct prealloc *p) 462 { 463 struct dm_cache_migration *mg = p->mg; 464 465 BUG_ON(!mg); 466 p->mg = NULL; 467 468 return mg; 469 } 470 471 /* 472 * You must have a cell within the prealloc struct to return. If not this 473 * function will BUG() rather than returning NULL. 474 */ 475 static struct dm_bio_prison_cell *prealloc_get_cell(struct prealloc *p) 476 { 477 struct dm_bio_prison_cell *r = NULL; 478 479 if (p->cell1) { 480 r = p->cell1; 481 p->cell1 = NULL; 482 483 } else if (p->cell2) { 484 r = p->cell2; 485 p->cell2 = NULL; 486 } else 487 BUG(); 488 489 return r; 490 } 491 492 /* 493 * You can't have more than two cells in a prealloc struct. BUG() will be 494 * called if you try and overfill. 495 */ 496 static void prealloc_put_cell(struct prealloc *p, struct dm_bio_prison_cell *cell) 497 { 498 if (!p->cell2) 499 p->cell2 = cell; 500 501 else if (!p->cell1) 502 p->cell1 = cell; 503 504 else 505 BUG(); 506 } 507 508 /*----------------------------------------------------------------*/ 509 510 static void build_key(dm_oblock_t begin, dm_oblock_t end, struct dm_cell_key *key) 511 { 512 key->virtual = 0; 513 key->dev = 0; 514 key->block_begin = from_oblock(begin); 515 key->block_end = from_oblock(end); 516 } 517 518 /* 519 * The caller hands in a preallocated cell, and a free function for it. 520 * The cell will be freed if there's an error, or if it wasn't used because 521 * a cell with that key already exists. 522 */ 523 typedef void (*cell_free_fn)(void *context, struct dm_bio_prison_cell *cell); 524 525 static int bio_detain_range(struct cache *cache, dm_oblock_t oblock_begin, dm_oblock_t oblock_end, 526 struct bio *bio, struct dm_bio_prison_cell *cell_prealloc, 527 cell_free_fn free_fn, void *free_context, 528 struct dm_bio_prison_cell **cell_result) 529 { 530 int r; 531 struct dm_cell_key key; 532 533 build_key(oblock_begin, oblock_end, &key); 534 r = dm_bio_detain(cache->prison, &key, bio, cell_prealloc, cell_result); 535 if (r) 536 free_fn(free_context, cell_prealloc); 537 538 return r; 539 } 540 541 static int bio_detain(struct cache *cache, dm_oblock_t oblock, 542 struct bio *bio, struct dm_bio_prison_cell *cell_prealloc, 543 cell_free_fn free_fn, void *free_context, 544 struct dm_bio_prison_cell **cell_result) 545 { 546 dm_oblock_t end = to_oblock(from_oblock(oblock) + 1ULL); 547 return bio_detain_range(cache, oblock, end, bio, 548 cell_prealloc, free_fn, free_context, cell_result); 549 } 550 551 static int get_cell(struct cache *cache, 552 dm_oblock_t oblock, 553 struct prealloc *structs, 554 struct dm_bio_prison_cell **cell_result) 555 { 556 int r; 557 struct dm_cell_key key; 558 struct dm_bio_prison_cell *cell_prealloc; 559 560 cell_prealloc = prealloc_get_cell(structs); 561 562 build_key(oblock, to_oblock(from_oblock(oblock) + 1ULL), &key); 563 r = dm_get_cell(cache->prison, &key, cell_prealloc, cell_result); 564 if (r) 565 prealloc_put_cell(structs, cell_prealloc); 566 567 return r; 568 } 569 570 /*----------------------------------------------------------------*/ 571 572 static bool is_dirty(struct cache *cache, dm_cblock_t b) 573 { 574 return test_bit(from_cblock(b), cache->dirty_bitset); 575 } 576 577 static void set_dirty(struct cache *cache, dm_oblock_t oblock, dm_cblock_t cblock) 578 { 579 if (!test_and_set_bit(from_cblock(cblock), cache->dirty_bitset)) { 580 atomic_inc(&cache->nr_dirty); 581 policy_set_dirty(cache->policy, oblock); 582 } 583 } 584 585 static void clear_dirty(struct cache *cache, dm_oblock_t oblock, dm_cblock_t cblock) 586 { 587 if (test_and_clear_bit(from_cblock(cblock), cache->dirty_bitset)) { 588 policy_clear_dirty(cache->policy, oblock); 589 if (atomic_dec_return(&cache->nr_dirty) == 0) 590 dm_table_event(cache->ti->table); 591 } 592 } 593 594 /*----------------------------------------------------------------*/ 595 596 static bool block_size_is_power_of_two(struct cache *cache) 597 { 598 return cache->sectors_per_block_shift >= 0; 599 } 600 601 /* gcc on ARM generates spurious references to __udivdi3 and __umoddi3 */ 602 #if defined(CONFIG_ARM) && __GNUC__ == 4 && __GNUC_MINOR__ <= 6 603 __always_inline 604 #endif 605 static dm_block_t block_div(dm_block_t b, uint32_t n) 606 { 607 do_div(b, n); 608 609 return b; 610 } 611 612 static dm_block_t oblocks_per_dblock(struct cache *cache) 613 { 614 dm_block_t oblocks = cache->discard_block_size; 615 616 if (block_size_is_power_of_two(cache)) 617 oblocks >>= cache->sectors_per_block_shift; 618 else 619 oblocks = block_div(oblocks, cache->sectors_per_block); 620 621 return oblocks; 622 } 623 624 static dm_dblock_t oblock_to_dblock(struct cache *cache, dm_oblock_t oblock) 625 { 626 return to_dblock(block_div(from_oblock(oblock), 627 oblocks_per_dblock(cache))); 628 } 629 630 static dm_oblock_t dblock_to_oblock(struct cache *cache, dm_dblock_t dblock) 631 { 632 return to_oblock(from_dblock(dblock) * oblocks_per_dblock(cache)); 633 } 634 635 static void set_discard(struct cache *cache, dm_dblock_t b) 636 { 637 unsigned long flags; 638 639 BUG_ON(from_dblock(b) >= from_dblock(cache->discard_nr_blocks)); 640 atomic_inc(&cache->stats.discard_count); 641 642 spin_lock_irqsave(&cache->lock, flags); 643 set_bit(from_dblock(b), cache->discard_bitset); 644 spin_unlock_irqrestore(&cache->lock, flags); 645 } 646 647 static void clear_discard(struct cache *cache, dm_dblock_t b) 648 { 649 unsigned long flags; 650 651 spin_lock_irqsave(&cache->lock, flags); 652 clear_bit(from_dblock(b), cache->discard_bitset); 653 spin_unlock_irqrestore(&cache->lock, flags); 654 } 655 656 static bool is_discarded(struct cache *cache, dm_dblock_t b) 657 { 658 int r; 659 unsigned long flags; 660 661 spin_lock_irqsave(&cache->lock, flags); 662 r = test_bit(from_dblock(b), cache->discard_bitset); 663 spin_unlock_irqrestore(&cache->lock, flags); 664 665 return r; 666 } 667 668 static bool is_discarded_oblock(struct cache *cache, dm_oblock_t b) 669 { 670 int r; 671 unsigned long flags; 672 673 spin_lock_irqsave(&cache->lock, flags); 674 r = test_bit(from_dblock(oblock_to_dblock(cache, b)), 675 cache->discard_bitset); 676 spin_unlock_irqrestore(&cache->lock, flags); 677 678 return r; 679 } 680 681 /*----------------------------------------------------------------*/ 682 683 static void load_stats(struct cache *cache) 684 { 685 struct dm_cache_statistics stats; 686 687 dm_cache_metadata_get_stats(cache->cmd, &stats); 688 atomic_set(&cache->stats.read_hit, stats.read_hits); 689 atomic_set(&cache->stats.read_miss, stats.read_misses); 690 atomic_set(&cache->stats.write_hit, stats.write_hits); 691 atomic_set(&cache->stats.write_miss, stats.write_misses); 692 } 693 694 static void save_stats(struct cache *cache) 695 { 696 struct dm_cache_statistics stats; 697 698 if (get_cache_mode(cache) >= CM_READ_ONLY) 699 return; 700 701 stats.read_hits = atomic_read(&cache->stats.read_hit); 702 stats.read_misses = atomic_read(&cache->stats.read_miss); 703 stats.write_hits = atomic_read(&cache->stats.write_hit); 704 stats.write_misses = atomic_read(&cache->stats.write_miss); 705 706 dm_cache_metadata_set_stats(cache->cmd, &stats); 707 } 708 709 /*---------------------------------------------------------------- 710 * Per bio data 711 *--------------------------------------------------------------*/ 712 713 /* 714 * If using writeback, leave out struct per_bio_data's writethrough fields. 715 */ 716 #define PB_DATA_SIZE_WB (offsetof(struct per_bio_data, cache)) 717 #define PB_DATA_SIZE_WT (sizeof(struct per_bio_data)) 718 719 static bool writethrough_mode(struct cache_features *f) 720 { 721 return f->io_mode == CM_IO_WRITETHROUGH; 722 } 723 724 static bool writeback_mode(struct cache_features *f) 725 { 726 return f->io_mode == CM_IO_WRITEBACK; 727 } 728 729 static bool passthrough_mode(struct cache_features *f) 730 { 731 return f->io_mode == CM_IO_PASSTHROUGH; 732 } 733 734 static size_t get_per_bio_data_size(struct cache *cache) 735 { 736 return writethrough_mode(&cache->features) ? PB_DATA_SIZE_WT : PB_DATA_SIZE_WB; 737 } 738 739 static struct per_bio_data *get_per_bio_data(struct bio *bio, size_t data_size) 740 { 741 struct per_bio_data *pb = dm_per_bio_data(bio, data_size); 742 BUG_ON(!pb); 743 return pb; 744 } 745 746 static struct per_bio_data *init_per_bio_data(struct bio *bio, size_t data_size) 747 { 748 struct per_bio_data *pb = get_per_bio_data(bio, data_size); 749 750 pb->tick = false; 751 pb->req_nr = dm_bio_get_target_bio_nr(bio); 752 pb->all_io_entry = NULL; 753 pb->len = 0; 754 755 return pb; 756 } 757 758 /*---------------------------------------------------------------- 759 * Remapping 760 *--------------------------------------------------------------*/ 761 static void remap_to_origin(struct cache *cache, struct bio *bio) 762 { 763 bio->bi_bdev = cache->origin_dev->bdev; 764 } 765 766 static void remap_to_cache(struct cache *cache, struct bio *bio, 767 dm_cblock_t cblock) 768 { 769 sector_t bi_sector = bio->bi_iter.bi_sector; 770 sector_t block = from_cblock(cblock); 771 772 bio->bi_bdev = cache->cache_dev->bdev; 773 if (!block_size_is_power_of_two(cache)) 774 bio->bi_iter.bi_sector = 775 (block * cache->sectors_per_block) + 776 sector_div(bi_sector, cache->sectors_per_block); 777 else 778 bio->bi_iter.bi_sector = 779 (block << cache->sectors_per_block_shift) | 780 (bi_sector & (cache->sectors_per_block - 1)); 781 } 782 783 static void check_if_tick_bio_needed(struct cache *cache, struct bio *bio) 784 { 785 unsigned long flags; 786 size_t pb_data_size = get_per_bio_data_size(cache); 787 struct per_bio_data *pb = get_per_bio_data(bio, pb_data_size); 788 789 spin_lock_irqsave(&cache->lock, flags); 790 if (cache->need_tick_bio && 791 !(bio->bi_opf & (REQ_FUA | REQ_PREFLUSH)) && 792 bio_op(bio) != REQ_OP_DISCARD) { 793 pb->tick = true; 794 cache->need_tick_bio = false; 795 } 796 spin_unlock_irqrestore(&cache->lock, flags); 797 } 798 799 static void remap_to_origin_clear_discard(struct cache *cache, struct bio *bio, 800 dm_oblock_t oblock) 801 { 802 check_if_tick_bio_needed(cache, bio); 803 remap_to_origin(cache, bio); 804 if (bio_data_dir(bio) == WRITE) 805 clear_discard(cache, oblock_to_dblock(cache, oblock)); 806 } 807 808 static void remap_to_cache_dirty(struct cache *cache, struct bio *bio, 809 dm_oblock_t oblock, dm_cblock_t cblock) 810 { 811 check_if_tick_bio_needed(cache, bio); 812 remap_to_cache(cache, bio, cblock); 813 if (bio_data_dir(bio) == WRITE) { 814 set_dirty(cache, oblock, cblock); 815 clear_discard(cache, oblock_to_dblock(cache, oblock)); 816 } 817 } 818 819 static dm_oblock_t get_bio_block(struct cache *cache, struct bio *bio) 820 { 821 sector_t block_nr = bio->bi_iter.bi_sector; 822 823 if (!block_size_is_power_of_two(cache)) 824 (void) sector_div(block_nr, cache->sectors_per_block); 825 else 826 block_nr >>= cache->sectors_per_block_shift; 827 828 return to_oblock(block_nr); 829 } 830 831 static int bio_triggers_commit(struct cache *cache, struct bio *bio) 832 { 833 return bio->bi_opf & (REQ_PREFLUSH | REQ_FUA); 834 } 835 836 /* 837 * You must increment the deferred set whilst the prison cell is held. To 838 * encourage this, we ask for 'cell' to be passed in. 839 */ 840 static void inc_ds(struct cache *cache, struct bio *bio, 841 struct dm_bio_prison_cell *cell) 842 { 843 size_t pb_data_size = get_per_bio_data_size(cache); 844 struct per_bio_data *pb = get_per_bio_data(bio, pb_data_size); 845 846 BUG_ON(!cell); 847 BUG_ON(pb->all_io_entry); 848 849 pb->all_io_entry = dm_deferred_entry_inc(cache->all_io_ds); 850 } 851 852 static bool accountable_bio(struct cache *cache, struct bio *bio) 853 { 854 return ((bio->bi_bdev == cache->origin_dev->bdev) && 855 bio_op(bio) != REQ_OP_DISCARD); 856 } 857 858 static void accounted_begin(struct cache *cache, struct bio *bio) 859 { 860 size_t pb_data_size = get_per_bio_data_size(cache); 861 struct per_bio_data *pb = get_per_bio_data(bio, pb_data_size); 862 863 if (accountable_bio(cache, bio)) { 864 pb->len = bio_sectors(bio); 865 iot_io_begin(&cache->origin_tracker, pb->len); 866 } 867 } 868 869 static void accounted_complete(struct cache *cache, struct bio *bio) 870 { 871 size_t pb_data_size = get_per_bio_data_size(cache); 872 struct per_bio_data *pb = get_per_bio_data(bio, pb_data_size); 873 874 iot_io_end(&cache->origin_tracker, pb->len); 875 } 876 877 static void accounted_request(struct cache *cache, struct bio *bio) 878 { 879 accounted_begin(cache, bio); 880 generic_make_request(bio); 881 } 882 883 static void issue(struct cache *cache, struct bio *bio) 884 { 885 unsigned long flags; 886 887 if (!bio_triggers_commit(cache, bio)) { 888 accounted_request(cache, bio); 889 return; 890 } 891 892 /* 893 * Batch together any bios that trigger commits and then issue a 894 * single commit for them in do_worker(). 895 */ 896 spin_lock_irqsave(&cache->lock, flags); 897 cache->commit_requested = true; 898 bio_list_add(&cache->deferred_flush_bios, bio); 899 spin_unlock_irqrestore(&cache->lock, flags); 900 } 901 902 static void inc_and_issue(struct cache *cache, struct bio *bio, struct dm_bio_prison_cell *cell) 903 { 904 inc_ds(cache, bio, cell); 905 issue(cache, bio); 906 } 907 908 static void defer_writethrough_bio(struct cache *cache, struct bio *bio) 909 { 910 unsigned long flags; 911 912 spin_lock_irqsave(&cache->lock, flags); 913 bio_list_add(&cache->deferred_writethrough_bios, bio); 914 spin_unlock_irqrestore(&cache->lock, flags); 915 916 wake_worker(cache); 917 } 918 919 static void writethrough_endio(struct bio *bio) 920 { 921 struct per_bio_data *pb = get_per_bio_data(bio, PB_DATA_SIZE_WT); 922 923 dm_unhook_bio(&pb->hook_info, bio); 924 925 if (bio->bi_error) { 926 bio_endio(bio); 927 return; 928 } 929 930 dm_bio_restore(&pb->bio_details, bio); 931 remap_to_cache(pb->cache, bio, pb->cblock); 932 933 /* 934 * We can't issue this bio directly, since we're in interrupt 935 * context. So it gets put on a bio list for processing by the 936 * worker thread. 937 */ 938 defer_writethrough_bio(pb->cache, bio); 939 } 940 941 /* 942 * When running in writethrough mode we need to send writes to clean blocks 943 * to both the cache and origin devices. In future we'd like to clone the 944 * bio and send them in parallel, but for now we're doing them in 945 * series as this is easier. 946 */ 947 static void remap_to_origin_then_cache(struct cache *cache, struct bio *bio, 948 dm_oblock_t oblock, dm_cblock_t cblock) 949 { 950 struct per_bio_data *pb = get_per_bio_data(bio, PB_DATA_SIZE_WT); 951 952 pb->cache = cache; 953 pb->cblock = cblock; 954 dm_hook_bio(&pb->hook_info, bio, writethrough_endio, NULL); 955 dm_bio_record(&pb->bio_details, bio); 956 957 remap_to_origin_clear_discard(pb->cache, bio, oblock); 958 } 959 960 /*---------------------------------------------------------------- 961 * Failure modes 962 *--------------------------------------------------------------*/ 963 static enum cache_metadata_mode get_cache_mode(struct cache *cache) 964 { 965 return cache->features.mode; 966 } 967 968 static const char *cache_device_name(struct cache *cache) 969 { 970 return dm_device_name(dm_table_get_md(cache->ti->table)); 971 } 972 973 static void notify_mode_switch(struct cache *cache, enum cache_metadata_mode mode) 974 { 975 const char *descs[] = { 976 "write", 977 "read-only", 978 "fail" 979 }; 980 981 dm_table_event(cache->ti->table); 982 DMINFO("%s: switching cache to %s mode", 983 cache_device_name(cache), descs[(int)mode]); 984 } 985 986 static void set_cache_mode(struct cache *cache, enum cache_metadata_mode new_mode) 987 { 988 bool needs_check; 989 enum cache_metadata_mode old_mode = get_cache_mode(cache); 990 991 if (dm_cache_metadata_needs_check(cache->cmd, &needs_check)) { 992 DMERR("%s: unable to read needs_check flag, setting failure mode.", 993 cache_device_name(cache)); 994 new_mode = CM_FAIL; 995 } 996 997 if (new_mode == CM_WRITE && needs_check) { 998 DMERR("%s: unable to switch cache to write mode until repaired.", 999 cache_device_name(cache)); 1000 if (old_mode != new_mode) 1001 new_mode = old_mode; 1002 else 1003 new_mode = CM_READ_ONLY; 1004 } 1005 1006 /* Never move out of fail mode */ 1007 if (old_mode == CM_FAIL) 1008 new_mode = CM_FAIL; 1009 1010 switch (new_mode) { 1011 case CM_FAIL: 1012 case CM_READ_ONLY: 1013 dm_cache_metadata_set_read_only(cache->cmd); 1014 break; 1015 1016 case CM_WRITE: 1017 dm_cache_metadata_set_read_write(cache->cmd); 1018 break; 1019 } 1020 1021 cache->features.mode = new_mode; 1022 1023 if (new_mode != old_mode) 1024 notify_mode_switch(cache, new_mode); 1025 } 1026 1027 static void abort_transaction(struct cache *cache) 1028 { 1029 const char *dev_name = cache_device_name(cache); 1030 1031 if (get_cache_mode(cache) >= CM_READ_ONLY) 1032 return; 1033 1034 if (dm_cache_metadata_set_needs_check(cache->cmd)) { 1035 DMERR("%s: failed to set 'needs_check' flag in metadata", dev_name); 1036 set_cache_mode(cache, CM_FAIL); 1037 } 1038 1039 DMERR_LIMIT("%s: aborting current metadata transaction", dev_name); 1040 if (dm_cache_metadata_abort(cache->cmd)) { 1041 DMERR("%s: failed to abort metadata transaction", dev_name); 1042 set_cache_mode(cache, CM_FAIL); 1043 } 1044 } 1045 1046 static void metadata_operation_failed(struct cache *cache, const char *op, int r) 1047 { 1048 DMERR_LIMIT("%s: metadata operation '%s' failed: error = %d", 1049 cache_device_name(cache), op, r); 1050 abort_transaction(cache); 1051 set_cache_mode(cache, CM_READ_ONLY); 1052 } 1053 1054 /*---------------------------------------------------------------- 1055 * Migration processing 1056 * 1057 * Migration covers moving data from the origin device to the cache, or 1058 * vice versa. 1059 *--------------------------------------------------------------*/ 1060 static void inc_io_migrations(struct cache *cache) 1061 { 1062 atomic_inc(&cache->nr_io_migrations); 1063 } 1064 1065 static void dec_io_migrations(struct cache *cache) 1066 { 1067 atomic_dec(&cache->nr_io_migrations); 1068 } 1069 1070 static bool discard_or_flush(struct bio *bio) 1071 { 1072 return bio_op(bio) == REQ_OP_DISCARD || 1073 bio->bi_opf & (REQ_PREFLUSH | REQ_FUA); 1074 } 1075 1076 static void __cell_defer(struct cache *cache, struct dm_bio_prison_cell *cell) 1077 { 1078 if (discard_or_flush(cell->holder)) { 1079 /* 1080 * We have to handle these bios individually. 1081 */ 1082 dm_cell_release(cache->prison, cell, &cache->deferred_bios); 1083 free_prison_cell(cache, cell); 1084 } else 1085 list_add_tail(&cell->user_list, &cache->deferred_cells); 1086 } 1087 1088 static void cell_defer(struct cache *cache, struct dm_bio_prison_cell *cell, bool holder) 1089 { 1090 unsigned long flags; 1091 1092 if (!holder && dm_cell_promote_or_release(cache->prison, cell)) { 1093 /* 1094 * There was no prisoner to promote to holder, the 1095 * cell has been released. 1096 */ 1097 free_prison_cell(cache, cell); 1098 return; 1099 } 1100 1101 spin_lock_irqsave(&cache->lock, flags); 1102 __cell_defer(cache, cell); 1103 spin_unlock_irqrestore(&cache->lock, flags); 1104 1105 wake_worker(cache); 1106 } 1107 1108 static void cell_error_with_code(struct cache *cache, struct dm_bio_prison_cell *cell, int err) 1109 { 1110 dm_cell_error(cache->prison, cell, err); 1111 free_prison_cell(cache, cell); 1112 } 1113 1114 static void cell_requeue(struct cache *cache, struct dm_bio_prison_cell *cell) 1115 { 1116 cell_error_with_code(cache, cell, DM_ENDIO_REQUEUE); 1117 } 1118 1119 static void free_io_migration(struct dm_cache_migration *mg) 1120 { 1121 struct cache *cache = mg->cache; 1122 1123 dec_io_migrations(cache); 1124 free_migration(mg); 1125 wake_worker(cache); 1126 } 1127 1128 static void migration_failure(struct dm_cache_migration *mg) 1129 { 1130 struct cache *cache = mg->cache; 1131 const char *dev_name = cache_device_name(cache); 1132 1133 if (mg->writeback) { 1134 DMERR_LIMIT("%s: writeback failed; couldn't copy block", dev_name); 1135 set_dirty(cache, mg->old_oblock, mg->cblock); 1136 cell_defer(cache, mg->old_ocell, false); 1137 1138 } else if (mg->demote) { 1139 DMERR_LIMIT("%s: demotion failed; couldn't copy block", dev_name); 1140 policy_force_mapping(cache->policy, mg->new_oblock, mg->old_oblock); 1141 1142 cell_defer(cache, mg->old_ocell, mg->promote ? false : true); 1143 if (mg->promote) 1144 cell_defer(cache, mg->new_ocell, true); 1145 } else { 1146 DMERR_LIMIT("%s: promotion failed; couldn't copy block", dev_name); 1147 policy_remove_mapping(cache->policy, mg->new_oblock); 1148 cell_defer(cache, mg->new_ocell, true); 1149 } 1150 1151 free_io_migration(mg); 1152 } 1153 1154 static void migration_success_pre_commit(struct dm_cache_migration *mg) 1155 { 1156 int r; 1157 unsigned long flags; 1158 struct cache *cache = mg->cache; 1159 1160 if (mg->writeback) { 1161 clear_dirty(cache, mg->old_oblock, mg->cblock); 1162 cell_defer(cache, mg->old_ocell, false); 1163 free_io_migration(mg); 1164 return; 1165 1166 } else if (mg->demote) { 1167 r = dm_cache_remove_mapping(cache->cmd, mg->cblock); 1168 if (r) { 1169 DMERR_LIMIT("%s: demotion failed; couldn't update on disk metadata", 1170 cache_device_name(cache)); 1171 metadata_operation_failed(cache, "dm_cache_remove_mapping", r); 1172 policy_force_mapping(cache->policy, mg->new_oblock, 1173 mg->old_oblock); 1174 if (mg->promote) 1175 cell_defer(cache, mg->new_ocell, true); 1176 free_io_migration(mg); 1177 return; 1178 } 1179 } else { 1180 r = dm_cache_insert_mapping(cache->cmd, mg->cblock, mg->new_oblock); 1181 if (r) { 1182 DMERR_LIMIT("%s: promotion failed; couldn't update on disk metadata", 1183 cache_device_name(cache)); 1184 metadata_operation_failed(cache, "dm_cache_insert_mapping", r); 1185 policy_remove_mapping(cache->policy, mg->new_oblock); 1186 free_io_migration(mg); 1187 return; 1188 } 1189 } 1190 1191 spin_lock_irqsave(&cache->lock, flags); 1192 list_add_tail(&mg->list, &cache->need_commit_migrations); 1193 cache->commit_requested = true; 1194 spin_unlock_irqrestore(&cache->lock, flags); 1195 } 1196 1197 static void migration_success_post_commit(struct dm_cache_migration *mg) 1198 { 1199 unsigned long flags; 1200 struct cache *cache = mg->cache; 1201 1202 if (mg->writeback) { 1203 DMWARN_LIMIT("%s: writeback unexpectedly triggered commit", 1204 cache_device_name(cache)); 1205 return; 1206 1207 } else if (mg->demote) { 1208 cell_defer(cache, mg->old_ocell, mg->promote ? false : true); 1209 1210 if (mg->promote) { 1211 mg->demote = false; 1212 1213 spin_lock_irqsave(&cache->lock, flags); 1214 list_add_tail(&mg->list, &cache->quiesced_migrations); 1215 spin_unlock_irqrestore(&cache->lock, flags); 1216 1217 } else { 1218 if (mg->invalidate) 1219 policy_remove_mapping(cache->policy, mg->old_oblock); 1220 free_io_migration(mg); 1221 } 1222 1223 } else { 1224 if (mg->requeue_holder) { 1225 clear_dirty(cache, mg->new_oblock, mg->cblock); 1226 cell_defer(cache, mg->new_ocell, true); 1227 } else { 1228 /* 1229 * The block was promoted via an overwrite, so it's dirty. 1230 */ 1231 set_dirty(cache, mg->new_oblock, mg->cblock); 1232 bio_endio(mg->new_ocell->holder); 1233 cell_defer(cache, mg->new_ocell, false); 1234 } 1235 free_io_migration(mg); 1236 } 1237 } 1238 1239 static void copy_complete(int read_err, unsigned long write_err, void *context) 1240 { 1241 unsigned long flags; 1242 struct dm_cache_migration *mg = (struct dm_cache_migration *) context; 1243 struct cache *cache = mg->cache; 1244 1245 if (read_err || write_err) 1246 mg->err = true; 1247 1248 spin_lock_irqsave(&cache->lock, flags); 1249 list_add_tail(&mg->list, &cache->completed_migrations); 1250 spin_unlock_irqrestore(&cache->lock, flags); 1251 1252 wake_worker(cache); 1253 } 1254 1255 static void issue_copy(struct dm_cache_migration *mg) 1256 { 1257 int r; 1258 struct dm_io_region o_region, c_region; 1259 struct cache *cache = mg->cache; 1260 sector_t cblock = from_cblock(mg->cblock); 1261 1262 o_region.bdev = cache->origin_dev->bdev; 1263 o_region.count = cache->sectors_per_block; 1264 1265 c_region.bdev = cache->cache_dev->bdev; 1266 c_region.sector = cblock * cache->sectors_per_block; 1267 c_region.count = cache->sectors_per_block; 1268 1269 if (mg->writeback || mg->demote) { 1270 /* demote */ 1271 o_region.sector = from_oblock(mg->old_oblock) * cache->sectors_per_block; 1272 r = dm_kcopyd_copy(cache->copier, &c_region, 1, &o_region, 0, copy_complete, mg); 1273 } else { 1274 /* promote */ 1275 o_region.sector = from_oblock(mg->new_oblock) * cache->sectors_per_block; 1276 r = dm_kcopyd_copy(cache->copier, &o_region, 1, &c_region, 0, copy_complete, mg); 1277 } 1278 1279 if (r < 0) { 1280 DMERR_LIMIT("%s: issuing migration failed", cache_device_name(cache)); 1281 migration_failure(mg); 1282 } 1283 } 1284 1285 static void overwrite_endio(struct bio *bio) 1286 { 1287 struct dm_cache_migration *mg = bio->bi_private; 1288 struct cache *cache = mg->cache; 1289 size_t pb_data_size = get_per_bio_data_size(cache); 1290 struct per_bio_data *pb = get_per_bio_data(bio, pb_data_size); 1291 unsigned long flags; 1292 1293 dm_unhook_bio(&pb->hook_info, bio); 1294 1295 if (bio->bi_error) 1296 mg->err = true; 1297 1298 mg->requeue_holder = false; 1299 1300 spin_lock_irqsave(&cache->lock, flags); 1301 list_add_tail(&mg->list, &cache->completed_migrations); 1302 spin_unlock_irqrestore(&cache->lock, flags); 1303 1304 wake_worker(cache); 1305 } 1306 1307 static void issue_overwrite(struct dm_cache_migration *mg, struct bio *bio) 1308 { 1309 size_t pb_data_size = get_per_bio_data_size(mg->cache); 1310 struct per_bio_data *pb = get_per_bio_data(bio, pb_data_size); 1311 1312 dm_hook_bio(&pb->hook_info, bio, overwrite_endio, mg); 1313 remap_to_cache_dirty(mg->cache, bio, mg->new_oblock, mg->cblock); 1314 1315 /* 1316 * No need to inc_ds() here, since the cell will be held for the 1317 * duration of the io. 1318 */ 1319 accounted_request(mg->cache, bio); 1320 } 1321 1322 static bool bio_writes_complete_block(struct cache *cache, struct bio *bio) 1323 { 1324 return (bio_data_dir(bio) == WRITE) && 1325 (bio->bi_iter.bi_size == (cache->sectors_per_block << SECTOR_SHIFT)); 1326 } 1327 1328 static void avoid_copy(struct dm_cache_migration *mg) 1329 { 1330 atomic_inc(&mg->cache->stats.copies_avoided); 1331 migration_success_pre_commit(mg); 1332 } 1333 1334 static void calc_discard_block_range(struct cache *cache, struct bio *bio, 1335 dm_dblock_t *b, dm_dblock_t *e) 1336 { 1337 sector_t sb = bio->bi_iter.bi_sector; 1338 sector_t se = bio_end_sector(bio); 1339 1340 *b = to_dblock(dm_sector_div_up(sb, cache->discard_block_size)); 1341 1342 if (se - sb < cache->discard_block_size) 1343 *e = *b; 1344 else 1345 *e = to_dblock(block_div(se, cache->discard_block_size)); 1346 } 1347 1348 static void issue_discard(struct dm_cache_migration *mg) 1349 { 1350 dm_dblock_t b, e; 1351 struct bio *bio = mg->new_ocell->holder; 1352 struct cache *cache = mg->cache; 1353 1354 calc_discard_block_range(cache, bio, &b, &e); 1355 while (b != e) { 1356 set_discard(cache, b); 1357 b = to_dblock(from_dblock(b) + 1); 1358 } 1359 1360 bio_endio(bio); 1361 cell_defer(cache, mg->new_ocell, false); 1362 free_migration(mg); 1363 wake_worker(cache); 1364 } 1365 1366 static void issue_copy_or_discard(struct dm_cache_migration *mg) 1367 { 1368 bool avoid; 1369 struct cache *cache = mg->cache; 1370 1371 if (mg->discard) { 1372 issue_discard(mg); 1373 return; 1374 } 1375 1376 if (mg->writeback || mg->demote) 1377 avoid = !is_dirty(cache, mg->cblock) || 1378 is_discarded_oblock(cache, mg->old_oblock); 1379 else { 1380 struct bio *bio = mg->new_ocell->holder; 1381 1382 avoid = is_discarded_oblock(cache, mg->new_oblock); 1383 1384 if (writeback_mode(&cache->features) && 1385 !avoid && bio_writes_complete_block(cache, bio)) { 1386 issue_overwrite(mg, bio); 1387 return; 1388 } 1389 } 1390 1391 avoid ? avoid_copy(mg) : issue_copy(mg); 1392 } 1393 1394 static void complete_migration(struct dm_cache_migration *mg) 1395 { 1396 if (mg->err) 1397 migration_failure(mg); 1398 else 1399 migration_success_pre_commit(mg); 1400 } 1401 1402 static void process_migrations(struct cache *cache, struct list_head *head, 1403 void (*fn)(struct dm_cache_migration *)) 1404 { 1405 unsigned long flags; 1406 struct list_head list; 1407 struct dm_cache_migration *mg, *tmp; 1408 1409 INIT_LIST_HEAD(&list); 1410 spin_lock_irqsave(&cache->lock, flags); 1411 list_splice_init(head, &list); 1412 spin_unlock_irqrestore(&cache->lock, flags); 1413 1414 list_for_each_entry_safe(mg, tmp, &list, list) 1415 fn(mg); 1416 } 1417 1418 static void __queue_quiesced_migration(struct dm_cache_migration *mg) 1419 { 1420 list_add_tail(&mg->list, &mg->cache->quiesced_migrations); 1421 } 1422 1423 static void queue_quiesced_migration(struct dm_cache_migration *mg) 1424 { 1425 unsigned long flags; 1426 struct cache *cache = mg->cache; 1427 1428 spin_lock_irqsave(&cache->lock, flags); 1429 __queue_quiesced_migration(mg); 1430 spin_unlock_irqrestore(&cache->lock, flags); 1431 1432 wake_worker(cache); 1433 } 1434 1435 static void queue_quiesced_migrations(struct cache *cache, struct list_head *work) 1436 { 1437 unsigned long flags; 1438 struct dm_cache_migration *mg, *tmp; 1439 1440 spin_lock_irqsave(&cache->lock, flags); 1441 list_for_each_entry_safe(mg, tmp, work, list) 1442 __queue_quiesced_migration(mg); 1443 spin_unlock_irqrestore(&cache->lock, flags); 1444 1445 wake_worker(cache); 1446 } 1447 1448 static void check_for_quiesced_migrations(struct cache *cache, 1449 struct per_bio_data *pb) 1450 { 1451 struct list_head work; 1452 1453 if (!pb->all_io_entry) 1454 return; 1455 1456 INIT_LIST_HEAD(&work); 1457 dm_deferred_entry_dec(pb->all_io_entry, &work); 1458 1459 if (!list_empty(&work)) 1460 queue_quiesced_migrations(cache, &work); 1461 } 1462 1463 static void quiesce_migration(struct dm_cache_migration *mg) 1464 { 1465 if (!dm_deferred_set_add_work(mg->cache->all_io_ds, &mg->list)) 1466 queue_quiesced_migration(mg); 1467 } 1468 1469 static void promote(struct cache *cache, struct prealloc *structs, 1470 dm_oblock_t oblock, dm_cblock_t cblock, 1471 struct dm_bio_prison_cell *cell) 1472 { 1473 struct dm_cache_migration *mg = prealloc_get_migration(structs); 1474 1475 mg->err = false; 1476 mg->discard = false; 1477 mg->writeback = false; 1478 mg->demote = false; 1479 mg->promote = true; 1480 mg->requeue_holder = true; 1481 mg->invalidate = false; 1482 mg->cache = cache; 1483 mg->new_oblock = oblock; 1484 mg->cblock = cblock; 1485 mg->old_ocell = NULL; 1486 mg->new_ocell = cell; 1487 mg->start_jiffies = jiffies; 1488 1489 inc_io_migrations(cache); 1490 quiesce_migration(mg); 1491 } 1492 1493 static void writeback(struct cache *cache, struct prealloc *structs, 1494 dm_oblock_t oblock, dm_cblock_t cblock, 1495 struct dm_bio_prison_cell *cell) 1496 { 1497 struct dm_cache_migration *mg = prealloc_get_migration(structs); 1498 1499 mg->err = false; 1500 mg->discard = false; 1501 mg->writeback = true; 1502 mg->demote = false; 1503 mg->promote = false; 1504 mg->requeue_holder = true; 1505 mg->invalidate = false; 1506 mg->cache = cache; 1507 mg->old_oblock = oblock; 1508 mg->cblock = cblock; 1509 mg->old_ocell = cell; 1510 mg->new_ocell = NULL; 1511 mg->start_jiffies = jiffies; 1512 1513 inc_io_migrations(cache); 1514 quiesce_migration(mg); 1515 } 1516 1517 static void demote_then_promote(struct cache *cache, struct prealloc *structs, 1518 dm_oblock_t old_oblock, dm_oblock_t new_oblock, 1519 dm_cblock_t cblock, 1520 struct dm_bio_prison_cell *old_ocell, 1521 struct dm_bio_prison_cell *new_ocell) 1522 { 1523 struct dm_cache_migration *mg = prealloc_get_migration(structs); 1524 1525 mg->err = false; 1526 mg->discard = false; 1527 mg->writeback = false; 1528 mg->demote = true; 1529 mg->promote = true; 1530 mg->requeue_holder = true; 1531 mg->invalidate = false; 1532 mg->cache = cache; 1533 mg->old_oblock = old_oblock; 1534 mg->new_oblock = new_oblock; 1535 mg->cblock = cblock; 1536 mg->old_ocell = old_ocell; 1537 mg->new_ocell = new_ocell; 1538 mg->start_jiffies = jiffies; 1539 1540 inc_io_migrations(cache); 1541 quiesce_migration(mg); 1542 } 1543 1544 /* 1545 * Invalidate a cache entry. No writeback occurs; any changes in the cache 1546 * block are thrown away. 1547 */ 1548 static void invalidate(struct cache *cache, struct prealloc *structs, 1549 dm_oblock_t oblock, dm_cblock_t cblock, 1550 struct dm_bio_prison_cell *cell) 1551 { 1552 struct dm_cache_migration *mg = prealloc_get_migration(structs); 1553 1554 mg->err = false; 1555 mg->discard = false; 1556 mg->writeback = false; 1557 mg->demote = true; 1558 mg->promote = false; 1559 mg->requeue_holder = true; 1560 mg->invalidate = true; 1561 mg->cache = cache; 1562 mg->old_oblock = oblock; 1563 mg->cblock = cblock; 1564 mg->old_ocell = cell; 1565 mg->new_ocell = NULL; 1566 mg->start_jiffies = jiffies; 1567 1568 inc_io_migrations(cache); 1569 quiesce_migration(mg); 1570 } 1571 1572 static void discard(struct cache *cache, struct prealloc *structs, 1573 struct dm_bio_prison_cell *cell) 1574 { 1575 struct dm_cache_migration *mg = prealloc_get_migration(structs); 1576 1577 mg->err = false; 1578 mg->discard = true; 1579 mg->writeback = false; 1580 mg->demote = false; 1581 mg->promote = false; 1582 mg->requeue_holder = false; 1583 mg->invalidate = false; 1584 mg->cache = cache; 1585 mg->old_ocell = NULL; 1586 mg->new_ocell = cell; 1587 mg->start_jiffies = jiffies; 1588 1589 quiesce_migration(mg); 1590 } 1591 1592 /*---------------------------------------------------------------- 1593 * bio processing 1594 *--------------------------------------------------------------*/ 1595 static void defer_bio(struct cache *cache, struct bio *bio) 1596 { 1597 unsigned long flags; 1598 1599 spin_lock_irqsave(&cache->lock, flags); 1600 bio_list_add(&cache->deferred_bios, bio); 1601 spin_unlock_irqrestore(&cache->lock, flags); 1602 1603 wake_worker(cache); 1604 } 1605 1606 static void process_flush_bio(struct cache *cache, struct bio *bio) 1607 { 1608 size_t pb_data_size = get_per_bio_data_size(cache); 1609 struct per_bio_data *pb = get_per_bio_data(bio, pb_data_size); 1610 1611 BUG_ON(bio->bi_iter.bi_size); 1612 if (!pb->req_nr) 1613 remap_to_origin(cache, bio); 1614 else 1615 remap_to_cache(cache, bio, 0); 1616 1617 /* 1618 * REQ_PREFLUSH is not directed at any particular block so we don't 1619 * need to inc_ds(). REQ_FUA's are split into a write + REQ_PREFLUSH 1620 * by dm-core. 1621 */ 1622 issue(cache, bio); 1623 } 1624 1625 static void process_discard_bio(struct cache *cache, struct prealloc *structs, 1626 struct bio *bio) 1627 { 1628 int r; 1629 dm_dblock_t b, e; 1630 struct dm_bio_prison_cell *cell_prealloc, *new_ocell; 1631 1632 calc_discard_block_range(cache, bio, &b, &e); 1633 if (b == e) { 1634 bio_endio(bio); 1635 return; 1636 } 1637 1638 cell_prealloc = prealloc_get_cell(structs); 1639 r = bio_detain_range(cache, dblock_to_oblock(cache, b), dblock_to_oblock(cache, e), bio, cell_prealloc, 1640 (cell_free_fn) prealloc_put_cell, 1641 structs, &new_ocell); 1642 if (r > 0) 1643 return; 1644 1645 discard(cache, structs, new_ocell); 1646 } 1647 1648 static bool spare_migration_bandwidth(struct cache *cache) 1649 { 1650 sector_t current_volume = (atomic_read(&cache->nr_io_migrations) + 1) * 1651 cache->sectors_per_block; 1652 return current_volume < cache->migration_threshold; 1653 } 1654 1655 static void inc_hit_counter(struct cache *cache, struct bio *bio) 1656 { 1657 atomic_inc(bio_data_dir(bio) == READ ? 1658 &cache->stats.read_hit : &cache->stats.write_hit); 1659 } 1660 1661 static void inc_miss_counter(struct cache *cache, struct bio *bio) 1662 { 1663 atomic_inc(bio_data_dir(bio) == READ ? 1664 &cache->stats.read_miss : &cache->stats.write_miss); 1665 } 1666 1667 /*----------------------------------------------------------------*/ 1668 1669 struct inc_detail { 1670 struct cache *cache; 1671 struct bio_list bios_for_issue; 1672 struct bio_list unhandled_bios; 1673 bool any_writes; 1674 }; 1675 1676 static void inc_fn(void *context, struct dm_bio_prison_cell *cell) 1677 { 1678 struct bio *bio; 1679 struct inc_detail *detail = context; 1680 struct cache *cache = detail->cache; 1681 1682 inc_ds(cache, cell->holder, cell); 1683 if (bio_data_dir(cell->holder) == WRITE) 1684 detail->any_writes = true; 1685 1686 while ((bio = bio_list_pop(&cell->bios))) { 1687 if (discard_or_flush(bio)) { 1688 bio_list_add(&detail->unhandled_bios, bio); 1689 continue; 1690 } 1691 1692 if (bio_data_dir(bio) == WRITE) 1693 detail->any_writes = true; 1694 1695 bio_list_add(&detail->bios_for_issue, bio); 1696 inc_ds(cache, bio, cell); 1697 } 1698 } 1699 1700 // FIXME: refactor these two 1701 static void remap_cell_to_origin_clear_discard(struct cache *cache, 1702 struct dm_bio_prison_cell *cell, 1703 dm_oblock_t oblock, bool issue_holder) 1704 { 1705 struct bio *bio; 1706 unsigned long flags; 1707 struct inc_detail detail; 1708 1709 detail.cache = cache; 1710 bio_list_init(&detail.bios_for_issue); 1711 bio_list_init(&detail.unhandled_bios); 1712 detail.any_writes = false; 1713 1714 spin_lock_irqsave(&cache->lock, flags); 1715 dm_cell_visit_release(cache->prison, inc_fn, &detail, cell); 1716 bio_list_merge(&cache->deferred_bios, &detail.unhandled_bios); 1717 spin_unlock_irqrestore(&cache->lock, flags); 1718 1719 remap_to_origin(cache, cell->holder); 1720 if (issue_holder) 1721 issue(cache, cell->holder); 1722 else 1723 accounted_begin(cache, cell->holder); 1724 1725 if (detail.any_writes) 1726 clear_discard(cache, oblock_to_dblock(cache, oblock)); 1727 1728 while ((bio = bio_list_pop(&detail.bios_for_issue))) { 1729 remap_to_origin(cache, bio); 1730 issue(cache, bio); 1731 } 1732 1733 free_prison_cell(cache, cell); 1734 } 1735 1736 static void remap_cell_to_cache_dirty(struct cache *cache, struct dm_bio_prison_cell *cell, 1737 dm_oblock_t oblock, dm_cblock_t cblock, bool issue_holder) 1738 { 1739 struct bio *bio; 1740 unsigned long flags; 1741 struct inc_detail detail; 1742 1743 detail.cache = cache; 1744 bio_list_init(&detail.bios_for_issue); 1745 bio_list_init(&detail.unhandled_bios); 1746 detail.any_writes = false; 1747 1748 spin_lock_irqsave(&cache->lock, flags); 1749 dm_cell_visit_release(cache->prison, inc_fn, &detail, cell); 1750 bio_list_merge(&cache->deferred_bios, &detail.unhandled_bios); 1751 spin_unlock_irqrestore(&cache->lock, flags); 1752 1753 remap_to_cache(cache, cell->holder, cblock); 1754 if (issue_holder) 1755 issue(cache, cell->holder); 1756 else 1757 accounted_begin(cache, cell->holder); 1758 1759 if (detail.any_writes) { 1760 set_dirty(cache, oblock, cblock); 1761 clear_discard(cache, oblock_to_dblock(cache, oblock)); 1762 } 1763 1764 while ((bio = bio_list_pop(&detail.bios_for_issue))) { 1765 remap_to_cache(cache, bio, cblock); 1766 issue(cache, bio); 1767 } 1768 1769 free_prison_cell(cache, cell); 1770 } 1771 1772 /*----------------------------------------------------------------*/ 1773 1774 struct old_oblock_lock { 1775 struct policy_locker locker; 1776 struct cache *cache; 1777 struct prealloc *structs; 1778 struct dm_bio_prison_cell *cell; 1779 }; 1780 1781 static int null_locker(struct policy_locker *locker, dm_oblock_t b) 1782 { 1783 /* This should never be called */ 1784 BUG(); 1785 return 0; 1786 } 1787 1788 static int cell_locker(struct policy_locker *locker, dm_oblock_t b) 1789 { 1790 struct old_oblock_lock *l = container_of(locker, struct old_oblock_lock, locker); 1791 struct dm_bio_prison_cell *cell_prealloc = prealloc_get_cell(l->structs); 1792 1793 return bio_detain(l->cache, b, NULL, cell_prealloc, 1794 (cell_free_fn) prealloc_put_cell, 1795 l->structs, &l->cell); 1796 } 1797 1798 static void process_cell(struct cache *cache, struct prealloc *structs, 1799 struct dm_bio_prison_cell *new_ocell) 1800 { 1801 int r; 1802 bool release_cell = true; 1803 struct bio *bio = new_ocell->holder; 1804 dm_oblock_t block = get_bio_block(cache, bio); 1805 struct policy_result lookup_result; 1806 bool passthrough = passthrough_mode(&cache->features); 1807 bool fast_promotion, can_migrate; 1808 struct old_oblock_lock ool; 1809 1810 fast_promotion = is_discarded_oblock(cache, block) || bio_writes_complete_block(cache, bio); 1811 can_migrate = !passthrough && (fast_promotion || spare_migration_bandwidth(cache)); 1812 1813 ool.locker.fn = cell_locker; 1814 ool.cache = cache; 1815 ool.structs = structs; 1816 ool.cell = NULL; 1817 r = policy_map(cache->policy, block, true, can_migrate, fast_promotion, 1818 bio, &ool.locker, &lookup_result); 1819 1820 if (r == -EWOULDBLOCK) 1821 /* migration has been denied */ 1822 lookup_result.op = POLICY_MISS; 1823 1824 switch (lookup_result.op) { 1825 case POLICY_HIT: 1826 if (passthrough) { 1827 inc_miss_counter(cache, bio); 1828 1829 /* 1830 * Passthrough always maps to the origin, 1831 * invalidating any cache blocks that are written 1832 * to. 1833 */ 1834 1835 if (bio_data_dir(bio) == WRITE) { 1836 atomic_inc(&cache->stats.demotion); 1837 invalidate(cache, structs, block, lookup_result.cblock, new_ocell); 1838 release_cell = false; 1839 1840 } else { 1841 /* FIXME: factor out issue_origin() */ 1842 remap_to_origin_clear_discard(cache, bio, block); 1843 inc_and_issue(cache, bio, new_ocell); 1844 } 1845 } else { 1846 inc_hit_counter(cache, bio); 1847 1848 if (bio_data_dir(bio) == WRITE && 1849 writethrough_mode(&cache->features) && 1850 !is_dirty(cache, lookup_result.cblock)) { 1851 remap_to_origin_then_cache(cache, bio, block, lookup_result.cblock); 1852 inc_and_issue(cache, bio, new_ocell); 1853 1854 } else { 1855 remap_cell_to_cache_dirty(cache, new_ocell, block, lookup_result.cblock, true); 1856 release_cell = false; 1857 } 1858 } 1859 1860 break; 1861 1862 case POLICY_MISS: 1863 inc_miss_counter(cache, bio); 1864 remap_cell_to_origin_clear_discard(cache, new_ocell, block, true); 1865 release_cell = false; 1866 break; 1867 1868 case POLICY_NEW: 1869 atomic_inc(&cache->stats.promotion); 1870 promote(cache, structs, block, lookup_result.cblock, new_ocell); 1871 release_cell = false; 1872 break; 1873 1874 case POLICY_REPLACE: 1875 atomic_inc(&cache->stats.demotion); 1876 atomic_inc(&cache->stats.promotion); 1877 demote_then_promote(cache, structs, lookup_result.old_oblock, 1878 block, lookup_result.cblock, 1879 ool.cell, new_ocell); 1880 release_cell = false; 1881 break; 1882 1883 default: 1884 DMERR_LIMIT("%s: %s: erroring bio, unknown policy op: %u", 1885 cache_device_name(cache), __func__, 1886 (unsigned) lookup_result.op); 1887 bio_io_error(bio); 1888 } 1889 1890 if (release_cell) 1891 cell_defer(cache, new_ocell, false); 1892 } 1893 1894 static void process_bio(struct cache *cache, struct prealloc *structs, 1895 struct bio *bio) 1896 { 1897 int r; 1898 dm_oblock_t block = get_bio_block(cache, bio); 1899 struct dm_bio_prison_cell *cell_prealloc, *new_ocell; 1900 1901 /* 1902 * Check to see if that block is currently migrating. 1903 */ 1904 cell_prealloc = prealloc_get_cell(structs); 1905 r = bio_detain(cache, block, bio, cell_prealloc, 1906 (cell_free_fn) prealloc_put_cell, 1907 structs, &new_ocell); 1908 if (r > 0) 1909 return; 1910 1911 process_cell(cache, structs, new_ocell); 1912 } 1913 1914 static int need_commit_due_to_time(struct cache *cache) 1915 { 1916 return jiffies < cache->last_commit_jiffies || 1917 jiffies > cache->last_commit_jiffies + COMMIT_PERIOD; 1918 } 1919 1920 /* 1921 * A non-zero return indicates read_only or fail_io mode. 1922 */ 1923 static int commit(struct cache *cache, bool clean_shutdown) 1924 { 1925 int r; 1926 1927 if (get_cache_mode(cache) >= CM_READ_ONLY) 1928 return -EINVAL; 1929 1930 atomic_inc(&cache->stats.commit_count); 1931 r = dm_cache_commit(cache->cmd, clean_shutdown); 1932 if (r) 1933 metadata_operation_failed(cache, "dm_cache_commit", r); 1934 1935 return r; 1936 } 1937 1938 static int commit_if_needed(struct cache *cache) 1939 { 1940 int r = 0; 1941 1942 if ((cache->commit_requested || need_commit_due_to_time(cache)) && 1943 dm_cache_changed_this_transaction(cache->cmd)) { 1944 r = commit(cache, false); 1945 cache->commit_requested = false; 1946 cache->last_commit_jiffies = jiffies; 1947 } 1948 1949 return r; 1950 } 1951 1952 static void process_deferred_bios(struct cache *cache) 1953 { 1954 bool prealloc_used = false; 1955 unsigned long flags; 1956 struct bio_list bios; 1957 struct bio *bio; 1958 struct prealloc structs; 1959 1960 memset(&structs, 0, sizeof(structs)); 1961 bio_list_init(&bios); 1962 1963 spin_lock_irqsave(&cache->lock, flags); 1964 bio_list_merge(&bios, &cache->deferred_bios); 1965 bio_list_init(&cache->deferred_bios); 1966 spin_unlock_irqrestore(&cache->lock, flags); 1967 1968 while (!bio_list_empty(&bios)) { 1969 /* 1970 * If we've got no free migration structs, and processing 1971 * this bio might require one, we pause until there are some 1972 * prepared mappings to process. 1973 */ 1974 prealloc_used = true; 1975 if (prealloc_data_structs(cache, &structs)) { 1976 spin_lock_irqsave(&cache->lock, flags); 1977 bio_list_merge(&cache->deferred_bios, &bios); 1978 spin_unlock_irqrestore(&cache->lock, flags); 1979 break; 1980 } 1981 1982 bio = bio_list_pop(&bios); 1983 1984 if (bio->bi_opf & REQ_PREFLUSH) 1985 process_flush_bio(cache, bio); 1986 else if (bio_op(bio) == REQ_OP_DISCARD) 1987 process_discard_bio(cache, &structs, bio); 1988 else 1989 process_bio(cache, &structs, bio); 1990 } 1991 1992 if (prealloc_used) 1993 prealloc_free_structs(cache, &structs); 1994 } 1995 1996 static void process_deferred_cells(struct cache *cache) 1997 { 1998 bool prealloc_used = false; 1999 unsigned long flags; 2000 struct dm_bio_prison_cell *cell, *tmp; 2001 struct list_head cells; 2002 struct prealloc structs; 2003 2004 memset(&structs, 0, sizeof(structs)); 2005 2006 INIT_LIST_HEAD(&cells); 2007 2008 spin_lock_irqsave(&cache->lock, flags); 2009 list_splice_init(&cache->deferred_cells, &cells); 2010 spin_unlock_irqrestore(&cache->lock, flags); 2011 2012 list_for_each_entry_safe(cell, tmp, &cells, user_list) { 2013 /* 2014 * If we've got no free migration structs, and processing 2015 * this bio might require one, we pause until there are some 2016 * prepared mappings to process. 2017 */ 2018 prealloc_used = true; 2019 if (prealloc_data_structs(cache, &structs)) { 2020 spin_lock_irqsave(&cache->lock, flags); 2021 list_splice(&cells, &cache->deferred_cells); 2022 spin_unlock_irqrestore(&cache->lock, flags); 2023 break; 2024 } 2025 2026 process_cell(cache, &structs, cell); 2027 } 2028 2029 if (prealloc_used) 2030 prealloc_free_structs(cache, &structs); 2031 } 2032 2033 static void process_deferred_flush_bios(struct cache *cache, bool submit_bios) 2034 { 2035 unsigned long flags; 2036 struct bio_list bios; 2037 struct bio *bio; 2038 2039 bio_list_init(&bios); 2040 2041 spin_lock_irqsave(&cache->lock, flags); 2042 bio_list_merge(&bios, &cache->deferred_flush_bios); 2043 bio_list_init(&cache->deferred_flush_bios); 2044 spin_unlock_irqrestore(&cache->lock, flags); 2045 2046 /* 2047 * These bios have already been through inc_ds() 2048 */ 2049 while ((bio = bio_list_pop(&bios))) 2050 submit_bios ? accounted_request(cache, bio) : bio_io_error(bio); 2051 } 2052 2053 static void process_deferred_writethrough_bios(struct cache *cache) 2054 { 2055 unsigned long flags; 2056 struct bio_list bios; 2057 struct bio *bio; 2058 2059 bio_list_init(&bios); 2060 2061 spin_lock_irqsave(&cache->lock, flags); 2062 bio_list_merge(&bios, &cache->deferred_writethrough_bios); 2063 bio_list_init(&cache->deferred_writethrough_bios); 2064 spin_unlock_irqrestore(&cache->lock, flags); 2065 2066 /* 2067 * These bios have already been through inc_ds() 2068 */ 2069 while ((bio = bio_list_pop(&bios))) 2070 accounted_request(cache, bio); 2071 } 2072 2073 static void writeback_some_dirty_blocks(struct cache *cache) 2074 { 2075 bool prealloc_used = false; 2076 dm_oblock_t oblock; 2077 dm_cblock_t cblock; 2078 struct prealloc structs; 2079 struct dm_bio_prison_cell *old_ocell; 2080 bool busy = !iot_idle_for(&cache->origin_tracker, HZ); 2081 2082 memset(&structs, 0, sizeof(structs)); 2083 2084 while (spare_migration_bandwidth(cache)) { 2085 if (policy_writeback_work(cache->policy, &oblock, &cblock, busy)) 2086 break; /* no work to do */ 2087 2088 prealloc_used = true; 2089 if (prealloc_data_structs(cache, &structs) || 2090 get_cell(cache, oblock, &structs, &old_ocell)) { 2091 policy_set_dirty(cache->policy, oblock); 2092 break; 2093 } 2094 2095 writeback(cache, &structs, oblock, cblock, old_ocell); 2096 } 2097 2098 if (prealloc_used) 2099 prealloc_free_structs(cache, &structs); 2100 } 2101 2102 /*---------------------------------------------------------------- 2103 * Invalidations. 2104 * Dropping something from the cache *without* writing back. 2105 *--------------------------------------------------------------*/ 2106 2107 static void process_invalidation_request(struct cache *cache, struct invalidation_request *req) 2108 { 2109 int r = 0; 2110 uint64_t begin = from_cblock(req->cblocks->begin); 2111 uint64_t end = from_cblock(req->cblocks->end); 2112 2113 while (begin != end) { 2114 r = policy_remove_cblock(cache->policy, to_cblock(begin)); 2115 if (!r) { 2116 r = dm_cache_remove_mapping(cache->cmd, to_cblock(begin)); 2117 if (r) { 2118 metadata_operation_failed(cache, "dm_cache_remove_mapping", r); 2119 break; 2120 } 2121 2122 } else if (r == -ENODATA) { 2123 /* harmless, already unmapped */ 2124 r = 0; 2125 2126 } else { 2127 DMERR("%s: policy_remove_cblock failed", cache_device_name(cache)); 2128 break; 2129 } 2130 2131 begin++; 2132 } 2133 2134 cache->commit_requested = true; 2135 2136 req->err = r; 2137 atomic_set(&req->complete, 1); 2138 2139 wake_up(&req->result_wait); 2140 } 2141 2142 static void process_invalidation_requests(struct cache *cache) 2143 { 2144 struct list_head list; 2145 struct invalidation_request *req, *tmp; 2146 2147 INIT_LIST_HEAD(&list); 2148 spin_lock(&cache->invalidation_lock); 2149 list_splice_init(&cache->invalidation_requests, &list); 2150 spin_unlock(&cache->invalidation_lock); 2151 2152 list_for_each_entry_safe (req, tmp, &list, list) 2153 process_invalidation_request(cache, req); 2154 } 2155 2156 /*---------------------------------------------------------------- 2157 * Main worker loop 2158 *--------------------------------------------------------------*/ 2159 static bool is_quiescing(struct cache *cache) 2160 { 2161 return atomic_read(&cache->quiescing); 2162 } 2163 2164 static void ack_quiescing(struct cache *cache) 2165 { 2166 if (is_quiescing(cache)) { 2167 atomic_inc(&cache->quiescing_ack); 2168 wake_up(&cache->quiescing_wait); 2169 } 2170 } 2171 2172 static void wait_for_quiescing_ack(struct cache *cache) 2173 { 2174 wait_event(cache->quiescing_wait, atomic_read(&cache->quiescing_ack)); 2175 } 2176 2177 static void start_quiescing(struct cache *cache) 2178 { 2179 atomic_inc(&cache->quiescing); 2180 wait_for_quiescing_ack(cache); 2181 } 2182 2183 static void stop_quiescing(struct cache *cache) 2184 { 2185 atomic_set(&cache->quiescing, 0); 2186 atomic_set(&cache->quiescing_ack, 0); 2187 } 2188 2189 static void wait_for_migrations(struct cache *cache) 2190 { 2191 wait_event(cache->migration_wait, !atomic_read(&cache->nr_allocated_migrations)); 2192 } 2193 2194 static void stop_worker(struct cache *cache) 2195 { 2196 cancel_delayed_work(&cache->waker); 2197 flush_workqueue(cache->wq); 2198 } 2199 2200 static void requeue_deferred_cells(struct cache *cache) 2201 { 2202 unsigned long flags; 2203 struct list_head cells; 2204 struct dm_bio_prison_cell *cell, *tmp; 2205 2206 INIT_LIST_HEAD(&cells); 2207 spin_lock_irqsave(&cache->lock, flags); 2208 list_splice_init(&cache->deferred_cells, &cells); 2209 spin_unlock_irqrestore(&cache->lock, flags); 2210 2211 list_for_each_entry_safe(cell, tmp, &cells, user_list) 2212 cell_requeue(cache, cell); 2213 } 2214 2215 static void requeue_deferred_bios(struct cache *cache) 2216 { 2217 struct bio *bio; 2218 struct bio_list bios; 2219 2220 bio_list_init(&bios); 2221 bio_list_merge(&bios, &cache->deferred_bios); 2222 bio_list_init(&cache->deferred_bios); 2223 2224 while ((bio = bio_list_pop(&bios))) { 2225 bio->bi_error = DM_ENDIO_REQUEUE; 2226 bio_endio(bio); 2227 } 2228 } 2229 2230 static int more_work(struct cache *cache) 2231 { 2232 if (is_quiescing(cache)) 2233 return !list_empty(&cache->quiesced_migrations) || 2234 !list_empty(&cache->completed_migrations) || 2235 !list_empty(&cache->need_commit_migrations); 2236 else 2237 return !bio_list_empty(&cache->deferred_bios) || 2238 !list_empty(&cache->deferred_cells) || 2239 !bio_list_empty(&cache->deferred_flush_bios) || 2240 !bio_list_empty(&cache->deferred_writethrough_bios) || 2241 !list_empty(&cache->quiesced_migrations) || 2242 !list_empty(&cache->completed_migrations) || 2243 !list_empty(&cache->need_commit_migrations) || 2244 cache->invalidate; 2245 } 2246 2247 static void do_worker(struct work_struct *ws) 2248 { 2249 struct cache *cache = container_of(ws, struct cache, worker); 2250 2251 do { 2252 if (!is_quiescing(cache)) { 2253 writeback_some_dirty_blocks(cache); 2254 process_deferred_writethrough_bios(cache); 2255 process_deferred_bios(cache); 2256 process_deferred_cells(cache); 2257 process_invalidation_requests(cache); 2258 } 2259 2260 process_migrations(cache, &cache->quiesced_migrations, issue_copy_or_discard); 2261 process_migrations(cache, &cache->completed_migrations, complete_migration); 2262 2263 if (commit_if_needed(cache)) { 2264 process_deferred_flush_bios(cache, false); 2265 process_migrations(cache, &cache->need_commit_migrations, migration_failure); 2266 } else { 2267 process_deferred_flush_bios(cache, true); 2268 process_migrations(cache, &cache->need_commit_migrations, 2269 migration_success_post_commit); 2270 } 2271 2272 ack_quiescing(cache); 2273 2274 } while (more_work(cache)); 2275 } 2276 2277 /* 2278 * We want to commit periodically so that not too much 2279 * unwritten metadata builds up. 2280 */ 2281 static void do_waker(struct work_struct *ws) 2282 { 2283 struct cache *cache = container_of(to_delayed_work(ws), struct cache, waker); 2284 policy_tick(cache->policy, true); 2285 wake_worker(cache); 2286 queue_delayed_work(cache->wq, &cache->waker, COMMIT_PERIOD); 2287 } 2288 2289 /*----------------------------------------------------------------*/ 2290 2291 static int is_congested(struct dm_dev *dev, int bdi_bits) 2292 { 2293 struct request_queue *q = bdev_get_queue(dev->bdev); 2294 return bdi_congested(&q->backing_dev_info, bdi_bits); 2295 } 2296 2297 static int cache_is_congested(struct dm_target_callbacks *cb, int bdi_bits) 2298 { 2299 struct cache *cache = container_of(cb, struct cache, callbacks); 2300 2301 return is_congested(cache->origin_dev, bdi_bits) || 2302 is_congested(cache->cache_dev, bdi_bits); 2303 } 2304 2305 /*---------------------------------------------------------------- 2306 * Target methods 2307 *--------------------------------------------------------------*/ 2308 2309 /* 2310 * This function gets called on the error paths of the constructor, so we 2311 * have to cope with a partially initialised struct. 2312 */ 2313 static void destroy(struct cache *cache) 2314 { 2315 unsigned i; 2316 2317 mempool_destroy(cache->migration_pool); 2318 2319 if (cache->all_io_ds) 2320 dm_deferred_set_destroy(cache->all_io_ds); 2321 2322 if (cache->prison) 2323 dm_bio_prison_destroy(cache->prison); 2324 2325 if (cache->wq) 2326 destroy_workqueue(cache->wq); 2327 2328 if (cache->dirty_bitset) 2329 free_bitset(cache->dirty_bitset); 2330 2331 if (cache->discard_bitset) 2332 free_bitset(cache->discard_bitset); 2333 2334 if (cache->copier) 2335 dm_kcopyd_client_destroy(cache->copier); 2336 2337 if (cache->cmd) 2338 dm_cache_metadata_close(cache->cmd); 2339 2340 if (cache->metadata_dev) 2341 dm_put_device(cache->ti, cache->metadata_dev); 2342 2343 if (cache->origin_dev) 2344 dm_put_device(cache->ti, cache->origin_dev); 2345 2346 if (cache->cache_dev) 2347 dm_put_device(cache->ti, cache->cache_dev); 2348 2349 if (cache->policy) 2350 dm_cache_policy_destroy(cache->policy); 2351 2352 for (i = 0; i < cache->nr_ctr_args ; i++) 2353 kfree(cache->ctr_args[i]); 2354 kfree(cache->ctr_args); 2355 2356 kfree(cache); 2357 } 2358 2359 static void cache_dtr(struct dm_target *ti) 2360 { 2361 struct cache *cache = ti->private; 2362 2363 destroy(cache); 2364 } 2365 2366 static sector_t get_dev_size(struct dm_dev *dev) 2367 { 2368 return i_size_read(dev->bdev->bd_inode) >> SECTOR_SHIFT; 2369 } 2370 2371 /*----------------------------------------------------------------*/ 2372 2373 /* 2374 * Construct a cache device mapping. 2375 * 2376 * cache <metadata dev> <cache dev> <origin dev> <block size> 2377 * <#feature args> [<feature arg>]* 2378 * <policy> <#policy args> [<policy arg>]* 2379 * 2380 * metadata dev : fast device holding the persistent metadata 2381 * cache dev : fast device holding cached data blocks 2382 * origin dev : slow device holding original data blocks 2383 * block size : cache unit size in sectors 2384 * 2385 * #feature args : number of feature arguments passed 2386 * feature args : writethrough. (The default is writeback.) 2387 * 2388 * policy : the replacement policy to use 2389 * #policy args : an even number of policy arguments corresponding 2390 * to key/value pairs passed to the policy 2391 * policy args : key/value pairs passed to the policy 2392 * E.g. 'sequential_threshold 1024' 2393 * See cache-policies.txt for details. 2394 * 2395 * Optional feature arguments are: 2396 * writethrough : write through caching that prohibits cache block 2397 * content from being different from origin block content. 2398 * Without this argument, the default behaviour is to write 2399 * back cache block contents later for performance reasons, 2400 * so they may differ from the corresponding origin blocks. 2401 */ 2402 struct cache_args { 2403 struct dm_target *ti; 2404 2405 struct dm_dev *metadata_dev; 2406 2407 struct dm_dev *cache_dev; 2408 sector_t cache_sectors; 2409 2410 struct dm_dev *origin_dev; 2411 sector_t origin_sectors; 2412 2413 uint32_t block_size; 2414 2415 const char *policy_name; 2416 int policy_argc; 2417 const char **policy_argv; 2418 2419 struct cache_features features; 2420 }; 2421 2422 static void destroy_cache_args(struct cache_args *ca) 2423 { 2424 if (ca->metadata_dev) 2425 dm_put_device(ca->ti, ca->metadata_dev); 2426 2427 if (ca->cache_dev) 2428 dm_put_device(ca->ti, ca->cache_dev); 2429 2430 if (ca->origin_dev) 2431 dm_put_device(ca->ti, ca->origin_dev); 2432 2433 kfree(ca); 2434 } 2435 2436 static bool at_least_one_arg(struct dm_arg_set *as, char **error) 2437 { 2438 if (!as->argc) { 2439 *error = "Insufficient args"; 2440 return false; 2441 } 2442 2443 return true; 2444 } 2445 2446 static int parse_metadata_dev(struct cache_args *ca, struct dm_arg_set *as, 2447 char **error) 2448 { 2449 int r; 2450 sector_t metadata_dev_size; 2451 char b[BDEVNAME_SIZE]; 2452 2453 if (!at_least_one_arg(as, error)) 2454 return -EINVAL; 2455 2456 r = dm_get_device(ca->ti, dm_shift_arg(as), FMODE_READ | FMODE_WRITE, 2457 &ca->metadata_dev); 2458 if (r) { 2459 *error = "Error opening metadata device"; 2460 return r; 2461 } 2462 2463 metadata_dev_size = get_dev_size(ca->metadata_dev); 2464 if (metadata_dev_size > DM_CACHE_METADATA_MAX_SECTORS_WARNING) 2465 DMWARN("Metadata device %s is larger than %u sectors: excess space will not be used.", 2466 bdevname(ca->metadata_dev->bdev, b), THIN_METADATA_MAX_SECTORS); 2467 2468 return 0; 2469 } 2470 2471 static int parse_cache_dev(struct cache_args *ca, struct dm_arg_set *as, 2472 char **error) 2473 { 2474 int r; 2475 2476 if (!at_least_one_arg(as, error)) 2477 return -EINVAL; 2478 2479 r = dm_get_device(ca->ti, dm_shift_arg(as), FMODE_READ | FMODE_WRITE, 2480 &ca->cache_dev); 2481 if (r) { 2482 *error = "Error opening cache device"; 2483 return r; 2484 } 2485 ca->cache_sectors = get_dev_size(ca->cache_dev); 2486 2487 return 0; 2488 } 2489 2490 static int parse_origin_dev(struct cache_args *ca, struct dm_arg_set *as, 2491 char **error) 2492 { 2493 int r; 2494 2495 if (!at_least_one_arg(as, error)) 2496 return -EINVAL; 2497 2498 r = dm_get_device(ca->ti, dm_shift_arg(as), FMODE_READ | FMODE_WRITE, 2499 &ca->origin_dev); 2500 if (r) { 2501 *error = "Error opening origin device"; 2502 return r; 2503 } 2504 2505 ca->origin_sectors = get_dev_size(ca->origin_dev); 2506 if (ca->ti->len > ca->origin_sectors) { 2507 *error = "Device size larger than cached device"; 2508 return -EINVAL; 2509 } 2510 2511 return 0; 2512 } 2513 2514 static int parse_block_size(struct cache_args *ca, struct dm_arg_set *as, 2515 char **error) 2516 { 2517 unsigned long block_size; 2518 2519 if (!at_least_one_arg(as, error)) 2520 return -EINVAL; 2521 2522 if (kstrtoul(dm_shift_arg(as), 10, &block_size) || !block_size || 2523 block_size < DATA_DEV_BLOCK_SIZE_MIN_SECTORS || 2524 block_size > DATA_DEV_BLOCK_SIZE_MAX_SECTORS || 2525 block_size & (DATA_DEV_BLOCK_SIZE_MIN_SECTORS - 1)) { 2526 *error = "Invalid data block size"; 2527 return -EINVAL; 2528 } 2529 2530 if (block_size > ca->cache_sectors) { 2531 *error = "Data block size is larger than the cache device"; 2532 return -EINVAL; 2533 } 2534 2535 ca->block_size = block_size; 2536 2537 return 0; 2538 } 2539 2540 static void init_features(struct cache_features *cf) 2541 { 2542 cf->mode = CM_WRITE; 2543 cf->io_mode = CM_IO_WRITEBACK; 2544 } 2545 2546 static int parse_features(struct cache_args *ca, struct dm_arg_set *as, 2547 char **error) 2548 { 2549 static struct dm_arg _args[] = { 2550 {0, 1, "Invalid number of cache feature arguments"}, 2551 }; 2552 2553 int r; 2554 unsigned argc; 2555 const char *arg; 2556 struct cache_features *cf = &ca->features; 2557 2558 init_features(cf); 2559 2560 r = dm_read_arg_group(_args, as, &argc, error); 2561 if (r) 2562 return -EINVAL; 2563 2564 while (argc--) { 2565 arg = dm_shift_arg(as); 2566 2567 if (!strcasecmp(arg, "writeback")) 2568 cf->io_mode = CM_IO_WRITEBACK; 2569 2570 else if (!strcasecmp(arg, "writethrough")) 2571 cf->io_mode = CM_IO_WRITETHROUGH; 2572 2573 else if (!strcasecmp(arg, "passthrough")) 2574 cf->io_mode = CM_IO_PASSTHROUGH; 2575 2576 else { 2577 *error = "Unrecognised cache feature requested"; 2578 return -EINVAL; 2579 } 2580 } 2581 2582 return 0; 2583 } 2584 2585 static int parse_policy(struct cache_args *ca, struct dm_arg_set *as, 2586 char **error) 2587 { 2588 static struct dm_arg _args[] = { 2589 {0, 1024, "Invalid number of policy arguments"}, 2590 }; 2591 2592 int r; 2593 2594 if (!at_least_one_arg(as, error)) 2595 return -EINVAL; 2596 2597 ca->policy_name = dm_shift_arg(as); 2598 2599 r = dm_read_arg_group(_args, as, &ca->policy_argc, error); 2600 if (r) 2601 return -EINVAL; 2602 2603 ca->policy_argv = (const char **)as->argv; 2604 dm_consume_args(as, ca->policy_argc); 2605 2606 return 0; 2607 } 2608 2609 static int parse_cache_args(struct cache_args *ca, int argc, char **argv, 2610 char **error) 2611 { 2612 int r; 2613 struct dm_arg_set as; 2614 2615 as.argc = argc; 2616 as.argv = argv; 2617 2618 r = parse_metadata_dev(ca, &as, error); 2619 if (r) 2620 return r; 2621 2622 r = parse_cache_dev(ca, &as, error); 2623 if (r) 2624 return r; 2625 2626 r = parse_origin_dev(ca, &as, error); 2627 if (r) 2628 return r; 2629 2630 r = parse_block_size(ca, &as, error); 2631 if (r) 2632 return r; 2633 2634 r = parse_features(ca, &as, error); 2635 if (r) 2636 return r; 2637 2638 r = parse_policy(ca, &as, error); 2639 if (r) 2640 return r; 2641 2642 return 0; 2643 } 2644 2645 /*----------------------------------------------------------------*/ 2646 2647 static struct kmem_cache *migration_cache; 2648 2649 #define NOT_CORE_OPTION 1 2650 2651 static int process_config_option(struct cache *cache, const char *key, const char *value) 2652 { 2653 unsigned long tmp; 2654 2655 if (!strcasecmp(key, "migration_threshold")) { 2656 if (kstrtoul(value, 10, &tmp)) 2657 return -EINVAL; 2658 2659 cache->migration_threshold = tmp; 2660 return 0; 2661 } 2662 2663 return NOT_CORE_OPTION; 2664 } 2665 2666 static int set_config_value(struct cache *cache, const char *key, const char *value) 2667 { 2668 int r = process_config_option(cache, key, value); 2669 2670 if (r == NOT_CORE_OPTION) 2671 r = policy_set_config_value(cache->policy, key, value); 2672 2673 if (r) 2674 DMWARN("bad config value for %s: %s", key, value); 2675 2676 return r; 2677 } 2678 2679 static int set_config_values(struct cache *cache, int argc, const char **argv) 2680 { 2681 int r = 0; 2682 2683 if (argc & 1) { 2684 DMWARN("Odd number of policy arguments given but they should be <key> <value> pairs."); 2685 return -EINVAL; 2686 } 2687 2688 while (argc) { 2689 r = set_config_value(cache, argv[0], argv[1]); 2690 if (r) 2691 break; 2692 2693 argc -= 2; 2694 argv += 2; 2695 } 2696 2697 return r; 2698 } 2699 2700 static int create_cache_policy(struct cache *cache, struct cache_args *ca, 2701 char **error) 2702 { 2703 struct dm_cache_policy *p = dm_cache_policy_create(ca->policy_name, 2704 cache->cache_size, 2705 cache->origin_sectors, 2706 cache->sectors_per_block); 2707 if (IS_ERR(p)) { 2708 *error = "Error creating cache's policy"; 2709 return PTR_ERR(p); 2710 } 2711 cache->policy = p; 2712 2713 return 0; 2714 } 2715 2716 /* 2717 * We want the discard block size to be at least the size of the cache 2718 * block size and have no more than 2^14 discard blocks across the origin. 2719 */ 2720 #define MAX_DISCARD_BLOCKS (1 << 14) 2721 2722 static bool too_many_discard_blocks(sector_t discard_block_size, 2723 sector_t origin_size) 2724 { 2725 (void) sector_div(origin_size, discard_block_size); 2726 2727 return origin_size > MAX_DISCARD_BLOCKS; 2728 } 2729 2730 static sector_t calculate_discard_block_size(sector_t cache_block_size, 2731 sector_t origin_size) 2732 { 2733 sector_t discard_block_size = cache_block_size; 2734 2735 if (origin_size) 2736 while (too_many_discard_blocks(discard_block_size, origin_size)) 2737 discard_block_size *= 2; 2738 2739 return discard_block_size; 2740 } 2741 2742 static void set_cache_size(struct cache *cache, dm_cblock_t size) 2743 { 2744 dm_block_t nr_blocks = from_cblock(size); 2745 2746 if (nr_blocks > (1 << 20) && cache->cache_size != size) 2747 DMWARN_LIMIT("You have created a cache device with a lot of individual cache blocks (%llu)\n" 2748 "All these mappings can consume a lot of kernel memory, and take some time to read/write.\n" 2749 "Please consider increasing the cache block size to reduce the overall cache block count.", 2750 (unsigned long long) nr_blocks); 2751 2752 cache->cache_size = size; 2753 } 2754 2755 #define DEFAULT_MIGRATION_THRESHOLD 2048 2756 2757 static int cache_create(struct cache_args *ca, struct cache **result) 2758 { 2759 int r = 0; 2760 char **error = &ca->ti->error; 2761 struct cache *cache; 2762 struct dm_target *ti = ca->ti; 2763 dm_block_t origin_blocks; 2764 struct dm_cache_metadata *cmd; 2765 bool may_format = ca->features.mode == CM_WRITE; 2766 2767 cache = kzalloc(sizeof(*cache), GFP_KERNEL); 2768 if (!cache) 2769 return -ENOMEM; 2770 2771 cache->ti = ca->ti; 2772 ti->private = cache; 2773 ti->num_flush_bios = 2; 2774 ti->flush_supported = true; 2775 2776 ti->num_discard_bios = 1; 2777 ti->discards_supported = true; 2778 ti->discard_zeroes_data_unsupported = true; 2779 ti->split_discard_bios = false; 2780 2781 cache->features = ca->features; 2782 ti->per_io_data_size = get_per_bio_data_size(cache); 2783 2784 cache->callbacks.congested_fn = cache_is_congested; 2785 dm_table_add_target_callbacks(ti->table, &cache->callbacks); 2786 2787 cache->metadata_dev = ca->metadata_dev; 2788 cache->origin_dev = ca->origin_dev; 2789 cache->cache_dev = ca->cache_dev; 2790 2791 ca->metadata_dev = ca->origin_dev = ca->cache_dev = NULL; 2792 2793 /* FIXME: factor out this whole section */ 2794 origin_blocks = cache->origin_sectors = ca->origin_sectors; 2795 origin_blocks = block_div(origin_blocks, ca->block_size); 2796 cache->origin_blocks = to_oblock(origin_blocks); 2797 2798 cache->sectors_per_block = ca->block_size; 2799 if (dm_set_target_max_io_len(ti, cache->sectors_per_block)) { 2800 r = -EINVAL; 2801 goto bad; 2802 } 2803 2804 if (ca->block_size & (ca->block_size - 1)) { 2805 dm_block_t cache_size = ca->cache_sectors; 2806 2807 cache->sectors_per_block_shift = -1; 2808 cache_size = block_div(cache_size, ca->block_size); 2809 set_cache_size(cache, to_cblock(cache_size)); 2810 } else { 2811 cache->sectors_per_block_shift = __ffs(ca->block_size); 2812 set_cache_size(cache, to_cblock(ca->cache_sectors >> cache->sectors_per_block_shift)); 2813 } 2814 2815 r = create_cache_policy(cache, ca, error); 2816 if (r) 2817 goto bad; 2818 2819 cache->policy_nr_args = ca->policy_argc; 2820 cache->migration_threshold = DEFAULT_MIGRATION_THRESHOLD; 2821 2822 r = set_config_values(cache, ca->policy_argc, ca->policy_argv); 2823 if (r) { 2824 *error = "Error setting cache policy's config values"; 2825 goto bad; 2826 } 2827 2828 cmd = dm_cache_metadata_open(cache->metadata_dev->bdev, 2829 ca->block_size, may_format, 2830 dm_cache_policy_get_hint_size(cache->policy)); 2831 if (IS_ERR(cmd)) { 2832 *error = "Error creating metadata object"; 2833 r = PTR_ERR(cmd); 2834 goto bad; 2835 } 2836 cache->cmd = cmd; 2837 set_cache_mode(cache, CM_WRITE); 2838 if (get_cache_mode(cache) != CM_WRITE) { 2839 *error = "Unable to get write access to metadata, please check/repair metadata."; 2840 r = -EINVAL; 2841 goto bad; 2842 } 2843 2844 if (passthrough_mode(&cache->features)) { 2845 bool all_clean; 2846 2847 r = dm_cache_metadata_all_clean(cache->cmd, &all_clean); 2848 if (r) { 2849 *error = "dm_cache_metadata_all_clean() failed"; 2850 goto bad; 2851 } 2852 2853 if (!all_clean) { 2854 *error = "Cannot enter passthrough mode unless all blocks are clean"; 2855 r = -EINVAL; 2856 goto bad; 2857 } 2858 } 2859 2860 spin_lock_init(&cache->lock); 2861 INIT_LIST_HEAD(&cache->deferred_cells); 2862 bio_list_init(&cache->deferred_bios); 2863 bio_list_init(&cache->deferred_flush_bios); 2864 bio_list_init(&cache->deferred_writethrough_bios); 2865 INIT_LIST_HEAD(&cache->quiesced_migrations); 2866 INIT_LIST_HEAD(&cache->completed_migrations); 2867 INIT_LIST_HEAD(&cache->need_commit_migrations); 2868 atomic_set(&cache->nr_allocated_migrations, 0); 2869 atomic_set(&cache->nr_io_migrations, 0); 2870 init_waitqueue_head(&cache->migration_wait); 2871 2872 init_waitqueue_head(&cache->quiescing_wait); 2873 atomic_set(&cache->quiescing, 0); 2874 atomic_set(&cache->quiescing_ack, 0); 2875 2876 r = -ENOMEM; 2877 atomic_set(&cache->nr_dirty, 0); 2878 cache->dirty_bitset = alloc_bitset(from_cblock(cache->cache_size)); 2879 if (!cache->dirty_bitset) { 2880 *error = "could not allocate dirty bitset"; 2881 goto bad; 2882 } 2883 clear_bitset(cache->dirty_bitset, from_cblock(cache->cache_size)); 2884 2885 cache->discard_block_size = 2886 calculate_discard_block_size(cache->sectors_per_block, 2887 cache->origin_sectors); 2888 cache->discard_nr_blocks = to_dblock(dm_sector_div_up(cache->origin_sectors, 2889 cache->discard_block_size)); 2890 cache->discard_bitset = alloc_bitset(from_dblock(cache->discard_nr_blocks)); 2891 if (!cache->discard_bitset) { 2892 *error = "could not allocate discard bitset"; 2893 goto bad; 2894 } 2895 clear_bitset(cache->discard_bitset, from_dblock(cache->discard_nr_blocks)); 2896 2897 cache->copier = dm_kcopyd_client_create(&dm_kcopyd_throttle); 2898 if (IS_ERR(cache->copier)) { 2899 *error = "could not create kcopyd client"; 2900 r = PTR_ERR(cache->copier); 2901 goto bad; 2902 } 2903 2904 cache->wq = alloc_ordered_workqueue("dm-" DM_MSG_PREFIX, WQ_MEM_RECLAIM); 2905 if (!cache->wq) { 2906 *error = "could not create workqueue for metadata object"; 2907 goto bad; 2908 } 2909 INIT_WORK(&cache->worker, do_worker); 2910 INIT_DELAYED_WORK(&cache->waker, do_waker); 2911 cache->last_commit_jiffies = jiffies; 2912 2913 cache->prison = dm_bio_prison_create(); 2914 if (!cache->prison) { 2915 *error = "could not create bio prison"; 2916 goto bad; 2917 } 2918 2919 cache->all_io_ds = dm_deferred_set_create(); 2920 if (!cache->all_io_ds) { 2921 *error = "could not create all_io deferred set"; 2922 goto bad; 2923 } 2924 2925 cache->migration_pool = mempool_create_slab_pool(MIGRATION_POOL_SIZE, 2926 migration_cache); 2927 if (!cache->migration_pool) { 2928 *error = "Error creating cache's migration mempool"; 2929 goto bad; 2930 } 2931 2932 cache->need_tick_bio = true; 2933 cache->sized = false; 2934 cache->invalidate = false; 2935 cache->commit_requested = false; 2936 cache->loaded_mappings = false; 2937 cache->loaded_discards = false; 2938 2939 load_stats(cache); 2940 2941 atomic_set(&cache->stats.demotion, 0); 2942 atomic_set(&cache->stats.promotion, 0); 2943 atomic_set(&cache->stats.copies_avoided, 0); 2944 atomic_set(&cache->stats.cache_cell_clash, 0); 2945 atomic_set(&cache->stats.commit_count, 0); 2946 atomic_set(&cache->stats.discard_count, 0); 2947 2948 spin_lock_init(&cache->invalidation_lock); 2949 INIT_LIST_HEAD(&cache->invalidation_requests); 2950 2951 iot_init(&cache->origin_tracker); 2952 2953 *result = cache; 2954 return 0; 2955 2956 bad: 2957 destroy(cache); 2958 return r; 2959 } 2960 2961 static int copy_ctr_args(struct cache *cache, int argc, const char **argv) 2962 { 2963 unsigned i; 2964 const char **copy; 2965 2966 copy = kcalloc(argc, sizeof(*copy), GFP_KERNEL); 2967 if (!copy) 2968 return -ENOMEM; 2969 for (i = 0; i < argc; i++) { 2970 copy[i] = kstrdup(argv[i], GFP_KERNEL); 2971 if (!copy[i]) { 2972 while (i--) 2973 kfree(copy[i]); 2974 kfree(copy); 2975 return -ENOMEM; 2976 } 2977 } 2978 2979 cache->nr_ctr_args = argc; 2980 cache->ctr_args = copy; 2981 2982 return 0; 2983 } 2984 2985 static int cache_ctr(struct dm_target *ti, unsigned argc, char **argv) 2986 { 2987 int r = -EINVAL; 2988 struct cache_args *ca; 2989 struct cache *cache = NULL; 2990 2991 ca = kzalloc(sizeof(*ca), GFP_KERNEL); 2992 if (!ca) { 2993 ti->error = "Error allocating memory for cache"; 2994 return -ENOMEM; 2995 } 2996 ca->ti = ti; 2997 2998 r = parse_cache_args(ca, argc, argv, &ti->error); 2999 if (r) 3000 goto out; 3001 3002 r = cache_create(ca, &cache); 3003 if (r) 3004 goto out; 3005 3006 r = copy_ctr_args(cache, argc - 3, (const char **)argv + 3); 3007 if (r) { 3008 destroy(cache); 3009 goto out; 3010 } 3011 3012 ti->private = cache; 3013 3014 out: 3015 destroy_cache_args(ca); 3016 return r; 3017 } 3018 3019 /*----------------------------------------------------------------*/ 3020 3021 static int cache_map(struct dm_target *ti, struct bio *bio) 3022 { 3023 struct cache *cache = ti->private; 3024 3025 int r; 3026 struct dm_bio_prison_cell *cell = NULL; 3027 dm_oblock_t block = get_bio_block(cache, bio); 3028 size_t pb_data_size = get_per_bio_data_size(cache); 3029 bool can_migrate = false; 3030 bool fast_promotion; 3031 struct policy_result lookup_result; 3032 struct per_bio_data *pb = init_per_bio_data(bio, pb_data_size); 3033 struct old_oblock_lock ool; 3034 3035 ool.locker.fn = null_locker; 3036 3037 if (unlikely(from_oblock(block) >= from_oblock(cache->origin_blocks))) { 3038 /* 3039 * This can only occur if the io goes to a partial block at 3040 * the end of the origin device. We don't cache these. 3041 * Just remap to the origin and carry on. 3042 */ 3043 remap_to_origin(cache, bio); 3044 accounted_begin(cache, bio); 3045 return DM_MAPIO_REMAPPED; 3046 } 3047 3048 if (discard_or_flush(bio)) { 3049 defer_bio(cache, bio); 3050 return DM_MAPIO_SUBMITTED; 3051 } 3052 3053 /* 3054 * Check to see if that block is currently migrating. 3055 */ 3056 cell = alloc_prison_cell(cache); 3057 if (!cell) { 3058 defer_bio(cache, bio); 3059 return DM_MAPIO_SUBMITTED; 3060 } 3061 3062 r = bio_detain(cache, block, bio, cell, 3063 (cell_free_fn) free_prison_cell, 3064 cache, &cell); 3065 if (r) { 3066 if (r < 0) 3067 defer_bio(cache, bio); 3068 3069 return DM_MAPIO_SUBMITTED; 3070 } 3071 3072 fast_promotion = is_discarded_oblock(cache, block) || bio_writes_complete_block(cache, bio); 3073 3074 r = policy_map(cache->policy, block, false, can_migrate, fast_promotion, 3075 bio, &ool.locker, &lookup_result); 3076 if (r == -EWOULDBLOCK) { 3077 cell_defer(cache, cell, true); 3078 return DM_MAPIO_SUBMITTED; 3079 3080 } else if (r) { 3081 DMERR_LIMIT("%s: Unexpected return from cache replacement policy: %d", 3082 cache_device_name(cache), r); 3083 cell_defer(cache, cell, false); 3084 bio_io_error(bio); 3085 return DM_MAPIO_SUBMITTED; 3086 } 3087 3088 r = DM_MAPIO_REMAPPED; 3089 switch (lookup_result.op) { 3090 case POLICY_HIT: 3091 if (passthrough_mode(&cache->features)) { 3092 if (bio_data_dir(bio) == WRITE) { 3093 /* 3094 * We need to invalidate this block, so 3095 * defer for the worker thread. 3096 */ 3097 cell_defer(cache, cell, true); 3098 r = DM_MAPIO_SUBMITTED; 3099 3100 } else { 3101 inc_miss_counter(cache, bio); 3102 remap_to_origin_clear_discard(cache, bio, block); 3103 accounted_begin(cache, bio); 3104 inc_ds(cache, bio, cell); 3105 // FIXME: we want to remap hits or misses straight 3106 // away rather than passing over to the worker. 3107 cell_defer(cache, cell, false); 3108 } 3109 3110 } else { 3111 inc_hit_counter(cache, bio); 3112 if (bio_data_dir(bio) == WRITE && writethrough_mode(&cache->features) && 3113 !is_dirty(cache, lookup_result.cblock)) { 3114 remap_to_origin_then_cache(cache, bio, block, lookup_result.cblock); 3115 accounted_begin(cache, bio); 3116 inc_ds(cache, bio, cell); 3117 cell_defer(cache, cell, false); 3118 3119 } else 3120 remap_cell_to_cache_dirty(cache, cell, block, lookup_result.cblock, false); 3121 } 3122 break; 3123 3124 case POLICY_MISS: 3125 inc_miss_counter(cache, bio); 3126 if (pb->req_nr != 0) { 3127 /* 3128 * This is a duplicate writethrough io that is no 3129 * longer needed because the block has been demoted. 3130 */ 3131 bio_endio(bio); 3132 // FIXME: remap everything as a miss 3133 cell_defer(cache, cell, false); 3134 r = DM_MAPIO_SUBMITTED; 3135 3136 } else 3137 remap_cell_to_origin_clear_discard(cache, cell, block, false); 3138 break; 3139 3140 default: 3141 DMERR_LIMIT("%s: %s: erroring bio: unknown policy op: %u", 3142 cache_device_name(cache), __func__, 3143 (unsigned) lookup_result.op); 3144 cell_defer(cache, cell, false); 3145 bio_io_error(bio); 3146 r = DM_MAPIO_SUBMITTED; 3147 } 3148 3149 return r; 3150 } 3151 3152 static int cache_end_io(struct dm_target *ti, struct bio *bio, int error) 3153 { 3154 struct cache *cache = ti->private; 3155 unsigned long flags; 3156 size_t pb_data_size = get_per_bio_data_size(cache); 3157 struct per_bio_data *pb = get_per_bio_data(bio, pb_data_size); 3158 3159 if (pb->tick) { 3160 policy_tick(cache->policy, false); 3161 3162 spin_lock_irqsave(&cache->lock, flags); 3163 cache->need_tick_bio = true; 3164 spin_unlock_irqrestore(&cache->lock, flags); 3165 } 3166 3167 check_for_quiesced_migrations(cache, pb); 3168 accounted_complete(cache, bio); 3169 3170 return 0; 3171 } 3172 3173 static int write_dirty_bitset(struct cache *cache) 3174 { 3175 unsigned i, r; 3176 3177 if (get_cache_mode(cache) >= CM_READ_ONLY) 3178 return -EINVAL; 3179 3180 for (i = 0; i < from_cblock(cache->cache_size); i++) { 3181 r = dm_cache_set_dirty(cache->cmd, to_cblock(i), 3182 is_dirty(cache, to_cblock(i))); 3183 if (r) { 3184 metadata_operation_failed(cache, "dm_cache_set_dirty", r); 3185 return r; 3186 } 3187 } 3188 3189 return 0; 3190 } 3191 3192 static int write_discard_bitset(struct cache *cache) 3193 { 3194 unsigned i, r; 3195 3196 if (get_cache_mode(cache) >= CM_READ_ONLY) 3197 return -EINVAL; 3198 3199 r = dm_cache_discard_bitset_resize(cache->cmd, cache->discard_block_size, 3200 cache->discard_nr_blocks); 3201 if (r) { 3202 DMERR("%s: could not resize on-disk discard bitset", cache_device_name(cache)); 3203 metadata_operation_failed(cache, "dm_cache_discard_bitset_resize", r); 3204 return r; 3205 } 3206 3207 for (i = 0; i < from_dblock(cache->discard_nr_blocks); i++) { 3208 r = dm_cache_set_discard(cache->cmd, to_dblock(i), 3209 is_discarded(cache, to_dblock(i))); 3210 if (r) { 3211 metadata_operation_failed(cache, "dm_cache_set_discard", r); 3212 return r; 3213 } 3214 } 3215 3216 return 0; 3217 } 3218 3219 static int write_hints(struct cache *cache) 3220 { 3221 int r; 3222 3223 if (get_cache_mode(cache) >= CM_READ_ONLY) 3224 return -EINVAL; 3225 3226 r = dm_cache_write_hints(cache->cmd, cache->policy); 3227 if (r) { 3228 metadata_operation_failed(cache, "dm_cache_write_hints", r); 3229 return r; 3230 } 3231 3232 return 0; 3233 } 3234 3235 /* 3236 * returns true on success 3237 */ 3238 static bool sync_metadata(struct cache *cache) 3239 { 3240 int r1, r2, r3, r4; 3241 3242 r1 = write_dirty_bitset(cache); 3243 if (r1) 3244 DMERR("%s: could not write dirty bitset", cache_device_name(cache)); 3245 3246 r2 = write_discard_bitset(cache); 3247 if (r2) 3248 DMERR("%s: could not write discard bitset", cache_device_name(cache)); 3249 3250 save_stats(cache); 3251 3252 r3 = write_hints(cache); 3253 if (r3) 3254 DMERR("%s: could not write hints", cache_device_name(cache)); 3255 3256 /* 3257 * If writing the above metadata failed, we still commit, but don't 3258 * set the clean shutdown flag. This will effectively force every 3259 * dirty bit to be set on reload. 3260 */ 3261 r4 = commit(cache, !r1 && !r2 && !r3); 3262 if (r4) 3263 DMERR("%s: could not write cache metadata", cache_device_name(cache)); 3264 3265 return !r1 && !r2 && !r3 && !r4; 3266 } 3267 3268 static void cache_postsuspend(struct dm_target *ti) 3269 { 3270 struct cache *cache = ti->private; 3271 3272 start_quiescing(cache); 3273 wait_for_migrations(cache); 3274 stop_worker(cache); 3275 requeue_deferred_bios(cache); 3276 requeue_deferred_cells(cache); 3277 stop_quiescing(cache); 3278 3279 if (get_cache_mode(cache) == CM_WRITE) 3280 (void) sync_metadata(cache); 3281 } 3282 3283 static int load_mapping(void *context, dm_oblock_t oblock, dm_cblock_t cblock, 3284 bool dirty, uint32_t hint, bool hint_valid) 3285 { 3286 int r; 3287 struct cache *cache = context; 3288 3289 r = policy_load_mapping(cache->policy, oblock, cblock, hint, hint_valid); 3290 if (r) 3291 return r; 3292 3293 if (dirty) 3294 set_dirty(cache, oblock, cblock); 3295 else 3296 clear_dirty(cache, oblock, cblock); 3297 3298 return 0; 3299 } 3300 3301 /* 3302 * The discard block size in the on disk metadata is not 3303 * neccessarily the same as we're currently using. So we have to 3304 * be careful to only set the discarded attribute if we know it 3305 * covers a complete block of the new size. 3306 */ 3307 struct discard_load_info { 3308 struct cache *cache; 3309 3310 /* 3311 * These blocks are sized using the on disk dblock size, rather 3312 * than the current one. 3313 */ 3314 dm_block_t block_size; 3315 dm_block_t discard_begin, discard_end; 3316 }; 3317 3318 static void discard_load_info_init(struct cache *cache, 3319 struct discard_load_info *li) 3320 { 3321 li->cache = cache; 3322 li->discard_begin = li->discard_end = 0; 3323 } 3324 3325 static void set_discard_range(struct discard_load_info *li) 3326 { 3327 sector_t b, e; 3328 3329 if (li->discard_begin == li->discard_end) 3330 return; 3331 3332 /* 3333 * Convert to sectors. 3334 */ 3335 b = li->discard_begin * li->block_size; 3336 e = li->discard_end * li->block_size; 3337 3338 /* 3339 * Then convert back to the current dblock size. 3340 */ 3341 b = dm_sector_div_up(b, li->cache->discard_block_size); 3342 sector_div(e, li->cache->discard_block_size); 3343 3344 /* 3345 * The origin may have shrunk, so we need to check we're still in 3346 * bounds. 3347 */ 3348 if (e > from_dblock(li->cache->discard_nr_blocks)) 3349 e = from_dblock(li->cache->discard_nr_blocks); 3350 3351 for (; b < e; b++) 3352 set_discard(li->cache, to_dblock(b)); 3353 } 3354 3355 static int load_discard(void *context, sector_t discard_block_size, 3356 dm_dblock_t dblock, bool discard) 3357 { 3358 struct discard_load_info *li = context; 3359 3360 li->block_size = discard_block_size; 3361 3362 if (discard) { 3363 if (from_dblock(dblock) == li->discard_end) 3364 /* 3365 * We're already in a discard range, just extend it. 3366 */ 3367 li->discard_end = li->discard_end + 1ULL; 3368 3369 else { 3370 /* 3371 * Emit the old range and start a new one. 3372 */ 3373 set_discard_range(li); 3374 li->discard_begin = from_dblock(dblock); 3375 li->discard_end = li->discard_begin + 1ULL; 3376 } 3377 } else { 3378 set_discard_range(li); 3379 li->discard_begin = li->discard_end = 0; 3380 } 3381 3382 return 0; 3383 } 3384 3385 static dm_cblock_t get_cache_dev_size(struct cache *cache) 3386 { 3387 sector_t size = get_dev_size(cache->cache_dev); 3388 (void) sector_div(size, cache->sectors_per_block); 3389 return to_cblock(size); 3390 } 3391 3392 static bool can_resize(struct cache *cache, dm_cblock_t new_size) 3393 { 3394 if (from_cblock(new_size) > from_cblock(cache->cache_size)) 3395 return true; 3396 3397 /* 3398 * We can't drop a dirty block when shrinking the cache. 3399 */ 3400 while (from_cblock(new_size) < from_cblock(cache->cache_size)) { 3401 new_size = to_cblock(from_cblock(new_size) + 1); 3402 if (is_dirty(cache, new_size)) { 3403 DMERR("%s: unable to shrink cache; cache block %llu is dirty", 3404 cache_device_name(cache), 3405 (unsigned long long) from_cblock(new_size)); 3406 return false; 3407 } 3408 } 3409 3410 return true; 3411 } 3412 3413 static int resize_cache_dev(struct cache *cache, dm_cblock_t new_size) 3414 { 3415 int r; 3416 3417 r = dm_cache_resize(cache->cmd, new_size); 3418 if (r) { 3419 DMERR("%s: could not resize cache metadata", cache_device_name(cache)); 3420 metadata_operation_failed(cache, "dm_cache_resize", r); 3421 return r; 3422 } 3423 3424 set_cache_size(cache, new_size); 3425 3426 return 0; 3427 } 3428 3429 static int cache_preresume(struct dm_target *ti) 3430 { 3431 int r = 0; 3432 struct cache *cache = ti->private; 3433 dm_cblock_t csize = get_cache_dev_size(cache); 3434 3435 /* 3436 * Check to see if the cache has resized. 3437 */ 3438 if (!cache->sized) { 3439 r = resize_cache_dev(cache, csize); 3440 if (r) 3441 return r; 3442 3443 cache->sized = true; 3444 3445 } else if (csize != cache->cache_size) { 3446 if (!can_resize(cache, csize)) 3447 return -EINVAL; 3448 3449 r = resize_cache_dev(cache, csize); 3450 if (r) 3451 return r; 3452 } 3453 3454 if (!cache->loaded_mappings) { 3455 r = dm_cache_load_mappings(cache->cmd, cache->policy, 3456 load_mapping, cache); 3457 if (r) { 3458 DMERR("%s: could not load cache mappings", cache_device_name(cache)); 3459 metadata_operation_failed(cache, "dm_cache_load_mappings", r); 3460 return r; 3461 } 3462 3463 cache->loaded_mappings = true; 3464 } 3465 3466 if (!cache->loaded_discards) { 3467 struct discard_load_info li; 3468 3469 /* 3470 * The discard bitset could have been resized, or the 3471 * discard block size changed. To be safe we start by 3472 * setting every dblock to not discarded. 3473 */ 3474 clear_bitset(cache->discard_bitset, from_dblock(cache->discard_nr_blocks)); 3475 3476 discard_load_info_init(cache, &li); 3477 r = dm_cache_load_discards(cache->cmd, load_discard, &li); 3478 if (r) { 3479 DMERR("%s: could not load origin discards", cache_device_name(cache)); 3480 metadata_operation_failed(cache, "dm_cache_load_discards", r); 3481 return r; 3482 } 3483 set_discard_range(&li); 3484 3485 cache->loaded_discards = true; 3486 } 3487 3488 return r; 3489 } 3490 3491 static void cache_resume(struct dm_target *ti) 3492 { 3493 struct cache *cache = ti->private; 3494 3495 cache->need_tick_bio = true; 3496 do_waker(&cache->waker.work); 3497 } 3498 3499 /* 3500 * Status format: 3501 * 3502 * <metadata block size> <#used metadata blocks>/<#total metadata blocks> 3503 * <cache block size> <#used cache blocks>/<#total cache blocks> 3504 * <#read hits> <#read misses> <#write hits> <#write misses> 3505 * <#demotions> <#promotions> <#dirty> 3506 * <#features> <features>* 3507 * <#core args> <core args> 3508 * <policy name> <#policy args> <policy args>* <cache metadata mode> <needs_check> 3509 */ 3510 static void cache_status(struct dm_target *ti, status_type_t type, 3511 unsigned status_flags, char *result, unsigned maxlen) 3512 { 3513 int r = 0; 3514 unsigned i; 3515 ssize_t sz = 0; 3516 dm_block_t nr_free_blocks_metadata = 0; 3517 dm_block_t nr_blocks_metadata = 0; 3518 char buf[BDEVNAME_SIZE]; 3519 struct cache *cache = ti->private; 3520 dm_cblock_t residency; 3521 bool needs_check; 3522 3523 switch (type) { 3524 case STATUSTYPE_INFO: 3525 if (get_cache_mode(cache) == CM_FAIL) { 3526 DMEMIT("Fail"); 3527 break; 3528 } 3529 3530 /* Commit to ensure statistics aren't out-of-date */ 3531 if (!(status_flags & DM_STATUS_NOFLUSH_FLAG) && !dm_suspended(ti)) 3532 (void) commit(cache, false); 3533 3534 r = dm_cache_get_free_metadata_block_count(cache->cmd, &nr_free_blocks_metadata); 3535 if (r) { 3536 DMERR("%s: dm_cache_get_free_metadata_block_count returned %d", 3537 cache_device_name(cache), r); 3538 goto err; 3539 } 3540 3541 r = dm_cache_get_metadata_dev_size(cache->cmd, &nr_blocks_metadata); 3542 if (r) { 3543 DMERR("%s: dm_cache_get_metadata_dev_size returned %d", 3544 cache_device_name(cache), r); 3545 goto err; 3546 } 3547 3548 residency = policy_residency(cache->policy); 3549 3550 DMEMIT("%u %llu/%llu %u %llu/%llu %u %u %u %u %u %u %lu ", 3551 (unsigned)DM_CACHE_METADATA_BLOCK_SIZE, 3552 (unsigned long long)(nr_blocks_metadata - nr_free_blocks_metadata), 3553 (unsigned long long)nr_blocks_metadata, 3554 cache->sectors_per_block, 3555 (unsigned long long) from_cblock(residency), 3556 (unsigned long long) from_cblock(cache->cache_size), 3557 (unsigned) atomic_read(&cache->stats.read_hit), 3558 (unsigned) atomic_read(&cache->stats.read_miss), 3559 (unsigned) atomic_read(&cache->stats.write_hit), 3560 (unsigned) atomic_read(&cache->stats.write_miss), 3561 (unsigned) atomic_read(&cache->stats.demotion), 3562 (unsigned) atomic_read(&cache->stats.promotion), 3563 (unsigned long) atomic_read(&cache->nr_dirty)); 3564 3565 if (writethrough_mode(&cache->features)) 3566 DMEMIT("1 writethrough "); 3567 3568 else if (passthrough_mode(&cache->features)) 3569 DMEMIT("1 passthrough "); 3570 3571 else if (writeback_mode(&cache->features)) 3572 DMEMIT("1 writeback "); 3573 3574 else { 3575 DMERR("%s: internal error: unknown io mode: %d", 3576 cache_device_name(cache), (int) cache->features.io_mode); 3577 goto err; 3578 } 3579 3580 DMEMIT("2 migration_threshold %llu ", (unsigned long long) cache->migration_threshold); 3581 3582 DMEMIT("%s ", dm_cache_policy_get_name(cache->policy)); 3583 if (sz < maxlen) { 3584 r = policy_emit_config_values(cache->policy, result, maxlen, &sz); 3585 if (r) 3586 DMERR("%s: policy_emit_config_values returned %d", 3587 cache_device_name(cache), r); 3588 } 3589 3590 if (get_cache_mode(cache) == CM_READ_ONLY) 3591 DMEMIT("ro "); 3592 else 3593 DMEMIT("rw "); 3594 3595 r = dm_cache_metadata_needs_check(cache->cmd, &needs_check); 3596 3597 if (r || needs_check) 3598 DMEMIT("needs_check "); 3599 else 3600 DMEMIT("- "); 3601 3602 break; 3603 3604 case STATUSTYPE_TABLE: 3605 format_dev_t(buf, cache->metadata_dev->bdev->bd_dev); 3606 DMEMIT("%s ", buf); 3607 format_dev_t(buf, cache->cache_dev->bdev->bd_dev); 3608 DMEMIT("%s ", buf); 3609 format_dev_t(buf, cache->origin_dev->bdev->bd_dev); 3610 DMEMIT("%s", buf); 3611 3612 for (i = 0; i < cache->nr_ctr_args - 1; i++) 3613 DMEMIT(" %s", cache->ctr_args[i]); 3614 if (cache->nr_ctr_args) 3615 DMEMIT(" %s", cache->ctr_args[cache->nr_ctr_args - 1]); 3616 } 3617 3618 return; 3619 3620 err: 3621 DMEMIT("Error"); 3622 } 3623 3624 /* 3625 * A cache block range can take two forms: 3626 * 3627 * i) A single cblock, eg. '3456' 3628 * ii) A begin and end cblock with dots between, eg. 123-234 3629 */ 3630 static int parse_cblock_range(struct cache *cache, const char *str, 3631 struct cblock_range *result) 3632 { 3633 char dummy; 3634 uint64_t b, e; 3635 int r; 3636 3637 /* 3638 * Try and parse form (ii) first. 3639 */ 3640 r = sscanf(str, "%llu-%llu%c", &b, &e, &dummy); 3641 if (r < 0) 3642 return r; 3643 3644 if (r == 2) { 3645 result->begin = to_cblock(b); 3646 result->end = to_cblock(e); 3647 return 0; 3648 } 3649 3650 /* 3651 * That didn't work, try form (i). 3652 */ 3653 r = sscanf(str, "%llu%c", &b, &dummy); 3654 if (r < 0) 3655 return r; 3656 3657 if (r == 1) { 3658 result->begin = to_cblock(b); 3659 result->end = to_cblock(from_cblock(result->begin) + 1u); 3660 return 0; 3661 } 3662 3663 DMERR("%s: invalid cblock range '%s'", cache_device_name(cache), str); 3664 return -EINVAL; 3665 } 3666 3667 static int validate_cblock_range(struct cache *cache, struct cblock_range *range) 3668 { 3669 uint64_t b = from_cblock(range->begin); 3670 uint64_t e = from_cblock(range->end); 3671 uint64_t n = from_cblock(cache->cache_size); 3672 3673 if (b >= n) { 3674 DMERR("%s: begin cblock out of range: %llu >= %llu", 3675 cache_device_name(cache), b, n); 3676 return -EINVAL; 3677 } 3678 3679 if (e > n) { 3680 DMERR("%s: end cblock out of range: %llu > %llu", 3681 cache_device_name(cache), e, n); 3682 return -EINVAL; 3683 } 3684 3685 if (b >= e) { 3686 DMERR("%s: invalid cblock range: %llu >= %llu", 3687 cache_device_name(cache), b, e); 3688 return -EINVAL; 3689 } 3690 3691 return 0; 3692 } 3693 3694 static int request_invalidation(struct cache *cache, struct cblock_range *range) 3695 { 3696 struct invalidation_request req; 3697 3698 INIT_LIST_HEAD(&req.list); 3699 req.cblocks = range; 3700 atomic_set(&req.complete, 0); 3701 req.err = 0; 3702 init_waitqueue_head(&req.result_wait); 3703 3704 spin_lock(&cache->invalidation_lock); 3705 list_add(&req.list, &cache->invalidation_requests); 3706 spin_unlock(&cache->invalidation_lock); 3707 wake_worker(cache); 3708 3709 wait_event(req.result_wait, atomic_read(&req.complete)); 3710 return req.err; 3711 } 3712 3713 static int process_invalidate_cblocks_message(struct cache *cache, unsigned count, 3714 const char **cblock_ranges) 3715 { 3716 int r = 0; 3717 unsigned i; 3718 struct cblock_range range; 3719 3720 if (!passthrough_mode(&cache->features)) { 3721 DMERR("%s: cache has to be in passthrough mode for invalidation", 3722 cache_device_name(cache)); 3723 return -EPERM; 3724 } 3725 3726 for (i = 0; i < count; i++) { 3727 r = parse_cblock_range(cache, cblock_ranges[i], &range); 3728 if (r) 3729 break; 3730 3731 r = validate_cblock_range(cache, &range); 3732 if (r) 3733 break; 3734 3735 /* 3736 * Pass begin and end origin blocks to the worker and wake it. 3737 */ 3738 r = request_invalidation(cache, &range); 3739 if (r) 3740 break; 3741 } 3742 3743 return r; 3744 } 3745 3746 /* 3747 * Supports 3748 * "<key> <value>" 3749 * and 3750 * "invalidate_cblocks [(<begin>)|(<begin>-<end>)]* 3751 * 3752 * The key migration_threshold is supported by the cache target core. 3753 */ 3754 static int cache_message(struct dm_target *ti, unsigned argc, char **argv) 3755 { 3756 struct cache *cache = ti->private; 3757 3758 if (!argc) 3759 return -EINVAL; 3760 3761 if (get_cache_mode(cache) >= CM_READ_ONLY) { 3762 DMERR("%s: unable to service cache target messages in READ_ONLY or FAIL mode", 3763 cache_device_name(cache)); 3764 return -EOPNOTSUPP; 3765 } 3766 3767 if (!strcasecmp(argv[0], "invalidate_cblocks")) 3768 return process_invalidate_cblocks_message(cache, argc - 1, (const char **) argv + 1); 3769 3770 if (argc != 2) 3771 return -EINVAL; 3772 3773 return set_config_value(cache, argv[0], argv[1]); 3774 } 3775 3776 static int cache_iterate_devices(struct dm_target *ti, 3777 iterate_devices_callout_fn fn, void *data) 3778 { 3779 int r = 0; 3780 struct cache *cache = ti->private; 3781 3782 r = fn(ti, cache->cache_dev, 0, get_dev_size(cache->cache_dev), data); 3783 if (!r) 3784 r = fn(ti, cache->origin_dev, 0, ti->len, data); 3785 3786 return r; 3787 } 3788 3789 static void set_discard_limits(struct cache *cache, struct queue_limits *limits) 3790 { 3791 /* 3792 * FIXME: these limits may be incompatible with the cache device 3793 */ 3794 limits->max_discard_sectors = min_t(sector_t, cache->discard_block_size * 1024, 3795 cache->origin_sectors); 3796 limits->discard_granularity = cache->discard_block_size << SECTOR_SHIFT; 3797 } 3798 3799 static void cache_io_hints(struct dm_target *ti, struct queue_limits *limits) 3800 { 3801 struct cache *cache = ti->private; 3802 uint64_t io_opt_sectors = limits->io_opt >> SECTOR_SHIFT; 3803 3804 /* 3805 * If the system-determined stacked limits are compatible with the 3806 * cache's blocksize (io_opt is a factor) do not override them. 3807 */ 3808 if (io_opt_sectors < cache->sectors_per_block || 3809 do_div(io_opt_sectors, cache->sectors_per_block)) { 3810 blk_limits_io_min(limits, cache->sectors_per_block << SECTOR_SHIFT); 3811 blk_limits_io_opt(limits, cache->sectors_per_block << SECTOR_SHIFT); 3812 } 3813 set_discard_limits(cache, limits); 3814 } 3815 3816 /*----------------------------------------------------------------*/ 3817 3818 static struct target_type cache_target = { 3819 .name = "cache", 3820 .version = {1, 9, 0}, 3821 .module = THIS_MODULE, 3822 .ctr = cache_ctr, 3823 .dtr = cache_dtr, 3824 .map = cache_map, 3825 .end_io = cache_end_io, 3826 .postsuspend = cache_postsuspend, 3827 .preresume = cache_preresume, 3828 .resume = cache_resume, 3829 .status = cache_status, 3830 .message = cache_message, 3831 .iterate_devices = cache_iterate_devices, 3832 .io_hints = cache_io_hints, 3833 }; 3834 3835 static int __init dm_cache_init(void) 3836 { 3837 int r; 3838 3839 r = dm_register_target(&cache_target); 3840 if (r) { 3841 DMERR("cache target registration failed: %d", r); 3842 return r; 3843 } 3844 3845 migration_cache = KMEM_CACHE(dm_cache_migration, 0); 3846 if (!migration_cache) { 3847 dm_unregister_target(&cache_target); 3848 return -ENOMEM; 3849 } 3850 3851 return 0; 3852 } 3853 3854 static void __exit dm_cache_exit(void) 3855 { 3856 dm_unregister_target(&cache_target); 3857 kmem_cache_destroy(migration_cache); 3858 } 3859 3860 module_init(dm_cache_init); 3861 module_exit(dm_cache_exit); 3862 3863 MODULE_DESCRIPTION(DM_NAME " cache target"); 3864 MODULE_AUTHOR("Joe Thornber <ejt@redhat.com>"); 3865 MODULE_LICENSE("GPL"); 3866