1 /* 2 * Copyright (C) 2012 Red Hat. All rights reserved. 3 * 4 * This file is released under the GPL. 5 */ 6 7 #include "dm.h" 8 #include "dm-bio-prison.h" 9 #include "dm-bio-record.h" 10 #include "dm-cache-metadata.h" 11 12 #include <linux/dm-io.h> 13 #include <linux/dm-kcopyd.h> 14 #include <linux/jiffies.h> 15 #include <linux/init.h> 16 #include <linux/mempool.h> 17 #include <linux/module.h> 18 #include <linux/slab.h> 19 #include <linux/vmalloc.h> 20 21 #define DM_MSG_PREFIX "cache" 22 23 DECLARE_DM_KCOPYD_THROTTLE_WITH_MODULE_PARM(cache_copy_throttle, 24 "A percentage of time allocated for copying to and/or from cache"); 25 26 /*----------------------------------------------------------------*/ 27 28 #define IOT_RESOLUTION 4 29 30 struct io_tracker { 31 spinlock_t lock; 32 33 /* 34 * Sectors of in-flight IO. 35 */ 36 sector_t in_flight; 37 38 /* 39 * The time, in jiffies, when this device became idle (if it is 40 * indeed idle). 41 */ 42 unsigned long idle_time; 43 unsigned long last_update_time; 44 }; 45 46 static void iot_init(struct io_tracker *iot) 47 { 48 spin_lock_init(&iot->lock); 49 iot->in_flight = 0ul; 50 iot->idle_time = 0ul; 51 iot->last_update_time = jiffies; 52 } 53 54 static bool __iot_idle_for(struct io_tracker *iot, unsigned long jifs) 55 { 56 if (iot->in_flight) 57 return false; 58 59 return time_after(jiffies, iot->idle_time + jifs); 60 } 61 62 static bool iot_idle_for(struct io_tracker *iot, unsigned long jifs) 63 { 64 bool r; 65 unsigned long flags; 66 67 spin_lock_irqsave(&iot->lock, flags); 68 r = __iot_idle_for(iot, jifs); 69 spin_unlock_irqrestore(&iot->lock, flags); 70 71 return r; 72 } 73 74 static void iot_io_begin(struct io_tracker *iot, sector_t len) 75 { 76 unsigned long flags; 77 78 spin_lock_irqsave(&iot->lock, flags); 79 iot->in_flight += len; 80 spin_unlock_irqrestore(&iot->lock, flags); 81 } 82 83 static void __iot_io_end(struct io_tracker *iot, sector_t len) 84 { 85 iot->in_flight -= len; 86 if (!iot->in_flight) 87 iot->idle_time = jiffies; 88 } 89 90 static void iot_io_end(struct io_tracker *iot, sector_t len) 91 { 92 unsigned long flags; 93 94 spin_lock_irqsave(&iot->lock, flags); 95 __iot_io_end(iot, len); 96 spin_unlock_irqrestore(&iot->lock, flags); 97 } 98 99 /*----------------------------------------------------------------*/ 100 101 /* 102 * Glossary: 103 * 104 * oblock: index of an origin block 105 * cblock: index of a cache block 106 * promotion: movement of a block from origin to cache 107 * demotion: movement of a block from cache to origin 108 * migration: movement of a block between the origin and cache device, 109 * either direction 110 */ 111 112 /*----------------------------------------------------------------*/ 113 114 /* 115 * There are a couple of places where we let a bio run, but want to do some 116 * work before calling its endio function. We do this by temporarily 117 * changing the endio fn. 118 */ 119 struct dm_hook_info { 120 bio_end_io_t *bi_end_io; 121 void *bi_private; 122 }; 123 124 static void dm_hook_bio(struct dm_hook_info *h, struct bio *bio, 125 bio_end_io_t *bi_end_io, void *bi_private) 126 { 127 h->bi_end_io = bio->bi_end_io; 128 h->bi_private = bio->bi_private; 129 130 bio->bi_end_io = bi_end_io; 131 bio->bi_private = bi_private; 132 } 133 134 static void dm_unhook_bio(struct dm_hook_info *h, struct bio *bio) 135 { 136 bio->bi_end_io = h->bi_end_io; 137 bio->bi_private = h->bi_private; 138 } 139 140 /*----------------------------------------------------------------*/ 141 142 #define MIGRATION_POOL_SIZE 128 143 #define COMMIT_PERIOD HZ 144 #define MIGRATION_COUNT_WINDOW 10 145 146 /* 147 * The block size of the device holding cache data must be 148 * between 32KB and 1GB. 149 */ 150 #define DATA_DEV_BLOCK_SIZE_MIN_SECTORS (32 * 1024 >> SECTOR_SHIFT) 151 #define DATA_DEV_BLOCK_SIZE_MAX_SECTORS (1024 * 1024 * 1024 >> SECTOR_SHIFT) 152 153 enum cache_metadata_mode { 154 CM_WRITE, /* metadata may be changed */ 155 CM_READ_ONLY, /* metadata may not be changed */ 156 CM_FAIL 157 }; 158 159 enum cache_io_mode { 160 /* 161 * Data is written to cached blocks only. These blocks are marked 162 * dirty. If you lose the cache device you will lose data. 163 * Potential performance increase for both reads and writes. 164 */ 165 CM_IO_WRITEBACK, 166 167 /* 168 * Data is written to both cache and origin. Blocks are never 169 * dirty. Potential performance benfit for reads only. 170 */ 171 CM_IO_WRITETHROUGH, 172 173 /* 174 * A degraded mode useful for various cache coherency situations 175 * (eg, rolling back snapshots). Reads and writes always go to the 176 * origin. If a write goes to a cached oblock, then the cache 177 * block is invalidated. 178 */ 179 CM_IO_PASSTHROUGH 180 }; 181 182 struct cache_features { 183 enum cache_metadata_mode mode; 184 enum cache_io_mode io_mode; 185 }; 186 187 struct cache_stats { 188 atomic_t read_hit; 189 atomic_t read_miss; 190 atomic_t write_hit; 191 atomic_t write_miss; 192 atomic_t demotion; 193 atomic_t promotion; 194 atomic_t copies_avoided; 195 atomic_t cache_cell_clash; 196 atomic_t commit_count; 197 atomic_t discard_count; 198 }; 199 200 /* 201 * Defines a range of cblocks, begin to (end - 1) are in the range. end is 202 * the one-past-the-end value. 203 */ 204 struct cblock_range { 205 dm_cblock_t begin; 206 dm_cblock_t end; 207 }; 208 209 struct invalidation_request { 210 struct list_head list; 211 struct cblock_range *cblocks; 212 213 atomic_t complete; 214 int err; 215 216 wait_queue_head_t result_wait; 217 }; 218 219 struct cache { 220 struct dm_target *ti; 221 struct dm_target_callbacks callbacks; 222 223 struct dm_cache_metadata *cmd; 224 225 /* 226 * Metadata is written to this device. 227 */ 228 struct dm_dev *metadata_dev; 229 230 /* 231 * The slower of the two data devices. Typically a spindle. 232 */ 233 struct dm_dev *origin_dev; 234 235 /* 236 * The faster of the two data devices. Typically an SSD. 237 */ 238 struct dm_dev *cache_dev; 239 240 /* 241 * Size of the origin device in _complete_ blocks and native sectors. 242 */ 243 dm_oblock_t origin_blocks; 244 sector_t origin_sectors; 245 246 /* 247 * Size of the cache device in blocks. 248 */ 249 dm_cblock_t cache_size; 250 251 /* 252 * Fields for converting from sectors to blocks. 253 */ 254 uint32_t sectors_per_block; 255 int sectors_per_block_shift; 256 257 spinlock_t lock; 258 struct list_head deferred_cells; 259 struct bio_list deferred_bios; 260 struct bio_list deferred_flush_bios; 261 struct bio_list deferred_writethrough_bios; 262 struct list_head quiesced_migrations; 263 struct list_head completed_migrations; 264 struct list_head need_commit_migrations; 265 sector_t migration_threshold; 266 wait_queue_head_t migration_wait; 267 atomic_t nr_allocated_migrations; 268 269 /* 270 * The number of in flight migrations that are performing 271 * background io. eg, promotion, writeback. 272 */ 273 atomic_t nr_io_migrations; 274 275 wait_queue_head_t quiescing_wait; 276 atomic_t quiescing; 277 atomic_t quiescing_ack; 278 279 /* 280 * cache_size entries, dirty if set 281 */ 282 atomic_t nr_dirty; 283 unsigned long *dirty_bitset; 284 285 /* 286 * origin_blocks entries, discarded if set. 287 */ 288 dm_dblock_t discard_nr_blocks; 289 unsigned long *discard_bitset; 290 uint32_t discard_block_size; /* a power of 2 times sectors per block */ 291 292 /* 293 * Rather than reconstructing the table line for the status we just 294 * save it and regurgitate. 295 */ 296 unsigned nr_ctr_args; 297 const char **ctr_args; 298 299 struct dm_kcopyd_client *copier; 300 struct workqueue_struct *wq; 301 struct work_struct worker; 302 303 struct delayed_work waker; 304 unsigned long last_commit_jiffies; 305 306 struct dm_bio_prison *prison; 307 struct dm_deferred_set *all_io_ds; 308 309 mempool_t *migration_pool; 310 311 struct dm_cache_policy *policy; 312 unsigned policy_nr_args; 313 314 bool need_tick_bio:1; 315 bool sized:1; 316 bool invalidate:1; 317 bool commit_requested:1; 318 bool loaded_mappings:1; 319 bool loaded_discards:1; 320 321 /* 322 * Cache features such as write-through. 323 */ 324 struct cache_features features; 325 326 struct cache_stats stats; 327 328 /* 329 * Invalidation fields. 330 */ 331 spinlock_t invalidation_lock; 332 struct list_head invalidation_requests; 333 334 struct io_tracker origin_tracker; 335 }; 336 337 struct per_bio_data { 338 bool tick:1; 339 unsigned req_nr:2; 340 struct dm_deferred_entry *all_io_entry; 341 struct dm_hook_info hook_info; 342 sector_t len; 343 344 /* 345 * writethrough fields. These MUST remain at the end of this 346 * structure and the 'cache' member must be the first as it 347 * is used to determine the offset of the writethrough fields. 348 */ 349 struct cache *cache; 350 dm_cblock_t cblock; 351 struct dm_bio_details bio_details; 352 }; 353 354 struct dm_cache_migration { 355 struct list_head list; 356 struct cache *cache; 357 358 unsigned long start_jiffies; 359 dm_oblock_t old_oblock; 360 dm_oblock_t new_oblock; 361 dm_cblock_t cblock; 362 363 bool err:1; 364 bool discard:1; 365 bool writeback:1; 366 bool demote:1; 367 bool promote:1; 368 bool requeue_holder:1; 369 bool invalidate:1; 370 371 struct dm_bio_prison_cell *old_ocell; 372 struct dm_bio_prison_cell *new_ocell; 373 }; 374 375 /* 376 * Processing a bio in the worker thread may require these memory 377 * allocations. We prealloc to avoid deadlocks (the same worker thread 378 * frees them back to the mempool). 379 */ 380 struct prealloc { 381 struct dm_cache_migration *mg; 382 struct dm_bio_prison_cell *cell1; 383 struct dm_bio_prison_cell *cell2; 384 }; 385 386 static enum cache_metadata_mode get_cache_mode(struct cache *cache); 387 388 static void wake_worker(struct cache *cache) 389 { 390 queue_work(cache->wq, &cache->worker); 391 } 392 393 /*----------------------------------------------------------------*/ 394 395 static struct dm_bio_prison_cell *alloc_prison_cell(struct cache *cache) 396 { 397 /* FIXME: change to use a local slab. */ 398 return dm_bio_prison_alloc_cell(cache->prison, GFP_NOWAIT); 399 } 400 401 static void free_prison_cell(struct cache *cache, struct dm_bio_prison_cell *cell) 402 { 403 dm_bio_prison_free_cell(cache->prison, cell); 404 } 405 406 static struct dm_cache_migration *alloc_migration(struct cache *cache) 407 { 408 struct dm_cache_migration *mg; 409 410 mg = mempool_alloc(cache->migration_pool, GFP_NOWAIT); 411 if (mg) { 412 mg->cache = cache; 413 atomic_inc(&mg->cache->nr_allocated_migrations); 414 } 415 416 return mg; 417 } 418 419 static void free_migration(struct dm_cache_migration *mg) 420 { 421 struct cache *cache = mg->cache; 422 423 if (atomic_dec_and_test(&cache->nr_allocated_migrations)) 424 wake_up(&cache->migration_wait); 425 426 mempool_free(mg, cache->migration_pool); 427 wake_worker(cache); 428 } 429 430 static int prealloc_data_structs(struct cache *cache, struct prealloc *p) 431 { 432 if (!p->mg) { 433 p->mg = alloc_migration(cache); 434 if (!p->mg) 435 return -ENOMEM; 436 } 437 438 if (!p->cell1) { 439 p->cell1 = alloc_prison_cell(cache); 440 if (!p->cell1) 441 return -ENOMEM; 442 } 443 444 if (!p->cell2) { 445 p->cell2 = alloc_prison_cell(cache); 446 if (!p->cell2) 447 return -ENOMEM; 448 } 449 450 return 0; 451 } 452 453 static void prealloc_free_structs(struct cache *cache, struct prealloc *p) 454 { 455 if (p->cell2) 456 free_prison_cell(cache, p->cell2); 457 458 if (p->cell1) 459 free_prison_cell(cache, p->cell1); 460 461 if (p->mg) 462 free_migration(p->mg); 463 } 464 465 static struct dm_cache_migration *prealloc_get_migration(struct prealloc *p) 466 { 467 struct dm_cache_migration *mg = p->mg; 468 469 BUG_ON(!mg); 470 p->mg = NULL; 471 472 return mg; 473 } 474 475 /* 476 * You must have a cell within the prealloc struct to return. If not this 477 * function will BUG() rather than returning NULL. 478 */ 479 static struct dm_bio_prison_cell *prealloc_get_cell(struct prealloc *p) 480 { 481 struct dm_bio_prison_cell *r = NULL; 482 483 if (p->cell1) { 484 r = p->cell1; 485 p->cell1 = NULL; 486 487 } else if (p->cell2) { 488 r = p->cell2; 489 p->cell2 = NULL; 490 } else 491 BUG(); 492 493 return r; 494 } 495 496 /* 497 * You can't have more than two cells in a prealloc struct. BUG() will be 498 * called if you try and overfill. 499 */ 500 static void prealloc_put_cell(struct prealloc *p, struct dm_bio_prison_cell *cell) 501 { 502 if (!p->cell2) 503 p->cell2 = cell; 504 505 else if (!p->cell1) 506 p->cell1 = cell; 507 508 else 509 BUG(); 510 } 511 512 /*----------------------------------------------------------------*/ 513 514 static void build_key(dm_oblock_t begin, dm_oblock_t end, struct dm_cell_key *key) 515 { 516 key->virtual = 0; 517 key->dev = 0; 518 key->block_begin = from_oblock(begin); 519 key->block_end = from_oblock(end); 520 } 521 522 /* 523 * The caller hands in a preallocated cell, and a free function for it. 524 * The cell will be freed if there's an error, or if it wasn't used because 525 * a cell with that key already exists. 526 */ 527 typedef void (*cell_free_fn)(void *context, struct dm_bio_prison_cell *cell); 528 529 static int bio_detain_range(struct cache *cache, dm_oblock_t oblock_begin, dm_oblock_t oblock_end, 530 struct bio *bio, struct dm_bio_prison_cell *cell_prealloc, 531 cell_free_fn free_fn, void *free_context, 532 struct dm_bio_prison_cell **cell_result) 533 { 534 int r; 535 struct dm_cell_key key; 536 537 build_key(oblock_begin, oblock_end, &key); 538 r = dm_bio_detain(cache->prison, &key, bio, cell_prealloc, cell_result); 539 if (r) 540 free_fn(free_context, cell_prealloc); 541 542 return r; 543 } 544 545 static int bio_detain(struct cache *cache, dm_oblock_t oblock, 546 struct bio *bio, struct dm_bio_prison_cell *cell_prealloc, 547 cell_free_fn free_fn, void *free_context, 548 struct dm_bio_prison_cell **cell_result) 549 { 550 dm_oblock_t end = to_oblock(from_oblock(oblock) + 1ULL); 551 return bio_detain_range(cache, oblock, end, bio, 552 cell_prealloc, free_fn, free_context, cell_result); 553 } 554 555 static int get_cell(struct cache *cache, 556 dm_oblock_t oblock, 557 struct prealloc *structs, 558 struct dm_bio_prison_cell **cell_result) 559 { 560 int r; 561 struct dm_cell_key key; 562 struct dm_bio_prison_cell *cell_prealloc; 563 564 cell_prealloc = prealloc_get_cell(structs); 565 566 build_key(oblock, to_oblock(from_oblock(oblock) + 1ULL), &key); 567 r = dm_get_cell(cache->prison, &key, cell_prealloc, cell_result); 568 if (r) 569 prealloc_put_cell(structs, cell_prealloc); 570 571 return r; 572 } 573 574 /*----------------------------------------------------------------*/ 575 576 static bool is_dirty(struct cache *cache, dm_cblock_t b) 577 { 578 return test_bit(from_cblock(b), cache->dirty_bitset); 579 } 580 581 static void set_dirty(struct cache *cache, dm_oblock_t oblock, dm_cblock_t cblock) 582 { 583 if (!test_and_set_bit(from_cblock(cblock), cache->dirty_bitset)) { 584 atomic_inc(&cache->nr_dirty); 585 policy_set_dirty(cache->policy, oblock); 586 } 587 } 588 589 static void clear_dirty(struct cache *cache, dm_oblock_t oblock, dm_cblock_t cblock) 590 { 591 if (test_and_clear_bit(from_cblock(cblock), cache->dirty_bitset)) { 592 policy_clear_dirty(cache->policy, oblock); 593 if (atomic_dec_return(&cache->nr_dirty) == 0) 594 dm_table_event(cache->ti->table); 595 } 596 } 597 598 /*----------------------------------------------------------------*/ 599 600 static bool block_size_is_power_of_two(struct cache *cache) 601 { 602 return cache->sectors_per_block_shift >= 0; 603 } 604 605 /* gcc on ARM generates spurious references to __udivdi3 and __umoddi3 */ 606 #if defined(CONFIG_ARM) && __GNUC__ == 4 && __GNUC_MINOR__ <= 6 607 __always_inline 608 #endif 609 static dm_block_t block_div(dm_block_t b, uint32_t n) 610 { 611 do_div(b, n); 612 613 return b; 614 } 615 616 static dm_block_t oblocks_per_dblock(struct cache *cache) 617 { 618 dm_block_t oblocks = cache->discard_block_size; 619 620 if (block_size_is_power_of_two(cache)) 621 oblocks >>= cache->sectors_per_block_shift; 622 else 623 oblocks = block_div(oblocks, cache->sectors_per_block); 624 625 return oblocks; 626 } 627 628 static dm_dblock_t oblock_to_dblock(struct cache *cache, dm_oblock_t oblock) 629 { 630 return to_dblock(block_div(from_oblock(oblock), 631 oblocks_per_dblock(cache))); 632 } 633 634 static dm_oblock_t dblock_to_oblock(struct cache *cache, dm_dblock_t dblock) 635 { 636 return to_oblock(from_dblock(dblock) * oblocks_per_dblock(cache)); 637 } 638 639 static void set_discard(struct cache *cache, dm_dblock_t b) 640 { 641 unsigned long flags; 642 643 BUG_ON(from_dblock(b) >= from_dblock(cache->discard_nr_blocks)); 644 atomic_inc(&cache->stats.discard_count); 645 646 spin_lock_irqsave(&cache->lock, flags); 647 set_bit(from_dblock(b), cache->discard_bitset); 648 spin_unlock_irqrestore(&cache->lock, flags); 649 } 650 651 static void clear_discard(struct cache *cache, dm_dblock_t b) 652 { 653 unsigned long flags; 654 655 spin_lock_irqsave(&cache->lock, flags); 656 clear_bit(from_dblock(b), cache->discard_bitset); 657 spin_unlock_irqrestore(&cache->lock, flags); 658 } 659 660 static bool is_discarded(struct cache *cache, dm_dblock_t b) 661 { 662 int r; 663 unsigned long flags; 664 665 spin_lock_irqsave(&cache->lock, flags); 666 r = test_bit(from_dblock(b), cache->discard_bitset); 667 spin_unlock_irqrestore(&cache->lock, flags); 668 669 return r; 670 } 671 672 static bool is_discarded_oblock(struct cache *cache, dm_oblock_t b) 673 { 674 int r; 675 unsigned long flags; 676 677 spin_lock_irqsave(&cache->lock, flags); 678 r = test_bit(from_dblock(oblock_to_dblock(cache, b)), 679 cache->discard_bitset); 680 spin_unlock_irqrestore(&cache->lock, flags); 681 682 return r; 683 } 684 685 /*----------------------------------------------------------------*/ 686 687 static void load_stats(struct cache *cache) 688 { 689 struct dm_cache_statistics stats; 690 691 dm_cache_metadata_get_stats(cache->cmd, &stats); 692 atomic_set(&cache->stats.read_hit, stats.read_hits); 693 atomic_set(&cache->stats.read_miss, stats.read_misses); 694 atomic_set(&cache->stats.write_hit, stats.write_hits); 695 atomic_set(&cache->stats.write_miss, stats.write_misses); 696 } 697 698 static void save_stats(struct cache *cache) 699 { 700 struct dm_cache_statistics stats; 701 702 if (get_cache_mode(cache) >= CM_READ_ONLY) 703 return; 704 705 stats.read_hits = atomic_read(&cache->stats.read_hit); 706 stats.read_misses = atomic_read(&cache->stats.read_miss); 707 stats.write_hits = atomic_read(&cache->stats.write_hit); 708 stats.write_misses = atomic_read(&cache->stats.write_miss); 709 710 dm_cache_metadata_set_stats(cache->cmd, &stats); 711 } 712 713 /*---------------------------------------------------------------- 714 * Per bio data 715 *--------------------------------------------------------------*/ 716 717 /* 718 * If using writeback, leave out struct per_bio_data's writethrough fields. 719 */ 720 #define PB_DATA_SIZE_WB (offsetof(struct per_bio_data, cache)) 721 #define PB_DATA_SIZE_WT (sizeof(struct per_bio_data)) 722 723 static bool writethrough_mode(struct cache_features *f) 724 { 725 return f->io_mode == CM_IO_WRITETHROUGH; 726 } 727 728 static bool writeback_mode(struct cache_features *f) 729 { 730 return f->io_mode == CM_IO_WRITEBACK; 731 } 732 733 static bool passthrough_mode(struct cache_features *f) 734 { 735 return f->io_mode == CM_IO_PASSTHROUGH; 736 } 737 738 static size_t get_per_bio_data_size(struct cache *cache) 739 { 740 return writethrough_mode(&cache->features) ? PB_DATA_SIZE_WT : PB_DATA_SIZE_WB; 741 } 742 743 static struct per_bio_data *get_per_bio_data(struct bio *bio, size_t data_size) 744 { 745 struct per_bio_data *pb = dm_per_bio_data(bio, data_size); 746 BUG_ON(!pb); 747 return pb; 748 } 749 750 static struct per_bio_data *init_per_bio_data(struct bio *bio, size_t data_size) 751 { 752 struct per_bio_data *pb = get_per_bio_data(bio, data_size); 753 754 pb->tick = false; 755 pb->req_nr = dm_bio_get_target_bio_nr(bio); 756 pb->all_io_entry = NULL; 757 pb->len = 0; 758 759 return pb; 760 } 761 762 /*---------------------------------------------------------------- 763 * Remapping 764 *--------------------------------------------------------------*/ 765 static void remap_to_origin(struct cache *cache, struct bio *bio) 766 { 767 bio->bi_bdev = cache->origin_dev->bdev; 768 } 769 770 static void remap_to_cache(struct cache *cache, struct bio *bio, 771 dm_cblock_t cblock) 772 { 773 sector_t bi_sector = bio->bi_iter.bi_sector; 774 sector_t block = from_cblock(cblock); 775 776 bio->bi_bdev = cache->cache_dev->bdev; 777 if (!block_size_is_power_of_two(cache)) 778 bio->bi_iter.bi_sector = 779 (block * cache->sectors_per_block) + 780 sector_div(bi_sector, cache->sectors_per_block); 781 else 782 bio->bi_iter.bi_sector = 783 (block << cache->sectors_per_block_shift) | 784 (bi_sector & (cache->sectors_per_block - 1)); 785 } 786 787 static void check_if_tick_bio_needed(struct cache *cache, struct bio *bio) 788 { 789 unsigned long flags; 790 size_t pb_data_size = get_per_bio_data_size(cache); 791 struct per_bio_data *pb = get_per_bio_data(bio, pb_data_size); 792 793 spin_lock_irqsave(&cache->lock, flags); 794 if (cache->need_tick_bio && 795 !(bio->bi_rw & (REQ_FUA | REQ_FLUSH | REQ_DISCARD))) { 796 pb->tick = true; 797 cache->need_tick_bio = false; 798 } 799 spin_unlock_irqrestore(&cache->lock, flags); 800 } 801 802 static void remap_to_origin_clear_discard(struct cache *cache, struct bio *bio, 803 dm_oblock_t oblock) 804 { 805 check_if_tick_bio_needed(cache, bio); 806 remap_to_origin(cache, bio); 807 if (bio_data_dir(bio) == WRITE) 808 clear_discard(cache, oblock_to_dblock(cache, oblock)); 809 } 810 811 static void remap_to_cache_dirty(struct cache *cache, struct bio *bio, 812 dm_oblock_t oblock, dm_cblock_t cblock) 813 { 814 check_if_tick_bio_needed(cache, bio); 815 remap_to_cache(cache, bio, cblock); 816 if (bio_data_dir(bio) == WRITE) { 817 set_dirty(cache, oblock, cblock); 818 clear_discard(cache, oblock_to_dblock(cache, oblock)); 819 } 820 } 821 822 static dm_oblock_t get_bio_block(struct cache *cache, struct bio *bio) 823 { 824 sector_t block_nr = bio->bi_iter.bi_sector; 825 826 if (!block_size_is_power_of_two(cache)) 827 (void) sector_div(block_nr, cache->sectors_per_block); 828 else 829 block_nr >>= cache->sectors_per_block_shift; 830 831 return to_oblock(block_nr); 832 } 833 834 static int bio_triggers_commit(struct cache *cache, struct bio *bio) 835 { 836 return bio->bi_rw & (REQ_FLUSH | REQ_FUA); 837 } 838 839 /* 840 * You must increment the deferred set whilst the prison cell is held. To 841 * encourage this, we ask for 'cell' to be passed in. 842 */ 843 static void inc_ds(struct cache *cache, struct bio *bio, 844 struct dm_bio_prison_cell *cell) 845 { 846 size_t pb_data_size = get_per_bio_data_size(cache); 847 struct per_bio_data *pb = get_per_bio_data(bio, pb_data_size); 848 849 BUG_ON(!cell); 850 BUG_ON(pb->all_io_entry); 851 852 pb->all_io_entry = dm_deferred_entry_inc(cache->all_io_ds); 853 } 854 855 static bool accountable_bio(struct cache *cache, struct bio *bio) 856 { 857 return ((bio->bi_bdev == cache->origin_dev->bdev) && 858 !(bio->bi_rw & REQ_DISCARD)); 859 } 860 861 static void accounted_begin(struct cache *cache, struct bio *bio) 862 { 863 size_t pb_data_size = get_per_bio_data_size(cache); 864 struct per_bio_data *pb = get_per_bio_data(bio, pb_data_size); 865 866 if (accountable_bio(cache, bio)) { 867 pb->len = bio_sectors(bio); 868 iot_io_begin(&cache->origin_tracker, pb->len); 869 } 870 } 871 872 static void accounted_complete(struct cache *cache, struct bio *bio) 873 { 874 size_t pb_data_size = get_per_bio_data_size(cache); 875 struct per_bio_data *pb = get_per_bio_data(bio, pb_data_size); 876 877 iot_io_end(&cache->origin_tracker, pb->len); 878 } 879 880 static void accounted_request(struct cache *cache, struct bio *bio) 881 { 882 accounted_begin(cache, bio); 883 generic_make_request(bio); 884 } 885 886 static void issue(struct cache *cache, struct bio *bio) 887 { 888 unsigned long flags; 889 890 if (!bio_triggers_commit(cache, bio)) { 891 accounted_request(cache, bio); 892 return; 893 } 894 895 /* 896 * Batch together any bios that trigger commits and then issue a 897 * single commit for them in do_worker(). 898 */ 899 spin_lock_irqsave(&cache->lock, flags); 900 cache->commit_requested = true; 901 bio_list_add(&cache->deferred_flush_bios, bio); 902 spin_unlock_irqrestore(&cache->lock, flags); 903 } 904 905 static void inc_and_issue(struct cache *cache, struct bio *bio, struct dm_bio_prison_cell *cell) 906 { 907 inc_ds(cache, bio, cell); 908 issue(cache, bio); 909 } 910 911 static void defer_writethrough_bio(struct cache *cache, struct bio *bio) 912 { 913 unsigned long flags; 914 915 spin_lock_irqsave(&cache->lock, flags); 916 bio_list_add(&cache->deferred_writethrough_bios, bio); 917 spin_unlock_irqrestore(&cache->lock, flags); 918 919 wake_worker(cache); 920 } 921 922 static void writethrough_endio(struct bio *bio, int err) 923 { 924 struct per_bio_data *pb = get_per_bio_data(bio, PB_DATA_SIZE_WT); 925 926 dm_unhook_bio(&pb->hook_info, bio); 927 928 if (err) { 929 bio_endio(bio, err); 930 return; 931 } 932 933 dm_bio_restore(&pb->bio_details, bio); 934 remap_to_cache(pb->cache, bio, pb->cblock); 935 936 /* 937 * We can't issue this bio directly, since we're in interrupt 938 * context. So it gets put on a bio list for processing by the 939 * worker thread. 940 */ 941 defer_writethrough_bio(pb->cache, bio); 942 } 943 944 /* 945 * When running in writethrough mode we need to send writes to clean blocks 946 * to both the cache and origin devices. In future we'd like to clone the 947 * bio and send them in parallel, but for now we're doing them in 948 * series as this is easier. 949 */ 950 static void remap_to_origin_then_cache(struct cache *cache, struct bio *bio, 951 dm_oblock_t oblock, dm_cblock_t cblock) 952 { 953 struct per_bio_data *pb = get_per_bio_data(bio, PB_DATA_SIZE_WT); 954 955 pb->cache = cache; 956 pb->cblock = cblock; 957 dm_hook_bio(&pb->hook_info, bio, writethrough_endio, NULL); 958 dm_bio_record(&pb->bio_details, bio); 959 960 remap_to_origin_clear_discard(pb->cache, bio, oblock); 961 } 962 963 /*---------------------------------------------------------------- 964 * Failure modes 965 *--------------------------------------------------------------*/ 966 static enum cache_metadata_mode get_cache_mode(struct cache *cache) 967 { 968 return cache->features.mode; 969 } 970 971 static const char *cache_device_name(struct cache *cache) 972 { 973 return dm_device_name(dm_table_get_md(cache->ti->table)); 974 } 975 976 static void notify_mode_switch(struct cache *cache, enum cache_metadata_mode mode) 977 { 978 const char *descs[] = { 979 "write", 980 "read-only", 981 "fail" 982 }; 983 984 dm_table_event(cache->ti->table); 985 DMINFO("%s: switching cache to %s mode", 986 cache_device_name(cache), descs[(int)mode]); 987 } 988 989 static void set_cache_mode(struct cache *cache, enum cache_metadata_mode new_mode) 990 { 991 bool needs_check = dm_cache_metadata_needs_check(cache->cmd); 992 enum cache_metadata_mode old_mode = get_cache_mode(cache); 993 994 if (new_mode == CM_WRITE && needs_check) { 995 DMERR("%s: unable to switch cache to write mode until repaired.", 996 cache_device_name(cache)); 997 if (old_mode != new_mode) 998 new_mode = old_mode; 999 else 1000 new_mode = CM_READ_ONLY; 1001 } 1002 1003 /* Never move out of fail mode */ 1004 if (old_mode == CM_FAIL) 1005 new_mode = CM_FAIL; 1006 1007 switch (new_mode) { 1008 case CM_FAIL: 1009 case CM_READ_ONLY: 1010 dm_cache_metadata_set_read_only(cache->cmd); 1011 break; 1012 1013 case CM_WRITE: 1014 dm_cache_metadata_set_read_write(cache->cmd); 1015 break; 1016 } 1017 1018 cache->features.mode = new_mode; 1019 1020 if (new_mode != old_mode) 1021 notify_mode_switch(cache, new_mode); 1022 } 1023 1024 static void abort_transaction(struct cache *cache) 1025 { 1026 const char *dev_name = cache_device_name(cache); 1027 1028 if (get_cache_mode(cache) >= CM_READ_ONLY) 1029 return; 1030 1031 if (dm_cache_metadata_set_needs_check(cache->cmd)) { 1032 DMERR("%s: failed to set 'needs_check' flag in metadata", dev_name); 1033 set_cache_mode(cache, CM_FAIL); 1034 } 1035 1036 DMERR_LIMIT("%s: aborting current metadata transaction", dev_name); 1037 if (dm_cache_metadata_abort(cache->cmd)) { 1038 DMERR("%s: failed to abort metadata transaction", dev_name); 1039 set_cache_mode(cache, CM_FAIL); 1040 } 1041 } 1042 1043 static void metadata_operation_failed(struct cache *cache, const char *op, int r) 1044 { 1045 DMERR_LIMIT("%s: metadata operation '%s' failed: error = %d", 1046 cache_device_name(cache), op, r); 1047 abort_transaction(cache); 1048 set_cache_mode(cache, CM_READ_ONLY); 1049 } 1050 1051 /*---------------------------------------------------------------- 1052 * Migration processing 1053 * 1054 * Migration covers moving data from the origin device to the cache, or 1055 * vice versa. 1056 *--------------------------------------------------------------*/ 1057 static void inc_io_migrations(struct cache *cache) 1058 { 1059 atomic_inc(&cache->nr_io_migrations); 1060 } 1061 1062 static void dec_io_migrations(struct cache *cache) 1063 { 1064 atomic_dec(&cache->nr_io_migrations); 1065 } 1066 1067 static void __cell_release(struct cache *cache, struct dm_bio_prison_cell *cell, 1068 bool holder, struct bio_list *bios) 1069 { 1070 (holder ? dm_cell_release : dm_cell_release_no_holder) 1071 (cache->prison, cell, bios); 1072 free_prison_cell(cache, cell); 1073 } 1074 1075 static bool discard_or_flush(struct bio *bio) 1076 { 1077 return bio->bi_rw & (REQ_FLUSH | REQ_FUA | REQ_DISCARD); 1078 } 1079 1080 static void __cell_defer(struct cache *cache, struct dm_bio_prison_cell *cell) 1081 { 1082 if (discard_or_flush(cell->holder)) 1083 /* 1084 * We have to handle these bios 1085 * individually. 1086 */ 1087 __cell_release(cache, cell, true, &cache->deferred_bios); 1088 1089 else 1090 list_add_tail(&cell->user_list, &cache->deferred_cells); 1091 } 1092 1093 static void cell_defer(struct cache *cache, struct dm_bio_prison_cell *cell, bool holder) 1094 { 1095 unsigned long flags; 1096 1097 if (!holder && dm_cell_promote_or_release(cache->prison, cell)) { 1098 /* 1099 * There was no prisoner to promote to holder, the 1100 * cell has been released. 1101 */ 1102 free_prison_cell(cache, cell); 1103 return; 1104 } 1105 1106 spin_lock_irqsave(&cache->lock, flags); 1107 __cell_defer(cache, cell); 1108 spin_unlock_irqrestore(&cache->lock, flags); 1109 1110 wake_worker(cache); 1111 } 1112 1113 static void cell_error_with_code(struct cache *cache, struct dm_bio_prison_cell *cell, int err) 1114 { 1115 dm_cell_error(cache->prison, cell, err); 1116 dm_bio_prison_free_cell(cache->prison, cell); 1117 } 1118 1119 static void cell_requeue(struct cache *cache, struct dm_bio_prison_cell *cell) 1120 { 1121 cell_error_with_code(cache, cell, DM_ENDIO_REQUEUE); 1122 } 1123 1124 static void free_io_migration(struct dm_cache_migration *mg) 1125 { 1126 dec_io_migrations(mg->cache); 1127 free_migration(mg); 1128 } 1129 1130 static void migration_failure(struct dm_cache_migration *mg) 1131 { 1132 struct cache *cache = mg->cache; 1133 const char *dev_name = cache_device_name(cache); 1134 1135 if (mg->writeback) { 1136 DMERR_LIMIT("%s: writeback failed; couldn't copy block", dev_name); 1137 set_dirty(cache, mg->old_oblock, mg->cblock); 1138 cell_defer(cache, mg->old_ocell, false); 1139 1140 } else if (mg->demote) { 1141 DMERR_LIMIT("%s: demotion failed; couldn't copy block", dev_name); 1142 policy_force_mapping(cache->policy, mg->new_oblock, mg->old_oblock); 1143 1144 cell_defer(cache, mg->old_ocell, mg->promote ? false : true); 1145 if (mg->promote) 1146 cell_defer(cache, mg->new_ocell, true); 1147 } else { 1148 DMERR_LIMIT("%s: promotion failed; couldn't copy block", dev_name); 1149 policy_remove_mapping(cache->policy, mg->new_oblock); 1150 cell_defer(cache, mg->new_ocell, true); 1151 } 1152 1153 free_io_migration(mg); 1154 } 1155 1156 static void migration_success_pre_commit(struct dm_cache_migration *mg) 1157 { 1158 int r; 1159 unsigned long flags; 1160 struct cache *cache = mg->cache; 1161 1162 if (mg->writeback) { 1163 clear_dirty(cache, mg->old_oblock, mg->cblock); 1164 cell_defer(cache, mg->old_ocell, false); 1165 free_io_migration(mg); 1166 return; 1167 1168 } else if (mg->demote) { 1169 r = dm_cache_remove_mapping(cache->cmd, mg->cblock); 1170 if (r) { 1171 DMERR_LIMIT("%s: demotion failed; couldn't update on disk metadata", 1172 cache_device_name(cache)); 1173 metadata_operation_failed(cache, "dm_cache_remove_mapping", r); 1174 policy_force_mapping(cache->policy, mg->new_oblock, 1175 mg->old_oblock); 1176 if (mg->promote) 1177 cell_defer(cache, mg->new_ocell, true); 1178 free_io_migration(mg); 1179 return; 1180 } 1181 } else { 1182 r = dm_cache_insert_mapping(cache->cmd, mg->cblock, mg->new_oblock); 1183 if (r) { 1184 DMERR_LIMIT("%s: promotion failed; couldn't update on disk metadata", 1185 cache_device_name(cache)); 1186 metadata_operation_failed(cache, "dm_cache_insert_mapping", r); 1187 policy_remove_mapping(cache->policy, mg->new_oblock); 1188 free_io_migration(mg); 1189 return; 1190 } 1191 } 1192 1193 spin_lock_irqsave(&cache->lock, flags); 1194 list_add_tail(&mg->list, &cache->need_commit_migrations); 1195 cache->commit_requested = true; 1196 spin_unlock_irqrestore(&cache->lock, flags); 1197 } 1198 1199 static void migration_success_post_commit(struct dm_cache_migration *mg) 1200 { 1201 unsigned long flags; 1202 struct cache *cache = mg->cache; 1203 1204 if (mg->writeback) { 1205 DMWARN_LIMIT("%s: writeback unexpectedly triggered commit", 1206 cache_device_name(cache)); 1207 return; 1208 1209 } else if (mg->demote) { 1210 cell_defer(cache, mg->old_ocell, mg->promote ? false : true); 1211 1212 if (mg->promote) { 1213 mg->demote = false; 1214 1215 spin_lock_irqsave(&cache->lock, flags); 1216 list_add_tail(&mg->list, &cache->quiesced_migrations); 1217 spin_unlock_irqrestore(&cache->lock, flags); 1218 1219 } else { 1220 if (mg->invalidate) 1221 policy_remove_mapping(cache->policy, mg->old_oblock); 1222 free_io_migration(mg); 1223 } 1224 1225 } else { 1226 if (mg->requeue_holder) { 1227 clear_dirty(cache, mg->new_oblock, mg->cblock); 1228 cell_defer(cache, mg->new_ocell, true); 1229 } else { 1230 /* 1231 * The block was promoted via an overwrite, so it's dirty. 1232 */ 1233 set_dirty(cache, mg->new_oblock, mg->cblock); 1234 bio_endio(mg->new_ocell->holder, 0); 1235 cell_defer(cache, mg->new_ocell, false); 1236 } 1237 free_io_migration(mg); 1238 } 1239 } 1240 1241 static void copy_complete(int read_err, unsigned long write_err, void *context) 1242 { 1243 unsigned long flags; 1244 struct dm_cache_migration *mg = (struct dm_cache_migration *) context; 1245 struct cache *cache = mg->cache; 1246 1247 if (read_err || write_err) 1248 mg->err = true; 1249 1250 spin_lock_irqsave(&cache->lock, flags); 1251 list_add_tail(&mg->list, &cache->completed_migrations); 1252 spin_unlock_irqrestore(&cache->lock, flags); 1253 1254 wake_worker(cache); 1255 } 1256 1257 static void issue_copy(struct dm_cache_migration *mg) 1258 { 1259 int r; 1260 struct dm_io_region o_region, c_region; 1261 struct cache *cache = mg->cache; 1262 sector_t cblock = from_cblock(mg->cblock); 1263 1264 o_region.bdev = cache->origin_dev->bdev; 1265 o_region.count = cache->sectors_per_block; 1266 1267 c_region.bdev = cache->cache_dev->bdev; 1268 c_region.sector = cblock * cache->sectors_per_block; 1269 c_region.count = cache->sectors_per_block; 1270 1271 if (mg->writeback || mg->demote) { 1272 /* demote */ 1273 o_region.sector = from_oblock(mg->old_oblock) * cache->sectors_per_block; 1274 r = dm_kcopyd_copy(cache->copier, &c_region, 1, &o_region, 0, copy_complete, mg); 1275 } else { 1276 /* promote */ 1277 o_region.sector = from_oblock(mg->new_oblock) * cache->sectors_per_block; 1278 r = dm_kcopyd_copy(cache->copier, &o_region, 1, &c_region, 0, copy_complete, mg); 1279 } 1280 1281 if (r < 0) { 1282 DMERR_LIMIT("%s: issuing migration failed", cache_device_name(cache)); 1283 migration_failure(mg); 1284 } 1285 } 1286 1287 static void overwrite_endio(struct bio *bio, int err) 1288 { 1289 struct dm_cache_migration *mg = bio->bi_private; 1290 struct cache *cache = mg->cache; 1291 size_t pb_data_size = get_per_bio_data_size(cache); 1292 struct per_bio_data *pb = get_per_bio_data(bio, pb_data_size); 1293 unsigned long flags; 1294 1295 dm_unhook_bio(&pb->hook_info, bio); 1296 1297 if (err) 1298 mg->err = true; 1299 1300 mg->requeue_holder = false; 1301 1302 spin_lock_irqsave(&cache->lock, flags); 1303 list_add_tail(&mg->list, &cache->completed_migrations); 1304 spin_unlock_irqrestore(&cache->lock, flags); 1305 1306 wake_worker(cache); 1307 } 1308 1309 static void issue_overwrite(struct dm_cache_migration *mg, struct bio *bio) 1310 { 1311 size_t pb_data_size = get_per_bio_data_size(mg->cache); 1312 struct per_bio_data *pb = get_per_bio_data(bio, pb_data_size); 1313 1314 dm_hook_bio(&pb->hook_info, bio, overwrite_endio, mg); 1315 remap_to_cache_dirty(mg->cache, bio, mg->new_oblock, mg->cblock); 1316 1317 /* 1318 * No need to inc_ds() here, since the cell will be held for the 1319 * duration of the io. 1320 */ 1321 accounted_request(mg->cache, bio); 1322 } 1323 1324 static bool bio_writes_complete_block(struct cache *cache, struct bio *bio) 1325 { 1326 return (bio_data_dir(bio) == WRITE) && 1327 (bio->bi_iter.bi_size == (cache->sectors_per_block << SECTOR_SHIFT)); 1328 } 1329 1330 static void avoid_copy(struct dm_cache_migration *mg) 1331 { 1332 atomic_inc(&mg->cache->stats.copies_avoided); 1333 migration_success_pre_commit(mg); 1334 } 1335 1336 static void calc_discard_block_range(struct cache *cache, struct bio *bio, 1337 dm_dblock_t *b, dm_dblock_t *e) 1338 { 1339 sector_t sb = bio->bi_iter.bi_sector; 1340 sector_t se = bio_end_sector(bio); 1341 1342 *b = to_dblock(dm_sector_div_up(sb, cache->discard_block_size)); 1343 1344 if (se - sb < cache->discard_block_size) 1345 *e = *b; 1346 else 1347 *e = to_dblock(block_div(se, cache->discard_block_size)); 1348 } 1349 1350 static void issue_discard(struct dm_cache_migration *mg) 1351 { 1352 dm_dblock_t b, e; 1353 struct bio *bio = mg->new_ocell->holder; 1354 1355 calc_discard_block_range(mg->cache, bio, &b, &e); 1356 while (b != e) { 1357 set_discard(mg->cache, b); 1358 b = to_dblock(from_dblock(b) + 1); 1359 } 1360 1361 bio_endio(bio, 0); 1362 cell_defer(mg->cache, mg->new_ocell, false); 1363 free_migration(mg); 1364 } 1365 1366 static void issue_copy_or_discard(struct dm_cache_migration *mg) 1367 { 1368 bool avoid; 1369 struct cache *cache = mg->cache; 1370 1371 if (mg->discard) { 1372 issue_discard(mg); 1373 return; 1374 } 1375 1376 if (mg->writeback || mg->demote) 1377 avoid = !is_dirty(cache, mg->cblock) || 1378 is_discarded_oblock(cache, mg->old_oblock); 1379 else { 1380 struct bio *bio = mg->new_ocell->holder; 1381 1382 avoid = is_discarded_oblock(cache, mg->new_oblock); 1383 1384 if (writeback_mode(&cache->features) && 1385 !avoid && bio_writes_complete_block(cache, bio)) { 1386 issue_overwrite(mg, bio); 1387 return; 1388 } 1389 } 1390 1391 avoid ? avoid_copy(mg) : issue_copy(mg); 1392 } 1393 1394 static void complete_migration(struct dm_cache_migration *mg) 1395 { 1396 if (mg->err) 1397 migration_failure(mg); 1398 else 1399 migration_success_pre_commit(mg); 1400 } 1401 1402 static void process_migrations(struct cache *cache, struct list_head *head, 1403 void (*fn)(struct dm_cache_migration *)) 1404 { 1405 unsigned long flags; 1406 struct list_head list; 1407 struct dm_cache_migration *mg, *tmp; 1408 1409 INIT_LIST_HEAD(&list); 1410 spin_lock_irqsave(&cache->lock, flags); 1411 list_splice_init(head, &list); 1412 spin_unlock_irqrestore(&cache->lock, flags); 1413 1414 list_for_each_entry_safe(mg, tmp, &list, list) 1415 fn(mg); 1416 } 1417 1418 static void __queue_quiesced_migration(struct dm_cache_migration *mg) 1419 { 1420 list_add_tail(&mg->list, &mg->cache->quiesced_migrations); 1421 } 1422 1423 static void queue_quiesced_migration(struct dm_cache_migration *mg) 1424 { 1425 unsigned long flags; 1426 struct cache *cache = mg->cache; 1427 1428 spin_lock_irqsave(&cache->lock, flags); 1429 __queue_quiesced_migration(mg); 1430 spin_unlock_irqrestore(&cache->lock, flags); 1431 1432 wake_worker(cache); 1433 } 1434 1435 static void queue_quiesced_migrations(struct cache *cache, struct list_head *work) 1436 { 1437 unsigned long flags; 1438 struct dm_cache_migration *mg, *tmp; 1439 1440 spin_lock_irqsave(&cache->lock, flags); 1441 list_for_each_entry_safe(mg, tmp, work, list) 1442 __queue_quiesced_migration(mg); 1443 spin_unlock_irqrestore(&cache->lock, flags); 1444 1445 wake_worker(cache); 1446 } 1447 1448 static void check_for_quiesced_migrations(struct cache *cache, 1449 struct per_bio_data *pb) 1450 { 1451 struct list_head work; 1452 1453 if (!pb->all_io_entry) 1454 return; 1455 1456 INIT_LIST_HEAD(&work); 1457 dm_deferred_entry_dec(pb->all_io_entry, &work); 1458 1459 if (!list_empty(&work)) 1460 queue_quiesced_migrations(cache, &work); 1461 } 1462 1463 static void quiesce_migration(struct dm_cache_migration *mg) 1464 { 1465 if (!dm_deferred_set_add_work(mg->cache->all_io_ds, &mg->list)) 1466 queue_quiesced_migration(mg); 1467 } 1468 1469 static void promote(struct cache *cache, struct prealloc *structs, 1470 dm_oblock_t oblock, dm_cblock_t cblock, 1471 struct dm_bio_prison_cell *cell) 1472 { 1473 struct dm_cache_migration *mg = prealloc_get_migration(structs); 1474 1475 mg->err = false; 1476 mg->discard = false; 1477 mg->writeback = false; 1478 mg->demote = false; 1479 mg->promote = true; 1480 mg->requeue_holder = true; 1481 mg->invalidate = false; 1482 mg->cache = cache; 1483 mg->new_oblock = oblock; 1484 mg->cblock = cblock; 1485 mg->old_ocell = NULL; 1486 mg->new_ocell = cell; 1487 mg->start_jiffies = jiffies; 1488 1489 inc_io_migrations(cache); 1490 quiesce_migration(mg); 1491 } 1492 1493 static void writeback(struct cache *cache, struct prealloc *structs, 1494 dm_oblock_t oblock, dm_cblock_t cblock, 1495 struct dm_bio_prison_cell *cell) 1496 { 1497 struct dm_cache_migration *mg = prealloc_get_migration(structs); 1498 1499 mg->err = false; 1500 mg->discard = false; 1501 mg->writeback = true; 1502 mg->demote = false; 1503 mg->promote = false; 1504 mg->requeue_holder = true; 1505 mg->invalidate = false; 1506 mg->cache = cache; 1507 mg->old_oblock = oblock; 1508 mg->cblock = cblock; 1509 mg->old_ocell = cell; 1510 mg->new_ocell = NULL; 1511 mg->start_jiffies = jiffies; 1512 1513 inc_io_migrations(cache); 1514 quiesce_migration(mg); 1515 } 1516 1517 static void demote_then_promote(struct cache *cache, struct prealloc *structs, 1518 dm_oblock_t old_oblock, dm_oblock_t new_oblock, 1519 dm_cblock_t cblock, 1520 struct dm_bio_prison_cell *old_ocell, 1521 struct dm_bio_prison_cell *new_ocell) 1522 { 1523 struct dm_cache_migration *mg = prealloc_get_migration(structs); 1524 1525 mg->err = false; 1526 mg->discard = false; 1527 mg->writeback = false; 1528 mg->demote = true; 1529 mg->promote = true; 1530 mg->requeue_holder = true; 1531 mg->invalidate = false; 1532 mg->cache = cache; 1533 mg->old_oblock = old_oblock; 1534 mg->new_oblock = new_oblock; 1535 mg->cblock = cblock; 1536 mg->old_ocell = old_ocell; 1537 mg->new_ocell = new_ocell; 1538 mg->start_jiffies = jiffies; 1539 1540 inc_io_migrations(cache); 1541 quiesce_migration(mg); 1542 } 1543 1544 /* 1545 * Invalidate a cache entry. No writeback occurs; any changes in the cache 1546 * block are thrown away. 1547 */ 1548 static void invalidate(struct cache *cache, struct prealloc *structs, 1549 dm_oblock_t oblock, dm_cblock_t cblock, 1550 struct dm_bio_prison_cell *cell) 1551 { 1552 struct dm_cache_migration *mg = prealloc_get_migration(structs); 1553 1554 mg->err = false; 1555 mg->discard = false; 1556 mg->writeback = false; 1557 mg->demote = true; 1558 mg->promote = false; 1559 mg->requeue_holder = true; 1560 mg->invalidate = true; 1561 mg->cache = cache; 1562 mg->old_oblock = oblock; 1563 mg->cblock = cblock; 1564 mg->old_ocell = cell; 1565 mg->new_ocell = NULL; 1566 mg->start_jiffies = jiffies; 1567 1568 inc_io_migrations(cache); 1569 quiesce_migration(mg); 1570 } 1571 1572 static void discard(struct cache *cache, struct prealloc *structs, 1573 struct dm_bio_prison_cell *cell) 1574 { 1575 struct dm_cache_migration *mg = prealloc_get_migration(structs); 1576 1577 mg->err = false; 1578 mg->discard = true; 1579 mg->writeback = false; 1580 mg->demote = false; 1581 mg->promote = false; 1582 mg->requeue_holder = false; 1583 mg->invalidate = false; 1584 mg->cache = cache; 1585 mg->old_ocell = NULL; 1586 mg->new_ocell = cell; 1587 mg->start_jiffies = jiffies; 1588 1589 quiesce_migration(mg); 1590 } 1591 1592 /*---------------------------------------------------------------- 1593 * bio processing 1594 *--------------------------------------------------------------*/ 1595 static void defer_bio(struct cache *cache, struct bio *bio) 1596 { 1597 unsigned long flags; 1598 1599 spin_lock_irqsave(&cache->lock, flags); 1600 bio_list_add(&cache->deferred_bios, bio); 1601 spin_unlock_irqrestore(&cache->lock, flags); 1602 1603 wake_worker(cache); 1604 } 1605 1606 static void process_flush_bio(struct cache *cache, struct bio *bio) 1607 { 1608 size_t pb_data_size = get_per_bio_data_size(cache); 1609 struct per_bio_data *pb = get_per_bio_data(bio, pb_data_size); 1610 1611 BUG_ON(bio->bi_iter.bi_size); 1612 if (!pb->req_nr) 1613 remap_to_origin(cache, bio); 1614 else 1615 remap_to_cache(cache, bio, 0); 1616 1617 /* 1618 * REQ_FLUSH is not directed at any particular block so we don't 1619 * need to inc_ds(). REQ_FUA's are split into a write + REQ_FLUSH 1620 * by dm-core. 1621 */ 1622 issue(cache, bio); 1623 } 1624 1625 static void process_discard_bio(struct cache *cache, struct prealloc *structs, 1626 struct bio *bio) 1627 { 1628 int r; 1629 dm_dblock_t b, e; 1630 struct dm_bio_prison_cell *cell_prealloc, *new_ocell; 1631 1632 calc_discard_block_range(cache, bio, &b, &e); 1633 if (b == e) { 1634 bio_endio(bio, 0); 1635 return; 1636 } 1637 1638 cell_prealloc = prealloc_get_cell(structs); 1639 r = bio_detain_range(cache, dblock_to_oblock(cache, b), dblock_to_oblock(cache, e), bio, cell_prealloc, 1640 (cell_free_fn) prealloc_put_cell, 1641 structs, &new_ocell); 1642 if (r > 0) 1643 return; 1644 1645 discard(cache, structs, new_ocell); 1646 } 1647 1648 static bool spare_migration_bandwidth(struct cache *cache) 1649 { 1650 sector_t current_volume = (atomic_read(&cache->nr_io_migrations) + 1) * 1651 cache->sectors_per_block; 1652 return current_volume < cache->migration_threshold; 1653 } 1654 1655 static void inc_hit_counter(struct cache *cache, struct bio *bio) 1656 { 1657 atomic_inc(bio_data_dir(bio) == READ ? 1658 &cache->stats.read_hit : &cache->stats.write_hit); 1659 } 1660 1661 static void inc_miss_counter(struct cache *cache, struct bio *bio) 1662 { 1663 atomic_inc(bio_data_dir(bio) == READ ? 1664 &cache->stats.read_miss : &cache->stats.write_miss); 1665 } 1666 1667 /*----------------------------------------------------------------*/ 1668 1669 struct inc_detail { 1670 struct cache *cache; 1671 struct bio_list bios_for_issue; 1672 struct bio_list unhandled_bios; 1673 bool any_writes; 1674 }; 1675 1676 static void inc_fn(void *context, struct dm_bio_prison_cell *cell) 1677 { 1678 struct bio *bio; 1679 struct inc_detail *detail = context; 1680 struct cache *cache = detail->cache; 1681 1682 inc_ds(cache, cell->holder, cell); 1683 if (bio_data_dir(cell->holder) == WRITE) 1684 detail->any_writes = true; 1685 1686 while ((bio = bio_list_pop(&cell->bios))) { 1687 if (discard_or_flush(bio)) { 1688 bio_list_add(&detail->unhandled_bios, bio); 1689 continue; 1690 } 1691 1692 if (bio_data_dir(bio) == WRITE) 1693 detail->any_writes = true; 1694 1695 bio_list_add(&detail->bios_for_issue, bio); 1696 inc_ds(cache, bio, cell); 1697 } 1698 } 1699 1700 // FIXME: refactor these two 1701 static void remap_cell_to_origin_clear_discard(struct cache *cache, 1702 struct dm_bio_prison_cell *cell, 1703 dm_oblock_t oblock, bool issue_holder) 1704 { 1705 struct bio *bio; 1706 unsigned long flags; 1707 struct inc_detail detail; 1708 1709 detail.cache = cache; 1710 bio_list_init(&detail.bios_for_issue); 1711 bio_list_init(&detail.unhandled_bios); 1712 detail.any_writes = false; 1713 1714 spin_lock_irqsave(&cache->lock, flags); 1715 dm_cell_visit_release(cache->prison, inc_fn, &detail, cell); 1716 bio_list_merge(&cache->deferred_bios, &detail.unhandled_bios); 1717 spin_unlock_irqrestore(&cache->lock, flags); 1718 1719 remap_to_origin(cache, cell->holder); 1720 if (issue_holder) 1721 issue(cache, cell->holder); 1722 else 1723 accounted_begin(cache, cell->holder); 1724 1725 if (detail.any_writes) 1726 clear_discard(cache, oblock_to_dblock(cache, oblock)); 1727 1728 while ((bio = bio_list_pop(&detail.bios_for_issue))) { 1729 remap_to_origin(cache, bio); 1730 issue(cache, bio); 1731 } 1732 } 1733 1734 static void remap_cell_to_cache_dirty(struct cache *cache, struct dm_bio_prison_cell *cell, 1735 dm_oblock_t oblock, dm_cblock_t cblock, bool issue_holder) 1736 { 1737 struct bio *bio; 1738 unsigned long flags; 1739 struct inc_detail detail; 1740 1741 detail.cache = cache; 1742 bio_list_init(&detail.bios_for_issue); 1743 bio_list_init(&detail.unhandled_bios); 1744 detail.any_writes = false; 1745 1746 spin_lock_irqsave(&cache->lock, flags); 1747 dm_cell_visit_release(cache->prison, inc_fn, &detail, cell); 1748 bio_list_merge(&cache->deferred_bios, &detail.unhandled_bios); 1749 spin_unlock_irqrestore(&cache->lock, flags); 1750 1751 remap_to_cache(cache, cell->holder, cblock); 1752 if (issue_holder) 1753 issue(cache, cell->holder); 1754 else 1755 accounted_begin(cache, cell->holder); 1756 1757 if (detail.any_writes) { 1758 set_dirty(cache, oblock, cblock); 1759 clear_discard(cache, oblock_to_dblock(cache, oblock)); 1760 } 1761 1762 while ((bio = bio_list_pop(&detail.bios_for_issue))) { 1763 remap_to_cache(cache, bio, cblock); 1764 issue(cache, bio); 1765 } 1766 } 1767 1768 /*----------------------------------------------------------------*/ 1769 1770 struct old_oblock_lock { 1771 struct policy_locker locker; 1772 struct cache *cache; 1773 struct prealloc *structs; 1774 struct dm_bio_prison_cell *cell; 1775 }; 1776 1777 static int null_locker(struct policy_locker *locker, dm_oblock_t b) 1778 { 1779 /* This should never be called */ 1780 BUG(); 1781 return 0; 1782 } 1783 1784 static int cell_locker(struct policy_locker *locker, dm_oblock_t b) 1785 { 1786 struct old_oblock_lock *l = container_of(locker, struct old_oblock_lock, locker); 1787 struct dm_bio_prison_cell *cell_prealloc = prealloc_get_cell(l->structs); 1788 1789 return bio_detain(l->cache, b, NULL, cell_prealloc, 1790 (cell_free_fn) prealloc_put_cell, 1791 l->structs, &l->cell); 1792 } 1793 1794 static void process_cell(struct cache *cache, struct prealloc *structs, 1795 struct dm_bio_prison_cell *new_ocell) 1796 { 1797 int r; 1798 bool release_cell = true; 1799 struct bio *bio = new_ocell->holder; 1800 dm_oblock_t block = get_bio_block(cache, bio); 1801 struct policy_result lookup_result; 1802 bool passthrough = passthrough_mode(&cache->features); 1803 bool fast_promotion, can_migrate; 1804 struct old_oblock_lock ool; 1805 1806 fast_promotion = is_discarded_oblock(cache, block) || bio_writes_complete_block(cache, bio); 1807 can_migrate = !passthrough && (fast_promotion || spare_migration_bandwidth(cache)); 1808 1809 ool.locker.fn = cell_locker; 1810 ool.cache = cache; 1811 ool.structs = structs; 1812 ool.cell = NULL; 1813 r = policy_map(cache->policy, block, true, can_migrate, fast_promotion, 1814 bio, &ool.locker, &lookup_result); 1815 1816 if (r == -EWOULDBLOCK) 1817 /* migration has been denied */ 1818 lookup_result.op = POLICY_MISS; 1819 1820 switch (lookup_result.op) { 1821 case POLICY_HIT: 1822 if (passthrough) { 1823 inc_miss_counter(cache, bio); 1824 1825 /* 1826 * Passthrough always maps to the origin, 1827 * invalidating any cache blocks that are written 1828 * to. 1829 */ 1830 1831 if (bio_data_dir(bio) == WRITE) { 1832 atomic_inc(&cache->stats.demotion); 1833 invalidate(cache, structs, block, lookup_result.cblock, new_ocell); 1834 release_cell = false; 1835 1836 } else { 1837 /* FIXME: factor out issue_origin() */ 1838 remap_to_origin_clear_discard(cache, bio, block); 1839 inc_and_issue(cache, bio, new_ocell); 1840 } 1841 } else { 1842 inc_hit_counter(cache, bio); 1843 1844 if (bio_data_dir(bio) == WRITE && 1845 writethrough_mode(&cache->features) && 1846 !is_dirty(cache, lookup_result.cblock)) { 1847 remap_to_origin_then_cache(cache, bio, block, lookup_result.cblock); 1848 inc_and_issue(cache, bio, new_ocell); 1849 1850 } else { 1851 remap_cell_to_cache_dirty(cache, new_ocell, block, lookup_result.cblock, true); 1852 release_cell = false; 1853 } 1854 } 1855 1856 break; 1857 1858 case POLICY_MISS: 1859 inc_miss_counter(cache, bio); 1860 remap_cell_to_origin_clear_discard(cache, new_ocell, block, true); 1861 release_cell = false; 1862 break; 1863 1864 case POLICY_NEW: 1865 atomic_inc(&cache->stats.promotion); 1866 promote(cache, structs, block, lookup_result.cblock, new_ocell); 1867 release_cell = false; 1868 break; 1869 1870 case POLICY_REPLACE: 1871 atomic_inc(&cache->stats.demotion); 1872 atomic_inc(&cache->stats.promotion); 1873 demote_then_promote(cache, structs, lookup_result.old_oblock, 1874 block, lookup_result.cblock, 1875 ool.cell, new_ocell); 1876 release_cell = false; 1877 break; 1878 1879 default: 1880 DMERR_LIMIT("%s: %s: erroring bio, unknown policy op: %u", 1881 cache_device_name(cache), __func__, 1882 (unsigned) lookup_result.op); 1883 bio_io_error(bio); 1884 } 1885 1886 if (release_cell) 1887 cell_defer(cache, new_ocell, false); 1888 } 1889 1890 static void process_bio(struct cache *cache, struct prealloc *structs, 1891 struct bio *bio) 1892 { 1893 int r; 1894 dm_oblock_t block = get_bio_block(cache, bio); 1895 struct dm_bio_prison_cell *cell_prealloc, *new_ocell; 1896 1897 /* 1898 * Check to see if that block is currently migrating. 1899 */ 1900 cell_prealloc = prealloc_get_cell(structs); 1901 r = bio_detain(cache, block, bio, cell_prealloc, 1902 (cell_free_fn) prealloc_put_cell, 1903 structs, &new_ocell); 1904 if (r > 0) 1905 return; 1906 1907 process_cell(cache, structs, new_ocell); 1908 } 1909 1910 static int need_commit_due_to_time(struct cache *cache) 1911 { 1912 return jiffies < cache->last_commit_jiffies || 1913 jiffies > cache->last_commit_jiffies + COMMIT_PERIOD; 1914 } 1915 1916 /* 1917 * A non-zero return indicates read_only or fail_io mode. 1918 */ 1919 static int commit(struct cache *cache, bool clean_shutdown) 1920 { 1921 int r; 1922 1923 if (get_cache_mode(cache) >= CM_READ_ONLY) 1924 return -EINVAL; 1925 1926 atomic_inc(&cache->stats.commit_count); 1927 r = dm_cache_commit(cache->cmd, clean_shutdown); 1928 if (r) 1929 metadata_operation_failed(cache, "dm_cache_commit", r); 1930 1931 return r; 1932 } 1933 1934 static int commit_if_needed(struct cache *cache) 1935 { 1936 int r = 0; 1937 1938 if ((cache->commit_requested || need_commit_due_to_time(cache)) && 1939 dm_cache_changed_this_transaction(cache->cmd)) { 1940 r = commit(cache, false); 1941 cache->commit_requested = false; 1942 cache->last_commit_jiffies = jiffies; 1943 } 1944 1945 return r; 1946 } 1947 1948 static void process_deferred_bios(struct cache *cache) 1949 { 1950 unsigned long flags; 1951 struct bio_list bios; 1952 struct bio *bio; 1953 struct prealloc structs; 1954 1955 memset(&structs, 0, sizeof(structs)); 1956 bio_list_init(&bios); 1957 1958 spin_lock_irqsave(&cache->lock, flags); 1959 bio_list_merge(&bios, &cache->deferred_bios); 1960 bio_list_init(&cache->deferred_bios); 1961 spin_unlock_irqrestore(&cache->lock, flags); 1962 1963 while (!bio_list_empty(&bios)) { 1964 /* 1965 * If we've got no free migration structs, and processing 1966 * this bio might require one, we pause until there are some 1967 * prepared mappings to process. 1968 */ 1969 if (prealloc_data_structs(cache, &structs)) { 1970 spin_lock_irqsave(&cache->lock, flags); 1971 bio_list_merge(&cache->deferred_bios, &bios); 1972 spin_unlock_irqrestore(&cache->lock, flags); 1973 break; 1974 } 1975 1976 bio = bio_list_pop(&bios); 1977 1978 if (bio->bi_rw & REQ_FLUSH) 1979 process_flush_bio(cache, bio); 1980 else if (bio->bi_rw & REQ_DISCARD) 1981 process_discard_bio(cache, &structs, bio); 1982 else 1983 process_bio(cache, &structs, bio); 1984 } 1985 1986 prealloc_free_structs(cache, &structs); 1987 } 1988 1989 static void process_deferred_cells(struct cache *cache) 1990 { 1991 unsigned long flags; 1992 struct dm_bio_prison_cell *cell, *tmp; 1993 struct list_head cells; 1994 struct prealloc structs; 1995 1996 memset(&structs, 0, sizeof(structs)); 1997 1998 INIT_LIST_HEAD(&cells); 1999 2000 spin_lock_irqsave(&cache->lock, flags); 2001 list_splice_init(&cache->deferred_cells, &cells); 2002 spin_unlock_irqrestore(&cache->lock, flags); 2003 2004 list_for_each_entry_safe(cell, tmp, &cells, user_list) { 2005 /* 2006 * If we've got no free migration structs, and processing 2007 * this bio might require one, we pause until there are some 2008 * prepared mappings to process. 2009 */ 2010 if (prealloc_data_structs(cache, &structs)) { 2011 spin_lock_irqsave(&cache->lock, flags); 2012 list_splice(&cells, &cache->deferred_cells); 2013 spin_unlock_irqrestore(&cache->lock, flags); 2014 break; 2015 } 2016 2017 process_cell(cache, &structs, cell); 2018 } 2019 2020 prealloc_free_structs(cache, &structs); 2021 } 2022 2023 static void process_deferred_flush_bios(struct cache *cache, bool submit_bios) 2024 { 2025 unsigned long flags; 2026 struct bio_list bios; 2027 struct bio *bio; 2028 2029 bio_list_init(&bios); 2030 2031 spin_lock_irqsave(&cache->lock, flags); 2032 bio_list_merge(&bios, &cache->deferred_flush_bios); 2033 bio_list_init(&cache->deferred_flush_bios); 2034 spin_unlock_irqrestore(&cache->lock, flags); 2035 2036 /* 2037 * These bios have already been through inc_ds() 2038 */ 2039 while ((bio = bio_list_pop(&bios))) 2040 submit_bios ? accounted_request(cache, bio) : bio_io_error(bio); 2041 } 2042 2043 static void process_deferred_writethrough_bios(struct cache *cache) 2044 { 2045 unsigned long flags; 2046 struct bio_list bios; 2047 struct bio *bio; 2048 2049 bio_list_init(&bios); 2050 2051 spin_lock_irqsave(&cache->lock, flags); 2052 bio_list_merge(&bios, &cache->deferred_writethrough_bios); 2053 bio_list_init(&cache->deferred_writethrough_bios); 2054 spin_unlock_irqrestore(&cache->lock, flags); 2055 2056 /* 2057 * These bios have already been through inc_ds() 2058 */ 2059 while ((bio = bio_list_pop(&bios))) 2060 accounted_request(cache, bio); 2061 } 2062 2063 static void writeback_some_dirty_blocks(struct cache *cache) 2064 { 2065 int r = 0; 2066 dm_oblock_t oblock; 2067 dm_cblock_t cblock; 2068 struct prealloc structs; 2069 struct dm_bio_prison_cell *old_ocell; 2070 bool busy = !iot_idle_for(&cache->origin_tracker, HZ); 2071 2072 memset(&structs, 0, sizeof(structs)); 2073 2074 while (spare_migration_bandwidth(cache)) { 2075 if (prealloc_data_structs(cache, &structs)) 2076 break; 2077 2078 r = policy_writeback_work(cache->policy, &oblock, &cblock, busy); 2079 if (r) 2080 break; 2081 2082 r = get_cell(cache, oblock, &structs, &old_ocell); 2083 if (r) { 2084 policy_set_dirty(cache->policy, oblock); 2085 break; 2086 } 2087 2088 writeback(cache, &structs, oblock, cblock, old_ocell); 2089 } 2090 2091 prealloc_free_structs(cache, &structs); 2092 } 2093 2094 /*---------------------------------------------------------------- 2095 * Invalidations. 2096 * Dropping something from the cache *without* writing back. 2097 *--------------------------------------------------------------*/ 2098 2099 static void process_invalidation_request(struct cache *cache, struct invalidation_request *req) 2100 { 2101 int r = 0; 2102 uint64_t begin = from_cblock(req->cblocks->begin); 2103 uint64_t end = from_cblock(req->cblocks->end); 2104 2105 while (begin != end) { 2106 r = policy_remove_cblock(cache->policy, to_cblock(begin)); 2107 if (!r) { 2108 r = dm_cache_remove_mapping(cache->cmd, to_cblock(begin)); 2109 if (r) { 2110 metadata_operation_failed(cache, "dm_cache_remove_mapping", r); 2111 break; 2112 } 2113 2114 } else if (r == -ENODATA) { 2115 /* harmless, already unmapped */ 2116 r = 0; 2117 2118 } else { 2119 DMERR("%s: policy_remove_cblock failed", cache_device_name(cache)); 2120 break; 2121 } 2122 2123 begin++; 2124 } 2125 2126 cache->commit_requested = true; 2127 2128 req->err = r; 2129 atomic_set(&req->complete, 1); 2130 2131 wake_up(&req->result_wait); 2132 } 2133 2134 static void process_invalidation_requests(struct cache *cache) 2135 { 2136 struct list_head list; 2137 struct invalidation_request *req, *tmp; 2138 2139 INIT_LIST_HEAD(&list); 2140 spin_lock(&cache->invalidation_lock); 2141 list_splice_init(&cache->invalidation_requests, &list); 2142 spin_unlock(&cache->invalidation_lock); 2143 2144 list_for_each_entry_safe (req, tmp, &list, list) 2145 process_invalidation_request(cache, req); 2146 } 2147 2148 /*---------------------------------------------------------------- 2149 * Main worker loop 2150 *--------------------------------------------------------------*/ 2151 static bool is_quiescing(struct cache *cache) 2152 { 2153 return atomic_read(&cache->quiescing); 2154 } 2155 2156 static void ack_quiescing(struct cache *cache) 2157 { 2158 if (is_quiescing(cache)) { 2159 atomic_inc(&cache->quiescing_ack); 2160 wake_up(&cache->quiescing_wait); 2161 } 2162 } 2163 2164 static void wait_for_quiescing_ack(struct cache *cache) 2165 { 2166 wait_event(cache->quiescing_wait, atomic_read(&cache->quiescing_ack)); 2167 } 2168 2169 static void start_quiescing(struct cache *cache) 2170 { 2171 atomic_inc(&cache->quiescing); 2172 wait_for_quiescing_ack(cache); 2173 } 2174 2175 static void stop_quiescing(struct cache *cache) 2176 { 2177 atomic_set(&cache->quiescing, 0); 2178 atomic_set(&cache->quiescing_ack, 0); 2179 } 2180 2181 static void wait_for_migrations(struct cache *cache) 2182 { 2183 wait_event(cache->migration_wait, !atomic_read(&cache->nr_allocated_migrations)); 2184 } 2185 2186 static void stop_worker(struct cache *cache) 2187 { 2188 cancel_delayed_work(&cache->waker); 2189 flush_workqueue(cache->wq); 2190 } 2191 2192 static void requeue_deferred_cells(struct cache *cache) 2193 { 2194 unsigned long flags; 2195 struct list_head cells; 2196 struct dm_bio_prison_cell *cell, *tmp; 2197 2198 INIT_LIST_HEAD(&cells); 2199 spin_lock_irqsave(&cache->lock, flags); 2200 list_splice_init(&cache->deferred_cells, &cells); 2201 spin_unlock_irqrestore(&cache->lock, flags); 2202 2203 list_for_each_entry_safe(cell, tmp, &cells, user_list) 2204 cell_requeue(cache, cell); 2205 } 2206 2207 static void requeue_deferred_bios(struct cache *cache) 2208 { 2209 struct bio *bio; 2210 struct bio_list bios; 2211 2212 bio_list_init(&bios); 2213 bio_list_merge(&bios, &cache->deferred_bios); 2214 bio_list_init(&cache->deferred_bios); 2215 2216 while ((bio = bio_list_pop(&bios))) 2217 bio_endio(bio, DM_ENDIO_REQUEUE); 2218 } 2219 2220 static int more_work(struct cache *cache) 2221 { 2222 if (is_quiescing(cache)) 2223 return !list_empty(&cache->quiesced_migrations) || 2224 !list_empty(&cache->completed_migrations) || 2225 !list_empty(&cache->need_commit_migrations); 2226 else 2227 return !bio_list_empty(&cache->deferred_bios) || 2228 !list_empty(&cache->deferred_cells) || 2229 !bio_list_empty(&cache->deferred_flush_bios) || 2230 !bio_list_empty(&cache->deferred_writethrough_bios) || 2231 !list_empty(&cache->quiesced_migrations) || 2232 !list_empty(&cache->completed_migrations) || 2233 !list_empty(&cache->need_commit_migrations) || 2234 cache->invalidate; 2235 } 2236 2237 static void do_worker(struct work_struct *ws) 2238 { 2239 struct cache *cache = container_of(ws, struct cache, worker); 2240 2241 do { 2242 if (!is_quiescing(cache)) { 2243 writeback_some_dirty_blocks(cache); 2244 process_deferred_writethrough_bios(cache); 2245 process_deferred_bios(cache); 2246 process_deferred_cells(cache); 2247 process_invalidation_requests(cache); 2248 } 2249 2250 process_migrations(cache, &cache->quiesced_migrations, issue_copy_or_discard); 2251 process_migrations(cache, &cache->completed_migrations, complete_migration); 2252 2253 if (commit_if_needed(cache)) { 2254 process_deferred_flush_bios(cache, false); 2255 process_migrations(cache, &cache->need_commit_migrations, migration_failure); 2256 } else { 2257 process_deferred_flush_bios(cache, true); 2258 process_migrations(cache, &cache->need_commit_migrations, 2259 migration_success_post_commit); 2260 } 2261 2262 ack_quiescing(cache); 2263 2264 } while (more_work(cache)); 2265 } 2266 2267 /* 2268 * We want to commit periodically so that not too much 2269 * unwritten metadata builds up. 2270 */ 2271 static void do_waker(struct work_struct *ws) 2272 { 2273 struct cache *cache = container_of(to_delayed_work(ws), struct cache, waker); 2274 policy_tick(cache->policy, true); 2275 wake_worker(cache); 2276 queue_delayed_work(cache->wq, &cache->waker, COMMIT_PERIOD); 2277 } 2278 2279 /*----------------------------------------------------------------*/ 2280 2281 static int is_congested(struct dm_dev *dev, int bdi_bits) 2282 { 2283 struct request_queue *q = bdev_get_queue(dev->bdev); 2284 return bdi_congested(&q->backing_dev_info, bdi_bits); 2285 } 2286 2287 static int cache_is_congested(struct dm_target_callbacks *cb, int bdi_bits) 2288 { 2289 struct cache *cache = container_of(cb, struct cache, callbacks); 2290 2291 return is_congested(cache->origin_dev, bdi_bits) || 2292 is_congested(cache->cache_dev, bdi_bits); 2293 } 2294 2295 /*---------------------------------------------------------------- 2296 * Target methods 2297 *--------------------------------------------------------------*/ 2298 2299 /* 2300 * This function gets called on the error paths of the constructor, so we 2301 * have to cope with a partially initialised struct. 2302 */ 2303 static void destroy(struct cache *cache) 2304 { 2305 unsigned i; 2306 2307 if (cache->migration_pool) 2308 mempool_destroy(cache->migration_pool); 2309 2310 if (cache->all_io_ds) 2311 dm_deferred_set_destroy(cache->all_io_ds); 2312 2313 if (cache->prison) 2314 dm_bio_prison_destroy(cache->prison); 2315 2316 if (cache->wq) 2317 destroy_workqueue(cache->wq); 2318 2319 if (cache->dirty_bitset) 2320 free_bitset(cache->dirty_bitset); 2321 2322 if (cache->discard_bitset) 2323 free_bitset(cache->discard_bitset); 2324 2325 if (cache->copier) 2326 dm_kcopyd_client_destroy(cache->copier); 2327 2328 if (cache->cmd) 2329 dm_cache_metadata_close(cache->cmd); 2330 2331 if (cache->metadata_dev) 2332 dm_put_device(cache->ti, cache->metadata_dev); 2333 2334 if (cache->origin_dev) 2335 dm_put_device(cache->ti, cache->origin_dev); 2336 2337 if (cache->cache_dev) 2338 dm_put_device(cache->ti, cache->cache_dev); 2339 2340 if (cache->policy) 2341 dm_cache_policy_destroy(cache->policy); 2342 2343 for (i = 0; i < cache->nr_ctr_args ; i++) 2344 kfree(cache->ctr_args[i]); 2345 kfree(cache->ctr_args); 2346 2347 kfree(cache); 2348 } 2349 2350 static void cache_dtr(struct dm_target *ti) 2351 { 2352 struct cache *cache = ti->private; 2353 2354 destroy(cache); 2355 } 2356 2357 static sector_t get_dev_size(struct dm_dev *dev) 2358 { 2359 return i_size_read(dev->bdev->bd_inode) >> SECTOR_SHIFT; 2360 } 2361 2362 /*----------------------------------------------------------------*/ 2363 2364 /* 2365 * Construct a cache device mapping. 2366 * 2367 * cache <metadata dev> <cache dev> <origin dev> <block size> 2368 * <#feature args> [<feature arg>]* 2369 * <policy> <#policy args> [<policy arg>]* 2370 * 2371 * metadata dev : fast device holding the persistent metadata 2372 * cache dev : fast device holding cached data blocks 2373 * origin dev : slow device holding original data blocks 2374 * block size : cache unit size in sectors 2375 * 2376 * #feature args : number of feature arguments passed 2377 * feature args : writethrough. (The default is writeback.) 2378 * 2379 * policy : the replacement policy to use 2380 * #policy args : an even number of policy arguments corresponding 2381 * to key/value pairs passed to the policy 2382 * policy args : key/value pairs passed to the policy 2383 * E.g. 'sequential_threshold 1024' 2384 * See cache-policies.txt for details. 2385 * 2386 * Optional feature arguments are: 2387 * writethrough : write through caching that prohibits cache block 2388 * content from being different from origin block content. 2389 * Without this argument, the default behaviour is to write 2390 * back cache block contents later for performance reasons, 2391 * so they may differ from the corresponding origin blocks. 2392 */ 2393 struct cache_args { 2394 struct dm_target *ti; 2395 2396 struct dm_dev *metadata_dev; 2397 2398 struct dm_dev *cache_dev; 2399 sector_t cache_sectors; 2400 2401 struct dm_dev *origin_dev; 2402 sector_t origin_sectors; 2403 2404 uint32_t block_size; 2405 2406 const char *policy_name; 2407 int policy_argc; 2408 const char **policy_argv; 2409 2410 struct cache_features features; 2411 }; 2412 2413 static void destroy_cache_args(struct cache_args *ca) 2414 { 2415 if (ca->metadata_dev) 2416 dm_put_device(ca->ti, ca->metadata_dev); 2417 2418 if (ca->cache_dev) 2419 dm_put_device(ca->ti, ca->cache_dev); 2420 2421 if (ca->origin_dev) 2422 dm_put_device(ca->ti, ca->origin_dev); 2423 2424 kfree(ca); 2425 } 2426 2427 static bool at_least_one_arg(struct dm_arg_set *as, char **error) 2428 { 2429 if (!as->argc) { 2430 *error = "Insufficient args"; 2431 return false; 2432 } 2433 2434 return true; 2435 } 2436 2437 static int parse_metadata_dev(struct cache_args *ca, struct dm_arg_set *as, 2438 char **error) 2439 { 2440 int r; 2441 sector_t metadata_dev_size; 2442 char b[BDEVNAME_SIZE]; 2443 2444 if (!at_least_one_arg(as, error)) 2445 return -EINVAL; 2446 2447 r = dm_get_device(ca->ti, dm_shift_arg(as), FMODE_READ | FMODE_WRITE, 2448 &ca->metadata_dev); 2449 if (r) { 2450 *error = "Error opening metadata device"; 2451 return r; 2452 } 2453 2454 metadata_dev_size = get_dev_size(ca->metadata_dev); 2455 if (metadata_dev_size > DM_CACHE_METADATA_MAX_SECTORS_WARNING) 2456 DMWARN("Metadata device %s is larger than %u sectors: excess space will not be used.", 2457 bdevname(ca->metadata_dev->bdev, b), THIN_METADATA_MAX_SECTORS); 2458 2459 return 0; 2460 } 2461 2462 static int parse_cache_dev(struct cache_args *ca, struct dm_arg_set *as, 2463 char **error) 2464 { 2465 int r; 2466 2467 if (!at_least_one_arg(as, error)) 2468 return -EINVAL; 2469 2470 r = dm_get_device(ca->ti, dm_shift_arg(as), FMODE_READ | FMODE_WRITE, 2471 &ca->cache_dev); 2472 if (r) { 2473 *error = "Error opening cache device"; 2474 return r; 2475 } 2476 ca->cache_sectors = get_dev_size(ca->cache_dev); 2477 2478 return 0; 2479 } 2480 2481 static int parse_origin_dev(struct cache_args *ca, struct dm_arg_set *as, 2482 char **error) 2483 { 2484 int r; 2485 2486 if (!at_least_one_arg(as, error)) 2487 return -EINVAL; 2488 2489 r = dm_get_device(ca->ti, dm_shift_arg(as), FMODE_READ | FMODE_WRITE, 2490 &ca->origin_dev); 2491 if (r) { 2492 *error = "Error opening origin device"; 2493 return r; 2494 } 2495 2496 ca->origin_sectors = get_dev_size(ca->origin_dev); 2497 if (ca->ti->len > ca->origin_sectors) { 2498 *error = "Device size larger than cached device"; 2499 return -EINVAL; 2500 } 2501 2502 return 0; 2503 } 2504 2505 static int parse_block_size(struct cache_args *ca, struct dm_arg_set *as, 2506 char **error) 2507 { 2508 unsigned long block_size; 2509 2510 if (!at_least_one_arg(as, error)) 2511 return -EINVAL; 2512 2513 if (kstrtoul(dm_shift_arg(as), 10, &block_size) || !block_size || 2514 block_size < DATA_DEV_BLOCK_SIZE_MIN_SECTORS || 2515 block_size > DATA_DEV_BLOCK_SIZE_MAX_SECTORS || 2516 block_size & (DATA_DEV_BLOCK_SIZE_MIN_SECTORS - 1)) { 2517 *error = "Invalid data block size"; 2518 return -EINVAL; 2519 } 2520 2521 if (block_size > ca->cache_sectors) { 2522 *error = "Data block size is larger than the cache device"; 2523 return -EINVAL; 2524 } 2525 2526 ca->block_size = block_size; 2527 2528 return 0; 2529 } 2530 2531 static void init_features(struct cache_features *cf) 2532 { 2533 cf->mode = CM_WRITE; 2534 cf->io_mode = CM_IO_WRITEBACK; 2535 } 2536 2537 static int parse_features(struct cache_args *ca, struct dm_arg_set *as, 2538 char **error) 2539 { 2540 static struct dm_arg _args[] = { 2541 {0, 1, "Invalid number of cache feature arguments"}, 2542 }; 2543 2544 int r; 2545 unsigned argc; 2546 const char *arg; 2547 struct cache_features *cf = &ca->features; 2548 2549 init_features(cf); 2550 2551 r = dm_read_arg_group(_args, as, &argc, error); 2552 if (r) 2553 return -EINVAL; 2554 2555 while (argc--) { 2556 arg = dm_shift_arg(as); 2557 2558 if (!strcasecmp(arg, "writeback")) 2559 cf->io_mode = CM_IO_WRITEBACK; 2560 2561 else if (!strcasecmp(arg, "writethrough")) 2562 cf->io_mode = CM_IO_WRITETHROUGH; 2563 2564 else if (!strcasecmp(arg, "passthrough")) 2565 cf->io_mode = CM_IO_PASSTHROUGH; 2566 2567 else { 2568 *error = "Unrecognised cache feature requested"; 2569 return -EINVAL; 2570 } 2571 } 2572 2573 return 0; 2574 } 2575 2576 static int parse_policy(struct cache_args *ca, struct dm_arg_set *as, 2577 char **error) 2578 { 2579 static struct dm_arg _args[] = { 2580 {0, 1024, "Invalid number of policy arguments"}, 2581 }; 2582 2583 int r; 2584 2585 if (!at_least_one_arg(as, error)) 2586 return -EINVAL; 2587 2588 ca->policy_name = dm_shift_arg(as); 2589 2590 r = dm_read_arg_group(_args, as, &ca->policy_argc, error); 2591 if (r) 2592 return -EINVAL; 2593 2594 ca->policy_argv = (const char **)as->argv; 2595 dm_consume_args(as, ca->policy_argc); 2596 2597 return 0; 2598 } 2599 2600 static int parse_cache_args(struct cache_args *ca, int argc, char **argv, 2601 char **error) 2602 { 2603 int r; 2604 struct dm_arg_set as; 2605 2606 as.argc = argc; 2607 as.argv = argv; 2608 2609 r = parse_metadata_dev(ca, &as, error); 2610 if (r) 2611 return r; 2612 2613 r = parse_cache_dev(ca, &as, error); 2614 if (r) 2615 return r; 2616 2617 r = parse_origin_dev(ca, &as, error); 2618 if (r) 2619 return r; 2620 2621 r = parse_block_size(ca, &as, error); 2622 if (r) 2623 return r; 2624 2625 r = parse_features(ca, &as, error); 2626 if (r) 2627 return r; 2628 2629 r = parse_policy(ca, &as, error); 2630 if (r) 2631 return r; 2632 2633 return 0; 2634 } 2635 2636 /*----------------------------------------------------------------*/ 2637 2638 static struct kmem_cache *migration_cache; 2639 2640 #define NOT_CORE_OPTION 1 2641 2642 static int process_config_option(struct cache *cache, const char *key, const char *value) 2643 { 2644 unsigned long tmp; 2645 2646 if (!strcasecmp(key, "migration_threshold")) { 2647 if (kstrtoul(value, 10, &tmp)) 2648 return -EINVAL; 2649 2650 cache->migration_threshold = tmp; 2651 return 0; 2652 } 2653 2654 return NOT_CORE_OPTION; 2655 } 2656 2657 static int set_config_value(struct cache *cache, const char *key, const char *value) 2658 { 2659 int r = process_config_option(cache, key, value); 2660 2661 if (r == NOT_CORE_OPTION) 2662 r = policy_set_config_value(cache->policy, key, value); 2663 2664 if (r) 2665 DMWARN("bad config value for %s: %s", key, value); 2666 2667 return r; 2668 } 2669 2670 static int set_config_values(struct cache *cache, int argc, const char **argv) 2671 { 2672 int r = 0; 2673 2674 if (argc & 1) { 2675 DMWARN("Odd number of policy arguments given but they should be <key> <value> pairs."); 2676 return -EINVAL; 2677 } 2678 2679 while (argc) { 2680 r = set_config_value(cache, argv[0], argv[1]); 2681 if (r) 2682 break; 2683 2684 argc -= 2; 2685 argv += 2; 2686 } 2687 2688 return r; 2689 } 2690 2691 static int create_cache_policy(struct cache *cache, struct cache_args *ca, 2692 char **error) 2693 { 2694 struct dm_cache_policy *p = dm_cache_policy_create(ca->policy_name, 2695 cache->cache_size, 2696 cache->origin_sectors, 2697 cache->sectors_per_block); 2698 if (IS_ERR(p)) { 2699 *error = "Error creating cache's policy"; 2700 return PTR_ERR(p); 2701 } 2702 cache->policy = p; 2703 2704 return 0; 2705 } 2706 2707 /* 2708 * We want the discard block size to be at least the size of the cache 2709 * block size and have no more than 2^14 discard blocks across the origin. 2710 */ 2711 #define MAX_DISCARD_BLOCKS (1 << 14) 2712 2713 static bool too_many_discard_blocks(sector_t discard_block_size, 2714 sector_t origin_size) 2715 { 2716 (void) sector_div(origin_size, discard_block_size); 2717 2718 return origin_size > MAX_DISCARD_BLOCKS; 2719 } 2720 2721 static sector_t calculate_discard_block_size(sector_t cache_block_size, 2722 sector_t origin_size) 2723 { 2724 sector_t discard_block_size = cache_block_size; 2725 2726 if (origin_size) 2727 while (too_many_discard_blocks(discard_block_size, origin_size)) 2728 discard_block_size *= 2; 2729 2730 return discard_block_size; 2731 } 2732 2733 static void set_cache_size(struct cache *cache, dm_cblock_t size) 2734 { 2735 dm_block_t nr_blocks = from_cblock(size); 2736 2737 if (nr_blocks > (1 << 20) && cache->cache_size != size) 2738 DMWARN_LIMIT("You have created a cache device with a lot of individual cache blocks (%llu)\n" 2739 "All these mappings can consume a lot of kernel memory, and take some time to read/write.\n" 2740 "Please consider increasing the cache block size to reduce the overall cache block count.", 2741 (unsigned long long) nr_blocks); 2742 2743 cache->cache_size = size; 2744 } 2745 2746 #define DEFAULT_MIGRATION_THRESHOLD 2048 2747 2748 static int cache_create(struct cache_args *ca, struct cache **result) 2749 { 2750 int r = 0; 2751 char **error = &ca->ti->error; 2752 struct cache *cache; 2753 struct dm_target *ti = ca->ti; 2754 dm_block_t origin_blocks; 2755 struct dm_cache_metadata *cmd; 2756 bool may_format = ca->features.mode == CM_WRITE; 2757 2758 cache = kzalloc(sizeof(*cache), GFP_KERNEL); 2759 if (!cache) 2760 return -ENOMEM; 2761 2762 cache->ti = ca->ti; 2763 ti->private = cache; 2764 ti->num_flush_bios = 2; 2765 ti->flush_supported = true; 2766 2767 ti->num_discard_bios = 1; 2768 ti->discards_supported = true; 2769 ti->discard_zeroes_data_unsupported = true; 2770 ti->split_discard_bios = false; 2771 2772 cache->features = ca->features; 2773 ti->per_bio_data_size = get_per_bio_data_size(cache); 2774 2775 cache->callbacks.congested_fn = cache_is_congested; 2776 dm_table_add_target_callbacks(ti->table, &cache->callbacks); 2777 2778 cache->metadata_dev = ca->metadata_dev; 2779 cache->origin_dev = ca->origin_dev; 2780 cache->cache_dev = ca->cache_dev; 2781 2782 ca->metadata_dev = ca->origin_dev = ca->cache_dev = NULL; 2783 2784 /* FIXME: factor out this whole section */ 2785 origin_blocks = cache->origin_sectors = ca->origin_sectors; 2786 origin_blocks = block_div(origin_blocks, ca->block_size); 2787 cache->origin_blocks = to_oblock(origin_blocks); 2788 2789 cache->sectors_per_block = ca->block_size; 2790 if (dm_set_target_max_io_len(ti, cache->sectors_per_block)) { 2791 r = -EINVAL; 2792 goto bad; 2793 } 2794 2795 if (ca->block_size & (ca->block_size - 1)) { 2796 dm_block_t cache_size = ca->cache_sectors; 2797 2798 cache->sectors_per_block_shift = -1; 2799 cache_size = block_div(cache_size, ca->block_size); 2800 set_cache_size(cache, to_cblock(cache_size)); 2801 } else { 2802 cache->sectors_per_block_shift = __ffs(ca->block_size); 2803 set_cache_size(cache, to_cblock(ca->cache_sectors >> cache->sectors_per_block_shift)); 2804 } 2805 2806 r = create_cache_policy(cache, ca, error); 2807 if (r) 2808 goto bad; 2809 2810 cache->policy_nr_args = ca->policy_argc; 2811 cache->migration_threshold = DEFAULT_MIGRATION_THRESHOLD; 2812 2813 r = set_config_values(cache, ca->policy_argc, ca->policy_argv); 2814 if (r) { 2815 *error = "Error setting cache policy's config values"; 2816 goto bad; 2817 } 2818 2819 cmd = dm_cache_metadata_open(cache->metadata_dev->bdev, 2820 ca->block_size, may_format, 2821 dm_cache_policy_get_hint_size(cache->policy)); 2822 if (IS_ERR(cmd)) { 2823 *error = "Error creating metadata object"; 2824 r = PTR_ERR(cmd); 2825 goto bad; 2826 } 2827 cache->cmd = cmd; 2828 set_cache_mode(cache, CM_WRITE); 2829 if (get_cache_mode(cache) != CM_WRITE) { 2830 *error = "Unable to get write access to metadata, please check/repair metadata."; 2831 r = -EINVAL; 2832 goto bad; 2833 } 2834 2835 if (passthrough_mode(&cache->features)) { 2836 bool all_clean; 2837 2838 r = dm_cache_metadata_all_clean(cache->cmd, &all_clean); 2839 if (r) { 2840 *error = "dm_cache_metadata_all_clean() failed"; 2841 goto bad; 2842 } 2843 2844 if (!all_clean) { 2845 *error = "Cannot enter passthrough mode unless all blocks are clean"; 2846 r = -EINVAL; 2847 goto bad; 2848 } 2849 } 2850 2851 spin_lock_init(&cache->lock); 2852 INIT_LIST_HEAD(&cache->deferred_cells); 2853 bio_list_init(&cache->deferred_bios); 2854 bio_list_init(&cache->deferred_flush_bios); 2855 bio_list_init(&cache->deferred_writethrough_bios); 2856 INIT_LIST_HEAD(&cache->quiesced_migrations); 2857 INIT_LIST_HEAD(&cache->completed_migrations); 2858 INIT_LIST_HEAD(&cache->need_commit_migrations); 2859 atomic_set(&cache->nr_allocated_migrations, 0); 2860 atomic_set(&cache->nr_io_migrations, 0); 2861 init_waitqueue_head(&cache->migration_wait); 2862 2863 init_waitqueue_head(&cache->quiescing_wait); 2864 atomic_set(&cache->quiescing, 0); 2865 atomic_set(&cache->quiescing_ack, 0); 2866 2867 r = -ENOMEM; 2868 atomic_set(&cache->nr_dirty, 0); 2869 cache->dirty_bitset = alloc_bitset(from_cblock(cache->cache_size)); 2870 if (!cache->dirty_bitset) { 2871 *error = "could not allocate dirty bitset"; 2872 goto bad; 2873 } 2874 clear_bitset(cache->dirty_bitset, from_cblock(cache->cache_size)); 2875 2876 cache->discard_block_size = 2877 calculate_discard_block_size(cache->sectors_per_block, 2878 cache->origin_sectors); 2879 cache->discard_nr_blocks = to_dblock(dm_sector_div_up(cache->origin_sectors, 2880 cache->discard_block_size)); 2881 cache->discard_bitset = alloc_bitset(from_dblock(cache->discard_nr_blocks)); 2882 if (!cache->discard_bitset) { 2883 *error = "could not allocate discard bitset"; 2884 goto bad; 2885 } 2886 clear_bitset(cache->discard_bitset, from_dblock(cache->discard_nr_blocks)); 2887 2888 cache->copier = dm_kcopyd_client_create(&dm_kcopyd_throttle); 2889 if (IS_ERR(cache->copier)) { 2890 *error = "could not create kcopyd client"; 2891 r = PTR_ERR(cache->copier); 2892 goto bad; 2893 } 2894 2895 cache->wq = alloc_ordered_workqueue("dm-" DM_MSG_PREFIX, WQ_MEM_RECLAIM); 2896 if (!cache->wq) { 2897 *error = "could not create workqueue for metadata object"; 2898 goto bad; 2899 } 2900 INIT_WORK(&cache->worker, do_worker); 2901 INIT_DELAYED_WORK(&cache->waker, do_waker); 2902 cache->last_commit_jiffies = jiffies; 2903 2904 cache->prison = dm_bio_prison_create(); 2905 if (!cache->prison) { 2906 *error = "could not create bio prison"; 2907 goto bad; 2908 } 2909 2910 cache->all_io_ds = dm_deferred_set_create(); 2911 if (!cache->all_io_ds) { 2912 *error = "could not create all_io deferred set"; 2913 goto bad; 2914 } 2915 2916 cache->migration_pool = mempool_create_slab_pool(MIGRATION_POOL_SIZE, 2917 migration_cache); 2918 if (!cache->migration_pool) { 2919 *error = "Error creating cache's migration mempool"; 2920 goto bad; 2921 } 2922 2923 cache->need_tick_bio = true; 2924 cache->sized = false; 2925 cache->invalidate = false; 2926 cache->commit_requested = false; 2927 cache->loaded_mappings = false; 2928 cache->loaded_discards = false; 2929 2930 load_stats(cache); 2931 2932 atomic_set(&cache->stats.demotion, 0); 2933 atomic_set(&cache->stats.promotion, 0); 2934 atomic_set(&cache->stats.copies_avoided, 0); 2935 atomic_set(&cache->stats.cache_cell_clash, 0); 2936 atomic_set(&cache->stats.commit_count, 0); 2937 atomic_set(&cache->stats.discard_count, 0); 2938 2939 spin_lock_init(&cache->invalidation_lock); 2940 INIT_LIST_HEAD(&cache->invalidation_requests); 2941 2942 iot_init(&cache->origin_tracker); 2943 2944 *result = cache; 2945 return 0; 2946 2947 bad: 2948 destroy(cache); 2949 return r; 2950 } 2951 2952 static int copy_ctr_args(struct cache *cache, int argc, const char **argv) 2953 { 2954 unsigned i; 2955 const char **copy; 2956 2957 copy = kcalloc(argc, sizeof(*copy), GFP_KERNEL); 2958 if (!copy) 2959 return -ENOMEM; 2960 for (i = 0; i < argc; i++) { 2961 copy[i] = kstrdup(argv[i], GFP_KERNEL); 2962 if (!copy[i]) { 2963 while (i--) 2964 kfree(copy[i]); 2965 kfree(copy); 2966 return -ENOMEM; 2967 } 2968 } 2969 2970 cache->nr_ctr_args = argc; 2971 cache->ctr_args = copy; 2972 2973 return 0; 2974 } 2975 2976 static int cache_ctr(struct dm_target *ti, unsigned argc, char **argv) 2977 { 2978 int r = -EINVAL; 2979 struct cache_args *ca; 2980 struct cache *cache = NULL; 2981 2982 ca = kzalloc(sizeof(*ca), GFP_KERNEL); 2983 if (!ca) { 2984 ti->error = "Error allocating memory for cache"; 2985 return -ENOMEM; 2986 } 2987 ca->ti = ti; 2988 2989 r = parse_cache_args(ca, argc, argv, &ti->error); 2990 if (r) 2991 goto out; 2992 2993 r = cache_create(ca, &cache); 2994 if (r) 2995 goto out; 2996 2997 r = copy_ctr_args(cache, argc - 3, (const char **)argv + 3); 2998 if (r) { 2999 destroy(cache); 3000 goto out; 3001 } 3002 3003 ti->private = cache; 3004 3005 out: 3006 destroy_cache_args(ca); 3007 return r; 3008 } 3009 3010 /*----------------------------------------------------------------*/ 3011 3012 static int cache_map(struct dm_target *ti, struct bio *bio) 3013 { 3014 struct cache *cache = ti->private; 3015 3016 int r; 3017 struct dm_bio_prison_cell *cell = NULL; 3018 dm_oblock_t block = get_bio_block(cache, bio); 3019 size_t pb_data_size = get_per_bio_data_size(cache); 3020 bool can_migrate = false; 3021 bool fast_promotion; 3022 struct policy_result lookup_result; 3023 struct per_bio_data *pb = init_per_bio_data(bio, pb_data_size); 3024 struct old_oblock_lock ool; 3025 3026 ool.locker.fn = null_locker; 3027 3028 if (unlikely(from_oblock(block) >= from_oblock(cache->origin_blocks))) { 3029 /* 3030 * This can only occur if the io goes to a partial block at 3031 * the end of the origin device. We don't cache these. 3032 * Just remap to the origin and carry on. 3033 */ 3034 remap_to_origin(cache, bio); 3035 accounted_begin(cache, bio); 3036 return DM_MAPIO_REMAPPED; 3037 } 3038 3039 if (discard_or_flush(bio)) { 3040 defer_bio(cache, bio); 3041 return DM_MAPIO_SUBMITTED; 3042 } 3043 3044 /* 3045 * Check to see if that block is currently migrating. 3046 */ 3047 cell = alloc_prison_cell(cache); 3048 if (!cell) { 3049 defer_bio(cache, bio); 3050 return DM_MAPIO_SUBMITTED; 3051 } 3052 3053 r = bio_detain(cache, block, bio, cell, 3054 (cell_free_fn) free_prison_cell, 3055 cache, &cell); 3056 if (r) { 3057 if (r < 0) 3058 defer_bio(cache, bio); 3059 3060 return DM_MAPIO_SUBMITTED; 3061 } 3062 3063 fast_promotion = is_discarded_oblock(cache, block) || bio_writes_complete_block(cache, bio); 3064 3065 r = policy_map(cache->policy, block, false, can_migrate, fast_promotion, 3066 bio, &ool.locker, &lookup_result); 3067 if (r == -EWOULDBLOCK) { 3068 cell_defer(cache, cell, true); 3069 return DM_MAPIO_SUBMITTED; 3070 3071 } else if (r) { 3072 DMERR_LIMIT("%s: Unexpected return from cache replacement policy: %d", 3073 cache_device_name(cache), r); 3074 cell_defer(cache, cell, false); 3075 bio_io_error(bio); 3076 return DM_MAPIO_SUBMITTED; 3077 } 3078 3079 r = DM_MAPIO_REMAPPED; 3080 switch (lookup_result.op) { 3081 case POLICY_HIT: 3082 if (passthrough_mode(&cache->features)) { 3083 if (bio_data_dir(bio) == WRITE) { 3084 /* 3085 * We need to invalidate this block, so 3086 * defer for the worker thread. 3087 */ 3088 cell_defer(cache, cell, true); 3089 r = DM_MAPIO_SUBMITTED; 3090 3091 } else { 3092 inc_miss_counter(cache, bio); 3093 remap_to_origin_clear_discard(cache, bio, block); 3094 accounted_begin(cache, bio); 3095 inc_ds(cache, bio, cell); 3096 // FIXME: we want to remap hits or misses straight 3097 // away rather than passing over to the worker. 3098 cell_defer(cache, cell, false); 3099 } 3100 3101 } else { 3102 inc_hit_counter(cache, bio); 3103 if (bio_data_dir(bio) == WRITE && writethrough_mode(&cache->features) && 3104 !is_dirty(cache, lookup_result.cblock)) { 3105 remap_to_origin_then_cache(cache, bio, block, lookup_result.cblock); 3106 accounted_begin(cache, bio); 3107 inc_ds(cache, bio, cell); 3108 cell_defer(cache, cell, false); 3109 3110 } else 3111 remap_cell_to_cache_dirty(cache, cell, block, lookup_result.cblock, false); 3112 } 3113 break; 3114 3115 case POLICY_MISS: 3116 inc_miss_counter(cache, bio); 3117 if (pb->req_nr != 0) { 3118 /* 3119 * This is a duplicate writethrough io that is no 3120 * longer needed because the block has been demoted. 3121 */ 3122 bio_endio(bio, 0); 3123 // FIXME: remap everything as a miss 3124 cell_defer(cache, cell, false); 3125 r = DM_MAPIO_SUBMITTED; 3126 3127 } else 3128 remap_cell_to_origin_clear_discard(cache, cell, block, false); 3129 break; 3130 3131 default: 3132 DMERR_LIMIT("%s: %s: erroring bio: unknown policy op: %u", 3133 cache_device_name(cache), __func__, 3134 (unsigned) lookup_result.op); 3135 cell_defer(cache, cell, false); 3136 bio_io_error(bio); 3137 r = DM_MAPIO_SUBMITTED; 3138 } 3139 3140 return r; 3141 } 3142 3143 static int cache_end_io(struct dm_target *ti, struct bio *bio, int error) 3144 { 3145 struct cache *cache = ti->private; 3146 unsigned long flags; 3147 size_t pb_data_size = get_per_bio_data_size(cache); 3148 struct per_bio_data *pb = get_per_bio_data(bio, pb_data_size); 3149 3150 if (pb->tick) { 3151 policy_tick(cache->policy, false); 3152 3153 spin_lock_irqsave(&cache->lock, flags); 3154 cache->need_tick_bio = true; 3155 spin_unlock_irqrestore(&cache->lock, flags); 3156 } 3157 3158 check_for_quiesced_migrations(cache, pb); 3159 accounted_complete(cache, bio); 3160 3161 return 0; 3162 } 3163 3164 static int write_dirty_bitset(struct cache *cache) 3165 { 3166 unsigned i, r; 3167 3168 if (get_cache_mode(cache) >= CM_READ_ONLY) 3169 return -EINVAL; 3170 3171 for (i = 0; i < from_cblock(cache->cache_size); i++) { 3172 r = dm_cache_set_dirty(cache->cmd, to_cblock(i), 3173 is_dirty(cache, to_cblock(i))); 3174 if (r) { 3175 metadata_operation_failed(cache, "dm_cache_set_dirty", r); 3176 return r; 3177 } 3178 } 3179 3180 return 0; 3181 } 3182 3183 static int write_discard_bitset(struct cache *cache) 3184 { 3185 unsigned i, r; 3186 3187 if (get_cache_mode(cache) >= CM_READ_ONLY) 3188 return -EINVAL; 3189 3190 r = dm_cache_discard_bitset_resize(cache->cmd, cache->discard_block_size, 3191 cache->discard_nr_blocks); 3192 if (r) { 3193 DMERR("%s: could not resize on-disk discard bitset", cache_device_name(cache)); 3194 metadata_operation_failed(cache, "dm_cache_discard_bitset_resize", r); 3195 return r; 3196 } 3197 3198 for (i = 0; i < from_dblock(cache->discard_nr_blocks); i++) { 3199 r = dm_cache_set_discard(cache->cmd, to_dblock(i), 3200 is_discarded(cache, to_dblock(i))); 3201 if (r) { 3202 metadata_operation_failed(cache, "dm_cache_set_discard", r); 3203 return r; 3204 } 3205 } 3206 3207 return 0; 3208 } 3209 3210 static int write_hints(struct cache *cache) 3211 { 3212 int r; 3213 3214 if (get_cache_mode(cache) >= CM_READ_ONLY) 3215 return -EINVAL; 3216 3217 r = dm_cache_write_hints(cache->cmd, cache->policy); 3218 if (r) { 3219 metadata_operation_failed(cache, "dm_cache_write_hints", r); 3220 return r; 3221 } 3222 3223 return 0; 3224 } 3225 3226 /* 3227 * returns true on success 3228 */ 3229 static bool sync_metadata(struct cache *cache) 3230 { 3231 int r1, r2, r3, r4; 3232 3233 r1 = write_dirty_bitset(cache); 3234 if (r1) 3235 DMERR("%s: could not write dirty bitset", cache_device_name(cache)); 3236 3237 r2 = write_discard_bitset(cache); 3238 if (r2) 3239 DMERR("%s: could not write discard bitset", cache_device_name(cache)); 3240 3241 save_stats(cache); 3242 3243 r3 = write_hints(cache); 3244 if (r3) 3245 DMERR("%s: could not write hints", cache_device_name(cache)); 3246 3247 /* 3248 * If writing the above metadata failed, we still commit, but don't 3249 * set the clean shutdown flag. This will effectively force every 3250 * dirty bit to be set on reload. 3251 */ 3252 r4 = commit(cache, !r1 && !r2 && !r3); 3253 if (r4) 3254 DMERR("%s: could not write cache metadata", cache_device_name(cache)); 3255 3256 return !r1 && !r2 && !r3 && !r4; 3257 } 3258 3259 static void cache_postsuspend(struct dm_target *ti) 3260 { 3261 struct cache *cache = ti->private; 3262 3263 start_quiescing(cache); 3264 wait_for_migrations(cache); 3265 stop_worker(cache); 3266 requeue_deferred_bios(cache); 3267 requeue_deferred_cells(cache); 3268 stop_quiescing(cache); 3269 3270 if (get_cache_mode(cache) == CM_WRITE) 3271 (void) sync_metadata(cache); 3272 } 3273 3274 static int load_mapping(void *context, dm_oblock_t oblock, dm_cblock_t cblock, 3275 bool dirty, uint32_t hint, bool hint_valid) 3276 { 3277 int r; 3278 struct cache *cache = context; 3279 3280 r = policy_load_mapping(cache->policy, oblock, cblock, hint, hint_valid); 3281 if (r) 3282 return r; 3283 3284 if (dirty) 3285 set_dirty(cache, oblock, cblock); 3286 else 3287 clear_dirty(cache, oblock, cblock); 3288 3289 return 0; 3290 } 3291 3292 /* 3293 * The discard block size in the on disk metadata is not 3294 * neccessarily the same as we're currently using. So we have to 3295 * be careful to only set the discarded attribute if we know it 3296 * covers a complete block of the new size. 3297 */ 3298 struct discard_load_info { 3299 struct cache *cache; 3300 3301 /* 3302 * These blocks are sized using the on disk dblock size, rather 3303 * than the current one. 3304 */ 3305 dm_block_t block_size; 3306 dm_block_t discard_begin, discard_end; 3307 }; 3308 3309 static void discard_load_info_init(struct cache *cache, 3310 struct discard_load_info *li) 3311 { 3312 li->cache = cache; 3313 li->discard_begin = li->discard_end = 0; 3314 } 3315 3316 static void set_discard_range(struct discard_load_info *li) 3317 { 3318 sector_t b, e; 3319 3320 if (li->discard_begin == li->discard_end) 3321 return; 3322 3323 /* 3324 * Convert to sectors. 3325 */ 3326 b = li->discard_begin * li->block_size; 3327 e = li->discard_end * li->block_size; 3328 3329 /* 3330 * Then convert back to the current dblock size. 3331 */ 3332 b = dm_sector_div_up(b, li->cache->discard_block_size); 3333 sector_div(e, li->cache->discard_block_size); 3334 3335 /* 3336 * The origin may have shrunk, so we need to check we're still in 3337 * bounds. 3338 */ 3339 if (e > from_dblock(li->cache->discard_nr_blocks)) 3340 e = from_dblock(li->cache->discard_nr_blocks); 3341 3342 for (; b < e; b++) 3343 set_discard(li->cache, to_dblock(b)); 3344 } 3345 3346 static int load_discard(void *context, sector_t discard_block_size, 3347 dm_dblock_t dblock, bool discard) 3348 { 3349 struct discard_load_info *li = context; 3350 3351 li->block_size = discard_block_size; 3352 3353 if (discard) { 3354 if (from_dblock(dblock) == li->discard_end) 3355 /* 3356 * We're already in a discard range, just extend it. 3357 */ 3358 li->discard_end = li->discard_end + 1ULL; 3359 3360 else { 3361 /* 3362 * Emit the old range and start a new one. 3363 */ 3364 set_discard_range(li); 3365 li->discard_begin = from_dblock(dblock); 3366 li->discard_end = li->discard_begin + 1ULL; 3367 } 3368 } else { 3369 set_discard_range(li); 3370 li->discard_begin = li->discard_end = 0; 3371 } 3372 3373 return 0; 3374 } 3375 3376 static dm_cblock_t get_cache_dev_size(struct cache *cache) 3377 { 3378 sector_t size = get_dev_size(cache->cache_dev); 3379 (void) sector_div(size, cache->sectors_per_block); 3380 return to_cblock(size); 3381 } 3382 3383 static bool can_resize(struct cache *cache, dm_cblock_t new_size) 3384 { 3385 if (from_cblock(new_size) > from_cblock(cache->cache_size)) 3386 return true; 3387 3388 /* 3389 * We can't drop a dirty block when shrinking the cache. 3390 */ 3391 while (from_cblock(new_size) < from_cblock(cache->cache_size)) { 3392 new_size = to_cblock(from_cblock(new_size) + 1); 3393 if (is_dirty(cache, new_size)) { 3394 DMERR("%s: unable to shrink cache; cache block %llu is dirty", 3395 cache_device_name(cache), 3396 (unsigned long long) from_cblock(new_size)); 3397 return false; 3398 } 3399 } 3400 3401 return true; 3402 } 3403 3404 static int resize_cache_dev(struct cache *cache, dm_cblock_t new_size) 3405 { 3406 int r; 3407 3408 r = dm_cache_resize(cache->cmd, new_size); 3409 if (r) { 3410 DMERR("%s: could not resize cache metadata", cache_device_name(cache)); 3411 metadata_operation_failed(cache, "dm_cache_resize", r); 3412 return r; 3413 } 3414 3415 set_cache_size(cache, new_size); 3416 3417 return 0; 3418 } 3419 3420 static int cache_preresume(struct dm_target *ti) 3421 { 3422 int r = 0; 3423 struct cache *cache = ti->private; 3424 dm_cblock_t csize = get_cache_dev_size(cache); 3425 3426 /* 3427 * Check to see if the cache has resized. 3428 */ 3429 if (!cache->sized) { 3430 r = resize_cache_dev(cache, csize); 3431 if (r) 3432 return r; 3433 3434 cache->sized = true; 3435 3436 } else if (csize != cache->cache_size) { 3437 if (!can_resize(cache, csize)) 3438 return -EINVAL; 3439 3440 r = resize_cache_dev(cache, csize); 3441 if (r) 3442 return r; 3443 } 3444 3445 if (!cache->loaded_mappings) { 3446 r = dm_cache_load_mappings(cache->cmd, cache->policy, 3447 load_mapping, cache); 3448 if (r) { 3449 DMERR("%s: could not load cache mappings", cache_device_name(cache)); 3450 metadata_operation_failed(cache, "dm_cache_load_mappings", r); 3451 return r; 3452 } 3453 3454 cache->loaded_mappings = true; 3455 } 3456 3457 if (!cache->loaded_discards) { 3458 struct discard_load_info li; 3459 3460 /* 3461 * The discard bitset could have been resized, or the 3462 * discard block size changed. To be safe we start by 3463 * setting every dblock to not discarded. 3464 */ 3465 clear_bitset(cache->discard_bitset, from_dblock(cache->discard_nr_blocks)); 3466 3467 discard_load_info_init(cache, &li); 3468 r = dm_cache_load_discards(cache->cmd, load_discard, &li); 3469 if (r) { 3470 DMERR("%s: could not load origin discards", cache_device_name(cache)); 3471 metadata_operation_failed(cache, "dm_cache_load_discards", r); 3472 return r; 3473 } 3474 set_discard_range(&li); 3475 3476 cache->loaded_discards = true; 3477 } 3478 3479 return r; 3480 } 3481 3482 static void cache_resume(struct dm_target *ti) 3483 { 3484 struct cache *cache = ti->private; 3485 3486 cache->need_tick_bio = true; 3487 do_waker(&cache->waker.work); 3488 } 3489 3490 /* 3491 * Status format: 3492 * 3493 * <metadata block size> <#used metadata blocks>/<#total metadata blocks> 3494 * <cache block size> <#used cache blocks>/<#total cache blocks> 3495 * <#read hits> <#read misses> <#write hits> <#write misses> 3496 * <#demotions> <#promotions> <#dirty> 3497 * <#features> <features>* 3498 * <#core args> <core args> 3499 * <policy name> <#policy args> <policy args>* <cache metadata mode> 3500 */ 3501 static void cache_status(struct dm_target *ti, status_type_t type, 3502 unsigned status_flags, char *result, unsigned maxlen) 3503 { 3504 int r = 0; 3505 unsigned i; 3506 ssize_t sz = 0; 3507 dm_block_t nr_free_blocks_metadata = 0; 3508 dm_block_t nr_blocks_metadata = 0; 3509 char buf[BDEVNAME_SIZE]; 3510 struct cache *cache = ti->private; 3511 dm_cblock_t residency; 3512 3513 switch (type) { 3514 case STATUSTYPE_INFO: 3515 if (get_cache_mode(cache) == CM_FAIL) { 3516 DMEMIT("Fail"); 3517 break; 3518 } 3519 3520 /* Commit to ensure statistics aren't out-of-date */ 3521 if (!(status_flags & DM_STATUS_NOFLUSH_FLAG) && !dm_suspended(ti)) 3522 (void) commit(cache, false); 3523 3524 r = dm_cache_get_free_metadata_block_count(cache->cmd, &nr_free_blocks_metadata); 3525 if (r) { 3526 DMERR("%s: dm_cache_get_free_metadata_block_count returned %d", 3527 cache_device_name(cache), r); 3528 goto err; 3529 } 3530 3531 r = dm_cache_get_metadata_dev_size(cache->cmd, &nr_blocks_metadata); 3532 if (r) { 3533 DMERR("%s: dm_cache_get_metadata_dev_size returned %d", 3534 cache_device_name(cache), r); 3535 goto err; 3536 } 3537 3538 residency = policy_residency(cache->policy); 3539 3540 DMEMIT("%u %llu/%llu %u %llu/%llu %u %u %u %u %u %u %lu ", 3541 (unsigned)DM_CACHE_METADATA_BLOCK_SIZE, 3542 (unsigned long long)(nr_blocks_metadata - nr_free_blocks_metadata), 3543 (unsigned long long)nr_blocks_metadata, 3544 cache->sectors_per_block, 3545 (unsigned long long) from_cblock(residency), 3546 (unsigned long long) from_cblock(cache->cache_size), 3547 (unsigned) atomic_read(&cache->stats.read_hit), 3548 (unsigned) atomic_read(&cache->stats.read_miss), 3549 (unsigned) atomic_read(&cache->stats.write_hit), 3550 (unsigned) atomic_read(&cache->stats.write_miss), 3551 (unsigned) atomic_read(&cache->stats.demotion), 3552 (unsigned) atomic_read(&cache->stats.promotion), 3553 (unsigned long) atomic_read(&cache->nr_dirty)); 3554 3555 if (writethrough_mode(&cache->features)) 3556 DMEMIT("1 writethrough "); 3557 3558 else if (passthrough_mode(&cache->features)) 3559 DMEMIT("1 passthrough "); 3560 3561 else if (writeback_mode(&cache->features)) 3562 DMEMIT("1 writeback "); 3563 3564 else { 3565 DMERR("%s: internal error: unknown io mode: %d", 3566 cache_device_name(cache), (int) cache->features.io_mode); 3567 goto err; 3568 } 3569 3570 DMEMIT("2 migration_threshold %llu ", (unsigned long long) cache->migration_threshold); 3571 3572 DMEMIT("%s ", dm_cache_policy_get_name(cache->policy)); 3573 if (sz < maxlen) { 3574 r = policy_emit_config_values(cache->policy, result, maxlen, &sz); 3575 if (r) 3576 DMERR("%s: policy_emit_config_values returned %d", 3577 cache_device_name(cache), r); 3578 } 3579 3580 if (get_cache_mode(cache) == CM_READ_ONLY) 3581 DMEMIT("ro "); 3582 else 3583 DMEMIT("rw "); 3584 3585 break; 3586 3587 case STATUSTYPE_TABLE: 3588 format_dev_t(buf, cache->metadata_dev->bdev->bd_dev); 3589 DMEMIT("%s ", buf); 3590 format_dev_t(buf, cache->cache_dev->bdev->bd_dev); 3591 DMEMIT("%s ", buf); 3592 format_dev_t(buf, cache->origin_dev->bdev->bd_dev); 3593 DMEMIT("%s", buf); 3594 3595 for (i = 0; i < cache->nr_ctr_args - 1; i++) 3596 DMEMIT(" %s", cache->ctr_args[i]); 3597 if (cache->nr_ctr_args) 3598 DMEMIT(" %s", cache->ctr_args[cache->nr_ctr_args - 1]); 3599 } 3600 3601 return; 3602 3603 err: 3604 DMEMIT("Error"); 3605 } 3606 3607 /* 3608 * A cache block range can take two forms: 3609 * 3610 * i) A single cblock, eg. '3456' 3611 * ii) A begin and end cblock with dots between, eg. 123-234 3612 */ 3613 static int parse_cblock_range(struct cache *cache, const char *str, 3614 struct cblock_range *result) 3615 { 3616 char dummy; 3617 uint64_t b, e; 3618 int r; 3619 3620 /* 3621 * Try and parse form (ii) first. 3622 */ 3623 r = sscanf(str, "%llu-%llu%c", &b, &e, &dummy); 3624 if (r < 0) 3625 return r; 3626 3627 if (r == 2) { 3628 result->begin = to_cblock(b); 3629 result->end = to_cblock(e); 3630 return 0; 3631 } 3632 3633 /* 3634 * That didn't work, try form (i). 3635 */ 3636 r = sscanf(str, "%llu%c", &b, &dummy); 3637 if (r < 0) 3638 return r; 3639 3640 if (r == 1) { 3641 result->begin = to_cblock(b); 3642 result->end = to_cblock(from_cblock(result->begin) + 1u); 3643 return 0; 3644 } 3645 3646 DMERR("%s: invalid cblock range '%s'", cache_device_name(cache), str); 3647 return -EINVAL; 3648 } 3649 3650 static int validate_cblock_range(struct cache *cache, struct cblock_range *range) 3651 { 3652 uint64_t b = from_cblock(range->begin); 3653 uint64_t e = from_cblock(range->end); 3654 uint64_t n = from_cblock(cache->cache_size); 3655 3656 if (b >= n) { 3657 DMERR("%s: begin cblock out of range: %llu >= %llu", 3658 cache_device_name(cache), b, n); 3659 return -EINVAL; 3660 } 3661 3662 if (e > n) { 3663 DMERR("%s: end cblock out of range: %llu > %llu", 3664 cache_device_name(cache), e, n); 3665 return -EINVAL; 3666 } 3667 3668 if (b >= e) { 3669 DMERR("%s: invalid cblock range: %llu >= %llu", 3670 cache_device_name(cache), b, e); 3671 return -EINVAL; 3672 } 3673 3674 return 0; 3675 } 3676 3677 static int request_invalidation(struct cache *cache, struct cblock_range *range) 3678 { 3679 struct invalidation_request req; 3680 3681 INIT_LIST_HEAD(&req.list); 3682 req.cblocks = range; 3683 atomic_set(&req.complete, 0); 3684 req.err = 0; 3685 init_waitqueue_head(&req.result_wait); 3686 3687 spin_lock(&cache->invalidation_lock); 3688 list_add(&req.list, &cache->invalidation_requests); 3689 spin_unlock(&cache->invalidation_lock); 3690 wake_worker(cache); 3691 3692 wait_event(req.result_wait, atomic_read(&req.complete)); 3693 return req.err; 3694 } 3695 3696 static int process_invalidate_cblocks_message(struct cache *cache, unsigned count, 3697 const char **cblock_ranges) 3698 { 3699 int r = 0; 3700 unsigned i; 3701 struct cblock_range range; 3702 3703 if (!passthrough_mode(&cache->features)) { 3704 DMERR("%s: cache has to be in passthrough mode for invalidation", 3705 cache_device_name(cache)); 3706 return -EPERM; 3707 } 3708 3709 for (i = 0; i < count; i++) { 3710 r = parse_cblock_range(cache, cblock_ranges[i], &range); 3711 if (r) 3712 break; 3713 3714 r = validate_cblock_range(cache, &range); 3715 if (r) 3716 break; 3717 3718 /* 3719 * Pass begin and end origin blocks to the worker and wake it. 3720 */ 3721 r = request_invalidation(cache, &range); 3722 if (r) 3723 break; 3724 } 3725 3726 return r; 3727 } 3728 3729 /* 3730 * Supports 3731 * "<key> <value>" 3732 * and 3733 * "invalidate_cblocks [(<begin>)|(<begin>-<end>)]* 3734 * 3735 * The key migration_threshold is supported by the cache target core. 3736 */ 3737 static int cache_message(struct dm_target *ti, unsigned argc, char **argv) 3738 { 3739 struct cache *cache = ti->private; 3740 3741 if (!argc) 3742 return -EINVAL; 3743 3744 if (get_cache_mode(cache) >= CM_READ_ONLY) { 3745 DMERR("%s: unable to service cache target messages in READ_ONLY or FAIL mode", 3746 cache_device_name(cache)); 3747 return -EOPNOTSUPP; 3748 } 3749 3750 if (!strcasecmp(argv[0], "invalidate_cblocks")) 3751 return process_invalidate_cblocks_message(cache, argc - 1, (const char **) argv + 1); 3752 3753 if (argc != 2) 3754 return -EINVAL; 3755 3756 return set_config_value(cache, argv[0], argv[1]); 3757 } 3758 3759 static int cache_iterate_devices(struct dm_target *ti, 3760 iterate_devices_callout_fn fn, void *data) 3761 { 3762 int r = 0; 3763 struct cache *cache = ti->private; 3764 3765 r = fn(ti, cache->cache_dev, 0, get_dev_size(cache->cache_dev), data); 3766 if (!r) 3767 r = fn(ti, cache->origin_dev, 0, ti->len, data); 3768 3769 return r; 3770 } 3771 3772 /* 3773 * We assume I/O is going to the origin (which is the volume 3774 * more likely to have restrictions e.g. by being striped). 3775 * (Looking up the exact location of the data would be expensive 3776 * and could always be out of date by the time the bio is submitted.) 3777 */ 3778 static int cache_bvec_merge(struct dm_target *ti, 3779 struct bvec_merge_data *bvm, 3780 struct bio_vec *biovec, int max_size) 3781 { 3782 struct cache *cache = ti->private; 3783 struct request_queue *q = bdev_get_queue(cache->origin_dev->bdev); 3784 3785 if (!q->merge_bvec_fn) 3786 return max_size; 3787 3788 bvm->bi_bdev = cache->origin_dev->bdev; 3789 return min(max_size, q->merge_bvec_fn(q, bvm, biovec)); 3790 } 3791 3792 static void set_discard_limits(struct cache *cache, struct queue_limits *limits) 3793 { 3794 /* 3795 * FIXME: these limits may be incompatible with the cache device 3796 */ 3797 limits->max_discard_sectors = min_t(sector_t, cache->discard_block_size * 1024, 3798 cache->origin_sectors); 3799 limits->discard_granularity = cache->discard_block_size << SECTOR_SHIFT; 3800 } 3801 3802 static void cache_io_hints(struct dm_target *ti, struct queue_limits *limits) 3803 { 3804 struct cache *cache = ti->private; 3805 uint64_t io_opt_sectors = limits->io_opt >> SECTOR_SHIFT; 3806 3807 /* 3808 * If the system-determined stacked limits are compatible with the 3809 * cache's blocksize (io_opt is a factor) do not override them. 3810 */ 3811 if (io_opt_sectors < cache->sectors_per_block || 3812 do_div(io_opt_sectors, cache->sectors_per_block)) { 3813 blk_limits_io_min(limits, cache->sectors_per_block << SECTOR_SHIFT); 3814 blk_limits_io_opt(limits, cache->sectors_per_block << SECTOR_SHIFT); 3815 } 3816 set_discard_limits(cache, limits); 3817 } 3818 3819 /*----------------------------------------------------------------*/ 3820 3821 static struct target_type cache_target = { 3822 .name = "cache", 3823 .version = {1, 7, 0}, 3824 .module = THIS_MODULE, 3825 .ctr = cache_ctr, 3826 .dtr = cache_dtr, 3827 .map = cache_map, 3828 .end_io = cache_end_io, 3829 .postsuspend = cache_postsuspend, 3830 .preresume = cache_preresume, 3831 .resume = cache_resume, 3832 .status = cache_status, 3833 .message = cache_message, 3834 .iterate_devices = cache_iterate_devices, 3835 .merge = cache_bvec_merge, 3836 .io_hints = cache_io_hints, 3837 }; 3838 3839 static int __init dm_cache_init(void) 3840 { 3841 int r; 3842 3843 r = dm_register_target(&cache_target); 3844 if (r) { 3845 DMERR("cache target registration failed: %d", r); 3846 return r; 3847 } 3848 3849 migration_cache = KMEM_CACHE(dm_cache_migration, 0); 3850 if (!migration_cache) { 3851 dm_unregister_target(&cache_target); 3852 return -ENOMEM; 3853 } 3854 3855 return 0; 3856 } 3857 3858 static void __exit dm_cache_exit(void) 3859 { 3860 dm_unregister_target(&cache_target); 3861 kmem_cache_destroy(migration_cache); 3862 } 3863 3864 module_init(dm_cache_init); 3865 module_exit(dm_cache_exit); 3866 3867 MODULE_DESCRIPTION(DM_NAME " cache target"); 3868 MODULE_AUTHOR("Joe Thornber <ejt@redhat.com>"); 3869 MODULE_LICENSE("GPL"); 3870