1 /* 2 * Copyright (C) 2012 Red Hat. All rights reserved. 3 * 4 * This file is released under the GPL. 5 */ 6 7 #include "dm.h" 8 #include "dm-bio-prison.h" 9 #include "dm-bio-record.h" 10 #include "dm-cache-metadata.h" 11 12 #include <linux/dm-io.h> 13 #include <linux/dm-kcopyd.h> 14 #include <linux/jiffies.h> 15 #include <linux/init.h> 16 #include <linux/mempool.h> 17 #include <linux/module.h> 18 #include <linux/slab.h> 19 #include <linux/vmalloc.h> 20 21 #define DM_MSG_PREFIX "cache" 22 23 DECLARE_DM_KCOPYD_THROTTLE_WITH_MODULE_PARM(cache_copy_throttle, 24 "A percentage of time allocated for copying to and/or from cache"); 25 26 /*----------------------------------------------------------------*/ 27 28 #define IOT_RESOLUTION 4 29 30 struct io_tracker { 31 spinlock_t lock; 32 33 /* 34 * Sectors of in-flight IO. 35 */ 36 sector_t in_flight; 37 38 /* 39 * The time, in jiffies, when this device became idle (if it is 40 * indeed idle). 41 */ 42 unsigned long idle_time; 43 unsigned long last_update_time; 44 }; 45 46 static void iot_init(struct io_tracker *iot) 47 { 48 spin_lock_init(&iot->lock); 49 iot->in_flight = 0ul; 50 iot->idle_time = 0ul; 51 iot->last_update_time = jiffies; 52 } 53 54 static bool __iot_idle_for(struct io_tracker *iot, unsigned long jifs) 55 { 56 if (iot->in_flight) 57 return false; 58 59 return time_after(jiffies, iot->idle_time + jifs); 60 } 61 62 static bool iot_idle_for(struct io_tracker *iot, unsigned long jifs) 63 { 64 bool r; 65 unsigned long flags; 66 67 spin_lock_irqsave(&iot->lock, flags); 68 r = __iot_idle_for(iot, jifs); 69 spin_unlock_irqrestore(&iot->lock, flags); 70 71 return r; 72 } 73 74 static void iot_io_begin(struct io_tracker *iot, sector_t len) 75 { 76 unsigned long flags; 77 78 spin_lock_irqsave(&iot->lock, flags); 79 iot->in_flight += len; 80 spin_unlock_irqrestore(&iot->lock, flags); 81 } 82 83 static void __iot_io_end(struct io_tracker *iot, sector_t len) 84 { 85 iot->in_flight -= len; 86 if (!iot->in_flight) 87 iot->idle_time = jiffies; 88 } 89 90 static void iot_io_end(struct io_tracker *iot, sector_t len) 91 { 92 unsigned long flags; 93 94 spin_lock_irqsave(&iot->lock, flags); 95 __iot_io_end(iot, len); 96 spin_unlock_irqrestore(&iot->lock, flags); 97 } 98 99 /*----------------------------------------------------------------*/ 100 101 /* 102 * Glossary: 103 * 104 * oblock: index of an origin block 105 * cblock: index of a cache block 106 * promotion: movement of a block from origin to cache 107 * demotion: movement of a block from cache to origin 108 * migration: movement of a block between the origin and cache device, 109 * either direction 110 */ 111 112 /*----------------------------------------------------------------*/ 113 114 /* 115 * There are a couple of places where we let a bio run, but want to do some 116 * work before calling its endio function. We do this by temporarily 117 * changing the endio fn. 118 */ 119 struct dm_hook_info { 120 bio_end_io_t *bi_end_io; 121 void *bi_private; 122 }; 123 124 static void dm_hook_bio(struct dm_hook_info *h, struct bio *bio, 125 bio_end_io_t *bi_end_io, void *bi_private) 126 { 127 h->bi_end_io = bio->bi_end_io; 128 h->bi_private = bio->bi_private; 129 130 bio->bi_end_io = bi_end_io; 131 bio->bi_private = bi_private; 132 } 133 134 static void dm_unhook_bio(struct dm_hook_info *h, struct bio *bio) 135 { 136 bio->bi_end_io = h->bi_end_io; 137 bio->bi_private = h->bi_private; 138 } 139 140 /*----------------------------------------------------------------*/ 141 142 #define MIGRATION_POOL_SIZE 128 143 #define COMMIT_PERIOD HZ 144 #define MIGRATION_COUNT_WINDOW 10 145 146 /* 147 * The block size of the device holding cache data must be 148 * between 32KB and 1GB. 149 */ 150 #define DATA_DEV_BLOCK_SIZE_MIN_SECTORS (32 * 1024 >> SECTOR_SHIFT) 151 #define DATA_DEV_BLOCK_SIZE_MAX_SECTORS (1024 * 1024 * 1024 >> SECTOR_SHIFT) 152 153 enum cache_metadata_mode { 154 CM_WRITE, /* metadata may be changed */ 155 CM_READ_ONLY, /* metadata may not be changed */ 156 CM_FAIL 157 }; 158 159 enum cache_io_mode { 160 /* 161 * Data is written to cached blocks only. These blocks are marked 162 * dirty. If you lose the cache device you will lose data. 163 * Potential performance increase for both reads and writes. 164 */ 165 CM_IO_WRITEBACK, 166 167 /* 168 * Data is written to both cache and origin. Blocks are never 169 * dirty. Potential performance benfit for reads only. 170 */ 171 CM_IO_WRITETHROUGH, 172 173 /* 174 * A degraded mode useful for various cache coherency situations 175 * (eg, rolling back snapshots). Reads and writes always go to the 176 * origin. If a write goes to a cached oblock, then the cache 177 * block is invalidated. 178 */ 179 CM_IO_PASSTHROUGH 180 }; 181 182 struct cache_features { 183 enum cache_metadata_mode mode; 184 enum cache_io_mode io_mode; 185 }; 186 187 struct cache_stats { 188 atomic_t read_hit; 189 atomic_t read_miss; 190 atomic_t write_hit; 191 atomic_t write_miss; 192 atomic_t demotion; 193 atomic_t promotion; 194 atomic_t copies_avoided; 195 atomic_t cache_cell_clash; 196 atomic_t commit_count; 197 atomic_t discard_count; 198 }; 199 200 /* 201 * Defines a range of cblocks, begin to (end - 1) are in the range. end is 202 * the one-past-the-end value. 203 */ 204 struct cblock_range { 205 dm_cblock_t begin; 206 dm_cblock_t end; 207 }; 208 209 struct invalidation_request { 210 struct list_head list; 211 struct cblock_range *cblocks; 212 213 atomic_t complete; 214 int err; 215 216 wait_queue_head_t result_wait; 217 }; 218 219 struct cache { 220 struct dm_target *ti; 221 struct dm_target_callbacks callbacks; 222 223 struct dm_cache_metadata *cmd; 224 225 /* 226 * Metadata is written to this device. 227 */ 228 struct dm_dev *metadata_dev; 229 230 /* 231 * The slower of the two data devices. Typically a spindle. 232 */ 233 struct dm_dev *origin_dev; 234 235 /* 236 * The faster of the two data devices. Typically an SSD. 237 */ 238 struct dm_dev *cache_dev; 239 240 /* 241 * Size of the origin device in _complete_ blocks and native sectors. 242 */ 243 dm_oblock_t origin_blocks; 244 sector_t origin_sectors; 245 246 /* 247 * Size of the cache device in blocks. 248 */ 249 dm_cblock_t cache_size; 250 251 /* 252 * Fields for converting from sectors to blocks. 253 */ 254 uint32_t sectors_per_block; 255 int sectors_per_block_shift; 256 257 spinlock_t lock; 258 struct list_head deferred_cells; 259 struct bio_list deferred_bios; 260 struct bio_list deferred_flush_bios; 261 struct bio_list deferred_writethrough_bios; 262 struct list_head quiesced_migrations; 263 struct list_head completed_migrations; 264 struct list_head need_commit_migrations; 265 sector_t migration_threshold; 266 wait_queue_head_t migration_wait; 267 atomic_t nr_allocated_migrations; 268 269 /* 270 * The number of in flight migrations that are performing 271 * background io. eg, promotion, writeback. 272 */ 273 atomic_t nr_io_migrations; 274 275 wait_queue_head_t quiescing_wait; 276 atomic_t quiescing; 277 atomic_t quiescing_ack; 278 279 /* 280 * cache_size entries, dirty if set 281 */ 282 atomic_t nr_dirty; 283 unsigned long *dirty_bitset; 284 285 /* 286 * origin_blocks entries, discarded if set. 287 */ 288 dm_dblock_t discard_nr_blocks; 289 unsigned long *discard_bitset; 290 uint32_t discard_block_size; /* a power of 2 times sectors per block */ 291 292 /* 293 * Rather than reconstructing the table line for the status we just 294 * save it and regurgitate. 295 */ 296 unsigned nr_ctr_args; 297 const char **ctr_args; 298 299 struct dm_kcopyd_client *copier; 300 struct workqueue_struct *wq; 301 struct work_struct worker; 302 303 struct delayed_work waker; 304 unsigned long last_commit_jiffies; 305 306 struct dm_bio_prison *prison; 307 struct dm_deferred_set *all_io_ds; 308 309 mempool_t *migration_pool; 310 311 struct dm_cache_policy *policy; 312 unsigned policy_nr_args; 313 314 bool need_tick_bio:1; 315 bool sized:1; 316 bool invalidate:1; 317 bool commit_requested:1; 318 bool loaded_mappings:1; 319 bool loaded_discards:1; 320 321 /* 322 * Cache features such as write-through. 323 */ 324 struct cache_features features; 325 326 struct cache_stats stats; 327 328 /* 329 * Invalidation fields. 330 */ 331 spinlock_t invalidation_lock; 332 struct list_head invalidation_requests; 333 334 struct io_tracker origin_tracker; 335 }; 336 337 struct per_bio_data { 338 bool tick:1; 339 unsigned req_nr:2; 340 struct dm_deferred_entry *all_io_entry; 341 struct dm_hook_info hook_info; 342 sector_t len; 343 344 /* 345 * writethrough fields. These MUST remain at the end of this 346 * structure and the 'cache' member must be the first as it 347 * is used to determine the offset of the writethrough fields. 348 */ 349 struct cache *cache; 350 dm_cblock_t cblock; 351 struct dm_bio_details bio_details; 352 }; 353 354 struct dm_cache_migration { 355 struct list_head list; 356 struct cache *cache; 357 358 unsigned long start_jiffies; 359 dm_oblock_t old_oblock; 360 dm_oblock_t new_oblock; 361 dm_cblock_t cblock; 362 363 bool err:1; 364 bool discard:1; 365 bool writeback:1; 366 bool demote:1; 367 bool promote:1; 368 bool requeue_holder:1; 369 bool invalidate:1; 370 371 struct dm_bio_prison_cell *old_ocell; 372 struct dm_bio_prison_cell *new_ocell; 373 }; 374 375 /* 376 * Processing a bio in the worker thread may require these memory 377 * allocations. We prealloc to avoid deadlocks (the same worker thread 378 * frees them back to the mempool). 379 */ 380 struct prealloc { 381 struct dm_cache_migration *mg; 382 struct dm_bio_prison_cell *cell1; 383 struct dm_bio_prison_cell *cell2; 384 }; 385 386 static enum cache_metadata_mode get_cache_mode(struct cache *cache); 387 388 static void wake_worker(struct cache *cache) 389 { 390 queue_work(cache->wq, &cache->worker); 391 } 392 393 /*----------------------------------------------------------------*/ 394 395 static struct dm_bio_prison_cell *alloc_prison_cell(struct cache *cache) 396 { 397 /* FIXME: change to use a local slab. */ 398 return dm_bio_prison_alloc_cell(cache->prison, GFP_NOWAIT); 399 } 400 401 static void free_prison_cell(struct cache *cache, struct dm_bio_prison_cell *cell) 402 { 403 dm_bio_prison_free_cell(cache->prison, cell); 404 } 405 406 static struct dm_cache_migration *alloc_migration(struct cache *cache) 407 { 408 struct dm_cache_migration *mg; 409 410 mg = mempool_alloc(cache->migration_pool, GFP_NOWAIT); 411 if (mg) { 412 mg->cache = cache; 413 atomic_inc(&mg->cache->nr_allocated_migrations); 414 } 415 416 return mg; 417 } 418 419 static void free_migration(struct dm_cache_migration *mg) 420 { 421 struct cache *cache = mg->cache; 422 423 if (atomic_dec_and_test(&cache->nr_allocated_migrations)) 424 wake_up(&cache->migration_wait); 425 426 mempool_free(mg, cache->migration_pool); 427 } 428 429 static int prealloc_data_structs(struct cache *cache, struct prealloc *p) 430 { 431 if (!p->mg) { 432 p->mg = alloc_migration(cache); 433 if (!p->mg) 434 return -ENOMEM; 435 } 436 437 if (!p->cell1) { 438 p->cell1 = alloc_prison_cell(cache); 439 if (!p->cell1) 440 return -ENOMEM; 441 } 442 443 if (!p->cell2) { 444 p->cell2 = alloc_prison_cell(cache); 445 if (!p->cell2) 446 return -ENOMEM; 447 } 448 449 return 0; 450 } 451 452 static void prealloc_free_structs(struct cache *cache, struct prealloc *p) 453 { 454 if (p->cell2) 455 free_prison_cell(cache, p->cell2); 456 457 if (p->cell1) 458 free_prison_cell(cache, p->cell1); 459 460 if (p->mg) 461 free_migration(p->mg); 462 } 463 464 static struct dm_cache_migration *prealloc_get_migration(struct prealloc *p) 465 { 466 struct dm_cache_migration *mg = p->mg; 467 468 BUG_ON(!mg); 469 p->mg = NULL; 470 471 return mg; 472 } 473 474 /* 475 * You must have a cell within the prealloc struct to return. If not this 476 * function will BUG() rather than returning NULL. 477 */ 478 static struct dm_bio_prison_cell *prealloc_get_cell(struct prealloc *p) 479 { 480 struct dm_bio_prison_cell *r = NULL; 481 482 if (p->cell1) { 483 r = p->cell1; 484 p->cell1 = NULL; 485 486 } else if (p->cell2) { 487 r = p->cell2; 488 p->cell2 = NULL; 489 } else 490 BUG(); 491 492 return r; 493 } 494 495 /* 496 * You can't have more than two cells in a prealloc struct. BUG() will be 497 * called if you try and overfill. 498 */ 499 static void prealloc_put_cell(struct prealloc *p, struct dm_bio_prison_cell *cell) 500 { 501 if (!p->cell2) 502 p->cell2 = cell; 503 504 else if (!p->cell1) 505 p->cell1 = cell; 506 507 else 508 BUG(); 509 } 510 511 /*----------------------------------------------------------------*/ 512 513 static void build_key(dm_oblock_t begin, dm_oblock_t end, struct dm_cell_key *key) 514 { 515 key->virtual = 0; 516 key->dev = 0; 517 key->block_begin = from_oblock(begin); 518 key->block_end = from_oblock(end); 519 } 520 521 /* 522 * The caller hands in a preallocated cell, and a free function for it. 523 * The cell will be freed if there's an error, or if it wasn't used because 524 * a cell with that key already exists. 525 */ 526 typedef void (*cell_free_fn)(void *context, struct dm_bio_prison_cell *cell); 527 528 static int bio_detain_range(struct cache *cache, dm_oblock_t oblock_begin, dm_oblock_t oblock_end, 529 struct bio *bio, struct dm_bio_prison_cell *cell_prealloc, 530 cell_free_fn free_fn, void *free_context, 531 struct dm_bio_prison_cell **cell_result) 532 { 533 int r; 534 struct dm_cell_key key; 535 536 build_key(oblock_begin, oblock_end, &key); 537 r = dm_bio_detain(cache->prison, &key, bio, cell_prealloc, cell_result); 538 if (r) 539 free_fn(free_context, cell_prealloc); 540 541 return r; 542 } 543 544 static int bio_detain(struct cache *cache, dm_oblock_t oblock, 545 struct bio *bio, struct dm_bio_prison_cell *cell_prealloc, 546 cell_free_fn free_fn, void *free_context, 547 struct dm_bio_prison_cell **cell_result) 548 { 549 dm_oblock_t end = to_oblock(from_oblock(oblock) + 1ULL); 550 return bio_detain_range(cache, oblock, end, bio, 551 cell_prealloc, free_fn, free_context, cell_result); 552 } 553 554 static int get_cell(struct cache *cache, 555 dm_oblock_t oblock, 556 struct prealloc *structs, 557 struct dm_bio_prison_cell **cell_result) 558 { 559 int r; 560 struct dm_cell_key key; 561 struct dm_bio_prison_cell *cell_prealloc; 562 563 cell_prealloc = prealloc_get_cell(structs); 564 565 build_key(oblock, to_oblock(from_oblock(oblock) + 1ULL), &key); 566 r = dm_get_cell(cache->prison, &key, cell_prealloc, cell_result); 567 if (r) 568 prealloc_put_cell(structs, cell_prealloc); 569 570 return r; 571 } 572 573 /*----------------------------------------------------------------*/ 574 575 static bool is_dirty(struct cache *cache, dm_cblock_t b) 576 { 577 return test_bit(from_cblock(b), cache->dirty_bitset); 578 } 579 580 static void set_dirty(struct cache *cache, dm_oblock_t oblock, dm_cblock_t cblock) 581 { 582 if (!test_and_set_bit(from_cblock(cblock), cache->dirty_bitset)) { 583 atomic_inc(&cache->nr_dirty); 584 policy_set_dirty(cache->policy, oblock); 585 } 586 } 587 588 static void clear_dirty(struct cache *cache, dm_oblock_t oblock, dm_cblock_t cblock) 589 { 590 if (test_and_clear_bit(from_cblock(cblock), cache->dirty_bitset)) { 591 policy_clear_dirty(cache->policy, oblock); 592 if (atomic_dec_return(&cache->nr_dirty) == 0) 593 dm_table_event(cache->ti->table); 594 } 595 } 596 597 /*----------------------------------------------------------------*/ 598 599 static bool block_size_is_power_of_two(struct cache *cache) 600 { 601 return cache->sectors_per_block_shift >= 0; 602 } 603 604 /* gcc on ARM generates spurious references to __udivdi3 and __umoddi3 */ 605 #if defined(CONFIG_ARM) && __GNUC__ == 4 && __GNUC_MINOR__ <= 6 606 __always_inline 607 #endif 608 static dm_block_t block_div(dm_block_t b, uint32_t n) 609 { 610 do_div(b, n); 611 612 return b; 613 } 614 615 static dm_block_t oblocks_per_dblock(struct cache *cache) 616 { 617 dm_block_t oblocks = cache->discard_block_size; 618 619 if (block_size_is_power_of_two(cache)) 620 oblocks >>= cache->sectors_per_block_shift; 621 else 622 oblocks = block_div(oblocks, cache->sectors_per_block); 623 624 return oblocks; 625 } 626 627 static dm_dblock_t oblock_to_dblock(struct cache *cache, dm_oblock_t oblock) 628 { 629 return to_dblock(block_div(from_oblock(oblock), 630 oblocks_per_dblock(cache))); 631 } 632 633 static dm_oblock_t dblock_to_oblock(struct cache *cache, dm_dblock_t dblock) 634 { 635 return to_oblock(from_dblock(dblock) * oblocks_per_dblock(cache)); 636 } 637 638 static void set_discard(struct cache *cache, dm_dblock_t b) 639 { 640 unsigned long flags; 641 642 BUG_ON(from_dblock(b) >= from_dblock(cache->discard_nr_blocks)); 643 atomic_inc(&cache->stats.discard_count); 644 645 spin_lock_irqsave(&cache->lock, flags); 646 set_bit(from_dblock(b), cache->discard_bitset); 647 spin_unlock_irqrestore(&cache->lock, flags); 648 } 649 650 static void clear_discard(struct cache *cache, dm_dblock_t b) 651 { 652 unsigned long flags; 653 654 spin_lock_irqsave(&cache->lock, flags); 655 clear_bit(from_dblock(b), cache->discard_bitset); 656 spin_unlock_irqrestore(&cache->lock, flags); 657 } 658 659 static bool is_discarded(struct cache *cache, dm_dblock_t b) 660 { 661 int r; 662 unsigned long flags; 663 664 spin_lock_irqsave(&cache->lock, flags); 665 r = test_bit(from_dblock(b), cache->discard_bitset); 666 spin_unlock_irqrestore(&cache->lock, flags); 667 668 return r; 669 } 670 671 static bool is_discarded_oblock(struct cache *cache, dm_oblock_t b) 672 { 673 int r; 674 unsigned long flags; 675 676 spin_lock_irqsave(&cache->lock, flags); 677 r = test_bit(from_dblock(oblock_to_dblock(cache, b)), 678 cache->discard_bitset); 679 spin_unlock_irqrestore(&cache->lock, flags); 680 681 return r; 682 } 683 684 /*----------------------------------------------------------------*/ 685 686 static void load_stats(struct cache *cache) 687 { 688 struct dm_cache_statistics stats; 689 690 dm_cache_metadata_get_stats(cache->cmd, &stats); 691 atomic_set(&cache->stats.read_hit, stats.read_hits); 692 atomic_set(&cache->stats.read_miss, stats.read_misses); 693 atomic_set(&cache->stats.write_hit, stats.write_hits); 694 atomic_set(&cache->stats.write_miss, stats.write_misses); 695 } 696 697 static void save_stats(struct cache *cache) 698 { 699 struct dm_cache_statistics stats; 700 701 if (get_cache_mode(cache) >= CM_READ_ONLY) 702 return; 703 704 stats.read_hits = atomic_read(&cache->stats.read_hit); 705 stats.read_misses = atomic_read(&cache->stats.read_miss); 706 stats.write_hits = atomic_read(&cache->stats.write_hit); 707 stats.write_misses = atomic_read(&cache->stats.write_miss); 708 709 dm_cache_metadata_set_stats(cache->cmd, &stats); 710 } 711 712 /*---------------------------------------------------------------- 713 * Per bio data 714 *--------------------------------------------------------------*/ 715 716 /* 717 * If using writeback, leave out struct per_bio_data's writethrough fields. 718 */ 719 #define PB_DATA_SIZE_WB (offsetof(struct per_bio_data, cache)) 720 #define PB_DATA_SIZE_WT (sizeof(struct per_bio_data)) 721 722 static bool writethrough_mode(struct cache_features *f) 723 { 724 return f->io_mode == CM_IO_WRITETHROUGH; 725 } 726 727 static bool writeback_mode(struct cache_features *f) 728 { 729 return f->io_mode == CM_IO_WRITEBACK; 730 } 731 732 static bool passthrough_mode(struct cache_features *f) 733 { 734 return f->io_mode == CM_IO_PASSTHROUGH; 735 } 736 737 static size_t get_per_bio_data_size(struct cache *cache) 738 { 739 return writethrough_mode(&cache->features) ? PB_DATA_SIZE_WT : PB_DATA_SIZE_WB; 740 } 741 742 static struct per_bio_data *get_per_bio_data(struct bio *bio, size_t data_size) 743 { 744 struct per_bio_data *pb = dm_per_bio_data(bio, data_size); 745 BUG_ON(!pb); 746 return pb; 747 } 748 749 static struct per_bio_data *init_per_bio_data(struct bio *bio, size_t data_size) 750 { 751 struct per_bio_data *pb = get_per_bio_data(bio, data_size); 752 753 pb->tick = false; 754 pb->req_nr = dm_bio_get_target_bio_nr(bio); 755 pb->all_io_entry = NULL; 756 pb->len = 0; 757 758 return pb; 759 } 760 761 /*---------------------------------------------------------------- 762 * Remapping 763 *--------------------------------------------------------------*/ 764 static void remap_to_origin(struct cache *cache, struct bio *bio) 765 { 766 bio->bi_bdev = cache->origin_dev->bdev; 767 } 768 769 static void remap_to_cache(struct cache *cache, struct bio *bio, 770 dm_cblock_t cblock) 771 { 772 sector_t bi_sector = bio->bi_iter.bi_sector; 773 sector_t block = from_cblock(cblock); 774 775 bio->bi_bdev = cache->cache_dev->bdev; 776 if (!block_size_is_power_of_two(cache)) 777 bio->bi_iter.bi_sector = 778 (block * cache->sectors_per_block) + 779 sector_div(bi_sector, cache->sectors_per_block); 780 else 781 bio->bi_iter.bi_sector = 782 (block << cache->sectors_per_block_shift) | 783 (bi_sector & (cache->sectors_per_block - 1)); 784 } 785 786 static void check_if_tick_bio_needed(struct cache *cache, struct bio *bio) 787 { 788 unsigned long flags; 789 size_t pb_data_size = get_per_bio_data_size(cache); 790 struct per_bio_data *pb = get_per_bio_data(bio, pb_data_size); 791 792 spin_lock_irqsave(&cache->lock, flags); 793 if (cache->need_tick_bio && 794 !(bio->bi_rw & (REQ_FUA | REQ_FLUSH | REQ_DISCARD))) { 795 pb->tick = true; 796 cache->need_tick_bio = false; 797 } 798 spin_unlock_irqrestore(&cache->lock, flags); 799 } 800 801 static void remap_to_origin_clear_discard(struct cache *cache, struct bio *bio, 802 dm_oblock_t oblock) 803 { 804 check_if_tick_bio_needed(cache, bio); 805 remap_to_origin(cache, bio); 806 if (bio_data_dir(bio) == WRITE) 807 clear_discard(cache, oblock_to_dblock(cache, oblock)); 808 } 809 810 static void remap_to_cache_dirty(struct cache *cache, struct bio *bio, 811 dm_oblock_t oblock, dm_cblock_t cblock) 812 { 813 check_if_tick_bio_needed(cache, bio); 814 remap_to_cache(cache, bio, cblock); 815 if (bio_data_dir(bio) == WRITE) { 816 set_dirty(cache, oblock, cblock); 817 clear_discard(cache, oblock_to_dblock(cache, oblock)); 818 } 819 } 820 821 static dm_oblock_t get_bio_block(struct cache *cache, struct bio *bio) 822 { 823 sector_t block_nr = bio->bi_iter.bi_sector; 824 825 if (!block_size_is_power_of_two(cache)) 826 (void) sector_div(block_nr, cache->sectors_per_block); 827 else 828 block_nr >>= cache->sectors_per_block_shift; 829 830 return to_oblock(block_nr); 831 } 832 833 static int bio_triggers_commit(struct cache *cache, struct bio *bio) 834 { 835 return bio->bi_rw & (REQ_FLUSH | REQ_FUA); 836 } 837 838 /* 839 * You must increment the deferred set whilst the prison cell is held. To 840 * encourage this, we ask for 'cell' to be passed in. 841 */ 842 static void inc_ds(struct cache *cache, struct bio *bio, 843 struct dm_bio_prison_cell *cell) 844 { 845 size_t pb_data_size = get_per_bio_data_size(cache); 846 struct per_bio_data *pb = get_per_bio_data(bio, pb_data_size); 847 848 BUG_ON(!cell); 849 BUG_ON(pb->all_io_entry); 850 851 pb->all_io_entry = dm_deferred_entry_inc(cache->all_io_ds); 852 } 853 854 static bool accountable_bio(struct cache *cache, struct bio *bio) 855 { 856 return ((bio->bi_bdev == cache->origin_dev->bdev) && 857 !(bio->bi_rw & REQ_DISCARD)); 858 } 859 860 static void accounted_begin(struct cache *cache, struct bio *bio) 861 { 862 size_t pb_data_size = get_per_bio_data_size(cache); 863 struct per_bio_data *pb = get_per_bio_data(bio, pb_data_size); 864 865 if (accountable_bio(cache, bio)) { 866 pb->len = bio_sectors(bio); 867 iot_io_begin(&cache->origin_tracker, pb->len); 868 } 869 } 870 871 static void accounted_complete(struct cache *cache, struct bio *bio) 872 { 873 size_t pb_data_size = get_per_bio_data_size(cache); 874 struct per_bio_data *pb = get_per_bio_data(bio, pb_data_size); 875 876 iot_io_end(&cache->origin_tracker, pb->len); 877 } 878 879 static void accounted_request(struct cache *cache, struct bio *bio) 880 { 881 accounted_begin(cache, bio); 882 generic_make_request(bio); 883 } 884 885 static void issue(struct cache *cache, struct bio *bio) 886 { 887 unsigned long flags; 888 889 if (!bio_triggers_commit(cache, bio)) { 890 accounted_request(cache, bio); 891 return; 892 } 893 894 /* 895 * Batch together any bios that trigger commits and then issue a 896 * single commit for them in do_worker(). 897 */ 898 spin_lock_irqsave(&cache->lock, flags); 899 cache->commit_requested = true; 900 bio_list_add(&cache->deferred_flush_bios, bio); 901 spin_unlock_irqrestore(&cache->lock, flags); 902 } 903 904 static void inc_and_issue(struct cache *cache, struct bio *bio, struct dm_bio_prison_cell *cell) 905 { 906 inc_ds(cache, bio, cell); 907 issue(cache, bio); 908 } 909 910 static void defer_writethrough_bio(struct cache *cache, struct bio *bio) 911 { 912 unsigned long flags; 913 914 spin_lock_irqsave(&cache->lock, flags); 915 bio_list_add(&cache->deferred_writethrough_bios, bio); 916 spin_unlock_irqrestore(&cache->lock, flags); 917 918 wake_worker(cache); 919 } 920 921 static void writethrough_endio(struct bio *bio, int err) 922 { 923 struct per_bio_data *pb = get_per_bio_data(bio, PB_DATA_SIZE_WT); 924 925 dm_unhook_bio(&pb->hook_info, bio); 926 927 if (err) { 928 bio_endio(bio, err); 929 return; 930 } 931 932 dm_bio_restore(&pb->bio_details, bio); 933 remap_to_cache(pb->cache, bio, pb->cblock); 934 935 /* 936 * We can't issue this bio directly, since we're in interrupt 937 * context. So it gets put on a bio list for processing by the 938 * worker thread. 939 */ 940 defer_writethrough_bio(pb->cache, bio); 941 } 942 943 /* 944 * When running in writethrough mode we need to send writes to clean blocks 945 * to both the cache and origin devices. In future we'd like to clone the 946 * bio and send them in parallel, but for now we're doing them in 947 * series as this is easier. 948 */ 949 static void remap_to_origin_then_cache(struct cache *cache, struct bio *bio, 950 dm_oblock_t oblock, dm_cblock_t cblock) 951 { 952 struct per_bio_data *pb = get_per_bio_data(bio, PB_DATA_SIZE_WT); 953 954 pb->cache = cache; 955 pb->cblock = cblock; 956 dm_hook_bio(&pb->hook_info, bio, writethrough_endio, NULL); 957 dm_bio_record(&pb->bio_details, bio); 958 959 remap_to_origin_clear_discard(pb->cache, bio, oblock); 960 } 961 962 /*---------------------------------------------------------------- 963 * Failure modes 964 *--------------------------------------------------------------*/ 965 static enum cache_metadata_mode get_cache_mode(struct cache *cache) 966 { 967 return cache->features.mode; 968 } 969 970 static const char *cache_device_name(struct cache *cache) 971 { 972 return dm_device_name(dm_table_get_md(cache->ti->table)); 973 } 974 975 static void notify_mode_switch(struct cache *cache, enum cache_metadata_mode mode) 976 { 977 const char *descs[] = { 978 "write", 979 "read-only", 980 "fail" 981 }; 982 983 dm_table_event(cache->ti->table); 984 DMINFO("%s: switching cache to %s mode", 985 cache_device_name(cache), descs[(int)mode]); 986 } 987 988 static void set_cache_mode(struct cache *cache, enum cache_metadata_mode new_mode) 989 { 990 bool needs_check = dm_cache_metadata_needs_check(cache->cmd); 991 enum cache_metadata_mode old_mode = get_cache_mode(cache); 992 993 if (new_mode == CM_WRITE && needs_check) { 994 DMERR("%s: unable to switch cache to write mode until repaired.", 995 cache_device_name(cache)); 996 if (old_mode != new_mode) 997 new_mode = old_mode; 998 else 999 new_mode = CM_READ_ONLY; 1000 } 1001 1002 /* Never move out of fail mode */ 1003 if (old_mode == CM_FAIL) 1004 new_mode = CM_FAIL; 1005 1006 switch (new_mode) { 1007 case CM_FAIL: 1008 case CM_READ_ONLY: 1009 dm_cache_metadata_set_read_only(cache->cmd); 1010 break; 1011 1012 case CM_WRITE: 1013 dm_cache_metadata_set_read_write(cache->cmd); 1014 break; 1015 } 1016 1017 cache->features.mode = new_mode; 1018 1019 if (new_mode != old_mode) 1020 notify_mode_switch(cache, new_mode); 1021 } 1022 1023 static void abort_transaction(struct cache *cache) 1024 { 1025 const char *dev_name = cache_device_name(cache); 1026 1027 if (get_cache_mode(cache) >= CM_READ_ONLY) 1028 return; 1029 1030 if (dm_cache_metadata_set_needs_check(cache->cmd)) { 1031 DMERR("%s: failed to set 'needs_check' flag in metadata", dev_name); 1032 set_cache_mode(cache, CM_FAIL); 1033 } 1034 1035 DMERR_LIMIT("%s: aborting current metadata transaction", dev_name); 1036 if (dm_cache_metadata_abort(cache->cmd)) { 1037 DMERR("%s: failed to abort metadata transaction", dev_name); 1038 set_cache_mode(cache, CM_FAIL); 1039 } 1040 } 1041 1042 static void metadata_operation_failed(struct cache *cache, const char *op, int r) 1043 { 1044 DMERR_LIMIT("%s: metadata operation '%s' failed: error = %d", 1045 cache_device_name(cache), op, r); 1046 abort_transaction(cache); 1047 set_cache_mode(cache, CM_READ_ONLY); 1048 } 1049 1050 /*---------------------------------------------------------------- 1051 * Migration processing 1052 * 1053 * Migration covers moving data from the origin device to the cache, or 1054 * vice versa. 1055 *--------------------------------------------------------------*/ 1056 static void inc_io_migrations(struct cache *cache) 1057 { 1058 atomic_inc(&cache->nr_io_migrations); 1059 } 1060 1061 static void dec_io_migrations(struct cache *cache) 1062 { 1063 atomic_dec(&cache->nr_io_migrations); 1064 } 1065 1066 static void __cell_release(struct cache *cache, struct dm_bio_prison_cell *cell, 1067 bool holder, struct bio_list *bios) 1068 { 1069 (holder ? dm_cell_release : dm_cell_release_no_holder) 1070 (cache->prison, cell, bios); 1071 free_prison_cell(cache, cell); 1072 } 1073 1074 static bool discard_or_flush(struct bio *bio) 1075 { 1076 return bio->bi_rw & (REQ_FLUSH | REQ_FUA | REQ_DISCARD); 1077 } 1078 1079 static void __cell_defer(struct cache *cache, struct dm_bio_prison_cell *cell) 1080 { 1081 if (discard_or_flush(cell->holder)) 1082 /* 1083 * We have to handle these bios 1084 * individually. 1085 */ 1086 __cell_release(cache, cell, true, &cache->deferred_bios); 1087 1088 else 1089 list_add_tail(&cell->user_list, &cache->deferred_cells); 1090 } 1091 1092 static void cell_defer(struct cache *cache, struct dm_bio_prison_cell *cell, bool holder) 1093 { 1094 unsigned long flags; 1095 1096 if (!holder && dm_cell_promote_or_release(cache->prison, cell)) { 1097 /* 1098 * There was no prisoner to promote to holder, the 1099 * cell has been released. 1100 */ 1101 free_prison_cell(cache, cell); 1102 return; 1103 } 1104 1105 spin_lock_irqsave(&cache->lock, flags); 1106 __cell_defer(cache, cell); 1107 spin_unlock_irqrestore(&cache->lock, flags); 1108 1109 wake_worker(cache); 1110 } 1111 1112 static void cell_error_with_code(struct cache *cache, struct dm_bio_prison_cell *cell, int err) 1113 { 1114 dm_cell_error(cache->prison, cell, err); 1115 dm_bio_prison_free_cell(cache->prison, cell); 1116 } 1117 1118 static void cell_requeue(struct cache *cache, struct dm_bio_prison_cell *cell) 1119 { 1120 cell_error_with_code(cache, cell, DM_ENDIO_REQUEUE); 1121 } 1122 1123 static void free_io_migration(struct dm_cache_migration *mg) 1124 { 1125 dec_io_migrations(mg->cache); 1126 free_migration(mg); 1127 } 1128 1129 static void migration_failure(struct dm_cache_migration *mg) 1130 { 1131 struct cache *cache = mg->cache; 1132 const char *dev_name = cache_device_name(cache); 1133 1134 if (mg->writeback) { 1135 DMERR_LIMIT("%s: writeback failed; couldn't copy block", dev_name); 1136 set_dirty(cache, mg->old_oblock, mg->cblock); 1137 cell_defer(cache, mg->old_ocell, false); 1138 1139 } else if (mg->demote) { 1140 DMERR_LIMIT("%s: demotion failed; couldn't copy block", dev_name); 1141 policy_force_mapping(cache->policy, mg->new_oblock, mg->old_oblock); 1142 1143 cell_defer(cache, mg->old_ocell, mg->promote ? false : true); 1144 if (mg->promote) 1145 cell_defer(cache, mg->new_ocell, true); 1146 } else { 1147 DMERR_LIMIT("%s: promotion failed; couldn't copy block", dev_name); 1148 policy_remove_mapping(cache->policy, mg->new_oblock); 1149 cell_defer(cache, mg->new_ocell, true); 1150 } 1151 1152 free_io_migration(mg); 1153 } 1154 1155 static void migration_success_pre_commit(struct dm_cache_migration *mg) 1156 { 1157 int r; 1158 unsigned long flags; 1159 struct cache *cache = mg->cache; 1160 1161 if (mg->writeback) { 1162 clear_dirty(cache, mg->old_oblock, mg->cblock); 1163 cell_defer(cache, mg->old_ocell, false); 1164 free_io_migration(mg); 1165 return; 1166 1167 } else if (mg->demote) { 1168 r = dm_cache_remove_mapping(cache->cmd, mg->cblock); 1169 if (r) { 1170 DMERR_LIMIT("%s: demotion failed; couldn't update on disk metadata", 1171 cache_device_name(cache)); 1172 metadata_operation_failed(cache, "dm_cache_remove_mapping", r); 1173 policy_force_mapping(cache->policy, mg->new_oblock, 1174 mg->old_oblock); 1175 if (mg->promote) 1176 cell_defer(cache, mg->new_ocell, true); 1177 free_io_migration(mg); 1178 return; 1179 } 1180 } else { 1181 r = dm_cache_insert_mapping(cache->cmd, mg->cblock, mg->new_oblock); 1182 if (r) { 1183 DMERR_LIMIT("%s: promotion failed; couldn't update on disk metadata", 1184 cache_device_name(cache)); 1185 metadata_operation_failed(cache, "dm_cache_insert_mapping", r); 1186 policy_remove_mapping(cache->policy, mg->new_oblock); 1187 free_io_migration(mg); 1188 return; 1189 } 1190 } 1191 1192 spin_lock_irqsave(&cache->lock, flags); 1193 list_add_tail(&mg->list, &cache->need_commit_migrations); 1194 cache->commit_requested = true; 1195 spin_unlock_irqrestore(&cache->lock, flags); 1196 } 1197 1198 static void migration_success_post_commit(struct dm_cache_migration *mg) 1199 { 1200 unsigned long flags; 1201 struct cache *cache = mg->cache; 1202 1203 if (mg->writeback) { 1204 DMWARN_LIMIT("%s: writeback unexpectedly triggered commit", 1205 cache_device_name(cache)); 1206 return; 1207 1208 } else if (mg->demote) { 1209 cell_defer(cache, mg->old_ocell, mg->promote ? false : true); 1210 1211 if (mg->promote) { 1212 mg->demote = false; 1213 1214 spin_lock_irqsave(&cache->lock, flags); 1215 list_add_tail(&mg->list, &cache->quiesced_migrations); 1216 spin_unlock_irqrestore(&cache->lock, flags); 1217 1218 } else { 1219 if (mg->invalidate) 1220 policy_remove_mapping(cache->policy, mg->old_oblock); 1221 free_io_migration(mg); 1222 } 1223 1224 } else { 1225 if (mg->requeue_holder) { 1226 clear_dirty(cache, mg->new_oblock, mg->cblock); 1227 cell_defer(cache, mg->new_ocell, true); 1228 } else { 1229 /* 1230 * The block was promoted via an overwrite, so it's dirty. 1231 */ 1232 set_dirty(cache, mg->new_oblock, mg->cblock); 1233 bio_endio(mg->new_ocell->holder, 0); 1234 cell_defer(cache, mg->new_ocell, false); 1235 } 1236 free_io_migration(mg); 1237 } 1238 } 1239 1240 static void copy_complete(int read_err, unsigned long write_err, void *context) 1241 { 1242 unsigned long flags; 1243 struct dm_cache_migration *mg = (struct dm_cache_migration *) context; 1244 struct cache *cache = mg->cache; 1245 1246 if (read_err || write_err) 1247 mg->err = true; 1248 1249 spin_lock_irqsave(&cache->lock, flags); 1250 list_add_tail(&mg->list, &cache->completed_migrations); 1251 spin_unlock_irqrestore(&cache->lock, flags); 1252 1253 wake_worker(cache); 1254 } 1255 1256 static void issue_copy(struct dm_cache_migration *mg) 1257 { 1258 int r; 1259 struct dm_io_region o_region, c_region; 1260 struct cache *cache = mg->cache; 1261 sector_t cblock = from_cblock(mg->cblock); 1262 1263 o_region.bdev = cache->origin_dev->bdev; 1264 o_region.count = cache->sectors_per_block; 1265 1266 c_region.bdev = cache->cache_dev->bdev; 1267 c_region.sector = cblock * cache->sectors_per_block; 1268 c_region.count = cache->sectors_per_block; 1269 1270 if (mg->writeback || mg->demote) { 1271 /* demote */ 1272 o_region.sector = from_oblock(mg->old_oblock) * cache->sectors_per_block; 1273 r = dm_kcopyd_copy(cache->copier, &c_region, 1, &o_region, 0, copy_complete, mg); 1274 } else { 1275 /* promote */ 1276 o_region.sector = from_oblock(mg->new_oblock) * cache->sectors_per_block; 1277 r = dm_kcopyd_copy(cache->copier, &o_region, 1, &c_region, 0, copy_complete, mg); 1278 } 1279 1280 if (r < 0) { 1281 DMERR_LIMIT("%s: issuing migration failed", cache_device_name(cache)); 1282 migration_failure(mg); 1283 } 1284 } 1285 1286 static void overwrite_endio(struct bio *bio, int err) 1287 { 1288 struct dm_cache_migration *mg = bio->bi_private; 1289 struct cache *cache = mg->cache; 1290 size_t pb_data_size = get_per_bio_data_size(cache); 1291 struct per_bio_data *pb = get_per_bio_data(bio, pb_data_size); 1292 unsigned long flags; 1293 1294 dm_unhook_bio(&pb->hook_info, bio); 1295 1296 if (err) 1297 mg->err = true; 1298 1299 mg->requeue_holder = false; 1300 1301 spin_lock_irqsave(&cache->lock, flags); 1302 list_add_tail(&mg->list, &cache->completed_migrations); 1303 spin_unlock_irqrestore(&cache->lock, flags); 1304 1305 wake_worker(cache); 1306 } 1307 1308 static void issue_overwrite(struct dm_cache_migration *mg, struct bio *bio) 1309 { 1310 size_t pb_data_size = get_per_bio_data_size(mg->cache); 1311 struct per_bio_data *pb = get_per_bio_data(bio, pb_data_size); 1312 1313 dm_hook_bio(&pb->hook_info, bio, overwrite_endio, mg); 1314 remap_to_cache_dirty(mg->cache, bio, mg->new_oblock, mg->cblock); 1315 1316 /* 1317 * No need to inc_ds() here, since the cell will be held for the 1318 * duration of the io. 1319 */ 1320 accounted_request(mg->cache, bio); 1321 } 1322 1323 static bool bio_writes_complete_block(struct cache *cache, struct bio *bio) 1324 { 1325 return (bio_data_dir(bio) == WRITE) && 1326 (bio->bi_iter.bi_size == (cache->sectors_per_block << SECTOR_SHIFT)); 1327 } 1328 1329 static void avoid_copy(struct dm_cache_migration *mg) 1330 { 1331 atomic_inc(&mg->cache->stats.copies_avoided); 1332 migration_success_pre_commit(mg); 1333 } 1334 1335 static void calc_discard_block_range(struct cache *cache, struct bio *bio, 1336 dm_dblock_t *b, dm_dblock_t *e) 1337 { 1338 sector_t sb = bio->bi_iter.bi_sector; 1339 sector_t se = bio_end_sector(bio); 1340 1341 *b = to_dblock(dm_sector_div_up(sb, cache->discard_block_size)); 1342 1343 if (se - sb < cache->discard_block_size) 1344 *e = *b; 1345 else 1346 *e = to_dblock(block_div(se, cache->discard_block_size)); 1347 } 1348 1349 static void issue_discard(struct dm_cache_migration *mg) 1350 { 1351 dm_dblock_t b, e; 1352 struct bio *bio = mg->new_ocell->holder; 1353 1354 calc_discard_block_range(mg->cache, bio, &b, &e); 1355 while (b != e) { 1356 set_discard(mg->cache, b); 1357 b = to_dblock(from_dblock(b) + 1); 1358 } 1359 1360 bio_endio(bio, 0); 1361 cell_defer(mg->cache, mg->new_ocell, false); 1362 free_migration(mg); 1363 } 1364 1365 static void issue_copy_or_discard(struct dm_cache_migration *mg) 1366 { 1367 bool avoid; 1368 struct cache *cache = mg->cache; 1369 1370 if (mg->discard) { 1371 issue_discard(mg); 1372 return; 1373 } 1374 1375 if (mg->writeback || mg->demote) 1376 avoid = !is_dirty(cache, mg->cblock) || 1377 is_discarded_oblock(cache, mg->old_oblock); 1378 else { 1379 struct bio *bio = mg->new_ocell->holder; 1380 1381 avoid = is_discarded_oblock(cache, mg->new_oblock); 1382 1383 if (writeback_mode(&cache->features) && 1384 !avoid && bio_writes_complete_block(cache, bio)) { 1385 issue_overwrite(mg, bio); 1386 return; 1387 } 1388 } 1389 1390 avoid ? avoid_copy(mg) : issue_copy(mg); 1391 } 1392 1393 static void complete_migration(struct dm_cache_migration *mg) 1394 { 1395 if (mg->err) 1396 migration_failure(mg); 1397 else 1398 migration_success_pre_commit(mg); 1399 } 1400 1401 static void process_migrations(struct cache *cache, struct list_head *head, 1402 void (*fn)(struct dm_cache_migration *)) 1403 { 1404 unsigned long flags; 1405 struct list_head list; 1406 struct dm_cache_migration *mg, *tmp; 1407 1408 INIT_LIST_HEAD(&list); 1409 spin_lock_irqsave(&cache->lock, flags); 1410 list_splice_init(head, &list); 1411 spin_unlock_irqrestore(&cache->lock, flags); 1412 1413 list_for_each_entry_safe(mg, tmp, &list, list) 1414 fn(mg); 1415 } 1416 1417 static void __queue_quiesced_migration(struct dm_cache_migration *mg) 1418 { 1419 list_add_tail(&mg->list, &mg->cache->quiesced_migrations); 1420 } 1421 1422 static void queue_quiesced_migration(struct dm_cache_migration *mg) 1423 { 1424 unsigned long flags; 1425 struct cache *cache = mg->cache; 1426 1427 spin_lock_irqsave(&cache->lock, flags); 1428 __queue_quiesced_migration(mg); 1429 spin_unlock_irqrestore(&cache->lock, flags); 1430 1431 wake_worker(cache); 1432 } 1433 1434 static void queue_quiesced_migrations(struct cache *cache, struct list_head *work) 1435 { 1436 unsigned long flags; 1437 struct dm_cache_migration *mg, *tmp; 1438 1439 spin_lock_irqsave(&cache->lock, flags); 1440 list_for_each_entry_safe(mg, tmp, work, list) 1441 __queue_quiesced_migration(mg); 1442 spin_unlock_irqrestore(&cache->lock, flags); 1443 1444 wake_worker(cache); 1445 } 1446 1447 static void check_for_quiesced_migrations(struct cache *cache, 1448 struct per_bio_data *pb) 1449 { 1450 struct list_head work; 1451 1452 if (!pb->all_io_entry) 1453 return; 1454 1455 INIT_LIST_HEAD(&work); 1456 dm_deferred_entry_dec(pb->all_io_entry, &work); 1457 1458 if (!list_empty(&work)) 1459 queue_quiesced_migrations(cache, &work); 1460 } 1461 1462 static void quiesce_migration(struct dm_cache_migration *mg) 1463 { 1464 if (!dm_deferred_set_add_work(mg->cache->all_io_ds, &mg->list)) 1465 queue_quiesced_migration(mg); 1466 } 1467 1468 static void promote(struct cache *cache, struct prealloc *structs, 1469 dm_oblock_t oblock, dm_cblock_t cblock, 1470 struct dm_bio_prison_cell *cell) 1471 { 1472 struct dm_cache_migration *mg = prealloc_get_migration(structs); 1473 1474 mg->err = false; 1475 mg->discard = false; 1476 mg->writeback = false; 1477 mg->demote = false; 1478 mg->promote = true; 1479 mg->requeue_holder = true; 1480 mg->invalidate = false; 1481 mg->cache = cache; 1482 mg->new_oblock = oblock; 1483 mg->cblock = cblock; 1484 mg->old_ocell = NULL; 1485 mg->new_ocell = cell; 1486 mg->start_jiffies = jiffies; 1487 1488 inc_io_migrations(cache); 1489 quiesce_migration(mg); 1490 } 1491 1492 static void writeback(struct cache *cache, struct prealloc *structs, 1493 dm_oblock_t oblock, dm_cblock_t cblock, 1494 struct dm_bio_prison_cell *cell) 1495 { 1496 struct dm_cache_migration *mg = prealloc_get_migration(structs); 1497 1498 mg->err = false; 1499 mg->discard = false; 1500 mg->writeback = true; 1501 mg->demote = false; 1502 mg->promote = false; 1503 mg->requeue_holder = true; 1504 mg->invalidate = false; 1505 mg->cache = cache; 1506 mg->old_oblock = oblock; 1507 mg->cblock = cblock; 1508 mg->old_ocell = cell; 1509 mg->new_ocell = NULL; 1510 mg->start_jiffies = jiffies; 1511 1512 inc_io_migrations(cache); 1513 quiesce_migration(mg); 1514 } 1515 1516 static void demote_then_promote(struct cache *cache, struct prealloc *structs, 1517 dm_oblock_t old_oblock, dm_oblock_t new_oblock, 1518 dm_cblock_t cblock, 1519 struct dm_bio_prison_cell *old_ocell, 1520 struct dm_bio_prison_cell *new_ocell) 1521 { 1522 struct dm_cache_migration *mg = prealloc_get_migration(structs); 1523 1524 mg->err = false; 1525 mg->discard = false; 1526 mg->writeback = false; 1527 mg->demote = true; 1528 mg->promote = true; 1529 mg->requeue_holder = true; 1530 mg->invalidate = false; 1531 mg->cache = cache; 1532 mg->old_oblock = old_oblock; 1533 mg->new_oblock = new_oblock; 1534 mg->cblock = cblock; 1535 mg->old_ocell = old_ocell; 1536 mg->new_ocell = new_ocell; 1537 mg->start_jiffies = jiffies; 1538 1539 inc_io_migrations(cache); 1540 quiesce_migration(mg); 1541 } 1542 1543 /* 1544 * Invalidate a cache entry. No writeback occurs; any changes in the cache 1545 * block are thrown away. 1546 */ 1547 static void invalidate(struct cache *cache, struct prealloc *structs, 1548 dm_oblock_t oblock, dm_cblock_t cblock, 1549 struct dm_bio_prison_cell *cell) 1550 { 1551 struct dm_cache_migration *mg = prealloc_get_migration(structs); 1552 1553 mg->err = false; 1554 mg->discard = false; 1555 mg->writeback = false; 1556 mg->demote = true; 1557 mg->promote = false; 1558 mg->requeue_holder = true; 1559 mg->invalidate = true; 1560 mg->cache = cache; 1561 mg->old_oblock = oblock; 1562 mg->cblock = cblock; 1563 mg->old_ocell = cell; 1564 mg->new_ocell = NULL; 1565 mg->start_jiffies = jiffies; 1566 1567 inc_io_migrations(cache); 1568 quiesce_migration(mg); 1569 } 1570 1571 static void discard(struct cache *cache, struct prealloc *structs, 1572 struct dm_bio_prison_cell *cell) 1573 { 1574 struct dm_cache_migration *mg = prealloc_get_migration(structs); 1575 1576 mg->err = false; 1577 mg->discard = true; 1578 mg->writeback = false; 1579 mg->demote = false; 1580 mg->promote = false; 1581 mg->requeue_holder = false; 1582 mg->invalidate = false; 1583 mg->cache = cache; 1584 mg->old_ocell = NULL; 1585 mg->new_ocell = cell; 1586 mg->start_jiffies = jiffies; 1587 1588 quiesce_migration(mg); 1589 } 1590 1591 /*---------------------------------------------------------------- 1592 * bio processing 1593 *--------------------------------------------------------------*/ 1594 static void defer_bio(struct cache *cache, struct bio *bio) 1595 { 1596 unsigned long flags; 1597 1598 spin_lock_irqsave(&cache->lock, flags); 1599 bio_list_add(&cache->deferred_bios, bio); 1600 spin_unlock_irqrestore(&cache->lock, flags); 1601 1602 wake_worker(cache); 1603 } 1604 1605 static void process_flush_bio(struct cache *cache, struct bio *bio) 1606 { 1607 size_t pb_data_size = get_per_bio_data_size(cache); 1608 struct per_bio_data *pb = get_per_bio_data(bio, pb_data_size); 1609 1610 BUG_ON(bio->bi_iter.bi_size); 1611 if (!pb->req_nr) 1612 remap_to_origin(cache, bio); 1613 else 1614 remap_to_cache(cache, bio, 0); 1615 1616 /* 1617 * REQ_FLUSH is not directed at any particular block so we don't 1618 * need to inc_ds(). REQ_FUA's are split into a write + REQ_FLUSH 1619 * by dm-core. 1620 */ 1621 issue(cache, bio); 1622 } 1623 1624 static void process_discard_bio(struct cache *cache, struct prealloc *structs, 1625 struct bio *bio) 1626 { 1627 int r; 1628 dm_dblock_t b, e; 1629 struct dm_bio_prison_cell *cell_prealloc, *new_ocell; 1630 1631 calc_discard_block_range(cache, bio, &b, &e); 1632 if (b == e) { 1633 bio_endio(bio, 0); 1634 return; 1635 } 1636 1637 cell_prealloc = prealloc_get_cell(structs); 1638 r = bio_detain_range(cache, dblock_to_oblock(cache, b), dblock_to_oblock(cache, e), bio, cell_prealloc, 1639 (cell_free_fn) prealloc_put_cell, 1640 structs, &new_ocell); 1641 if (r > 0) 1642 return; 1643 1644 discard(cache, structs, new_ocell); 1645 } 1646 1647 static bool spare_migration_bandwidth(struct cache *cache) 1648 { 1649 sector_t current_volume = (atomic_read(&cache->nr_io_migrations) + 1) * 1650 cache->sectors_per_block; 1651 return current_volume < cache->migration_threshold; 1652 } 1653 1654 static void inc_hit_counter(struct cache *cache, struct bio *bio) 1655 { 1656 atomic_inc(bio_data_dir(bio) == READ ? 1657 &cache->stats.read_hit : &cache->stats.write_hit); 1658 } 1659 1660 static void inc_miss_counter(struct cache *cache, struct bio *bio) 1661 { 1662 atomic_inc(bio_data_dir(bio) == READ ? 1663 &cache->stats.read_miss : &cache->stats.write_miss); 1664 } 1665 1666 /*----------------------------------------------------------------*/ 1667 1668 struct inc_detail { 1669 struct cache *cache; 1670 struct bio_list bios_for_issue; 1671 struct bio_list unhandled_bios; 1672 bool any_writes; 1673 }; 1674 1675 static void inc_fn(void *context, struct dm_bio_prison_cell *cell) 1676 { 1677 struct bio *bio; 1678 struct inc_detail *detail = context; 1679 struct cache *cache = detail->cache; 1680 1681 inc_ds(cache, cell->holder, cell); 1682 if (bio_data_dir(cell->holder) == WRITE) 1683 detail->any_writes = true; 1684 1685 while ((bio = bio_list_pop(&cell->bios))) { 1686 if (discard_or_flush(bio)) { 1687 bio_list_add(&detail->unhandled_bios, bio); 1688 continue; 1689 } 1690 1691 if (bio_data_dir(bio) == WRITE) 1692 detail->any_writes = true; 1693 1694 bio_list_add(&detail->bios_for_issue, bio); 1695 inc_ds(cache, bio, cell); 1696 } 1697 } 1698 1699 // FIXME: refactor these two 1700 static void remap_cell_to_origin_clear_discard(struct cache *cache, 1701 struct dm_bio_prison_cell *cell, 1702 dm_oblock_t oblock, bool issue_holder) 1703 { 1704 struct bio *bio; 1705 unsigned long flags; 1706 struct inc_detail detail; 1707 1708 detail.cache = cache; 1709 bio_list_init(&detail.bios_for_issue); 1710 bio_list_init(&detail.unhandled_bios); 1711 detail.any_writes = false; 1712 1713 spin_lock_irqsave(&cache->lock, flags); 1714 dm_cell_visit_release(cache->prison, inc_fn, &detail, cell); 1715 bio_list_merge(&cache->deferred_bios, &detail.unhandled_bios); 1716 spin_unlock_irqrestore(&cache->lock, flags); 1717 1718 remap_to_origin(cache, cell->holder); 1719 if (issue_holder) 1720 issue(cache, cell->holder); 1721 else 1722 accounted_begin(cache, cell->holder); 1723 1724 if (detail.any_writes) 1725 clear_discard(cache, oblock_to_dblock(cache, oblock)); 1726 1727 while ((bio = bio_list_pop(&detail.bios_for_issue))) { 1728 remap_to_origin(cache, bio); 1729 issue(cache, bio); 1730 } 1731 } 1732 1733 static void remap_cell_to_cache_dirty(struct cache *cache, struct dm_bio_prison_cell *cell, 1734 dm_oblock_t oblock, dm_cblock_t cblock, bool issue_holder) 1735 { 1736 struct bio *bio; 1737 unsigned long flags; 1738 struct inc_detail detail; 1739 1740 detail.cache = cache; 1741 bio_list_init(&detail.bios_for_issue); 1742 bio_list_init(&detail.unhandled_bios); 1743 detail.any_writes = false; 1744 1745 spin_lock_irqsave(&cache->lock, flags); 1746 dm_cell_visit_release(cache->prison, inc_fn, &detail, cell); 1747 bio_list_merge(&cache->deferred_bios, &detail.unhandled_bios); 1748 spin_unlock_irqrestore(&cache->lock, flags); 1749 1750 remap_to_cache(cache, cell->holder, cblock); 1751 if (issue_holder) 1752 issue(cache, cell->holder); 1753 else 1754 accounted_begin(cache, cell->holder); 1755 1756 if (detail.any_writes) { 1757 set_dirty(cache, oblock, cblock); 1758 clear_discard(cache, oblock_to_dblock(cache, oblock)); 1759 } 1760 1761 while ((bio = bio_list_pop(&detail.bios_for_issue))) { 1762 remap_to_cache(cache, bio, cblock); 1763 issue(cache, bio); 1764 } 1765 } 1766 1767 /*----------------------------------------------------------------*/ 1768 1769 struct old_oblock_lock { 1770 struct policy_locker locker; 1771 struct cache *cache; 1772 struct prealloc *structs; 1773 struct dm_bio_prison_cell *cell; 1774 }; 1775 1776 static int null_locker(struct policy_locker *locker, dm_oblock_t b) 1777 { 1778 /* This should never be called */ 1779 BUG(); 1780 return 0; 1781 } 1782 1783 static int cell_locker(struct policy_locker *locker, dm_oblock_t b) 1784 { 1785 struct old_oblock_lock *l = container_of(locker, struct old_oblock_lock, locker); 1786 struct dm_bio_prison_cell *cell_prealloc = prealloc_get_cell(l->structs); 1787 1788 return bio_detain(l->cache, b, NULL, cell_prealloc, 1789 (cell_free_fn) prealloc_put_cell, 1790 l->structs, &l->cell); 1791 } 1792 1793 static void process_cell(struct cache *cache, struct prealloc *structs, 1794 struct dm_bio_prison_cell *new_ocell) 1795 { 1796 int r; 1797 bool release_cell = true; 1798 struct bio *bio = new_ocell->holder; 1799 dm_oblock_t block = get_bio_block(cache, bio); 1800 struct policy_result lookup_result; 1801 bool passthrough = passthrough_mode(&cache->features); 1802 bool fast_promotion, can_migrate; 1803 struct old_oblock_lock ool; 1804 1805 fast_promotion = is_discarded_oblock(cache, block) || bio_writes_complete_block(cache, bio); 1806 can_migrate = !passthrough && (fast_promotion || spare_migration_bandwidth(cache)); 1807 1808 ool.locker.fn = cell_locker; 1809 ool.cache = cache; 1810 ool.structs = structs; 1811 ool.cell = NULL; 1812 r = policy_map(cache->policy, block, true, can_migrate, fast_promotion, 1813 bio, &ool.locker, &lookup_result); 1814 1815 if (r == -EWOULDBLOCK) 1816 /* migration has been denied */ 1817 lookup_result.op = POLICY_MISS; 1818 1819 switch (lookup_result.op) { 1820 case POLICY_HIT: 1821 if (passthrough) { 1822 inc_miss_counter(cache, bio); 1823 1824 /* 1825 * Passthrough always maps to the origin, 1826 * invalidating any cache blocks that are written 1827 * to. 1828 */ 1829 1830 if (bio_data_dir(bio) == WRITE) { 1831 atomic_inc(&cache->stats.demotion); 1832 invalidate(cache, structs, block, lookup_result.cblock, new_ocell); 1833 release_cell = false; 1834 1835 } else { 1836 /* FIXME: factor out issue_origin() */ 1837 remap_to_origin_clear_discard(cache, bio, block); 1838 inc_and_issue(cache, bio, new_ocell); 1839 } 1840 } else { 1841 inc_hit_counter(cache, bio); 1842 1843 if (bio_data_dir(bio) == WRITE && 1844 writethrough_mode(&cache->features) && 1845 !is_dirty(cache, lookup_result.cblock)) { 1846 remap_to_origin_then_cache(cache, bio, block, lookup_result.cblock); 1847 inc_and_issue(cache, bio, new_ocell); 1848 1849 } else { 1850 remap_cell_to_cache_dirty(cache, new_ocell, block, lookup_result.cblock, true); 1851 release_cell = false; 1852 } 1853 } 1854 1855 break; 1856 1857 case POLICY_MISS: 1858 inc_miss_counter(cache, bio); 1859 remap_cell_to_origin_clear_discard(cache, new_ocell, block, true); 1860 release_cell = false; 1861 break; 1862 1863 case POLICY_NEW: 1864 atomic_inc(&cache->stats.promotion); 1865 promote(cache, structs, block, lookup_result.cblock, new_ocell); 1866 release_cell = false; 1867 break; 1868 1869 case POLICY_REPLACE: 1870 atomic_inc(&cache->stats.demotion); 1871 atomic_inc(&cache->stats.promotion); 1872 demote_then_promote(cache, structs, lookup_result.old_oblock, 1873 block, lookup_result.cblock, 1874 ool.cell, new_ocell); 1875 release_cell = false; 1876 break; 1877 1878 default: 1879 DMERR_LIMIT("%s: %s: erroring bio, unknown policy op: %u", 1880 cache_device_name(cache), __func__, 1881 (unsigned) lookup_result.op); 1882 bio_io_error(bio); 1883 } 1884 1885 if (release_cell) 1886 cell_defer(cache, new_ocell, false); 1887 } 1888 1889 static void process_bio(struct cache *cache, struct prealloc *structs, 1890 struct bio *bio) 1891 { 1892 int r; 1893 dm_oblock_t block = get_bio_block(cache, bio); 1894 struct dm_bio_prison_cell *cell_prealloc, *new_ocell; 1895 1896 /* 1897 * Check to see if that block is currently migrating. 1898 */ 1899 cell_prealloc = prealloc_get_cell(structs); 1900 r = bio_detain(cache, block, bio, cell_prealloc, 1901 (cell_free_fn) prealloc_put_cell, 1902 structs, &new_ocell); 1903 if (r > 0) 1904 return; 1905 1906 process_cell(cache, structs, new_ocell); 1907 } 1908 1909 static int need_commit_due_to_time(struct cache *cache) 1910 { 1911 return jiffies < cache->last_commit_jiffies || 1912 jiffies > cache->last_commit_jiffies + COMMIT_PERIOD; 1913 } 1914 1915 /* 1916 * A non-zero return indicates read_only or fail_io mode. 1917 */ 1918 static int commit(struct cache *cache, bool clean_shutdown) 1919 { 1920 int r; 1921 1922 if (get_cache_mode(cache) >= CM_READ_ONLY) 1923 return -EINVAL; 1924 1925 atomic_inc(&cache->stats.commit_count); 1926 r = dm_cache_commit(cache->cmd, clean_shutdown); 1927 if (r) 1928 metadata_operation_failed(cache, "dm_cache_commit", r); 1929 1930 return r; 1931 } 1932 1933 static int commit_if_needed(struct cache *cache) 1934 { 1935 int r = 0; 1936 1937 if ((cache->commit_requested || need_commit_due_to_time(cache)) && 1938 dm_cache_changed_this_transaction(cache->cmd)) { 1939 r = commit(cache, false); 1940 cache->commit_requested = false; 1941 cache->last_commit_jiffies = jiffies; 1942 } 1943 1944 return r; 1945 } 1946 1947 static void process_deferred_bios(struct cache *cache) 1948 { 1949 bool prealloc_used = false; 1950 unsigned long flags; 1951 struct bio_list bios; 1952 struct bio *bio; 1953 struct prealloc structs; 1954 1955 memset(&structs, 0, sizeof(structs)); 1956 bio_list_init(&bios); 1957 1958 spin_lock_irqsave(&cache->lock, flags); 1959 bio_list_merge(&bios, &cache->deferred_bios); 1960 bio_list_init(&cache->deferred_bios); 1961 spin_unlock_irqrestore(&cache->lock, flags); 1962 1963 while (!bio_list_empty(&bios)) { 1964 /* 1965 * If we've got no free migration structs, and processing 1966 * this bio might require one, we pause until there are some 1967 * prepared mappings to process. 1968 */ 1969 if (prealloc_data_structs(cache, &structs)) { 1970 spin_lock_irqsave(&cache->lock, flags); 1971 bio_list_merge(&cache->deferred_bios, &bios); 1972 spin_unlock_irqrestore(&cache->lock, flags); 1973 break; 1974 } 1975 1976 bio = bio_list_pop(&bios); 1977 1978 if (bio->bi_rw & REQ_FLUSH) 1979 process_flush_bio(cache, bio); 1980 else if (bio->bi_rw & REQ_DISCARD) 1981 process_discard_bio(cache, &structs, bio); 1982 else 1983 process_bio(cache, &structs, bio); 1984 prealloc_used = true; 1985 } 1986 1987 if (prealloc_used) 1988 prealloc_free_structs(cache, &structs); 1989 } 1990 1991 static void process_deferred_cells(struct cache *cache) 1992 { 1993 bool prealloc_used = false; 1994 unsigned long flags; 1995 struct dm_bio_prison_cell *cell, *tmp; 1996 struct list_head cells; 1997 struct prealloc structs; 1998 1999 memset(&structs, 0, sizeof(structs)); 2000 2001 INIT_LIST_HEAD(&cells); 2002 2003 spin_lock_irqsave(&cache->lock, flags); 2004 list_splice_init(&cache->deferred_cells, &cells); 2005 spin_unlock_irqrestore(&cache->lock, flags); 2006 2007 list_for_each_entry_safe(cell, tmp, &cells, user_list) { 2008 /* 2009 * If we've got no free migration structs, and processing 2010 * this bio might require one, we pause until there are some 2011 * prepared mappings to process. 2012 */ 2013 if (prealloc_data_structs(cache, &structs)) { 2014 spin_lock_irqsave(&cache->lock, flags); 2015 list_splice(&cells, &cache->deferred_cells); 2016 spin_unlock_irqrestore(&cache->lock, flags); 2017 break; 2018 } 2019 2020 process_cell(cache, &structs, cell); 2021 prealloc_used = true; 2022 } 2023 2024 if (prealloc_used) 2025 prealloc_free_structs(cache, &structs); 2026 } 2027 2028 static void process_deferred_flush_bios(struct cache *cache, bool submit_bios) 2029 { 2030 unsigned long flags; 2031 struct bio_list bios; 2032 struct bio *bio; 2033 2034 bio_list_init(&bios); 2035 2036 spin_lock_irqsave(&cache->lock, flags); 2037 bio_list_merge(&bios, &cache->deferred_flush_bios); 2038 bio_list_init(&cache->deferred_flush_bios); 2039 spin_unlock_irqrestore(&cache->lock, flags); 2040 2041 /* 2042 * These bios have already been through inc_ds() 2043 */ 2044 while ((bio = bio_list_pop(&bios))) 2045 submit_bios ? accounted_request(cache, bio) : bio_io_error(bio); 2046 } 2047 2048 static void process_deferred_writethrough_bios(struct cache *cache) 2049 { 2050 unsigned long flags; 2051 struct bio_list bios; 2052 struct bio *bio; 2053 2054 bio_list_init(&bios); 2055 2056 spin_lock_irqsave(&cache->lock, flags); 2057 bio_list_merge(&bios, &cache->deferred_writethrough_bios); 2058 bio_list_init(&cache->deferred_writethrough_bios); 2059 spin_unlock_irqrestore(&cache->lock, flags); 2060 2061 /* 2062 * These bios have already been through inc_ds() 2063 */ 2064 while ((bio = bio_list_pop(&bios))) 2065 accounted_request(cache, bio); 2066 } 2067 2068 static void writeback_some_dirty_blocks(struct cache *cache) 2069 { 2070 bool prealloc_used = false; 2071 dm_oblock_t oblock; 2072 dm_cblock_t cblock; 2073 struct prealloc structs; 2074 struct dm_bio_prison_cell *old_ocell; 2075 bool busy = !iot_idle_for(&cache->origin_tracker, HZ); 2076 2077 memset(&structs, 0, sizeof(structs)); 2078 2079 while (spare_migration_bandwidth(cache)) { 2080 if (policy_writeback_work(cache->policy, &oblock, &cblock, busy)) 2081 break; /* no work to do */ 2082 2083 if (prealloc_data_structs(cache, &structs) || 2084 get_cell(cache, oblock, &structs, &old_ocell)) { 2085 policy_set_dirty(cache->policy, oblock); 2086 break; 2087 } 2088 2089 writeback(cache, &structs, oblock, cblock, old_ocell); 2090 prealloc_used = true; 2091 } 2092 2093 if (prealloc_used) 2094 prealloc_free_structs(cache, &structs); 2095 } 2096 2097 /*---------------------------------------------------------------- 2098 * Invalidations. 2099 * Dropping something from the cache *without* writing back. 2100 *--------------------------------------------------------------*/ 2101 2102 static void process_invalidation_request(struct cache *cache, struct invalidation_request *req) 2103 { 2104 int r = 0; 2105 uint64_t begin = from_cblock(req->cblocks->begin); 2106 uint64_t end = from_cblock(req->cblocks->end); 2107 2108 while (begin != end) { 2109 r = policy_remove_cblock(cache->policy, to_cblock(begin)); 2110 if (!r) { 2111 r = dm_cache_remove_mapping(cache->cmd, to_cblock(begin)); 2112 if (r) { 2113 metadata_operation_failed(cache, "dm_cache_remove_mapping", r); 2114 break; 2115 } 2116 2117 } else if (r == -ENODATA) { 2118 /* harmless, already unmapped */ 2119 r = 0; 2120 2121 } else { 2122 DMERR("%s: policy_remove_cblock failed", cache_device_name(cache)); 2123 break; 2124 } 2125 2126 begin++; 2127 } 2128 2129 cache->commit_requested = true; 2130 2131 req->err = r; 2132 atomic_set(&req->complete, 1); 2133 2134 wake_up(&req->result_wait); 2135 } 2136 2137 static void process_invalidation_requests(struct cache *cache) 2138 { 2139 struct list_head list; 2140 struct invalidation_request *req, *tmp; 2141 2142 INIT_LIST_HEAD(&list); 2143 spin_lock(&cache->invalidation_lock); 2144 list_splice_init(&cache->invalidation_requests, &list); 2145 spin_unlock(&cache->invalidation_lock); 2146 2147 list_for_each_entry_safe (req, tmp, &list, list) 2148 process_invalidation_request(cache, req); 2149 } 2150 2151 /*---------------------------------------------------------------- 2152 * Main worker loop 2153 *--------------------------------------------------------------*/ 2154 static bool is_quiescing(struct cache *cache) 2155 { 2156 return atomic_read(&cache->quiescing); 2157 } 2158 2159 static void ack_quiescing(struct cache *cache) 2160 { 2161 if (is_quiescing(cache)) { 2162 atomic_inc(&cache->quiescing_ack); 2163 wake_up(&cache->quiescing_wait); 2164 } 2165 } 2166 2167 static void wait_for_quiescing_ack(struct cache *cache) 2168 { 2169 wait_event(cache->quiescing_wait, atomic_read(&cache->quiescing_ack)); 2170 } 2171 2172 static void start_quiescing(struct cache *cache) 2173 { 2174 atomic_inc(&cache->quiescing); 2175 wait_for_quiescing_ack(cache); 2176 } 2177 2178 static void stop_quiescing(struct cache *cache) 2179 { 2180 atomic_set(&cache->quiescing, 0); 2181 atomic_set(&cache->quiescing_ack, 0); 2182 } 2183 2184 static void wait_for_migrations(struct cache *cache) 2185 { 2186 wait_event(cache->migration_wait, !atomic_read(&cache->nr_allocated_migrations)); 2187 } 2188 2189 static void stop_worker(struct cache *cache) 2190 { 2191 cancel_delayed_work(&cache->waker); 2192 flush_workqueue(cache->wq); 2193 } 2194 2195 static void requeue_deferred_cells(struct cache *cache) 2196 { 2197 unsigned long flags; 2198 struct list_head cells; 2199 struct dm_bio_prison_cell *cell, *tmp; 2200 2201 INIT_LIST_HEAD(&cells); 2202 spin_lock_irqsave(&cache->lock, flags); 2203 list_splice_init(&cache->deferred_cells, &cells); 2204 spin_unlock_irqrestore(&cache->lock, flags); 2205 2206 list_for_each_entry_safe(cell, tmp, &cells, user_list) 2207 cell_requeue(cache, cell); 2208 } 2209 2210 static void requeue_deferred_bios(struct cache *cache) 2211 { 2212 struct bio *bio; 2213 struct bio_list bios; 2214 2215 bio_list_init(&bios); 2216 bio_list_merge(&bios, &cache->deferred_bios); 2217 bio_list_init(&cache->deferred_bios); 2218 2219 while ((bio = bio_list_pop(&bios))) 2220 bio_endio(bio, DM_ENDIO_REQUEUE); 2221 } 2222 2223 static int more_work(struct cache *cache) 2224 { 2225 if (is_quiescing(cache)) 2226 return !list_empty(&cache->quiesced_migrations) || 2227 !list_empty(&cache->completed_migrations) || 2228 !list_empty(&cache->need_commit_migrations); 2229 else 2230 return !bio_list_empty(&cache->deferred_bios) || 2231 !list_empty(&cache->deferred_cells) || 2232 !bio_list_empty(&cache->deferred_flush_bios) || 2233 !bio_list_empty(&cache->deferred_writethrough_bios) || 2234 !list_empty(&cache->quiesced_migrations) || 2235 !list_empty(&cache->completed_migrations) || 2236 !list_empty(&cache->need_commit_migrations) || 2237 cache->invalidate; 2238 } 2239 2240 static void do_worker(struct work_struct *ws) 2241 { 2242 struct cache *cache = container_of(ws, struct cache, worker); 2243 2244 do { 2245 if (!is_quiescing(cache)) { 2246 writeback_some_dirty_blocks(cache); 2247 process_deferred_writethrough_bios(cache); 2248 process_deferred_bios(cache); 2249 process_deferred_cells(cache); 2250 process_invalidation_requests(cache); 2251 } 2252 2253 process_migrations(cache, &cache->quiesced_migrations, issue_copy_or_discard); 2254 process_migrations(cache, &cache->completed_migrations, complete_migration); 2255 2256 if (commit_if_needed(cache)) { 2257 process_deferred_flush_bios(cache, false); 2258 process_migrations(cache, &cache->need_commit_migrations, migration_failure); 2259 } else { 2260 process_deferred_flush_bios(cache, true); 2261 process_migrations(cache, &cache->need_commit_migrations, 2262 migration_success_post_commit); 2263 } 2264 2265 ack_quiescing(cache); 2266 2267 } while (more_work(cache)); 2268 } 2269 2270 /* 2271 * We want to commit periodically so that not too much 2272 * unwritten metadata builds up. 2273 */ 2274 static void do_waker(struct work_struct *ws) 2275 { 2276 struct cache *cache = container_of(to_delayed_work(ws), struct cache, waker); 2277 policy_tick(cache->policy, true); 2278 wake_worker(cache); 2279 queue_delayed_work(cache->wq, &cache->waker, COMMIT_PERIOD); 2280 } 2281 2282 /*----------------------------------------------------------------*/ 2283 2284 static int is_congested(struct dm_dev *dev, int bdi_bits) 2285 { 2286 struct request_queue *q = bdev_get_queue(dev->bdev); 2287 return bdi_congested(&q->backing_dev_info, bdi_bits); 2288 } 2289 2290 static int cache_is_congested(struct dm_target_callbacks *cb, int bdi_bits) 2291 { 2292 struct cache *cache = container_of(cb, struct cache, callbacks); 2293 2294 return is_congested(cache->origin_dev, bdi_bits) || 2295 is_congested(cache->cache_dev, bdi_bits); 2296 } 2297 2298 /*---------------------------------------------------------------- 2299 * Target methods 2300 *--------------------------------------------------------------*/ 2301 2302 /* 2303 * This function gets called on the error paths of the constructor, so we 2304 * have to cope with a partially initialised struct. 2305 */ 2306 static void destroy(struct cache *cache) 2307 { 2308 unsigned i; 2309 2310 if (cache->migration_pool) 2311 mempool_destroy(cache->migration_pool); 2312 2313 if (cache->all_io_ds) 2314 dm_deferred_set_destroy(cache->all_io_ds); 2315 2316 if (cache->prison) 2317 dm_bio_prison_destroy(cache->prison); 2318 2319 if (cache->wq) 2320 destroy_workqueue(cache->wq); 2321 2322 if (cache->dirty_bitset) 2323 free_bitset(cache->dirty_bitset); 2324 2325 if (cache->discard_bitset) 2326 free_bitset(cache->discard_bitset); 2327 2328 if (cache->copier) 2329 dm_kcopyd_client_destroy(cache->copier); 2330 2331 if (cache->cmd) 2332 dm_cache_metadata_close(cache->cmd); 2333 2334 if (cache->metadata_dev) 2335 dm_put_device(cache->ti, cache->metadata_dev); 2336 2337 if (cache->origin_dev) 2338 dm_put_device(cache->ti, cache->origin_dev); 2339 2340 if (cache->cache_dev) 2341 dm_put_device(cache->ti, cache->cache_dev); 2342 2343 if (cache->policy) 2344 dm_cache_policy_destroy(cache->policy); 2345 2346 for (i = 0; i < cache->nr_ctr_args ; i++) 2347 kfree(cache->ctr_args[i]); 2348 kfree(cache->ctr_args); 2349 2350 kfree(cache); 2351 } 2352 2353 static void cache_dtr(struct dm_target *ti) 2354 { 2355 struct cache *cache = ti->private; 2356 2357 destroy(cache); 2358 } 2359 2360 static sector_t get_dev_size(struct dm_dev *dev) 2361 { 2362 return i_size_read(dev->bdev->bd_inode) >> SECTOR_SHIFT; 2363 } 2364 2365 /*----------------------------------------------------------------*/ 2366 2367 /* 2368 * Construct a cache device mapping. 2369 * 2370 * cache <metadata dev> <cache dev> <origin dev> <block size> 2371 * <#feature args> [<feature arg>]* 2372 * <policy> <#policy args> [<policy arg>]* 2373 * 2374 * metadata dev : fast device holding the persistent metadata 2375 * cache dev : fast device holding cached data blocks 2376 * origin dev : slow device holding original data blocks 2377 * block size : cache unit size in sectors 2378 * 2379 * #feature args : number of feature arguments passed 2380 * feature args : writethrough. (The default is writeback.) 2381 * 2382 * policy : the replacement policy to use 2383 * #policy args : an even number of policy arguments corresponding 2384 * to key/value pairs passed to the policy 2385 * policy args : key/value pairs passed to the policy 2386 * E.g. 'sequential_threshold 1024' 2387 * See cache-policies.txt for details. 2388 * 2389 * Optional feature arguments are: 2390 * writethrough : write through caching that prohibits cache block 2391 * content from being different from origin block content. 2392 * Without this argument, the default behaviour is to write 2393 * back cache block contents later for performance reasons, 2394 * so they may differ from the corresponding origin blocks. 2395 */ 2396 struct cache_args { 2397 struct dm_target *ti; 2398 2399 struct dm_dev *metadata_dev; 2400 2401 struct dm_dev *cache_dev; 2402 sector_t cache_sectors; 2403 2404 struct dm_dev *origin_dev; 2405 sector_t origin_sectors; 2406 2407 uint32_t block_size; 2408 2409 const char *policy_name; 2410 int policy_argc; 2411 const char **policy_argv; 2412 2413 struct cache_features features; 2414 }; 2415 2416 static void destroy_cache_args(struct cache_args *ca) 2417 { 2418 if (ca->metadata_dev) 2419 dm_put_device(ca->ti, ca->metadata_dev); 2420 2421 if (ca->cache_dev) 2422 dm_put_device(ca->ti, ca->cache_dev); 2423 2424 if (ca->origin_dev) 2425 dm_put_device(ca->ti, ca->origin_dev); 2426 2427 kfree(ca); 2428 } 2429 2430 static bool at_least_one_arg(struct dm_arg_set *as, char **error) 2431 { 2432 if (!as->argc) { 2433 *error = "Insufficient args"; 2434 return false; 2435 } 2436 2437 return true; 2438 } 2439 2440 static int parse_metadata_dev(struct cache_args *ca, struct dm_arg_set *as, 2441 char **error) 2442 { 2443 int r; 2444 sector_t metadata_dev_size; 2445 char b[BDEVNAME_SIZE]; 2446 2447 if (!at_least_one_arg(as, error)) 2448 return -EINVAL; 2449 2450 r = dm_get_device(ca->ti, dm_shift_arg(as), FMODE_READ | FMODE_WRITE, 2451 &ca->metadata_dev); 2452 if (r) { 2453 *error = "Error opening metadata device"; 2454 return r; 2455 } 2456 2457 metadata_dev_size = get_dev_size(ca->metadata_dev); 2458 if (metadata_dev_size > DM_CACHE_METADATA_MAX_SECTORS_WARNING) 2459 DMWARN("Metadata device %s is larger than %u sectors: excess space will not be used.", 2460 bdevname(ca->metadata_dev->bdev, b), THIN_METADATA_MAX_SECTORS); 2461 2462 return 0; 2463 } 2464 2465 static int parse_cache_dev(struct cache_args *ca, struct dm_arg_set *as, 2466 char **error) 2467 { 2468 int r; 2469 2470 if (!at_least_one_arg(as, error)) 2471 return -EINVAL; 2472 2473 r = dm_get_device(ca->ti, dm_shift_arg(as), FMODE_READ | FMODE_WRITE, 2474 &ca->cache_dev); 2475 if (r) { 2476 *error = "Error opening cache device"; 2477 return r; 2478 } 2479 ca->cache_sectors = get_dev_size(ca->cache_dev); 2480 2481 return 0; 2482 } 2483 2484 static int parse_origin_dev(struct cache_args *ca, struct dm_arg_set *as, 2485 char **error) 2486 { 2487 int r; 2488 2489 if (!at_least_one_arg(as, error)) 2490 return -EINVAL; 2491 2492 r = dm_get_device(ca->ti, dm_shift_arg(as), FMODE_READ | FMODE_WRITE, 2493 &ca->origin_dev); 2494 if (r) { 2495 *error = "Error opening origin device"; 2496 return r; 2497 } 2498 2499 ca->origin_sectors = get_dev_size(ca->origin_dev); 2500 if (ca->ti->len > ca->origin_sectors) { 2501 *error = "Device size larger than cached device"; 2502 return -EINVAL; 2503 } 2504 2505 return 0; 2506 } 2507 2508 static int parse_block_size(struct cache_args *ca, struct dm_arg_set *as, 2509 char **error) 2510 { 2511 unsigned long block_size; 2512 2513 if (!at_least_one_arg(as, error)) 2514 return -EINVAL; 2515 2516 if (kstrtoul(dm_shift_arg(as), 10, &block_size) || !block_size || 2517 block_size < DATA_DEV_BLOCK_SIZE_MIN_SECTORS || 2518 block_size > DATA_DEV_BLOCK_SIZE_MAX_SECTORS || 2519 block_size & (DATA_DEV_BLOCK_SIZE_MIN_SECTORS - 1)) { 2520 *error = "Invalid data block size"; 2521 return -EINVAL; 2522 } 2523 2524 if (block_size > ca->cache_sectors) { 2525 *error = "Data block size is larger than the cache device"; 2526 return -EINVAL; 2527 } 2528 2529 ca->block_size = block_size; 2530 2531 return 0; 2532 } 2533 2534 static void init_features(struct cache_features *cf) 2535 { 2536 cf->mode = CM_WRITE; 2537 cf->io_mode = CM_IO_WRITEBACK; 2538 } 2539 2540 static int parse_features(struct cache_args *ca, struct dm_arg_set *as, 2541 char **error) 2542 { 2543 static struct dm_arg _args[] = { 2544 {0, 1, "Invalid number of cache feature arguments"}, 2545 }; 2546 2547 int r; 2548 unsigned argc; 2549 const char *arg; 2550 struct cache_features *cf = &ca->features; 2551 2552 init_features(cf); 2553 2554 r = dm_read_arg_group(_args, as, &argc, error); 2555 if (r) 2556 return -EINVAL; 2557 2558 while (argc--) { 2559 arg = dm_shift_arg(as); 2560 2561 if (!strcasecmp(arg, "writeback")) 2562 cf->io_mode = CM_IO_WRITEBACK; 2563 2564 else if (!strcasecmp(arg, "writethrough")) 2565 cf->io_mode = CM_IO_WRITETHROUGH; 2566 2567 else if (!strcasecmp(arg, "passthrough")) 2568 cf->io_mode = CM_IO_PASSTHROUGH; 2569 2570 else { 2571 *error = "Unrecognised cache feature requested"; 2572 return -EINVAL; 2573 } 2574 } 2575 2576 return 0; 2577 } 2578 2579 static int parse_policy(struct cache_args *ca, struct dm_arg_set *as, 2580 char **error) 2581 { 2582 static struct dm_arg _args[] = { 2583 {0, 1024, "Invalid number of policy arguments"}, 2584 }; 2585 2586 int r; 2587 2588 if (!at_least_one_arg(as, error)) 2589 return -EINVAL; 2590 2591 ca->policy_name = dm_shift_arg(as); 2592 2593 r = dm_read_arg_group(_args, as, &ca->policy_argc, error); 2594 if (r) 2595 return -EINVAL; 2596 2597 ca->policy_argv = (const char **)as->argv; 2598 dm_consume_args(as, ca->policy_argc); 2599 2600 return 0; 2601 } 2602 2603 static int parse_cache_args(struct cache_args *ca, int argc, char **argv, 2604 char **error) 2605 { 2606 int r; 2607 struct dm_arg_set as; 2608 2609 as.argc = argc; 2610 as.argv = argv; 2611 2612 r = parse_metadata_dev(ca, &as, error); 2613 if (r) 2614 return r; 2615 2616 r = parse_cache_dev(ca, &as, error); 2617 if (r) 2618 return r; 2619 2620 r = parse_origin_dev(ca, &as, error); 2621 if (r) 2622 return r; 2623 2624 r = parse_block_size(ca, &as, error); 2625 if (r) 2626 return r; 2627 2628 r = parse_features(ca, &as, error); 2629 if (r) 2630 return r; 2631 2632 r = parse_policy(ca, &as, error); 2633 if (r) 2634 return r; 2635 2636 return 0; 2637 } 2638 2639 /*----------------------------------------------------------------*/ 2640 2641 static struct kmem_cache *migration_cache; 2642 2643 #define NOT_CORE_OPTION 1 2644 2645 static int process_config_option(struct cache *cache, const char *key, const char *value) 2646 { 2647 unsigned long tmp; 2648 2649 if (!strcasecmp(key, "migration_threshold")) { 2650 if (kstrtoul(value, 10, &tmp)) 2651 return -EINVAL; 2652 2653 cache->migration_threshold = tmp; 2654 return 0; 2655 } 2656 2657 return NOT_CORE_OPTION; 2658 } 2659 2660 static int set_config_value(struct cache *cache, const char *key, const char *value) 2661 { 2662 int r = process_config_option(cache, key, value); 2663 2664 if (r == NOT_CORE_OPTION) 2665 r = policy_set_config_value(cache->policy, key, value); 2666 2667 if (r) 2668 DMWARN("bad config value for %s: %s", key, value); 2669 2670 return r; 2671 } 2672 2673 static int set_config_values(struct cache *cache, int argc, const char **argv) 2674 { 2675 int r = 0; 2676 2677 if (argc & 1) { 2678 DMWARN("Odd number of policy arguments given but they should be <key> <value> pairs."); 2679 return -EINVAL; 2680 } 2681 2682 while (argc) { 2683 r = set_config_value(cache, argv[0], argv[1]); 2684 if (r) 2685 break; 2686 2687 argc -= 2; 2688 argv += 2; 2689 } 2690 2691 return r; 2692 } 2693 2694 static int create_cache_policy(struct cache *cache, struct cache_args *ca, 2695 char **error) 2696 { 2697 struct dm_cache_policy *p = dm_cache_policy_create(ca->policy_name, 2698 cache->cache_size, 2699 cache->origin_sectors, 2700 cache->sectors_per_block); 2701 if (IS_ERR(p)) { 2702 *error = "Error creating cache's policy"; 2703 return PTR_ERR(p); 2704 } 2705 cache->policy = p; 2706 2707 return 0; 2708 } 2709 2710 /* 2711 * We want the discard block size to be at least the size of the cache 2712 * block size and have no more than 2^14 discard blocks across the origin. 2713 */ 2714 #define MAX_DISCARD_BLOCKS (1 << 14) 2715 2716 static bool too_many_discard_blocks(sector_t discard_block_size, 2717 sector_t origin_size) 2718 { 2719 (void) sector_div(origin_size, discard_block_size); 2720 2721 return origin_size > MAX_DISCARD_BLOCKS; 2722 } 2723 2724 static sector_t calculate_discard_block_size(sector_t cache_block_size, 2725 sector_t origin_size) 2726 { 2727 sector_t discard_block_size = cache_block_size; 2728 2729 if (origin_size) 2730 while (too_many_discard_blocks(discard_block_size, origin_size)) 2731 discard_block_size *= 2; 2732 2733 return discard_block_size; 2734 } 2735 2736 static void set_cache_size(struct cache *cache, dm_cblock_t size) 2737 { 2738 dm_block_t nr_blocks = from_cblock(size); 2739 2740 if (nr_blocks > (1 << 20) && cache->cache_size != size) 2741 DMWARN_LIMIT("You have created a cache device with a lot of individual cache blocks (%llu)\n" 2742 "All these mappings can consume a lot of kernel memory, and take some time to read/write.\n" 2743 "Please consider increasing the cache block size to reduce the overall cache block count.", 2744 (unsigned long long) nr_blocks); 2745 2746 cache->cache_size = size; 2747 } 2748 2749 #define DEFAULT_MIGRATION_THRESHOLD 2048 2750 2751 static int cache_create(struct cache_args *ca, struct cache **result) 2752 { 2753 int r = 0; 2754 char **error = &ca->ti->error; 2755 struct cache *cache; 2756 struct dm_target *ti = ca->ti; 2757 dm_block_t origin_blocks; 2758 struct dm_cache_metadata *cmd; 2759 bool may_format = ca->features.mode == CM_WRITE; 2760 2761 cache = kzalloc(sizeof(*cache), GFP_KERNEL); 2762 if (!cache) 2763 return -ENOMEM; 2764 2765 cache->ti = ca->ti; 2766 ti->private = cache; 2767 ti->num_flush_bios = 2; 2768 ti->flush_supported = true; 2769 2770 ti->num_discard_bios = 1; 2771 ti->discards_supported = true; 2772 ti->discard_zeroes_data_unsupported = true; 2773 ti->split_discard_bios = false; 2774 2775 cache->features = ca->features; 2776 ti->per_bio_data_size = get_per_bio_data_size(cache); 2777 2778 cache->callbacks.congested_fn = cache_is_congested; 2779 dm_table_add_target_callbacks(ti->table, &cache->callbacks); 2780 2781 cache->metadata_dev = ca->metadata_dev; 2782 cache->origin_dev = ca->origin_dev; 2783 cache->cache_dev = ca->cache_dev; 2784 2785 ca->metadata_dev = ca->origin_dev = ca->cache_dev = NULL; 2786 2787 /* FIXME: factor out this whole section */ 2788 origin_blocks = cache->origin_sectors = ca->origin_sectors; 2789 origin_blocks = block_div(origin_blocks, ca->block_size); 2790 cache->origin_blocks = to_oblock(origin_blocks); 2791 2792 cache->sectors_per_block = ca->block_size; 2793 if (dm_set_target_max_io_len(ti, cache->sectors_per_block)) { 2794 r = -EINVAL; 2795 goto bad; 2796 } 2797 2798 if (ca->block_size & (ca->block_size - 1)) { 2799 dm_block_t cache_size = ca->cache_sectors; 2800 2801 cache->sectors_per_block_shift = -1; 2802 cache_size = block_div(cache_size, ca->block_size); 2803 set_cache_size(cache, to_cblock(cache_size)); 2804 } else { 2805 cache->sectors_per_block_shift = __ffs(ca->block_size); 2806 set_cache_size(cache, to_cblock(ca->cache_sectors >> cache->sectors_per_block_shift)); 2807 } 2808 2809 r = create_cache_policy(cache, ca, error); 2810 if (r) 2811 goto bad; 2812 2813 cache->policy_nr_args = ca->policy_argc; 2814 cache->migration_threshold = DEFAULT_MIGRATION_THRESHOLD; 2815 2816 r = set_config_values(cache, ca->policy_argc, ca->policy_argv); 2817 if (r) { 2818 *error = "Error setting cache policy's config values"; 2819 goto bad; 2820 } 2821 2822 cmd = dm_cache_metadata_open(cache->metadata_dev->bdev, 2823 ca->block_size, may_format, 2824 dm_cache_policy_get_hint_size(cache->policy)); 2825 if (IS_ERR(cmd)) { 2826 *error = "Error creating metadata object"; 2827 r = PTR_ERR(cmd); 2828 goto bad; 2829 } 2830 cache->cmd = cmd; 2831 set_cache_mode(cache, CM_WRITE); 2832 if (get_cache_mode(cache) != CM_WRITE) { 2833 *error = "Unable to get write access to metadata, please check/repair metadata."; 2834 r = -EINVAL; 2835 goto bad; 2836 } 2837 2838 if (passthrough_mode(&cache->features)) { 2839 bool all_clean; 2840 2841 r = dm_cache_metadata_all_clean(cache->cmd, &all_clean); 2842 if (r) { 2843 *error = "dm_cache_metadata_all_clean() failed"; 2844 goto bad; 2845 } 2846 2847 if (!all_clean) { 2848 *error = "Cannot enter passthrough mode unless all blocks are clean"; 2849 r = -EINVAL; 2850 goto bad; 2851 } 2852 } 2853 2854 spin_lock_init(&cache->lock); 2855 INIT_LIST_HEAD(&cache->deferred_cells); 2856 bio_list_init(&cache->deferred_bios); 2857 bio_list_init(&cache->deferred_flush_bios); 2858 bio_list_init(&cache->deferred_writethrough_bios); 2859 INIT_LIST_HEAD(&cache->quiesced_migrations); 2860 INIT_LIST_HEAD(&cache->completed_migrations); 2861 INIT_LIST_HEAD(&cache->need_commit_migrations); 2862 atomic_set(&cache->nr_allocated_migrations, 0); 2863 atomic_set(&cache->nr_io_migrations, 0); 2864 init_waitqueue_head(&cache->migration_wait); 2865 2866 init_waitqueue_head(&cache->quiescing_wait); 2867 atomic_set(&cache->quiescing, 0); 2868 atomic_set(&cache->quiescing_ack, 0); 2869 2870 r = -ENOMEM; 2871 atomic_set(&cache->nr_dirty, 0); 2872 cache->dirty_bitset = alloc_bitset(from_cblock(cache->cache_size)); 2873 if (!cache->dirty_bitset) { 2874 *error = "could not allocate dirty bitset"; 2875 goto bad; 2876 } 2877 clear_bitset(cache->dirty_bitset, from_cblock(cache->cache_size)); 2878 2879 cache->discard_block_size = 2880 calculate_discard_block_size(cache->sectors_per_block, 2881 cache->origin_sectors); 2882 cache->discard_nr_blocks = to_dblock(dm_sector_div_up(cache->origin_sectors, 2883 cache->discard_block_size)); 2884 cache->discard_bitset = alloc_bitset(from_dblock(cache->discard_nr_blocks)); 2885 if (!cache->discard_bitset) { 2886 *error = "could not allocate discard bitset"; 2887 goto bad; 2888 } 2889 clear_bitset(cache->discard_bitset, from_dblock(cache->discard_nr_blocks)); 2890 2891 cache->copier = dm_kcopyd_client_create(&dm_kcopyd_throttle); 2892 if (IS_ERR(cache->copier)) { 2893 *error = "could not create kcopyd client"; 2894 r = PTR_ERR(cache->copier); 2895 goto bad; 2896 } 2897 2898 cache->wq = alloc_ordered_workqueue("dm-" DM_MSG_PREFIX, WQ_MEM_RECLAIM); 2899 if (!cache->wq) { 2900 *error = "could not create workqueue for metadata object"; 2901 goto bad; 2902 } 2903 INIT_WORK(&cache->worker, do_worker); 2904 INIT_DELAYED_WORK(&cache->waker, do_waker); 2905 cache->last_commit_jiffies = jiffies; 2906 2907 cache->prison = dm_bio_prison_create(); 2908 if (!cache->prison) { 2909 *error = "could not create bio prison"; 2910 goto bad; 2911 } 2912 2913 cache->all_io_ds = dm_deferred_set_create(); 2914 if (!cache->all_io_ds) { 2915 *error = "could not create all_io deferred set"; 2916 goto bad; 2917 } 2918 2919 cache->migration_pool = mempool_create_slab_pool(MIGRATION_POOL_SIZE, 2920 migration_cache); 2921 if (!cache->migration_pool) { 2922 *error = "Error creating cache's migration mempool"; 2923 goto bad; 2924 } 2925 2926 cache->need_tick_bio = true; 2927 cache->sized = false; 2928 cache->invalidate = false; 2929 cache->commit_requested = false; 2930 cache->loaded_mappings = false; 2931 cache->loaded_discards = false; 2932 2933 load_stats(cache); 2934 2935 atomic_set(&cache->stats.demotion, 0); 2936 atomic_set(&cache->stats.promotion, 0); 2937 atomic_set(&cache->stats.copies_avoided, 0); 2938 atomic_set(&cache->stats.cache_cell_clash, 0); 2939 atomic_set(&cache->stats.commit_count, 0); 2940 atomic_set(&cache->stats.discard_count, 0); 2941 2942 spin_lock_init(&cache->invalidation_lock); 2943 INIT_LIST_HEAD(&cache->invalidation_requests); 2944 2945 iot_init(&cache->origin_tracker); 2946 2947 *result = cache; 2948 return 0; 2949 2950 bad: 2951 destroy(cache); 2952 return r; 2953 } 2954 2955 static int copy_ctr_args(struct cache *cache, int argc, const char **argv) 2956 { 2957 unsigned i; 2958 const char **copy; 2959 2960 copy = kcalloc(argc, sizeof(*copy), GFP_KERNEL); 2961 if (!copy) 2962 return -ENOMEM; 2963 for (i = 0; i < argc; i++) { 2964 copy[i] = kstrdup(argv[i], GFP_KERNEL); 2965 if (!copy[i]) { 2966 while (i--) 2967 kfree(copy[i]); 2968 kfree(copy); 2969 return -ENOMEM; 2970 } 2971 } 2972 2973 cache->nr_ctr_args = argc; 2974 cache->ctr_args = copy; 2975 2976 return 0; 2977 } 2978 2979 static int cache_ctr(struct dm_target *ti, unsigned argc, char **argv) 2980 { 2981 int r = -EINVAL; 2982 struct cache_args *ca; 2983 struct cache *cache = NULL; 2984 2985 ca = kzalloc(sizeof(*ca), GFP_KERNEL); 2986 if (!ca) { 2987 ti->error = "Error allocating memory for cache"; 2988 return -ENOMEM; 2989 } 2990 ca->ti = ti; 2991 2992 r = parse_cache_args(ca, argc, argv, &ti->error); 2993 if (r) 2994 goto out; 2995 2996 r = cache_create(ca, &cache); 2997 if (r) 2998 goto out; 2999 3000 r = copy_ctr_args(cache, argc - 3, (const char **)argv + 3); 3001 if (r) { 3002 destroy(cache); 3003 goto out; 3004 } 3005 3006 ti->private = cache; 3007 3008 out: 3009 destroy_cache_args(ca); 3010 return r; 3011 } 3012 3013 /*----------------------------------------------------------------*/ 3014 3015 static int cache_map(struct dm_target *ti, struct bio *bio) 3016 { 3017 struct cache *cache = ti->private; 3018 3019 int r; 3020 struct dm_bio_prison_cell *cell = NULL; 3021 dm_oblock_t block = get_bio_block(cache, bio); 3022 size_t pb_data_size = get_per_bio_data_size(cache); 3023 bool can_migrate = false; 3024 bool fast_promotion; 3025 struct policy_result lookup_result; 3026 struct per_bio_data *pb = init_per_bio_data(bio, pb_data_size); 3027 struct old_oblock_lock ool; 3028 3029 ool.locker.fn = null_locker; 3030 3031 if (unlikely(from_oblock(block) >= from_oblock(cache->origin_blocks))) { 3032 /* 3033 * This can only occur if the io goes to a partial block at 3034 * the end of the origin device. We don't cache these. 3035 * Just remap to the origin and carry on. 3036 */ 3037 remap_to_origin(cache, bio); 3038 accounted_begin(cache, bio); 3039 return DM_MAPIO_REMAPPED; 3040 } 3041 3042 if (discard_or_flush(bio)) { 3043 defer_bio(cache, bio); 3044 return DM_MAPIO_SUBMITTED; 3045 } 3046 3047 /* 3048 * Check to see if that block is currently migrating. 3049 */ 3050 cell = alloc_prison_cell(cache); 3051 if (!cell) { 3052 defer_bio(cache, bio); 3053 return DM_MAPIO_SUBMITTED; 3054 } 3055 3056 r = bio_detain(cache, block, bio, cell, 3057 (cell_free_fn) free_prison_cell, 3058 cache, &cell); 3059 if (r) { 3060 if (r < 0) 3061 defer_bio(cache, bio); 3062 3063 return DM_MAPIO_SUBMITTED; 3064 } 3065 3066 fast_promotion = is_discarded_oblock(cache, block) || bio_writes_complete_block(cache, bio); 3067 3068 r = policy_map(cache->policy, block, false, can_migrate, fast_promotion, 3069 bio, &ool.locker, &lookup_result); 3070 if (r == -EWOULDBLOCK) { 3071 cell_defer(cache, cell, true); 3072 return DM_MAPIO_SUBMITTED; 3073 3074 } else if (r) { 3075 DMERR_LIMIT("%s: Unexpected return from cache replacement policy: %d", 3076 cache_device_name(cache), r); 3077 cell_defer(cache, cell, false); 3078 bio_io_error(bio); 3079 return DM_MAPIO_SUBMITTED; 3080 } 3081 3082 r = DM_MAPIO_REMAPPED; 3083 switch (lookup_result.op) { 3084 case POLICY_HIT: 3085 if (passthrough_mode(&cache->features)) { 3086 if (bio_data_dir(bio) == WRITE) { 3087 /* 3088 * We need to invalidate this block, so 3089 * defer for the worker thread. 3090 */ 3091 cell_defer(cache, cell, true); 3092 r = DM_MAPIO_SUBMITTED; 3093 3094 } else { 3095 inc_miss_counter(cache, bio); 3096 remap_to_origin_clear_discard(cache, bio, block); 3097 accounted_begin(cache, bio); 3098 inc_ds(cache, bio, cell); 3099 // FIXME: we want to remap hits or misses straight 3100 // away rather than passing over to the worker. 3101 cell_defer(cache, cell, false); 3102 } 3103 3104 } else { 3105 inc_hit_counter(cache, bio); 3106 if (bio_data_dir(bio) == WRITE && writethrough_mode(&cache->features) && 3107 !is_dirty(cache, lookup_result.cblock)) { 3108 remap_to_origin_then_cache(cache, bio, block, lookup_result.cblock); 3109 accounted_begin(cache, bio); 3110 inc_ds(cache, bio, cell); 3111 cell_defer(cache, cell, false); 3112 3113 } else 3114 remap_cell_to_cache_dirty(cache, cell, block, lookup_result.cblock, false); 3115 } 3116 break; 3117 3118 case POLICY_MISS: 3119 inc_miss_counter(cache, bio); 3120 if (pb->req_nr != 0) { 3121 /* 3122 * This is a duplicate writethrough io that is no 3123 * longer needed because the block has been demoted. 3124 */ 3125 bio_endio(bio, 0); 3126 // FIXME: remap everything as a miss 3127 cell_defer(cache, cell, false); 3128 r = DM_MAPIO_SUBMITTED; 3129 3130 } else 3131 remap_cell_to_origin_clear_discard(cache, cell, block, false); 3132 break; 3133 3134 default: 3135 DMERR_LIMIT("%s: %s: erroring bio: unknown policy op: %u", 3136 cache_device_name(cache), __func__, 3137 (unsigned) lookup_result.op); 3138 cell_defer(cache, cell, false); 3139 bio_io_error(bio); 3140 r = DM_MAPIO_SUBMITTED; 3141 } 3142 3143 return r; 3144 } 3145 3146 static int cache_end_io(struct dm_target *ti, struct bio *bio, int error) 3147 { 3148 struct cache *cache = ti->private; 3149 unsigned long flags; 3150 size_t pb_data_size = get_per_bio_data_size(cache); 3151 struct per_bio_data *pb = get_per_bio_data(bio, pb_data_size); 3152 3153 if (pb->tick) { 3154 policy_tick(cache->policy, false); 3155 3156 spin_lock_irqsave(&cache->lock, flags); 3157 cache->need_tick_bio = true; 3158 spin_unlock_irqrestore(&cache->lock, flags); 3159 } 3160 3161 check_for_quiesced_migrations(cache, pb); 3162 accounted_complete(cache, bio); 3163 3164 return 0; 3165 } 3166 3167 static int write_dirty_bitset(struct cache *cache) 3168 { 3169 unsigned i, r; 3170 3171 if (get_cache_mode(cache) >= CM_READ_ONLY) 3172 return -EINVAL; 3173 3174 for (i = 0; i < from_cblock(cache->cache_size); i++) { 3175 r = dm_cache_set_dirty(cache->cmd, to_cblock(i), 3176 is_dirty(cache, to_cblock(i))); 3177 if (r) { 3178 metadata_operation_failed(cache, "dm_cache_set_dirty", r); 3179 return r; 3180 } 3181 } 3182 3183 return 0; 3184 } 3185 3186 static int write_discard_bitset(struct cache *cache) 3187 { 3188 unsigned i, r; 3189 3190 if (get_cache_mode(cache) >= CM_READ_ONLY) 3191 return -EINVAL; 3192 3193 r = dm_cache_discard_bitset_resize(cache->cmd, cache->discard_block_size, 3194 cache->discard_nr_blocks); 3195 if (r) { 3196 DMERR("%s: could not resize on-disk discard bitset", cache_device_name(cache)); 3197 metadata_operation_failed(cache, "dm_cache_discard_bitset_resize", r); 3198 return r; 3199 } 3200 3201 for (i = 0; i < from_dblock(cache->discard_nr_blocks); i++) { 3202 r = dm_cache_set_discard(cache->cmd, to_dblock(i), 3203 is_discarded(cache, to_dblock(i))); 3204 if (r) { 3205 metadata_operation_failed(cache, "dm_cache_set_discard", r); 3206 return r; 3207 } 3208 } 3209 3210 return 0; 3211 } 3212 3213 static int write_hints(struct cache *cache) 3214 { 3215 int r; 3216 3217 if (get_cache_mode(cache) >= CM_READ_ONLY) 3218 return -EINVAL; 3219 3220 r = dm_cache_write_hints(cache->cmd, cache->policy); 3221 if (r) { 3222 metadata_operation_failed(cache, "dm_cache_write_hints", r); 3223 return r; 3224 } 3225 3226 return 0; 3227 } 3228 3229 /* 3230 * returns true on success 3231 */ 3232 static bool sync_metadata(struct cache *cache) 3233 { 3234 int r1, r2, r3, r4; 3235 3236 r1 = write_dirty_bitset(cache); 3237 if (r1) 3238 DMERR("%s: could not write dirty bitset", cache_device_name(cache)); 3239 3240 r2 = write_discard_bitset(cache); 3241 if (r2) 3242 DMERR("%s: could not write discard bitset", cache_device_name(cache)); 3243 3244 save_stats(cache); 3245 3246 r3 = write_hints(cache); 3247 if (r3) 3248 DMERR("%s: could not write hints", cache_device_name(cache)); 3249 3250 /* 3251 * If writing the above metadata failed, we still commit, but don't 3252 * set the clean shutdown flag. This will effectively force every 3253 * dirty bit to be set on reload. 3254 */ 3255 r4 = commit(cache, !r1 && !r2 && !r3); 3256 if (r4) 3257 DMERR("%s: could not write cache metadata", cache_device_name(cache)); 3258 3259 return !r1 && !r2 && !r3 && !r4; 3260 } 3261 3262 static void cache_postsuspend(struct dm_target *ti) 3263 { 3264 struct cache *cache = ti->private; 3265 3266 start_quiescing(cache); 3267 wait_for_migrations(cache); 3268 stop_worker(cache); 3269 requeue_deferred_bios(cache); 3270 requeue_deferred_cells(cache); 3271 stop_quiescing(cache); 3272 3273 if (get_cache_mode(cache) == CM_WRITE) 3274 (void) sync_metadata(cache); 3275 } 3276 3277 static int load_mapping(void *context, dm_oblock_t oblock, dm_cblock_t cblock, 3278 bool dirty, uint32_t hint, bool hint_valid) 3279 { 3280 int r; 3281 struct cache *cache = context; 3282 3283 r = policy_load_mapping(cache->policy, oblock, cblock, hint, hint_valid); 3284 if (r) 3285 return r; 3286 3287 if (dirty) 3288 set_dirty(cache, oblock, cblock); 3289 else 3290 clear_dirty(cache, oblock, cblock); 3291 3292 return 0; 3293 } 3294 3295 /* 3296 * The discard block size in the on disk metadata is not 3297 * neccessarily the same as we're currently using. So we have to 3298 * be careful to only set the discarded attribute if we know it 3299 * covers a complete block of the new size. 3300 */ 3301 struct discard_load_info { 3302 struct cache *cache; 3303 3304 /* 3305 * These blocks are sized using the on disk dblock size, rather 3306 * than the current one. 3307 */ 3308 dm_block_t block_size; 3309 dm_block_t discard_begin, discard_end; 3310 }; 3311 3312 static void discard_load_info_init(struct cache *cache, 3313 struct discard_load_info *li) 3314 { 3315 li->cache = cache; 3316 li->discard_begin = li->discard_end = 0; 3317 } 3318 3319 static void set_discard_range(struct discard_load_info *li) 3320 { 3321 sector_t b, e; 3322 3323 if (li->discard_begin == li->discard_end) 3324 return; 3325 3326 /* 3327 * Convert to sectors. 3328 */ 3329 b = li->discard_begin * li->block_size; 3330 e = li->discard_end * li->block_size; 3331 3332 /* 3333 * Then convert back to the current dblock size. 3334 */ 3335 b = dm_sector_div_up(b, li->cache->discard_block_size); 3336 sector_div(e, li->cache->discard_block_size); 3337 3338 /* 3339 * The origin may have shrunk, so we need to check we're still in 3340 * bounds. 3341 */ 3342 if (e > from_dblock(li->cache->discard_nr_blocks)) 3343 e = from_dblock(li->cache->discard_nr_blocks); 3344 3345 for (; b < e; b++) 3346 set_discard(li->cache, to_dblock(b)); 3347 } 3348 3349 static int load_discard(void *context, sector_t discard_block_size, 3350 dm_dblock_t dblock, bool discard) 3351 { 3352 struct discard_load_info *li = context; 3353 3354 li->block_size = discard_block_size; 3355 3356 if (discard) { 3357 if (from_dblock(dblock) == li->discard_end) 3358 /* 3359 * We're already in a discard range, just extend it. 3360 */ 3361 li->discard_end = li->discard_end + 1ULL; 3362 3363 else { 3364 /* 3365 * Emit the old range and start a new one. 3366 */ 3367 set_discard_range(li); 3368 li->discard_begin = from_dblock(dblock); 3369 li->discard_end = li->discard_begin + 1ULL; 3370 } 3371 } else { 3372 set_discard_range(li); 3373 li->discard_begin = li->discard_end = 0; 3374 } 3375 3376 return 0; 3377 } 3378 3379 static dm_cblock_t get_cache_dev_size(struct cache *cache) 3380 { 3381 sector_t size = get_dev_size(cache->cache_dev); 3382 (void) sector_div(size, cache->sectors_per_block); 3383 return to_cblock(size); 3384 } 3385 3386 static bool can_resize(struct cache *cache, dm_cblock_t new_size) 3387 { 3388 if (from_cblock(new_size) > from_cblock(cache->cache_size)) 3389 return true; 3390 3391 /* 3392 * We can't drop a dirty block when shrinking the cache. 3393 */ 3394 while (from_cblock(new_size) < from_cblock(cache->cache_size)) { 3395 new_size = to_cblock(from_cblock(new_size) + 1); 3396 if (is_dirty(cache, new_size)) { 3397 DMERR("%s: unable to shrink cache; cache block %llu is dirty", 3398 cache_device_name(cache), 3399 (unsigned long long) from_cblock(new_size)); 3400 return false; 3401 } 3402 } 3403 3404 return true; 3405 } 3406 3407 static int resize_cache_dev(struct cache *cache, dm_cblock_t new_size) 3408 { 3409 int r; 3410 3411 r = dm_cache_resize(cache->cmd, new_size); 3412 if (r) { 3413 DMERR("%s: could not resize cache metadata", cache_device_name(cache)); 3414 metadata_operation_failed(cache, "dm_cache_resize", r); 3415 return r; 3416 } 3417 3418 set_cache_size(cache, new_size); 3419 3420 return 0; 3421 } 3422 3423 static int cache_preresume(struct dm_target *ti) 3424 { 3425 int r = 0; 3426 struct cache *cache = ti->private; 3427 dm_cblock_t csize = get_cache_dev_size(cache); 3428 3429 /* 3430 * Check to see if the cache has resized. 3431 */ 3432 if (!cache->sized) { 3433 r = resize_cache_dev(cache, csize); 3434 if (r) 3435 return r; 3436 3437 cache->sized = true; 3438 3439 } else if (csize != cache->cache_size) { 3440 if (!can_resize(cache, csize)) 3441 return -EINVAL; 3442 3443 r = resize_cache_dev(cache, csize); 3444 if (r) 3445 return r; 3446 } 3447 3448 if (!cache->loaded_mappings) { 3449 r = dm_cache_load_mappings(cache->cmd, cache->policy, 3450 load_mapping, cache); 3451 if (r) { 3452 DMERR("%s: could not load cache mappings", cache_device_name(cache)); 3453 metadata_operation_failed(cache, "dm_cache_load_mappings", r); 3454 return r; 3455 } 3456 3457 cache->loaded_mappings = true; 3458 } 3459 3460 if (!cache->loaded_discards) { 3461 struct discard_load_info li; 3462 3463 /* 3464 * The discard bitset could have been resized, or the 3465 * discard block size changed. To be safe we start by 3466 * setting every dblock to not discarded. 3467 */ 3468 clear_bitset(cache->discard_bitset, from_dblock(cache->discard_nr_blocks)); 3469 3470 discard_load_info_init(cache, &li); 3471 r = dm_cache_load_discards(cache->cmd, load_discard, &li); 3472 if (r) { 3473 DMERR("%s: could not load origin discards", cache_device_name(cache)); 3474 metadata_operation_failed(cache, "dm_cache_load_discards", r); 3475 return r; 3476 } 3477 set_discard_range(&li); 3478 3479 cache->loaded_discards = true; 3480 } 3481 3482 return r; 3483 } 3484 3485 static void cache_resume(struct dm_target *ti) 3486 { 3487 struct cache *cache = ti->private; 3488 3489 cache->need_tick_bio = true; 3490 do_waker(&cache->waker.work); 3491 } 3492 3493 /* 3494 * Status format: 3495 * 3496 * <metadata block size> <#used metadata blocks>/<#total metadata blocks> 3497 * <cache block size> <#used cache blocks>/<#total cache blocks> 3498 * <#read hits> <#read misses> <#write hits> <#write misses> 3499 * <#demotions> <#promotions> <#dirty> 3500 * <#features> <features>* 3501 * <#core args> <core args> 3502 * <policy name> <#policy args> <policy args>* <cache metadata mode> <needs_check> 3503 */ 3504 static void cache_status(struct dm_target *ti, status_type_t type, 3505 unsigned status_flags, char *result, unsigned maxlen) 3506 { 3507 int r = 0; 3508 unsigned i; 3509 ssize_t sz = 0; 3510 dm_block_t nr_free_blocks_metadata = 0; 3511 dm_block_t nr_blocks_metadata = 0; 3512 char buf[BDEVNAME_SIZE]; 3513 struct cache *cache = ti->private; 3514 dm_cblock_t residency; 3515 3516 switch (type) { 3517 case STATUSTYPE_INFO: 3518 if (get_cache_mode(cache) == CM_FAIL) { 3519 DMEMIT("Fail"); 3520 break; 3521 } 3522 3523 /* Commit to ensure statistics aren't out-of-date */ 3524 if (!(status_flags & DM_STATUS_NOFLUSH_FLAG) && !dm_suspended(ti)) 3525 (void) commit(cache, false); 3526 3527 r = dm_cache_get_free_metadata_block_count(cache->cmd, &nr_free_blocks_metadata); 3528 if (r) { 3529 DMERR("%s: dm_cache_get_free_metadata_block_count returned %d", 3530 cache_device_name(cache), r); 3531 goto err; 3532 } 3533 3534 r = dm_cache_get_metadata_dev_size(cache->cmd, &nr_blocks_metadata); 3535 if (r) { 3536 DMERR("%s: dm_cache_get_metadata_dev_size returned %d", 3537 cache_device_name(cache), r); 3538 goto err; 3539 } 3540 3541 residency = policy_residency(cache->policy); 3542 3543 DMEMIT("%u %llu/%llu %u %llu/%llu %u %u %u %u %u %u %lu ", 3544 (unsigned)DM_CACHE_METADATA_BLOCK_SIZE, 3545 (unsigned long long)(nr_blocks_metadata - nr_free_blocks_metadata), 3546 (unsigned long long)nr_blocks_metadata, 3547 cache->sectors_per_block, 3548 (unsigned long long) from_cblock(residency), 3549 (unsigned long long) from_cblock(cache->cache_size), 3550 (unsigned) atomic_read(&cache->stats.read_hit), 3551 (unsigned) atomic_read(&cache->stats.read_miss), 3552 (unsigned) atomic_read(&cache->stats.write_hit), 3553 (unsigned) atomic_read(&cache->stats.write_miss), 3554 (unsigned) atomic_read(&cache->stats.demotion), 3555 (unsigned) atomic_read(&cache->stats.promotion), 3556 (unsigned long) atomic_read(&cache->nr_dirty)); 3557 3558 if (writethrough_mode(&cache->features)) 3559 DMEMIT("1 writethrough "); 3560 3561 else if (passthrough_mode(&cache->features)) 3562 DMEMIT("1 passthrough "); 3563 3564 else if (writeback_mode(&cache->features)) 3565 DMEMIT("1 writeback "); 3566 3567 else { 3568 DMERR("%s: internal error: unknown io mode: %d", 3569 cache_device_name(cache), (int) cache->features.io_mode); 3570 goto err; 3571 } 3572 3573 DMEMIT("2 migration_threshold %llu ", (unsigned long long) cache->migration_threshold); 3574 3575 DMEMIT("%s ", dm_cache_policy_get_name(cache->policy)); 3576 if (sz < maxlen) { 3577 r = policy_emit_config_values(cache->policy, result, maxlen, &sz); 3578 if (r) 3579 DMERR("%s: policy_emit_config_values returned %d", 3580 cache_device_name(cache), r); 3581 } 3582 3583 if (get_cache_mode(cache) == CM_READ_ONLY) 3584 DMEMIT("ro "); 3585 else 3586 DMEMIT("rw "); 3587 3588 if (dm_cache_metadata_needs_check(cache->cmd)) 3589 DMEMIT("needs_check "); 3590 else 3591 DMEMIT("- "); 3592 3593 break; 3594 3595 case STATUSTYPE_TABLE: 3596 format_dev_t(buf, cache->metadata_dev->bdev->bd_dev); 3597 DMEMIT("%s ", buf); 3598 format_dev_t(buf, cache->cache_dev->bdev->bd_dev); 3599 DMEMIT("%s ", buf); 3600 format_dev_t(buf, cache->origin_dev->bdev->bd_dev); 3601 DMEMIT("%s", buf); 3602 3603 for (i = 0; i < cache->nr_ctr_args - 1; i++) 3604 DMEMIT(" %s", cache->ctr_args[i]); 3605 if (cache->nr_ctr_args) 3606 DMEMIT(" %s", cache->ctr_args[cache->nr_ctr_args - 1]); 3607 } 3608 3609 return; 3610 3611 err: 3612 DMEMIT("Error"); 3613 } 3614 3615 /* 3616 * A cache block range can take two forms: 3617 * 3618 * i) A single cblock, eg. '3456' 3619 * ii) A begin and end cblock with dots between, eg. 123-234 3620 */ 3621 static int parse_cblock_range(struct cache *cache, const char *str, 3622 struct cblock_range *result) 3623 { 3624 char dummy; 3625 uint64_t b, e; 3626 int r; 3627 3628 /* 3629 * Try and parse form (ii) first. 3630 */ 3631 r = sscanf(str, "%llu-%llu%c", &b, &e, &dummy); 3632 if (r < 0) 3633 return r; 3634 3635 if (r == 2) { 3636 result->begin = to_cblock(b); 3637 result->end = to_cblock(e); 3638 return 0; 3639 } 3640 3641 /* 3642 * That didn't work, try form (i). 3643 */ 3644 r = sscanf(str, "%llu%c", &b, &dummy); 3645 if (r < 0) 3646 return r; 3647 3648 if (r == 1) { 3649 result->begin = to_cblock(b); 3650 result->end = to_cblock(from_cblock(result->begin) + 1u); 3651 return 0; 3652 } 3653 3654 DMERR("%s: invalid cblock range '%s'", cache_device_name(cache), str); 3655 return -EINVAL; 3656 } 3657 3658 static int validate_cblock_range(struct cache *cache, struct cblock_range *range) 3659 { 3660 uint64_t b = from_cblock(range->begin); 3661 uint64_t e = from_cblock(range->end); 3662 uint64_t n = from_cblock(cache->cache_size); 3663 3664 if (b >= n) { 3665 DMERR("%s: begin cblock out of range: %llu >= %llu", 3666 cache_device_name(cache), b, n); 3667 return -EINVAL; 3668 } 3669 3670 if (e > n) { 3671 DMERR("%s: end cblock out of range: %llu > %llu", 3672 cache_device_name(cache), e, n); 3673 return -EINVAL; 3674 } 3675 3676 if (b >= e) { 3677 DMERR("%s: invalid cblock range: %llu >= %llu", 3678 cache_device_name(cache), b, e); 3679 return -EINVAL; 3680 } 3681 3682 return 0; 3683 } 3684 3685 static int request_invalidation(struct cache *cache, struct cblock_range *range) 3686 { 3687 struct invalidation_request req; 3688 3689 INIT_LIST_HEAD(&req.list); 3690 req.cblocks = range; 3691 atomic_set(&req.complete, 0); 3692 req.err = 0; 3693 init_waitqueue_head(&req.result_wait); 3694 3695 spin_lock(&cache->invalidation_lock); 3696 list_add(&req.list, &cache->invalidation_requests); 3697 spin_unlock(&cache->invalidation_lock); 3698 wake_worker(cache); 3699 3700 wait_event(req.result_wait, atomic_read(&req.complete)); 3701 return req.err; 3702 } 3703 3704 static int process_invalidate_cblocks_message(struct cache *cache, unsigned count, 3705 const char **cblock_ranges) 3706 { 3707 int r = 0; 3708 unsigned i; 3709 struct cblock_range range; 3710 3711 if (!passthrough_mode(&cache->features)) { 3712 DMERR("%s: cache has to be in passthrough mode for invalidation", 3713 cache_device_name(cache)); 3714 return -EPERM; 3715 } 3716 3717 for (i = 0; i < count; i++) { 3718 r = parse_cblock_range(cache, cblock_ranges[i], &range); 3719 if (r) 3720 break; 3721 3722 r = validate_cblock_range(cache, &range); 3723 if (r) 3724 break; 3725 3726 /* 3727 * Pass begin and end origin blocks to the worker and wake it. 3728 */ 3729 r = request_invalidation(cache, &range); 3730 if (r) 3731 break; 3732 } 3733 3734 return r; 3735 } 3736 3737 /* 3738 * Supports 3739 * "<key> <value>" 3740 * and 3741 * "invalidate_cblocks [(<begin>)|(<begin>-<end>)]* 3742 * 3743 * The key migration_threshold is supported by the cache target core. 3744 */ 3745 static int cache_message(struct dm_target *ti, unsigned argc, char **argv) 3746 { 3747 struct cache *cache = ti->private; 3748 3749 if (!argc) 3750 return -EINVAL; 3751 3752 if (get_cache_mode(cache) >= CM_READ_ONLY) { 3753 DMERR("%s: unable to service cache target messages in READ_ONLY or FAIL mode", 3754 cache_device_name(cache)); 3755 return -EOPNOTSUPP; 3756 } 3757 3758 if (!strcasecmp(argv[0], "invalidate_cblocks")) 3759 return process_invalidate_cblocks_message(cache, argc - 1, (const char **) argv + 1); 3760 3761 if (argc != 2) 3762 return -EINVAL; 3763 3764 return set_config_value(cache, argv[0], argv[1]); 3765 } 3766 3767 static int cache_iterate_devices(struct dm_target *ti, 3768 iterate_devices_callout_fn fn, void *data) 3769 { 3770 int r = 0; 3771 struct cache *cache = ti->private; 3772 3773 r = fn(ti, cache->cache_dev, 0, get_dev_size(cache->cache_dev), data); 3774 if (!r) 3775 r = fn(ti, cache->origin_dev, 0, ti->len, data); 3776 3777 return r; 3778 } 3779 3780 /* 3781 * We assume I/O is going to the origin (which is the volume 3782 * more likely to have restrictions e.g. by being striped). 3783 * (Looking up the exact location of the data would be expensive 3784 * and could always be out of date by the time the bio is submitted.) 3785 */ 3786 static int cache_bvec_merge(struct dm_target *ti, 3787 struct bvec_merge_data *bvm, 3788 struct bio_vec *biovec, int max_size) 3789 { 3790 struct cache *cache = ti->private; 3791 struct request_queue *q = bdev_get_queue(cache->origin_dev->bdev); 3792 3793 if (!q->merge_bvec_fn) 3794 return max_size; 3795 3796 bvm->bi_bdev = cache->origin_dev->bdev; 3797 return min(max_size, q->merge_bvec_fn(q, bvm, biovec)); 3798 } 3799 3800 static void set_discard_limits(struct cache *cache, struct queue_limits *limits) 3801 { 3802 /* 3803 * FIXME: these limits may be incompatible with the cache device 3804 */ 3805 limits->max_discard_sectors = min_t(sector_t, cache->discard_block_size * 1024, 3806 cache->origin_sectors); 3807 limits->discard_granularity = cache->discard_block_size << SECTOR_SHIFT; 3808 } 3809 3810 static void cache_io_hints(struct dm_target *ti, struct queue_limits *limits) 3811 { 3812 struct cache *cache = ti->private; 3813 uint64_t io_opt_sectors = limits->io_opt >> SECTOR_SHIFT; 3814 3815 /* 3816 * If the system-determined stacked limits are compatible with the 3817 * cache's blocksize (io_opt is a factor) do not override them. 3818 */ 3819 if (io_opt_sectors < cache->sectors_per_block || 3820 do_div(io_opt_sectors, cache->sectors_per_block)) { 3821 blk_limits_io_min(limits, cache->sectors_per_block << SECTOR_SHIFT); 3822 blk_limits_io_opt(limits, cache->sectors_per_block << SECTOR_SHIFT); 3823 } 3824 set_discard_limits(cache, limits); 3825 } 3826 3827 /*----------------------------------------------------------------*/ 3828 3829 static struct target_type cache_target = { 3830 .name = "cache", 3831 .version = {1, 8, 0}, 3832 .module = THIS_MODULE, 3833 .ctr = cache_ctr, 3834 .dtr = cache_dtr, 3835 .map = cache_map, 3836 .end_io = cache_end_io, 3837 .postsuspend = cache_postsuspend, 3838 .preresume = cache_preresume, 3839 .resume = cache_resume, 3840 .status = cache_status, 3841 .message = cache_message, 3842 .iterate_devices = cache_iterate_devices, 3843 .merge = cache_bvec_merge, 3844 .io_hints = cache_io_hints, 3845 }; 3846 3847 static int __init dm_cache_init(void) 3848 { 3849 int r; 3850 3851 r = dm_register_target(&cache_target); 3852 if (r) { 3853 DMERR("cache target registration failed: %d", r); 3854 return r; 3855 } 3856 3857 migration_cache = KMEM_CACHE(dm_cache_migration, 0); 3858 if (!migration_cache) { 3859 dm_unregister_target(&cache_target); 3860 return -ENOMEM; 3861 } 3862 3863 return 0; 3864 } 3865 3866 static void __exit dm_cache_exit(void) 3867 { 3868 dm_unregister_target(&cache_target); 3869 kmem_cache_destroy(migration_cache); 3870 } 3871 3872 module_init(dm_cache_init); 3873 module_exit(dm_cache_exit); 3874 3875 MODULE_DESCRIPTION(DM_NAME " cache target"); 3876 MODULE_AUTHOR("Joe Thornber <ejt@redhat.com>"); 3877 MODULE_LICENSE("GPL"); 3878