1 /* 2 * Copyright (C) 2012 Red Hat. All rights reserved. 3 * 4 * This file is released under the GPL. 5 */ 6 7 #include "dm.h" 8 #include "dm-bio-prison.h" 9 #include "dm-bio-record.h" 10 #include "dm-cache-metadata.h" 11 12 #include <linux/dm-io.h> 13 #include <linux/dm-kcopyd.h> 14 #include <linux/init.h> 15 #include <linux/mempool.h> 16 #include <linux/module.h> 17 #include <linux/slab.h> 18 #include <linux/vmalloc.h> 19 20 #define DM_MSG_PREFIX "cache" 21 22 DECLARE_DM_KCOPYD_THROTTLE_WITH_MODULE_PARM(cache_copy_throttle, 23 "A percentage of time allocated for copying to and/or from cache"); 24 25 /*----------------------------------------------------------------*/ 26 27 /* 28 * Glossary: 29 * 30 * oblock: index of an origin block 31 * cblock: index of a cache block 32 * promotion: movement of a block from origin to cache 33 * demotion: movement of a block from cache to origin 34 * migration: movement of a block between the origin and cache device, 35 * either direction 36 */ 37 38 /*----------------------------------------------------------------*/ 39 40 static size_t bitset_size_in_bytes(unsigned nr_entries) 41 { 42 return sizeof(unsigned long) * dm_div_up(nr_entries, BITS_PER_LONG); 43 } 44 45 static unsigned long *alloc_bitset(unsigned nr_entries) 46 { 47 size_t s = bitset_size_in_bytes(nr_entries); 48 return vzalloc(s); 49 } 50 51 static void clear_bitset(void *bitset, unsigned nr_entries) 52 { 53 size_t s = bitset_size_in_bytes(nr_entries); 54 memset(bitset, 0, s); 55 } 56 57 static void free_bitset(unsigned long *bits) 58 { 59 vfree(bits); 60 } 61 62 /*----------------------------------------------------------------*/ 63 64 #define PRISON_CELLS 1024 65 #define MIGRATION_POOL_SIZE 128 66 #define COMMIT_PERIOD HZ 67 #define MIGRATION_COUNT_WINDOW 10 68 69 /* 70 * The block size of the device holding cache data must be 71 * between 32KB and 1GB. 72 */ 73 #define DATA_DEV_BLOCK_SIZE_MIN_SECTORS (32 * 1024 >> SECTOR_SHIFT) 74 #define DATA_DEV_BLOCK_SIZE_MAX_SECTORS (1024 * 1024 * 1024 >> SECTOR_SHIFT) 75 76 /* 77 * FIXME: the cache is read/write for the time being. 78 */ 79 enum cache_mode { 80 CM_WRITE, /* metadata may be changed */ 81 CM_READ_ONLY, /* metadata may not be changed */ 82 }; 83 84 struct cache_features { 85 enum cache_mode mode; 86 bool write_through:1; 87 }; 88 89 struct cache_stats { 90 atomic_t read_hit; 91 atomic_t read_miss; 92 atomic_t write_hit; 93 atomic_t write_miss; 94 atomic_t demotion; 95 atomic_t promotion; 96 atomic_t copies_avoided; 97 atomic_t cache_cell_clash; 98 atomic_t commit_count; 99 atomic_t discard_count; 100 }; 101 102 struct cache { 103 struct dm_target *ti; 104 struct dm_target_callbacks callbacks; 105 106 struct dm_cache_metadata *cmd; 107 108 /* 109 * Metadata is written to this device. 110 */ 111 struct dm_dev *metadata_dev; 112 113 /* 114 * The slower of the two data devices. Typically a spindle. 115 */ 116 struct dm_dev *origin_dev; 117 118 /* 119 * The faster of the two data devices. Typically an SSD. 120 */ 121 struct dm_dev *cache_dev; 122 123 /* 124 * Size of the origin device in _complete_ blocks and native sectors. 125 */ 126 dm_oblock_t origin_blocks; 127 sector_t origin_sectors; 128 129 /* 130 * Size of the cache device in blocks. 131 */ 132 dm_cblock_t cache_size; 133 134 /* 135 * Fields for converting from sectors to blocks. 136 */ 137 uint32_t sectors_per_block; 138 int sectors_per_block_shift; 139 140 spinlock_t lock; 141 struct bio_list deferred_bios; 142 struct bio_list deferred_flush_bios; 143 struct bio_list deferred_writethrough_bios; 144 struct list_head quiesced_migrations; 145 struct list_head completed_migrations; 146 struct list_head need_commit_migrations; 147 sector_t migration_threshold; 148 wait_queue_head_t migration_wait; 149 atomic_t nr_migrations; 150 151 /* 152 * cache_size entries, dirty if set 153 */ 154 dm_cblock_t nr_dirty; 155 unsigned long *dirty_bitset; 156 157 /* 158 * origin_blocks entries, discarded if set. 159 */ 160 dm_dblock_t discard_nr_blocks; 161 unsigned long *discard_bitset; 162 uint32_t discard_block_size; /* a power of 2 times sectors per block */ 163 164 /* 165 * Rather than reconstructing the table line for the status we just 166 * save it and regurgitate. 167 */ 168 unsigned nr_ctr_args; 169 const char **ctr_args; 170 171 struct dm_kcopyd_client *copier; 172 struct workqueue_struct *wq; 173 struct work_struct worker; 174 175 struct delayed_work waker; 176 unsigned long last_commit_jiffies; 177 178 struct dm_bio_prison *prison; 179 struct dm_deferred_set *all_io_ds; 180 181 mempool_t *migration_pool; 182 struct dm_cache_migration *next_migration; 183 184 struct dm_cache_policy *policy; 185 unsigned policy_nr_args; 186 187 bool need_tick_bio:1; 188 bool sized:1; 189 bool quiescing:1; 190 bool commit_requested:1; 191 bool loaded_mappings:1; 192 bool loaded_discards:1; 193 194 /* 195 * Cache features such as write-through. 196 */ 197 struct cache_features features; 198 199 struct cache_stats stats; 200 }; 201 202 struct per_bio_data { 203 bool tick:1; 204 unsigned req_nr:2; 205 struct dm_deferred_entry *all_io_entry; 206 207 /* 208 * writethrough fields. These MUST remain at the end of this 209 * structure and the 'cache' member must be the first as it 210 * is used to determine the offset of the writethrough fields. 211 */ 212 struct cache *cache; 213 dm_cblock_t cblock; 214 bio_end_io_t *saved_bi_end_io; 215 struct dm_bio_details bio_details; 216 }; 217 218 struct dm_cache_migration { 219 struct list_head list; 220 struct cache *cache; 221 222 unsigned long start_jiffies; 223 dm_oblock_t old_oblock; 224 dm_oblock_t new_oblock; 225 dm_cblock_t cblock; 226 227 bool err:1; 228 bool writeback:1; 229 bool demote:1; 230 bool promote:1; 231 232 struct dm_bio_prison_cell *old_ocell; 233 struct dm_bio_prison_cell *new_ocell; 234 }; 235 236 /* 237 * Processing a bio in the worker thread may require these memory 238 * allocations. We prealloc to avoid deadlocks (the same worker thread 239 * frees them back to the mempool). 240 */ 241 struct prealloc { 242 struct dm_cache_migration *mg; 243 struct dm_bio_prison_cell *cell1; 244 struct dm_bio_prison_cell *cell2; 245 }; 246 247 static void wake_worker(struct cache *cache) 248 { 249 queue_work(cache->wq, &cache->worker); 250 } 251 252 /*----------------------------------------------------------------*/ 253 254 static struct dm_bio_prison_cell *alloc_prison_cell(struct cache *cache) 255 { 256 /* FIXME: change to use a local slab. */ 257 return dm_bio_prison_alloc_cell(cache->prison, GFP_NOWAIT); 258 } 259 260 static void free_prison_cell(struct cache *cache, struct dm_bio_prison_cell *cell) 261 { 262 dm_bio_prison_free_cell(cache->prison, cell); 263 } 264 265 static int prealloc_data_structs(struct cache *cache, struct prealloc *p) 266 { 267 if (!p->mg) { 268 p->mg = mempool_alloc(cache->migration_pool, GFP_NOWAIT); 269 if (!p->mg) 270 return -ENOMEM; 271 } 272 273 if (!p->cell1) { 274 p->cell1 = alloc_prison_cell(cache); 275 if (!p->cell1) 276 return -ENOMEM; 277 } 278 279 if (!p->cell2) { 280 p->cell2 = alloc_prison_cell(cache); 281 if (!p->cell2) 282 return -ENOMEM; 283 } 284 285 return 0; 286 } 287 288 static void prealloc_free_structs(struct cache *cache, struct prealloc *p) 289 { 290 if (p->cell2) 291 free_prison_cell(cache, p->cell2); 292 293 if (p->cell1) 294 free_prison_cell(cache, p->cell1); 295 296 if (p->mg) 297 mempool_free(p->mg, cache->migration_pool); 298 } 299 300 static struct dm_cache_migration *prealloc_get_migration(struct prealloc *p) 301 { 302 struct dm_cache_migration *mg = p->mg; 303 304 BUG_ON(!mg); 305 p->mg = NULL; 306 307 return mg; 308 } 309 310 /* 311 * You must have a cell within the prealloc struct to return. If not this 312 * function will BUG() rather than returning NULL. 313 */ 314 static struct dm_bio_prison_cell *prealloc_get_cell(struct prealloc *p) 315 { 316 struct dm_bio_prison_cell *r = NULL; 317 318 if (p->cell1) { 319 r = p->cell1; 320 p->cell1 = NULL; 321 322 } else if (p->cell2) { 323 r = p->cell2; 324 p->cell2 = NULL; 325 } else 326 BUG(); 327 328 return r; 329 } 330 331 /* 332 * You can't have more than two cells in a prealloc struct. BUG() will be 333 * called if you try and overfill. 334 */ 335 static void prealloc_put_cell(struct prealloc *p, struct dm_bio_prison_cell *cell) 336 { 337 if (!p->cell2) 338 p->cell2 = cell; 339 340 else if (!p->cell1) 341 p->cell1 = cell; 342 343 else 344 BUG(); 345 } 346 347 /*----------------------------------------------------------------*/ 348 349 static void build_key(dm_oblock_t oblock, struct dm_cell_key *key) 350 { 351 key->virtual = 0; 352 key->dev = 0; 353 key->block = from_oblock(oblock); 354 } 355 356 /* 357 * The caller hands in a preallocated cell, and a free function for it. 358 * The cell will be freed if there's an error, or if it wasn't used because 359 * a cell with that key already exists. 360 */ 361 typedef void (*cell_free_fn)(void *context, struct dm_bio_prison_cell *cell); 362 363 static int bio_detain(struct cache *cache, dm_oblock_t oblock, 364 struct bio *bio, struct dm_bio_prison_cell *cell_prealloc, 365 cell_free_fn free_fn, void *free_context, 366 struct dm_bio_prison_cell **cell_result) 367 { 368 int r; 369 struct dm_cell_key key; 370 371 build_key(oblock, &key); 372 r = dm_bio_detain(cache->prison, &key, bio, cell_prealloc, cell_result); 373 if (r) 374 free_fn(free_context, cell_prealloc); 375 376 return r; 377 } 378 379 static int get_cell(struct cache *cache, 380 dm_oblock_t oblock, 381 struct prealloc *structs, 382 struct dm_bio_prison_cell **cell_result) 383 { 384 int r; 385 struct dm_cell_key key; 386 struct dm_bio_prison_cell *cell_prealloc; 387 388 cell_prealloc = prealloc_get_cell(structs); 389 390 build_key(oblock, &key); 391 r = dm_get_cell(cache->prison, &key, cell_prealloc, cell_result); 392 if (r) 393 prealloc_put_cell(structs, cell_prealloc); 394 395 return r; 396 } 397 398 /*----------------------------------------------------------------*/ 399 400 static bool is_dirty(struct cache *cache, dm_cblock_t b) 401 { 402 return test_bit(from_cblock(b), cache->dirty_bitset); 403 } 404 405 static void set_dirty(struct cache *cache, dm_oblock_t oblock, dm_cblock_t cblock) 406 { 407 if (!test_and_set_bit(from_cblock(cblock), cache->dirty_bitset)) { 408 cache->nr_dirty = to_cblock(from_cblock(cache->nr_dirty) + 1); 409 policy_set_dirty(cache->policy, oblock); 410 } 411 } 412 413 static void clear_dirty(struct cache *cache, dm_oblock_t oblock, dm_cblock_t cblock) 414 { 415 if (test_and_clear_bit(from_cblock(cblock), cache->dirty_bitset)) { 416 policy_clear_dirty(cache->policy, oblock); 417 cache->nr_dirty = to_cblock(from_cblock(cache->nr_dirty) - 1); 418 if (!from_cblock(cache->nr_dirty)) 419 dm_table_event(cache->ti->table); 420 } 421 } 422 423 /*----------------------------------------------------------------*/ 424 425 static bool block_size_is_power_of_two(struct cache *cache) 426 { 427 return cache->sectors_per_block_shift >= 0; 428 } 429 430 /* gcc on ARM generates spurious references to __udivdi3 and __umoddi3 */ 431 #if defined(CONFIG_ARM) && __GNUC__ == 4 && __GNUC_MINOR__ <= 6 432 __always_inline 433 #endif 434 static dm_block_t block_div(dm_block_t b, uint32_t n) 435 { 436 do_div(b, n); 437 438 return b; 439 } 440 441 static dm_dblock_t oblock_to_dblock(struct cache *cache, dm_oblock_t oblock) 442 { 443 uint32_t discard_blocks = cache->discard_block_size; 444 dm_block_t b = from_oblock(oblock); 445 446 if (!block_size_is_power_of_two(cache)) 447 discard_blocks = discard_blocks / cache->sectors_per_block; 448 else 449 discard_blocks >>= cache->sectors_per_block_shift; 450 451 b = block_div(b, discard_blocks); 452 453 return to_dblock(b); 454 } 455 456 static void set_discard(struct cache *cache, dm_dblock_t b) 457 { 458 unsigned long flags; 459 460 atomic_inc(&cache->stats.discard_count); 461 462 spin_lock_irqsave(&cache->lock, flags); 463 set_bit(from_dblock(b), cache->discard_bitset); 464 spin_unlock_irqrestore(&cache->lock, flags); 465 } 466 467 static void clear_discard(struct cache *cache, dm_dblock_t b) 468 { 469 unsigned long flags; 470 471 spin_lock_irqsave(&cache->lock, flags); 472 clear_bit(from_dblock(b), cache->discard_bitset); 473 spin_unlock_irqrestore(&cache->lock, flags); 474 } 475 476 static bool is_discarded(struct cache *cache, dm_dblock_t b) 477 { 478 int r; 479 unsigned long flags; 480 481 spin_lock_irqsave(&cache->lock, flags); 482 r = test_bit(from_dblock(b), cache->discard_bitset); 483 spin_unlock_irqrestore(&cache->lock, flags); 484 485 return r; 486 } 487 488 static bool is_discarded_oblock(struct cache *cache, dm_oblock_t b) 489 { 490 int r; 491 unsigned long flags; 492 493 spin_lock_irqsave(&cache->lock, flags); 494 r = test_bit(from_dblock(oblock_to_dblock(cache, b)), 495 cache->discard_bitset); 496 spin_unlock_irqrestore(&cache->lock, flags); 497 498 return r; 499 } 500 501 /*----------------------------------------------------------------*/ 502 503 static void load_stats(struct cache *cache) 504 { 505 struct dm_cache_statistics stats; 506 507 dm_cache_metadata_get_stats(cache->cmd, &stats); 508 atomic_set(&cache->stats.read_hit, stats.read_hits); 509 atomic_set(&cache->stats.read_miss, stats.read_misses); 510 atomic_set(&cache->stats.write_hit, stats.write_hits); 511 atomic_set(&cache->stats.write_miss, stats.write_misses); 512 } 513 514 static void save_stats(struct cache *cache) 515 { 516 struct dm_cache_statistics stats; 517 518 stats.read_hits = atomic_read(&cache->stats.read_hit); 519 stats.read_misses = atomic_read(&cache->stats.read_miss); 520 stats.write_hits = atomic_read(&cache->stats.write_hit); 521 stats.write_misses = atomic_read(&cache->stats.write_miss); 522 523 dm_cache_metadata_set_stats(cache->cmd, &stats); 524 } 525 526 /*---------------------------------------------------------------- 527 * Per bio data 528 *--------------------------------------------------------------*/ 529 530 /* 531 * If using writeback, leave out struct per_bio_data's writethrough fields. 532 */ 533 #define PB_DATA_SIZE_WB (offsetof(struct per_bio_data, cache)) 534 #define PB_DATA_SIZE_WT (sizeof(struct per_bio_data)) 535 536 static size_t get_per_bio_data_size(struct cache *cache) 537 { 538 return cache->features.write_through ? PB_DATA_SIZE_WT : PB_DATA_SIZE_WB; 539 } 540 541 static struct per_bio_data *get_per_bio_data(struct bio *bio, size_t data_size) 542 { 543 struct per_bio_data *pb = dm_per_bio_data(bio, data_size); 544 BUG_ON(!pb); 545 return pb; 546 } 547 548 static struct per_bio_data *init_per_bio_data(struct bio *bio, size_t data_size) 549 { 550 struct per_bio_data *pb = get_per_bio_data(bio, data_size); 551 552 pb->tick = false; 553 pb->req_nr = dm_bio_get_target_bio_nr(bio); 554 pb->all_io_entry = NULL; 555 556 return pb; 557 } 558 559 /*---------------------------------------------------------------- 560 * Remapping 561 *--------------------------------------------------------------*/ 562 static void remap_to_origin(struct cache *cache, struct bio *bio) 563 { 564 bio->bi_bdev = cache->origin_dev->bdev; 565 } 566 567 static void remap_to_cache(struct cache *cache, struct bio *bio, 568 dm_cblock_t cblock) 569 { 570 sector_t bi_sector = bio->bi_sector; 571 572 bio->bi_bdev = cache->cache_dev->bdev; 573 if (!block_size_is_power_of_two(cache)) 574 bio->bi_sector = (from_cblock(cblock) * cache->sectors_per_block) + 575 sector_div(bi_sector, cache->sectors_per_block); 576 else 577 bio->bi_sector = (from_cblock(cblock) << cache->sectors_per_block_shift) | 578 (bi_sector & (cache->sectors_per_block - 1)); 579 } 580 581 static void check_if_tick_bio_needed(struct cache *cache, struct bio *bio) 582 { 583 unsigned long flags; 584 size_t pb_data_size = get_per_bio_data_size(cache); 585 struct per_bio_data *pb = get_per_bio_data(bio, pb_data_size); 586 587 spin_lock_irqsave(&cache->lock, flags); 588 if (cache->need_tick_bio && 589 !(bio->bi_rw & (REQ_FUA | REQ_FLUSH | REQ_DISCARD))) { 590 pb->tick = true; 591 cache->need_tick_bio = false; 592 } 593 spin_unlock_irqrestore(&cache->lock, flags); 594 } 595 596 static void remap_to_origin_clear_discard(struct cache *cache, struct bio *bio, 597 dm_oblock_t oblock) 598 { 599 check_if_tick_bio_needed(cache, bio); 600 remap_to_origin(cache, bio); 601 if (bio_data_dir(bio) == WRITE) 602 clear_discard(cache, oblock_to_dblock(cache, oblock)); 603 } 604 605 static void remap_to_cache_dirty(struct cache *cache, struct bio *bio, 606 dm_oblock_t oblock, dm_cblock_t cblock) 607 { 608 remap_to_cache(cache, bio, cblock); 609 if (bio_data_dir(bio) == WRITE) { 610 set_dirty(cache, oblock, cblock); 611 clear_discard(cache, oblock_to_dblock(cache, oblock)); 612 } 613 } 614 615 static dm_oblock_t get_bio_block(struct cache *cache, struct bio *bio) 616 { 617 sector_t block_nr = bio->bi_sector; 618 619 if (!block_size_is_power_of_two(cache)) 620 (void) sector_div(block_nr, cache->sectors_per_block); 621 else 622 block_nr >>= cache->sectors_per_block_shift; 623 624 return to_oblock(block_nr); 625 } 626 627 static int bio_triggers_commit(struct cache *cache, struct bio *bio) 628 { 629 return bio->bi_rw & (REQ_FLUSH | REQ_FUA); 630 } 631 632 static void issue(struct cache *cache, struct bio *bio) 633 { 634 unsigned long flags; 635 636 if (!bio_triggers_commit(cache, bio)) { 637 generic_make_request(bio); 638 return; 639 } 640 641 /* 642 * Batch together any bios that trigger commits and then issue a 643 * single commit for them in do_worker(). 644 */ 645 spin_lock_irqsave(&cache->lock, flags); 646 cache->commit_requested = true; 647 bio_list_add(&cache->deferred_flush_bios, bio); 648 spin_unlock_irqrestore(&cache->lock, flags); 649 } 650 651 static void defer_writethrough_bio(struct cache *cache, struct bio *bio) 652 { 653 unsigned long flags; 654 655 spin_lock_irqsave(&cache->lock, flags); 656 bio_list_add(&cache->deferred_writethrough_bios, bio); 657 spin_unlock_irqrestore(&cache->lock, flags); 658 659 wake_worker(cache); 660 } 661 662 static void writethrough_endio(struct bio *bio, int err) 663 { 664 struct per_bio_data *pb = get_per_bio_data(bio, PB_DATA_SIZE_WT); 665 bio->bi_end_io = pb->saved_bi_end_io; 666 667 if (err) { 668 bio_endio(bio, err); 669 return; 670 } 671 672 dm_bio_restore(&pb->bio_details, bio); 673 remap_to_cache(pb->cache, bio, pb->cblock); 674 675 /* 676 * We can't issue this bio directly, since we're in interrupt 677 * context. So it gets put on a bio list for processing by the 678 * worker thread. 679 */ 680 defer_writethrough_bio(pb->cache, bio); 681 } 682 683 /* 684 * When running in writethrough mode we need to send writes to clean blocks 685 * to both the cache and origin devices. In future we'd like to clone the 686 * bio and send them in parallel, but for now we're doing them in 687 * series as this is easier. 688 */ 689 static void remap_to_origin_then_cache(struct cache *cache, struct bio *bio, 690 dm_oblock_t oblock, dm_cblock_t cblock) 691 { 692 struct per_bio_data *pb = get_per_bio_data(bio, PB_DATA_SIZE_WT); 693 694 pb->cache = cache; 695 pb->cblock = cblock; 696 pb->saved_bi_end_io = bio->bi_end_io; 697 dm_bio_record(&pb->bio_details, bio); 698 bio->bi_end_io = writethrough_endio; 699 700 remap_to_origin_clear_discard(pb->cache, bio, oblock); 701 } 702 703 /*---------------------------------------------------------------- 704 * Migration processing 705 * 706 * Migration covers moving data from the origin device to the cache, or 707 * vice versa. 708 *--------------------------------------------------------------*/ 709 static void free_migration(struct dm_cache_migration *mg) 710 { 711 mempool_free(mg, mg->cache->migration_pool); 712 } 713 714 static void inc_nr_migrations(struct cache *cache) 715 { 716 atomic_inc(&cache->nr_migrations); 717 } 718 719 static void dec_nr_migrations(struct cache *cache) 720 { 721 atomic_dec(&cache->nr_migrations); 722 723 /* 724 * Wake the worker in case we're suspending the target. 725 */ 726 wake_up(&cache->migration_wait); 727 } 728 729 static void __cell_defer(struct cache *cache, struct dm_bio_prison_cell *cell, 730 bool holder) 731 { 732 (holder ? dm_cell_release : dm_cell_release_no_holder) 733 (cache->prison, cell, &cache->deferred_bios); 734 free_prison_cell(cache, cell); 735 } 736 737 static void cell_defer(struct cache *cache, struct dm_bio_prison_cell *cell, 738 bool holder) 739 { 740 unsigned long flags; 741 742 spin_lock_irqsave(&cache->lock, flags); 743 __cell_defer(cache, cell, holder); 744 spin_unlock_irqrestore(&cache->lock, flags); 745 746 wake_worker(cache); 747 } 748 749 static void cleanup_migration(struct dm_cache_migration *mg) 750 { 751 dec_nr_migrations(mg->cache); 752 free_migration(mg); 753 } 754 755 static void migration_failure(struct dm_cache_migration *mg) 756 { 757 struct cache *cache = mg->cache; 758 759 if (mg->writeback) { 760 DMWARN_LIMIT("writeback failed; couldn't copy block"); 761 set_dirty(cache, mg->old_oblock, mg->cblock); 762 cell_defer(cache, mg->old_ocell, false); 763 764 } else if (mg->demote) { 765 DMWARN_LIMIT("demotion failed; couldn't copy block"); 766 policy_force_mapping(cache->policy, mg->new_oblock, mg->old_oblock); 767 768 cell_defer(cache, mg->old_ocell, mg->promote ? 0 : 1); 769 if (mg->promote) 770 cell_defer(cache, mg->new_ocell, 1); 771 } else { 772 DMWARN_LIMIT("promotion failed; couldn't copy block"); 773 policy_remove_mapping(cache->policy, mg->new_oblock); 774 cell_defer(cache, mg->new_ocell, 1); 775 } 776 777 cleanup_migration(mg); 778 } 779 780 static void migration_success_pre_commit(struct dm_cache_migration *mg) 781 { 782 unsigned long flags; 783 struct cache *cache = mg->cache; 784 785 if (mg->writeback) { 786 cell_defer(cache, mg->old_ocell, false); 787 clear_dirty(cache, mg->old_oblock, mg->cblock); 788 cleanup_migration(mg); 789 return; 790 791 } else if (mg->demote) { 792 if (dm_cache_remove_mapping(cache->cmd, mg->cblock)) { 793 DMWARN_LIMIT("demotion failed; couldn't update on disk metadata"); 794 policy_force_mapping(cache->policy, mg->new_oblock, 795 mg->old_oblock); 796 if (mg->promote) 797 cell_defer(cache, mg->new_ocell, true); 798 cleanup_migration(mg); 799 return; 800 } 801 } else { 802 if (dm_cache_insert_mapping(cache->cmd, mg->cblock, mg->new_oblock)) { 803 DMWARN_LIMIT("promotion failed; couldn't update on disk metadata"); 804 policy_remove_mapping(cache->policy, mg->new_oblock); 805 cleanup_migration(mg); 806 return; 807 } 808 } 809 810 spin_lock_irqsave(&cache->lock, flags); 811 list_add_tail(&mg->list, &cache->need_commit_migrations); 812 cache->commit_requested = true; 813 spin_unlock_irqrestore(&cache->lock, flags); 814 } 815 816 static void migration_success_post_commit(struct dm_cache_migration *mg) 817 { 818 unsigned long flags; 819 struct cache *cache = mg->cache; 820 821 if (mg->writeback) { 822 DMWARN("writeback unexpectedly triggered commit"); 823 return; 824 825 } else if (mg->demote) { 826 cell_defer(cache, mg->old_ocell, mg->promote ? 0 : 1); 827 828 if (mg->promote) { 829 mg->demote = false; 830 831 spin_lock_irqsave(&cache->lock, flags); 832 list_add_tail(&mg->list, &cache->quiesced_migrations); 833 spin_unlock_irqrestore(&cache->lock, flags); 834 835 } else 836 cleanup_migration(mg); 837 838 } else { 839 cell_defer(cache, mg->new_ocell, true); 840 clear_dirty(cache, mg->new_oblock, mg->cblock); 841 cleanup_migration(mg); 842 } 843 } 844 845 static void copy_complete(int read_err, unsigned long write_err, void *context) 846 { 847 unsigned long flags; 848 struct dm_cache_migration *mg = (struct dm_cache_migration *) context; 849 struct cache *cache = mg->cache; 850 851 if (read_err || write_err) 852 mg->err = true; 853 854 spin_lock_irqsave(&cache->lock, flags); 855 list_add_tail(&mg->list, &cache->completed_migrations); 856 spin_unlock_irqrestore(&cache->lock, flags); 857 858 wake_worker(cache); 859 } 860 861 static void issue_copy_real(struct dm_cache_migration *mg) 862 { 863 int r; 864 struct dm_io_region o_region, c_region; 865 struct cache *cache = mg->cache; 866 867 o_region.bdev = cache->origin_dev->bdev; 868 o_region.count = cache->sectors_per_block; 869 870 c_region.bdev = cache->cache_dev->bdev; 871 c_region.sector = from_cblock(mg->cblock) * cache->sectors_per_block; 872 c_region.count = cache->sectors_per_block; 873 874 if (mg->writeback || mg->demote) { 875 /* demote */ 876 o_region.sector = from_oblock(mg->old_oblock) * cache->sectors_per_block; 877 r = dm_kcopyd_copy(cache->copier, &c_region, 1, &o_region, 0, copy_complete, mg); 878 } else { 879 /* promote */ 880 o_region.sector = from_oblock(mg->new_oblock) * cache->sectors_per_block; 881 r = dm_kcopyd_copy(cache->copier, &o_region, 1, &c_region, 0, copy_complete, mg); 882 } 883 884 if (r < 0) 885 migration_failure(mg); 886 } 887 888 static void avoid_copy(struct dm_cache_migration *mg) 889 { 890 atomic_inc(&mg->cache->stats.copies_avoided); 891 migration_success_pre_commit(mg); 892 } 893 894 static void issue_copy(struct dm_cache_migration *mg) 895 { 896 bool avoid; 897 struct cache *cache = mg->cache; 898 899 if (mg->writeback || mg->demote) 900 avoid = !is_dirty(cache, mg->cblock) || 901 is_discarded_oblock(cache, mg->old_oblock); 902 else 903 avoid = is_discarded_oblock(cache, mg->new_oblock); 904 905 avoid ? avoid_copy(mg) : issue_copy_real(mg); 906 } 907 908 static void complete_migration(struct dm_cache_migration *mg) 909 { 910 if (mg->err) 911 migration_failure(mg); 912 else 913 migration_success_pre_commit(mg); 914 } 915 916 static void process_migrations(struct cache *cache, struct list_head *head, 917 void (*fn)(struct dm_cache_migration *)) 918 { 919 unsigned long flags; 920 struct list_head list; 921 struct dm_cache_migration *mg, *tmp; 922 923 INIT_LIST_HEAD(&list); 924 spin_lock_irqsave(&cache->lock, flags); 925 list_splice_init(head, &list); 926 spin_unlock_irqrestore(&cache->lock, flags); 927 928 list_for_each_entry_safe(mg, tmp, &list, list) 929 fn(mg); 930 } 931 932 static void __queue_quiesced_migration(struct dm_cache_migration *mg) 933 { 934 list_add_tail(&mg->list, &mg->cache->quiesced_migrations); 935 } 936 937 static void queue_quiesced_migration(struct dm_cache_migration *mg) 938 { 939 unsigned long flags; 940 struct cache *cache = mg->cache; 941 942 spin_lock_irqsave(&cache->lock, flags); 943 __queue_quiesced_migration(mg); 944 spin_unlock_irqrestore(&cache->lock, flags); 945 946 wake_worker(cache); 947 } 948 949 static void queue_quiesced_migrations(struct cache *cache, struct list_head *work) 950 { 951 unsigned long flags; 952 struct dm_cache_migration *mg, *tmp; 953 954 spin_lock_irqsave(&cache->lock, flags); 955 list_for_each_entry_safe(mg, tmp, work, list) 956 __queue_quiesced_migration(mg); 957 spin_unlock_irqrestore(&cache->lock, flags); 958 959 wake_worker(cache); 960 } 961 962 static void check_for_quiesced_migrations(struct cache *cache, 963 struct per_bio_data *pb) 964 { 965 struct list_head work; 966 967 if (!pb->all_io_entry) 968 return; 969 970 INIT_LIST_HEAD(&work); 971 if (pb->all_io_entry) 972 dm_deferred_entry_dec(pb->all_io_entry, &work); 973 974 if (!list_empty(&work)) 975 queue_quiesced_migrations(cache, &work); 976 } 977 978 static void quiesce_migration(struct dm_cache_migration *mg) 979 { 980 if (!dm_deferred_set_add_work(mg->cache->all_io_ds, &mg->list)) 981 queue_quiesced_migration(mg); 982 } 983 984 static void promote(struct cache *cache, struct prealloc *structs, 985 dm_oblock_t oblock, dm_cblock_t cblock, 986 struct dm_bio_prison_cell *cell) 987 { 988 struct dm_cache_migration *mg = prealloc_get_migration(structs); 989 990 mg->err = false; 991 mg->writeback = false; 992 mg->demote = false; 993 mg->promote = true; 994 mg->cache = cache; 995 mg->new_oblock = oblock; 996 mg->cblock = cblock; 997 mg->old_ocell = NULL; 998 mg->new_ocell = cell; 999 mg->start_jiffies = jiffies; 1000 1001 inc_nr_migrations(cache); 1002 quiesce_migration(mg); 1003 } 1004 1005 static void writeback(struct cache *cache, struct prealloc *structs, 1006 dm_oblock_t oblock, dm_cblock_t cblock, 1007 struct dm_bio_prison_cell *cell) 1008 { 1009 struct dm_cache_migration *mg = prealloc_get_migration(structs); 1010 1011 mg->err = false; 1012 mg->writeback = true; 1013 mg->demote = false; 1014 mg->promote = false; 1015 mg->cache = cache; 1016 mg->old_oblock = oblock; 1017 mg->cblock = cblock; 1018 mg->old_ocell = cell; 1019 mg->new_ocell = NULL; 1020 mg->start_jiffies = jiffies; 1021 1022 inc_nr_migrations(cache); 1023 quiesce_migration(mg); 1024 } 1025 1026 static void demote_then_promote(struct cache *cache, struct prealloc *structs, 1027 dm_oblock_t old_oblock, dm_oblock_t new_oblock, 1028 dm_cblock_t cblock, 1029 struct dm_bio_prison_cell *old_ocell, 1030 struct dm_bio_prison_cell *new_ocell) 1031 { 1032 struct dm_cache_migration *mg = prealloc_get_migration(structs); 1033 1034 mg->err = false; 1035 mg->writeback = false; 1036 mg->demote = true; 1037 mg->promote = true; 1038 mg->cache = cache; 1039 mg->old_oblock = old_oblock; 1040 mg->new_oblock = new_oblock; 1041 mg->cblock = cblock; 1042 mg->old_ocell = old_ocell; 1043 mg->new_ocell = new_ocell; 1044 mg->start_jiffies = jiffies; 1045 1046 inc_nr_migrations(cache); 1047 quiesce_migration(mg); 1048 } 1049 1050 /*---------------------------------------------------------------- 1051 * bio processing 1052 *--------------------------------------------------------------*/ 1053 static void defer_bio(struct cache *cache, struct bio *bio) 1054 { 1055 unsigned long flags; 1056 1057 spin_lock_irqsave(&cache->lock, flags); 1058 bio_list_add(&cache->deferred_bios, bio); 1059 spin_unlock_irqrestore(&cache->lock, flags); 1060 1061 wake_worker(cache); 1062 } 1063 1064 static void process_flush_bio(struct cache *cache, struct bio *bio) 1065 { 1066 size_t pb_data_size = get_per_bio_data_size(cache); 1067 struct per_bio_data *pb = get_per_bio_data(bio, pb_data_size); 1068 1069 BUG_ON(bio->bi_size); 1070 if (!pb->req_nr) 1071 remap_to_origin(cache, bio); 1072 else 1073 remap_to_cache(cache, bio, 0); 1074 1075 issue(cache, bio); 1076 } 1077 1078 /* 1079 * People generally discard large parts of a device, eg, the whole device 1080 * when formatting. Splitting these large discards up into cache block 1081 * sized ios and then quiescing (always neccessary for discard) takes too 1082 * long. 1083 * 1084 * We keep it simple, and allow any size of discard to come in, and just 1085 * mark off blocks on the discard bitset. No passdown occurs! 1086 * 1087 * To implement passdown we need to change the bio_prison such that a cell 1088 * can have a key that spans many blocks. 1089 */ 1090 static void process_discard_bio(struct cache *cache, struct bio *bio) 1091 { 1092 dm_block_t start_block = dm_sector_div_up(bio->bi_sector, 1093 cache->discard_block_size); 1094 dm_block_t end_block = bio->bi_sector + bio_sectors(bio); 1095 dm_block_t b; 1096 1097 end_block = block_div(end_block, cache->discard_block_size); 1098 1099 for (b = start_block; b < end_block; b++) 1100 set_discard(cache, to_dblock(b)); 1101 1102 bio_endio(bio, 0); 1103 } 1104 1105 static bool spare_migration_bandwidth(struct cache *cache) 1106 { 1107 sector_t current_volume = (atomic_read(&cache->nr_migrations) + 1) * 1108 cache->sectors_per_block; 1109 return current_volume < cache->migration_threshold; 1110 } 1111 1112 static bool is_writethrough_io(struct cache *cache, struct bio *bio, 1113 dm_cblock_t cblock) 1114 { 1115 return bio_data_dir(bio) == WRITE && 1116 cache->features.write_through && !is_dirty(cache, cblock); 1117 } 1118 1119 static void inc_hit_counter(struct cache *cache, struct bio *bio) 1120 { 1121 atomic_inc(bio_data_dir(bio) == READ ? 1122 &cache->stats.read_hit : &cache->stats.write_hit); 1123 } 1124 1125 static void inc_miss_counter(struct cache *cache, struct bio *bio) 1126 { 1127 atomic_inc(bio_data_dir(bio) == READ ? 1128 &cache->stats.read_miss : &cache->stats.write_miss); 1129 } 1130 1131 static void process_bio(struct cache *cache, struct prealloc *structs, 1132 struct bio *bio) 1133 { 1134 int r; 1135 bool release_cell = true; 1136 dm_oblock_t block = get_bio_block(cache, bio); 1137 struct dm_bio_prison_cell *cell_prealloc, *old_ocell, *new_ocell; 1138 struct policy_result lookup_result; 1139 size_t pb_data_size = get_per_bio_data_size(cache); 1140 struct per_bio_data *pb = get_per_bio_data(bio, pb_data_size); 1141 bool discarded_block = is_discarded_oblock(cache, block); 1142 bool can_migrate = discarded_block || spare_migration_bandwidth(cache); 1143 1144 /* 1145 * Check to see if that block is currently migrating. 1146 */ 1147 cell_prealloc = prealloc_get_cell(structs); 1148 r = bio_detain(cache, block, bio, cell_prealloc, 1149 (cell_free_fn) prealloc_put_cell, 1150 structs, &new_ocell); 1151 if (r > 0) 1152 return; 1153 1154 r = policy_map(cache->policy, block, true, can_migrate, discarded_block, 1155 bio, &lookup_result); 1156 1157 if (r == -EWOULDBLOCK) 1158 /* migration has been denied */ 1159 lookup_result.op = POLICY_MISS; 1160 1161 switch (lookup_result.op) { 1162 case POLICY_HIT: 1163 inc_hit_counter(cache, bio); 1164 pb->all_io_entry = dm_deferred_entry_inc(cache->all_io_ds); 1165 1166 if (is_writethrough_io(cache, bio, lookup_result.cblock)) 1167 remap_to_origin_then_cache(cache, bio, block, lookup_result.cblock); 1168 else 1169 remap_to_cache_dirty(cache, bio, block, lookup_result.cblock); 1170 1171 issue(cache, bio); 1172 break; 1173 1174 case POLICY_MISS: 1175 inc_miss_counter(cache, bio); 1176 pb->all_io_entry = dm_deferred_entry_inc(cache->all_io_ds); 1177 remap_to_origin_clear_discard(cache, bio, block); 1178 issue(cache, bio); 1179 break; 1180 1181 case POLICY_NEW: 1182 atomic_inc(&cache->stats.promotion); 1183 promote(cache, structs, block, lookup_result.cblock, new_ocell); 1184 release_cell = false; 1185 break; 1186 1187 case POLICY_REPLACE: 1188 cell_prealloc = prealloc_get_cell(structs); 1189 r = bio_detain(cache, lookup_result.old_oblock, bio, cell_prealloc, 1190 (cell_free_fn) prealloc_put_cell, 1191 structs, &old_ocell); 1192 if (r > 0) { 1193 /* 1194 * We have to be careful to avoid lock inversion of 1195 * the cells. So we back off, and wait for the 1196 * old_ocell to become free. 1197 */ 1198 policy_force_mapping(cache->policy, block, 1199 lookup_result.old_oblock); 1200 atomic_inc(&cache->stats.cache_cell_clash); 1201 break; 1202 } 1203 atomic_inc(&cache->stats.demotion); 1204 atomic_inc(&cache->stats.promotion); 1205 1206 demote_then_promote(cache, structs, lookup_result.old_oblock, 1207 block, lookup_result.cblock, 1208 old_ocell, new_ocell); 1209 release_cell = false; 1210 break; 1211 1212 default: 1213 DMERR_LIMIT("%s: erroring bio, unknown policy op: %u", __func__, 1214 (unsigned) lookup_result.op); 1215 bio_io_error(bio); 1216 } 1217 1218 if (release_cell) 1219 cell_defer(cache, new_ocell, false); 1220 } 1221 1222 static int need_commit_due_to_time(struct cache *cache) 1223 { 1224 return jiffies < cache->last_commit_jiffies || 1225 jiffies > cache->last_commit_jiffies + COMMIT_PERIOD; 1226 } 1227 1228 static int commit_if_needed(struct cache *cache) 1229 { 1230 if (dm_cache_changed_this_transaction(cache->cmd) && 1231 (cache->commit_requested || need_commit_due_to_time(cache))) { 1232 atomic_inc(&cache->stats.commit_count); 1233 cache->last_commit_jiffies = jiffies; 1234 cache->commit_requested = false; 1235 return dm_cache_commit(cache->cmd, false); 1236 } 1237 1238 return 0; 1239 } 1240 1241 static void process_deferred_bios(struct cache *cache) 1242 { 1243 unsigned long flags; 1244 struct bio_list bios; 1245 struct bio *bio; 1246 struct prealloc structs; 1247 1248 memset(&structs, 0, sizeof(structs)); 1249 bio_list_init(&bios); 1250 1251 spin_lock_irqsave(&cache->lock, flags); 1252 bio_list_merge(&bios, &cache->deferred_bios); 1253 bio_list_init(&cache->deferred_bios); 1254 spin_unlock_irqrestore(&cache->lock, flags); 1255 1256 while (!bio_list_empty(&bios)) { 1257 /* 1258 * If we've got no free migration structs, and processing 1259 * this bio might require one, we pause until there are some 1260 * prepared mappings to process. 1261 */ 1262 if (prealloc_data_structs(cache, &structs)) { 1263 spin_lock_irqsave(&cache->lock, flags); 1264 bio_list_merge(&cache->deferred_bios, &bios); 1265 spin_unlock_irqrestore(&cache->lock, flags); 1266 break; 1267 } 1268 1269 bio = bio_list_pop(&bios); 1270 1271 if (bio->bi_rw & REQ_FLUSH) 1272 process_flush_bio(cache, bio); 1273 else if (bio->bi_rw & REQ_DISCARD) 1274 process_discard_bio(cache, bio); 1275 else 1276 process_bio(cache, &structs, bio); 1277 } 1278 1279 prealloc_free_structs(cache, &structs); 1280 } 1281 1282 static void process_deferred_flush_bios(struct cache *cache, bool submit_bios) 1283 { 1284 unsigned long flags; 1285 struct bio_list bios; 1286 struct bio *bio; 1287 1288 bio_list_init(&bios); 1289 1290 spin_lock_irqsave(&cache->lock, flags); 1291 bio_list_merge(&bios, &cache->deferred_flush_bios); 1292 bio_list_init(&cache->deferred_flush_bios); 1293 spin_unlock_irqrestore(&cache->lock, flags); 1294 1295 while ((bio = bio_list_pop(&bios))) 1296 submit_bios ? generic_make_request(bio) : bio_io_error(bio); 1297 } 1298 1299 static void process_deferred_writethrough_bios(struct cache *cache) 1300 { 1301 unsigned long flags; 1302 struct bio_list bios; 1303 struct bio *bio; 1304 1305 bio_list_init(&bios); 1306 1307 spin_lock_irqsave(&cache->lock, flags); 1308 bio_list_merge(&bios, &cache->deferred_writethrough_bios); 1309 bio_list_init(&cache->deferred_writethrough_bios); 1310 spin_unlock_irqrestore(&cache->lock, flags); 1311 1312 while ((bio = bio_list_pop(&bios))) 1313 generic_make_request(bio); 1314 } 1315 1316 static void writeback_some_dirty_blocks(struct cache *cache) 1317 { 1318 int r = 0; 1319 dm_oblock_t oblock; 1320 dm_cblock_t cblock; 1321 struct prealloc structs; 1322 struct dm_bio_prison_cell *old_ocell; 1323 1324 memset(&structs, 0, sizeof(structs)); 1325 1326 while (spare_migration_bandwidth(cache)) { 1327 if (prealloc_data_structs(cache, &structs)) 1328 break; 1329 1330 r = policy_writeback_work(cache->policy, &oblock, &cblock); 1331 if (r) 1332 break; 1333 1334 r = get_cell(cache, oblock, &structs, &old_ocell); 1335 if (r) { 1336 policy_set_dirty(cache->policy, oblock); 1337 break; 1338 } 1339 1340 writeback(cache, &structs, oblock, cblock, old_ocell); 1341 } 1342 1343 prealloc_free_structs(cache, &structs); 1344 } 1345 1346 /*---------------------------------------------------------------- 1347 * Main worker loop 1348 *--------------------------------------------------------------*/ 1349 static void start_quiescing(struct cache *cache) 1350 { 1351 unsigned long flags; 1352 1353 spin_lock_irqsave(&cache->lock, flags); 1354 cache->quiescing = 1; 1355 spin_unlock_irqrestore(&cache->lock, flags); 1356 } 1357 1358 static void stop_quiescing(struct cache *cache) 1359 { 1360 unsigned long flags; 1361 1362 spin_lock_irqsave(&cache->lock, flags); 1363 cache->quiescing = 0; 1364 spin_unlock_irqrestore(&cache->lock, flags); 1365 } 1366 1367 static bool is_quiescing(struct cache *cache) 1368 { 1369 int r; 1370 unsigned long flags; 1371 1372 spin_lock_irqsave(&cache->lock, flags); 1373 r = cache->quiescing; 1374 spin_unlock_irqrestore(&cache->lock, flags); 1375 1376 return r; 1377 } 1378 1379 static void wait_for_migrations(struct cache *cache) 1380 { 1381 wait_event(cache->migration_wait, !atomic_read(&cache->nr_migrations)); 1382 } 1383 1384 static void stop_worker(struct cache *cache) 1385 { 1386 cancel_delayed_work(&cache->waker); 1387 flush_workqueue(cache->wq); 1388 } 1389 1390 static void requeue_deferred_io(struct cache *cache) 1391 { 1392 struct bio *bio; 1393 struct bio_list bios; 1394 1395 bio_list_init(&bios); 1396 bio_list_merge(&bios, &cache->deferred_bios); 1397 bio_list_init(&cache->deferred_bios); 1398 1399 while ((bio = bio_list_pop(&bios))) 1400 bio_endio(bio, DM_ENDIO_REQUEUE); 1401 } 1402 1403 static int more_work(struct cache *cache) 1404 { 1405 if (is_quiescing(cache)) 1406 return !list_empty(&cache->quiesced_migrations) || 1407 !list_empty(&cache->completed_migrations) || 1408 !list_empty(&cache->need_commit_migrations); 1409 else 1410 return !bio_list_empty(&cache->deferred_bios) || 1411 !bio_list_empty(&cache->deferred_flush_bios) || 1412 !bio_list_empty(&cache->deferred_writethrough_bios) || 1413 !list_empty(&cache->quiesced_migrations) || 1414 !list_empty(&cache->completed_migrations) || 1415 !list_empty(&cache->need_commit_migrations); 1416 } 1417 1418 static void do_worker(struct work_struct *ws) 1419 { 1420 struct cache *cache = container_of(ws, struct cache, worker); 1421 1422 do { 1423 if (!is_quiescing(cache)) 1424 process_deferred_bios(cache); 1425 1426 process_migrations(cache, &cache->quiesced_migrations, issue_copy); 1427 process_migrations(cache, &cache->completed_migrations, complete_migration); 1428 1429 writeback_some_dirty_blocks(cache); 1430 1431 process_deferred_writethrough_bios(cache); 1432 1433 if (commit_if_needed(cache)) { 1434 process_deferred_flush_bios(cache, false); 1435 1436 /* 1437 * FIXME: rollback metadata or just go into a 1438 * failure mode and error everything 1439 */ 1440 } else { 1441 process_deferred_flush_bios(cache, true); 1442 process_migrations(cache, &cache->need_commit_migrations, 1443 migration_success_post_commit); 1444 } 1445 } while (more_work(cache)); 1446 } 1447 1448 /* 1449 * We want to commit periodically so that not too much 1450 * unwritten metadata builds up. 1451 */ 1452 static void do_waker(struct work_struct *ws) 1453 { 1454 struct cache *cache = container_of(to_delayed_work(ws), struct cache, waker); 1455 policy_tick(cache->policy); 1456 wake_worker(cache); 1457 queue_delayed_work(cache->wq, &cache->waker, COMMIT_PERIOD); 1458 } 1459 1460 /*----------------------------------------------------------------*/ 1461 1462 static int is_congested(struct dm_dev *dev, int bdi_bits) 1463 { 1464 struct request_queue *q = bdev_get_queue(dev->bdev); 1465 return bdi_congested(&q->backing_dev_info, bdi_bits); 1466 } 1467 1468 static int cache_is_congested(struct dm_target_callbacks *cb, int bdi_bits) 1469 { 1470 struct cache *cache = container_of(cb, struct cache, callbacks); 1471 1472 return is_congested(cache->origin_dev, bdi_bits) || 1473 is_congested(cache->cache_dev, bdi_bits); 1474 } 1475 1476 /*---------------------------------------------------------------- 1477 * Target methods 1478 *--------------------------------------------------------------*/ 1479 1480 /* 1481 * This function gets called on the error paths of the constructor, so we 1482 * have to cope with a partially initialised struct. 1483 */ 1484 static void destroy(struct cache *cache) 1485 { 1486 unsigned i; 1487 1488 if (cache->next_migration) 1489 mempool_free(cache->next_migration, cache->migration_pool); 1490 1491 if (cache->migration_pool) 1492 mempool_destroy(cache->migration_pool); 1493 1494 if (cache->all_io_ds) 1495 dm_deferred_set_destroy(cache->all_io_ds); 1496 1497 if (cache->prison) 1498 dm_bio_prison_destroy(cache->prison); 1499 1500 if (cache->wq) 1501 destroy_workqueue(cache->wq); 1502 1503 if (cache->dirty_bitset) 1504 free_bitset(cache->dirty_bitset); 1505 1506 if (cache->discard_bitset) 1507 free_bitset(cache->discard_bitset); 1508 1509 if (cache->copier) 1510 dm_kcopyd_client_destroy(cache->copier); 1511 1512 if (cache->cmd) 1513 dm_cache_metadata_close(cache->cmd); 1514 1515 if (cache->metadata_dev) 1516 dm_put_device(cache->ti, cache->metadata_dev); 1517 1518 if (cache->origin_dev) 1519 dm_put_device(cache->ti, cache->origin_dev); 1520 1521 if (cache->cache_dev) 1522 dm_put_device(cache->ti, cache->cache_dev); 1523 1524 if (cache->policy) 1525 dm_cache_policy_destroy(cache->policy); 1526 1527 for (i = 0; i < cache->nr_ctr_args ; i++) 1528 kfree(cache->ctr_args[i]); 1529 kfree(cache->ctr_args); 1530 1531 kfree(cache); 1532 } 1533 1534 static void cache_dtr(struct dm_target *ti) 1535 { 1536 struct cache *cache = ti->private; 1537 1538 destroy(cache); 1539 } 1540 1541 static sector_t get_dev_size(struct dm_dev *dev) 1542 { 1543 return i_size_read(dev->bdev->bd_inode) >> SECTOR_SHIFT; 1544 } 1545 1546 /*----------------------------------------------------------------*/ 1547 1548 /* 1549 * Construct a cache device mapping. 1550 * 1551 * cache <metadata dev> <cache dev> <origin dev> <block size> 1552 * <#feature args> [<feature arg>]* 1553 * <policy> <#policy args> [<policy arg>]* 1554 * 1555 * metadata dev : fast device holding the persistent metadata 1556 * cache dev : fast device holding cached data blocks 1557 * origin dev : slow device holding original data blocks 1558 * block size : cache unit size in sectors 1559 * 1560 * #feature args : number of feature arguments passed 1561 * feature args : writethrough. (The default is writeback.) 1562 * 1563 * policy : the replacement policy to use 1564 * #policy args : an even number of policy arguments corresponding 1565 * to key/value pairs passed to the policy 1566 * policy args : key/value pairs passed to the policy 1567 * E.g. 'sequential_threshold 1024' 1568 * See cache-policies.txt for details. 1569 * 1570 * Optional feature arguments are: 1571 * writethrough : write through caching that prohibits cache block 1572 * content from being different from origin block content. 1573 * Without this argument, the default behaviour is to write 1574 * back cache block contents later for performance reasons, 1575 * so they may differ from the corresponding origin blocks. 1576 */ 1577 struct cache_args { 1578 struct dm_target *ti; 1579 1580 struct dm_dev *metadata_dev; 1581 1582 struct dm_dev *cache_dev; 1583 sector_t cache_sectors; 1584 1585 struct dm_dev *origin_dev; 1586 sector_t origin_sectors; 1587 1588 uint32_t block_size; 1589 1590 const char *policy_name; 1591 int policy_argc; 1592 const char **policy_argv; 1593 1594 struct cache_features features; 1595 }; 1596 1597 static void destroy_cache_args(struct cache_args *ca) 1598 { 1599 if (ca->metadata_dev) 1600 dm_put_device(ca->ti, ca->metadata_dev); 1601 1602 if (ca->cache_dev) 1603 dm_put_device(ca->ti, ca->cache_dev); 1604 1605 if (ca->origin_dev) 1606 dm_put_device(ca->ti, ca->origin_dev); 1607 1608 kfree(ca); 1609 } 1610 1611 static bool at_least_one_arg(struct dm_arg_set *as, char **error) 1612 { 1613 if (!as->argc) { 1614 *error = "Insufficient args"; 1615 return false; 1616 } 1617 1618 return true; 1619 } 1620 1621 static int parse_metadata_dev(struct cache_args *ca, struct dm_arg_set *as, 1622 char **error) 1623 { 1624 int r; 1625 sector_t metadata_dev_size; 1626 char b[BDEVNAME_SIZE]; 1627 1628 if (!at_least_one_arg(as, error)) 1629 return -EINVAL; 1630 1631 r = dm_get_device(ca->ti, dm_shift_arg(as), FMODE_READ | FMODE_WRITE, 1632 &ca->metadata_dev); 1633 if (r) { 1634 *error = "Error opening metadata device"; 1635 return r; 1636 } 1637 1638 metadata_dev_size = get_dev_size(ca->metadata_dev); 1639 if (metadata_dev_size > DM_CACHE_METADATA_MAX_SECTORS_WARNING) 1640 DMWARN("Metadata device %s is larger than %u sectors: excess space will not be used.", 1641 bdevname(ca->metadata_dev->bdev, b), THIN_METADATA_MAX_SECTORS); 1642 1643 return 0; 1644 } 1645 1646 static int parse_cache_dev(struct cache_args *ca, struct dm_arg_set *as, 1647 char **error) 1648 { 1649 int r; 1650 1651 if (!at_least_one_arg(as, error)) 1652 return -EINVAL; 1653 1654 r = dm_get_device(ca->ti, dm_shift_arg(as), FMODE_READ | FMODE_WRITE, 1655 &ca->cache_dev); 1656 if (r) { 1657 *error = "Error opening cache device"; 1658 return r; 1659 } 1660 ca->cache_sectors = get_dev_size(ca->cache_dev); 1661 1662 return 0; 1663 } 1664 1665 static int parse_origin_dev(struct cache_args *ca, struct dm_arg_set *as, 1666 char **error) 1667 { 1668 int r; 1669 1670 if (!at_least_one_arg(as, error)) 1671 return -EINVAL; 1672 1673 r = dm_get_device(ca->ti, dm_shift_arg(as), FMODE_READ | FMODE_WRITE, 1674 &ca->origin_dev); 1675 if (r) { 1676 *error = "Error opening origin device"; 1677 return r; 1678 } 1679 1680 ca->origin_sectors = get_dev_size(ca->origin_dev); 1681 if (ca->ti->len > ca->origin_sectors) { 1682 *error = "Device size larger than cached device"; 1683 return -EINVAL; 1684 } 1685 1686 return 0; 1687 } 1688 1689 static int parse_block_size(struct cache_args *ca, struct dm_arg_set *as, 1690 char **error) 1691 { 1692 unsigned long block_size; 1693 1694 if (!at_least_one_arg(as, error)) 1695 return -EINVAL; 1696 1697 if (kstrtoul(dm_shift_arg(as), 10, &block_size) || !block_size || 1698 block_size < DATA_DEV_BLOCK_SIZE_MIN_SECTORS || 1699 block_size > DATA_DEV_BLOCK_SIZE_MAX_SECTORS || 1700 block_size & (DATA_DEV_BLOCK_SIZE_MIN_SECTORS - 1)) { 1701 *error = "Invalid data block size"; 1702 return -EINVAL; 1703 } 1704 1705 if (block_size > ca->cache_sectors) { 1706 *error = "Data block size is larger than the cache device"; 1707 return -EINVAL; 1708 } 1709 1710 ca->block_size = block_size; 1711 1712 return 0; 1713 } 1714 1715 static void init_features(struct cache_features *cf) 1716 { 1717 cf->mode = CM_WRITE; 1718 cf->write_through = false; 1719 } 1720 1721 static int parse_features(struct cache_args *ca, struct dm_arg_set *as, 1722 char **error) 1723 { 1724 static struct dm_arg _args[] = { 1725 {0, 1, "Invalid number of cache feature arguments"}, 1726 }; 1727 1728 int r; 1729 unsigned argc; 1730 const char *arg; 1731 struct cache_features *cf = &ca->features; 1732 1733 init_features(cf); 1734 1735 r = dm_read_arg_group(_args, as, &argc, error); 1736 if (r) 1737 return -EINVAL; 1738 1739 while (argc--) { 1740 arg = dm_shift_arg(as); 1741 1742 if (!strcasecmp(arg, "writeback")) 1743 cf->write_through = false; 1744 1745 else if (!strcasecmp(arg, "writethrough")) 1746 cf->write_through = true; 1747 1748 else { 1749 *error = "Unrecognised cache feature requested"; 1750 return -EINVAL; 1751 } 1752 } 1753 1754 return 0; 1755 } 1756 1757 static int parse_policy(struct cache_args *ca, struct dm_arg_set *as, 1758 char **error) 1759 { 1760 static struct dm_arg _args[] = { 1761 {0, 1024, "Invalid number of policy arguments"}, 1762 }; 1763 1764 int r; 1765 1766 if (!at_least_one_arg(as, error)) 1767 return -EINVAL; 1768 1769 ca->policy_name = dm_shift_arg(as); 1770 1771 r = dm_read_arg_group(_args, as, &ca->policy_argc, error); 1772 if (r) 1773 return -EINVAL; 1774 1775 ca->policy_argv = (const char **)as->argv; 1776 dm_consume_args(as, ca->policy_argc); 1777 1778 return 0; 1779 } 1780 1781 static int parse_cache_args(struct cache_args *ca, int argc, char **argv, 1782 char **error) 1783 { 1784 int r; 1785 struct dm_arg_set as; 1786 1787 as.argc = argc; 1788 as.argv = argv; 1789 1790 r = parse_metadata_dev(ca, &as, error); 1791 if (r) 1792 return r; 1793 1794 r = parse_cache_dev(ca, &as, error); 1795 if (r) 1796 return r; 1797 1798 r = parse_origin_dev(ca, &as, error); 1799 if (r) 1800 return r; 1801 1802 r = parse_block_size(ca, &as, error); 1803 if (r) 1804 return r; 1805 1806 r = parse_features(ca, &as, error); 1807 if (r) 1808 return r; 1809 1810 r = parse_policy(ca, &as, error); 1811 if (r) 1812 return r; 1813 1814 return 0; 1815 } 1816 1817 /*----------------------------------------------------------------*/ 1818 1819 static struct kmem_cache *migration_cache; 1820 1821 #define NOT_CORE_OPTION 1 1822 1823 static int process_config_option(struct cache *cache, const char *key, const char *value) 1824 { 1825 unsigned long tmp; 1826 1827 if (!strcasecmp(key, "migration_threshold")) { 1828 if (kstrtoul(value, 10, &tmp)) 1829 return -EINVAL; 1830 1831 cache->migration_threshold = tmp; 1832 return 0; 1833 } 1834 1835 return NOT_CORE_OPTION; 1836 } 1837 1838 static int set_config_value(struct cache *cache, const char *key, const char *value) 1839 { 1840 int r = process_config_option(cache, key, value); 1841 1842 if (r == NOT_CORE_OPTION) 1843 r = policy_set_config_value(cache->policy, key, value); 1844 1845 if (r) 1846 DMWARN("bad config value for %s: %s", key, value); 1847 1848 return r; 1849 } 1850 1851 static int set_config_values(struct cache *cache, int argc, const char **argv) 1852 { 1853 int r = 0; 1854 1855 if (argc & 1) { 1856 DMWARN("Odd number of policy arguments given but they should be <key> <value> pairs."); 1857 return -EINVAL; 1858 } 1859 1860 while (argc) { 1861 r = set_config_value(cache, argv[0], argv[1]); 1862 if (r) 1863 break; 1864 1865 argc -= 2; 1866 argv += 2; 1867 } 1868 1869 return r; 1870 } 1871 1872 static int create_cache_policy(struct cache *cache, struct cache_args *ca, 1873 char **error) 1874 { 1875 cache->policy = dm_cache_policy_create(ca->policy_name, 1876 cache->cache_size, 1877 cache->origin_sectors, 1878 cache->sectors_per_block); 1879 if (!cache->policy) { 1880 *error = "Error creating cache's policy"; 1881 return -ENOMEM; 1882 } 1883 1884 return 0; 1885 } 1886 1887 /* 1888 * We want the discard block size to be a power of two, at least the size 1889 * of the cache block size, and have no more than 2^14 discard blocks 1890 * across the origin. 1891 */ 1892 #define MAX_DISCARD_BLOCKS (1 << 14) 1893 1894 static bool too_many_discard_blocks(sector_t discard_block_size, 1895 sector_t origin_size) 1896 { 1897 (void) sector_div(origin_size, discard_block_size); 1898 1899 return origin_size > MAX_DISCARD_BLOCKS; 1900 } 1901 1902 static sector_t calculate_discard_block_size(sector_t cache_block_size, 1903 sector_t origin_size) 1904 { 1905 sector_t discard_block_size; 1906 1907 discard_block_size = roundup_pow_of_two(cache_block_size); 1908 1909 if (origin_size) 1910 while (too_many_discard_blocks(discard_block_size, origin_size)) 1911 discard_block_size *= 2; 1912 1913 return discard_block_size; 1914 } 1915 1916 #define DEFAULT_MIGRATION_THRESHOLD 2048 1917 1918 static int cache_create(struct cache_args *ca, struct cache **result) 1919 { 1920 int r = 0; 1921 char **error = &ca->ti->error; 1922 struct cache *cache; 1923 struct dm_target *ti = ca->ti; 1924 dm_block_t origin_blocks; 1925 struct dm_cache_metadata *cmd; 1926 bool may_format = ca->features.mode == CM_WRITE; 1927 1928 cache = kzalloc(sizeof(*cache), GFP_KERNEL); 1929 if (!cache) 1930 return -ENOMEM; 1931 1932 cache->ti = ca->ti; 1933 ti->private = cache; 1934 ti->num_flush_bios = 2; 1935 ti->flush_supported = true; 1936 1937 ti->num_discard_bios = 1; 1938 ti->discards_supported = true; 1939 ti->discard_zeroes_data_unsupported = true; 1940 1941 cache->features = ca->features; 1942 ti->per_bio_data_size = get_per_bio_data_size(cache); 1943 1944 cache->callbacks.congested_fn = cache_is_congested; 1945 dm_table_add_target_callbacks(ti->table, &cache->callbacks); 1946 1947 cache->metadata_dev = ca->metadata_dev; 1948 cache->origin_dev = ca->origin_dev; 1949 cache->cache_dev = ca->cache_dev; 1950 1951 ca->metadata_dev = ca->origin_dev = ca->cache_dev = NULL; 1952 1953 /* FIXME: factor out this whole section */ 1954 origin_blocks = cache->origin_sectors = ca->origin_sectors; 1955 origin_blocks = block_div(origin_blocks, ca->block_size); 1956 cache->origin_blocks = to_oblock(origin_blocks); 1957 1958 cache->sectors_per_block = ca->block_size; 1959 if (dm_set_target_max_io_len(ti, cache->sectors_per_block)) { 1960 r = -EINVAL; 1961 goto bad; 1962 } 1963 1964 if (ca->block_size & (ca->block_size - 1)) { 1965 dm_block_t cache_size = ca->cache_sectors; 1966 1967 cache->sectors_per_block_shift = -1; 1968 cache_size = block_div(cache_size, ca->block_size); 1969 cache->cache_size = to_cblock(cache_size); 1970 } else { 1971 cache->sectors_per_block_shift = __ffs(ca->block_size); 1972 cache->cache_size = to_cblock(ca->cache_sectors >> cache->sectors_per_block_shift); 1973 } 1974 1975 r = create_cache_policy(cache, ca, error); 1976 if (r) 1977 goto bad; 1978 1979 cache->policy_nr_args = ca->policy_argc; 1980 cache->migration_threshold = DEFAULT_MIGRATION_THRESHOLD; 1981 1982 r = set_config_values(cache, ca->policy_argc, ca->policy_argv); 1983 if (r) { 1984 *error = "Error setting cache policy's config values"; 1985 goto bad; 1986 } 1987 1988 cmd = dm_cache_metadata_open(cache->metadata_dev->bdev, 1989 ca->block_size, may_format, 1990 dm_cache_policy_get_hint_size(cache->policy)); 1991 if (IS_ERR(cmd)) { 1992 *error = "Error creating metadata object"; 1993 r = PTR_ERR(cmd); 1994 goto bad; 1995 } 1996 cache->cmd = cmd; 1997 1998 spin_lock_init(&cache->lock); 1999 bio_list_init(&cache->deferred_bios); 2000 bio_list_init(&cache->deferred_flush_bios); 2001 bio_list_init(&cache->deferred_writethrough_bios); 2002 INIT_LIST_HEAD(&cache->quiesced_migrations); 2003 INIT_LIST_HEAD(&cache->completed_migrations); 2004 INIT_LIST_HEAD(&cache->need_commit_migrations); 2005 atomic_set(&cache->nr_migrations, 0); 2006 init_waitqueue_head(&cache->migration_wait); 2007 2008 r = -ENOMEM; 2009 cache->nr_dirty = 0; 2010 cache->dirty_bitset = alloc_bitset(from_cblock(cache->cache_size)); 2011 if (!cache->dirty_bitset) { 2012 *error = "could not allocate dirty bitset"; 2013 goto bad; 2014 } 2015 clear_bitset(cache->dirty_bitset, from_cblock(cache->cache_size)); 2016 2017 cache->discard_block_size = 2018 calculate_discard_block_size(cache->sectors_per_block, 2019 cache->origin_sectors); 2020 cache->discard_nr_blocks = oblock_to_dblock(cache, cache->origin_blocks); 2021 cache->discard_bitset = alloc_bitset(from_dblock(cache->discard_nr_blocks)); 2022 if (!cache->discard_bitset) { 2023 *error = "could not allocate discard bitset"; 2024 goto bad; 2025 } 2026 clear_bitset(cache->discard_bitset, from_dblock(cache->discard_nr_blocks)); 2027 2028 cache->copier = dm_kcopyd_client_create(&dm_kcopyd_throttle); 2029 if (IS_ERR(cache->copier)) { 2030 *error = "could not create kcopyd client"; 2031 r = PTR_ERR(cache->copier); 2032 goto bad; 2033 } 2034 2035 cache->wq = alloc_ordered_workqueue("dm-" DM_MSG_PREFIX, WQ_MEM_RECLAIM); 2036 if (!cache->wq) { 2037 *error = "could not create workqueue for metadata object"; 2038 goto bad; 2039 } 2040 INIT_WORK(&cache->worker, do_worker); 2041 INIT_DELAYED_WORK(&cache->waker, do_waker); 2042 cache->last_commit_jiffies = jiffies; 2043 2044 cache->prison = dm_bio_prison_create(PRISON_CELLS); 2045 if (!cache->prison) { 2046 *error = "could not create bio prison"; 2047 goto bad; 2048 } 2049 2050 cache->all_io_ds = dm_deferred_set_create(); 2051 if (!cache->all_io_ds) { 2052 *error = "could not create all_io deferred set"; 2053 goto bad; 2054 } 2055 2056 cache->migration_pool = mempool_create_slab_pool(MIGRATION_POOL_SIZE, 2057 migration_cache); 2058 if (!cache->migration_pool) { 2059 *error = "Error creating cache's migration mempool"; 2060 goto bad; 2061 } 2062 2063 cache->next_migration = NULL; 2064 2065 cache->need_tick_bio = true; 2066 cache->sized = false; 2067 cache->quiescing = false; 2068 cache->commit_requested = false; 2069 cache->loaded_mappings = false; 2070 cache->loaded_discards = false; 2071 2072 load_stats(cache); 2073 2074 atomic_set(&cache->stats.demotion, 0); 2075 atomic_set(&cache->stats.promotion, 0); 2076 atomic_set(&cache->stats.copies_avoided, 0); 2077 atomic_set(&cache->stats.cache_cell_clash, 0); 2078 atomic_set(&cache->stats.commit_count, 0); 2079 atomic_set(&cache->stats.discard_count, 0); 2080 2081 *result = cache; 2082 return 0; 2083 2084 bad: 2085 destroy(cache); 2086 return r; 2087 } 2088 2089 static int copy_ctr_args(struct cache *cache, int argc, const char **argv) 2090 { 2091 unsigned i; 2092 const char **copy; 2093 2094 copy = kcalloc(argc, sizeof(*copy), GFP_KERNEL); 2095 if (!copy) 2096 return -ENOMEM; 2097 for (i = 0; i < argc; i++) { 2098 copy[i] = kstrdup(argv[i], GFP_KERNEL); 2099 if (!copy[i]) { 2100 while (i--) 2101 kfree(copy[i]); 2102 kfree(copy); 2103 return -ENOMEM; 2104 } 2105 } 2106 2107 cache->nr_ctr_args = argc; 2108 cache->ctr_args = copy; 2109 2110 return 0; 2111 } 2112 2113 static int cache_ctr(struct dm_target *ti, unsigned argc, char **argv) 2114 { 2115 int r = -EINVAL; 2116 struct cache_args *ca; 2117 struct cache *cache = NULL; 2118 2119 ca = kzalloc(sizeof(*ca), GFP_KERNEL); 2120 if (!ca) { 2121 ti->error = "Error allocating memory for cache"; 2122 return -ENOMEM; 2123 } 2124 ca->ti = ti; 2125 2126 r = parse_cache_args(ca, argc, argv, &ti->error); 2127 if (r) 2128 goto out; 2129 2130 r = cache_create(ca, &cache); 2131 if (r) 2132 goto out; 2133 2134 r = copy_ctr_args(cache, argc - 3, (const char **)argv + 3); 2135 if (r) { 2136 destroy(cache); 2137 goto out; 2138 } 2139 2140 ti->private = cache; 2141 2142 out: 2143 destroy_cache_args(ca); 2144 return r; 2145 } 2146 2147 static int cache_map(struct dm_target *ti, struct bio *bio) 2148 { 2149 struct cache *cache = ti->private; 2150 2151 int r; 2152 dm_oblock_t block = get_bio_block(cache, bio); 2153 size_t pb_data_size = get_per_bio_data_size(cache); 2154 bool can_migrate = false; 2155 bool discarded_block; 2156 struct dm_bio_prison_cell *cell; 2157 struct policy_result lookup_result; 2158 struct per_bio_data *pb; 2159 2160 if (from_oblock(block) > from_oblock(cache->origin_blocks)) { 2161 /* 2162 * This can only occur if the io goes to a partial block at 2163 * the end of the origin device. We don't cache these. 2164 * Just remap to the origin and carry on. 2165 */ 2166 remap_to_origin_clear_discard(cache, bio, block); 2167 return DM_MAPIO_REMAPPED; 2168 } 2169 2170 pb = init_per_bio_data(bio, pb_data_size); 2171 2172 if (bio->bi_rw & (REQ_FLUSH | REQ_FUA | REQ_DISCARD)) { 2173 defer_bio(cache, bio); 2174 return DM_MAPIO_SUBMITTED; 2175 } 2176 2177 /* 2178 * Check to see if that block is currently migrating. 2179 */ 2180 cell = alloc_prison_cell(cache); 2181 if (!cell) { 2182 defer_bio(cache, bio); 2183 return DM_MAPIO_SUBMITTED; 2184 } 2185 2186 r = bio_detain(cache, block, bio, cell, 2187 (cell_free_fn) free_prison_cell, 2188 cache, &cell); 2189 if (r) { 2190 if (r < 0) 2191 defer_bio(cache, bio); 2192 2193 return DM_MAPIO_SUBMITTED; 2194 } 2195 2196 discarded_block = is_discarded_oblock(cache, block); 2197 2198 r = policy_map(cache->policy, block, false, can_migrate, discarded_block, 2199 bio, &lookup_result); 2200 if (r == -EWOULDBLOCK) { 2201 cell_defer(cache, cell, true); 2202 return DM_MAPIO_SUBMITTED; 2203 2204 } else if (r) { 2205 DMERR_LIMIT("Unexpected return from cache replacement policy: %d", r); 2206 bio_io_error(bio); 2207 return DM_MAPIO_SUBMITTED; 2208 } 2209 2210 switch (lookup_result.op) { 2211 case POLICY_HIT: 2212 inc_hit_counter(cache, bio); 2213 pb->all_io_entry = dm_deferred_entry_inc(cache->all_io_ds); 2214 2215 if (is_writethrough_io(cache, bio, lookup_result.cblock)) 2216 remap_to_origin_then_cache(cache, bio, block, lookup_result.cblock); 2217 else 2218 remap_to_cache_dirty(cache, bio, block, lookup_result.cblock); 2219 2220 cell_defer(cache, cell, false); 2221 break; 2222 2223 case POLICY_MISS: 2224 inc_miss_counter(cache, bio); 2225 pb->all_io_entry = dm_deferred_entry_inc(cache->all_io_ds); 2226 2227 if (pb->req_nr != 0) { 2228 /* 2229 * This is a duplicate writethrough io that is no 2230 * longer needed because the block has been demoted. 2231 */ 2232 bio_endio(bio, 0); 2233 cell_defer(cache, cell, false); 2234 return DM_MAPIO_SUBMITTED; 2235 } else { 2236 remap_to_origin_clear_discard(cache, bio, block); 2237 cell_defer(cache, cell, false); 2238 } 2239 break; 2240 2241 default: 2242 DMERR_LIMIT("%s: erroring bio: unknown policy op: %u", __func__, 2243 (unsigned) lookup_result.op); 2244 bio_io_error(bio); 2245 return DM_MAPIO_SUBMITTED; 2246 } 2247 2248 return DM_MAPIO_REMAPPED; 2249 } 2250 2251 static int cache_end_io(struct dm_target *ti, struct bio *bio, int error) 2252 { 2253 struct cache *cache = ti->private; 2254 unsigned long flags; 2255 size_t pb_data_size = get_per_bio_data_size(cache); 2256 struct per_bio_data *pb = get_per_bio_data(bio, pb_data_size); 2257 2258 if (pb->tick) { 2259 policy_tick(cache->policy); 2260 2261 spin_lock_irqsave(&cache->lock, flags); 2262 cache->need_tick_bio = true; 2263 spin_unlock_irqrestore(&cache->lock, flags); 2264 } 2265 2266 check_for_quiesced_migrations(cache, pb); 2267 2268 return 0; 2269 } 2270 2271 static int write_dirty_bitset(struct cache *cache) 2272 { 2273 unsigned i, r; 2274 2275 for (i = 0; i < from_cblock(cache->cache_size); i++) { 2276 r = dm_cache_set_dirty(cache->cmd, to_cblock(i), 2277 is_dirty(cache, to_cblock(i))); 2278 if (r) 2279 return r; 2280 } 2281 2282 return 0; 2283 } 2284 2285 static int write_discard_bitset(struct cache *cache) 2286 { 2287 unsigned i, r; 2288 2289 r = dm_cache_discard_bitset_resize(cache->cmd, cache->discard_block_size, 2290 cache->discard_nr_blocks); 2291 if (r) { 2292 DMERR("could not resize on-disk discard bitset"); 2293 return r; 2294 } 2295 2296 for (i = 0; i < from_dblock(cache->discard_nr_blocks); i++) { 2297 r = dm_cache_set_discard(cache->cmd, to_dblock(i), 2298 is_discarded(cache, to_dblock(i))); 2299 if (r) 2300 return r; 2301 } 2302 2303 return 0; 2304 } 2305 2306 static int save_hint(void *context, dm_cblock_t cblock, dm_oblock_t oblock, 2307 uint32_t hint) 2308 { 2309 struct cache *cache = context; 2310 return dm_cache_save_hint(cache->cmd, cblock, hint); 2311 } 2312 2313 static int write_hints(struct cache *cache) 2314 { 2315 int r; 2316 2317 r = dm_cache_begin_hints(cache->cmd, cache->policy); 2318 if (r) { 2319 DMERR("dm_cache_begin_hints failed"); 2320 return r; 2321 } 2322 2323 r = policy_walk_mappings(cache->policy, save_hint, cache); 2324 if (r) 2325 DMERR("policy_walk_mappings failed"); 2326 2327 return r; 2328 } 2329 2330 /* 2331 * returns true on success 2332 */ 2333 static bool sync_metadata(struct cache *cache) 2334 { 2335 int r1, r2, r3, r4; 2336 2337 r1 = write_dirty_bitset(cache); 2338 if (r1) 2339 DMERR("could not write dirty bitset"); 2340 2341 r2 = write_discard_bitset(cache); 2342 if (r2) 2343 DMERR("could not write discard bitset"); 2344 2345 save_stats(cache); 2346 2347 r3 = write_hints(cache); 2348 if (r3) 2349 DMERR("could not write hints"); 2350 2351 /* 2352 * If writing the above metadata failed, we still commit, but don't 2353 * set the clean shutdown flag. This will effectively force every 2354 * dirty bit to be set on reload. 2355 */ 2356 r4 = dm_cache_commit(cache->cmd, !r1 && !r2 && !r3); 2357 if (r4) 2358 DMERR("could not write cache metadata. Data loss may occur."); 2359 2360 return !r1 && !r2 && !r3 && !r4; 2361 } 2362 2363 static void cache_postsuspend(struct dm_target *ti) 2364 { 2365 struct cache *cache = ti->private; 2366 2367 start_quiescing(cache); 2368 wait_for_migrations(cache); 2369 stop_worker(cache); 2370 requeue_deferred_io(cache); 2371 stop_quiescing(cache); 2372 2373 (void) sync_metadata(cache); 2374 } 2375 2376 static int load_mapping(void *context, dm_oblock_t oblock, dm_cblock_t cblock, 2377 bool dirty, uint32_t hint, bool hint_valid) 2378 { 2379 int r; 2380 struct cache *cache = context; 2381 2382 r = policy_load_mapping(cache->policy, oblock, cblock, hint, hint_valid); 2383 if (r) 2384 return r; 2385 2386 if (dirty) 2387 set_dirty(cache, oblock, cblock); 2388 else 2389 clear_dirty(cache, oblock, cblock); 2390 2391 return 0; 2392 } 2393 2394 static int load_discard(void *context, sector_t discard_block_size, 2395 dm_dblock_t dblock, bool discard) 2396 { 2397 struct cache *cache = context; 2398 2399 /* FIXME: handle mis-matched block size */ 2400 2401 if (discard) 2402 set_discard(cache, dblock); 2403 else 2404 clear_discard(cache, dblock); 2405 2406 return 0; 2407 } 2408 2409 static int cache_preresume(struct dm_target *ti) 2410 { 2411 int r = 0; 2412 struct cache *cache = ti->private; 2413 sector_t actual_cache_size = get_dev_size(cache->cache_dev); 2414 (void) sector_div(actual_cache_size, cache->sectors_per_block); 2415 2416 /* 2417 * Check to see if the cache has resized. 2418 */ 2419 if (from_cblock(cache->cache_size) != actual_cache_size || !cache->sized) { 2420 cache->cache_size = to_cblock(actual_cache_size); 2421 2422 r = dm_cache_resize(cache->cmd, cache->cache_size); 2423 if (r) { 2424 DMERR("could not resize cache metadata"); 2425 return r; 2426 } 2427 2428 cache->sized = true; 2429 } 2430 2431 if (!cache->loaded_mappings) { 2432 r = dm_cache_load_mappings(cache->cmd, cache->policy, 2433 load_mapping, cache); 2434 if (r) { 2435 DMERR("could not load cache mappings"); 2436 return r; 2437 } 2438 2439 cache->loaded_mappings = true; 2440 } 2441 2442 if (!cache->loaded_discards) { 2443 r = dm_cache_load_discards(cache->cmd, load_discard, cache); 2444 if (r) { 2445 DMERR("could not load origin discards"); 2446 return r; 2447 } 2448 2449 cache->loaded_discards = true; 2450 } 2451 2452 return r; 2453 } 2454 2455 static void cache_resume(struct dm_target *ti) 2456 { 2457 struct cache *cache = ti->private; 2458 2459 cache->need_tick_bio = true; 2460 do_waker(&cache->waker.work); 2461 } 2462 2463 /* 2464 * Status format: 2465 * 2466 * <#used metadata blocks>/<#total metadata blocks> 2467 * <#read hits> <#read misses> <#write hits> <#write misses> 2468 * <#demotions> <#promotions> <#blocks in cache> <#dirty> 2469 * <#features> <features>* 2470 * <#core args> <core args> 2471 * <#policy args> <policy args>* 2472 */ 2473 static void cache_status(struct dm_target *ti, status_type_t type, 2474 unsigned status_flags, char *result, unsigned maxlen) 2475 { 2476 int r = 0; 2477 unsigned i; 2478 ssize_t sz = 0; 2479 dm_block_t nr_free_blocks_metadata = 0; 2480 dm_block_t nr_blocks_metadata = 0; 2481 char buf[BDEVNAME_SIZE]; 2482 struct cache *cache = ti->private; 2483 dm_cblock_t residency; 2484 2485 switch (type) { 2486 case STATUSTYPE_INFO: 2487 /* Commit to ensure statistics aren't out-of-date */ 2488 if (!(status_flags & DM_STATUS_NOFLUSH_FLAG) && !dm_suspended(ti)) { 2489 r = dm_cache_commit(cache->cmd, false); 2490 if (r) 2491 DMERR("could not commit metadata for accurate status"); 2492 } 2493 2494 r = dm_cache_get_free_metadata_block_count(cache->cmd, 2495 &nr_free_blocks_metadata); 2496 if (r) { 2497 DMERR("could not get metadata free block count"); 2498 goto err; 2499 } 2500 2501 r = dm_cache_get_metadata_dev_size(cache->cmd, &nr_blocks_metadata); 2502 if (r) { 2503 DMERR("could not get metadata device size"); 2504 goto err; 2505 } 2506 2507 residency = policy_residency(cache->policy); 2508 2509 DMEMIT("%llu/%llu %u %u %u %u %u %u %llu %u ", 2510 (unsigned long long)(nr_blocks_metadata - nr_free_blocks_metadata), 2511 (unsigned long long)nr_blocks_metadata, 2512 (unsigned) atomic_read(&cache->stats.read_hit), 2513 (unsigned) atomic_read(&cache->stats.read_miss), 2514 (unsigned) atomic_read(&cache->stats.write_hit), 2515 (unsigned) atomic_read(&cache->stats.write_miss), 2516 (unsigned) atomic_read(&cache->stats.demotion), 2517 (unsigned) atomic_read(&cache->stats.promotion), 2518 (unsigned long long) from_cblock(residency), 2519 cache->nr_dirty); 2520 2521 if (cache->features.write_through) 2522 DMEMIT("1 writethrough "); 2523 else 2524 DMEMIT("0 "); 2525 2526 DMEMIT("2 migration_threshold %llu ", (unsigned long long) cache->migration_threshold); 2527 if (sz < maxlen) { 2528 r = policy_emit_config_values(cache->policy, result + sz, maxlen - sz); 2529 if (r) 2530 DMERR("policy_emit_config_values returned %d", r); 2531 } 2532 2533 break; 2534 2535 case STATUSTYPE_TABLE: 2536 format_dev_t(buf, cache->metadata_dev->bdev->bd_dev); 2537 DMEMIT("%s ", buf); 2538 format_dev_t(buf, cache->cache_dev->bdev->bd_dev); 2539 DMEMIT("%s ", buf); 2540 format_dev_t(buf, cache->origin_dev->bdev->bd_dev); 2541 DMEMIT("%s", buf); 2542 2543 for (i = 0; i < cache->nr_ctr_args - 1; i++) 2544 DMEMIT(" %s", cache->ctr_args[i]); 2545 if (cache->nr_ctr_args) 2546 DMEMIT(" %s", cache->ctr_args[cache->nr_ctr_args - 1]); 2547 } 2548 2549 return; 2550 2551 err: 2552 DMEMIT("Error"); 2553 } 2554 2555 /* 2556 * Supports <key> <value>. 2557 * 2558 * The key migration_threshold is supported by the cache target core. 2559 */ 2560 static int cache_message(struct dm_target *ti, unsigned argc, char **argv) 2561 { 2562 struct cache *cache = ti->private; 2563 2564 if (argc != 2) 2565 return -EINVAL; 2566 2567 return set_config_value(cache, argv[0], argv[1]); 2568 } 2569 2570 static int cache_iterate_devices(struct dm_target *ti, 2571 iterate_devices_callout_fn fn, void *data) 2572 { 2573 int r = 0; 2574 struct cache *cache = ti->private; 2575 2576 r = fn(ti, cache->cache_dev, 0, get_dev_size(cache->cache_dev), data); 2577 if (!r) 2578 r = fn(ti, cache->origin_dev, 0, ti->len, data); 2579 2580 return r; 2581 } 2582 2583 /* 2584 * We assume I/O is going to the origin (which is the volume 2585 * more likely to have restrictions e.g. by being striped). 2586 * (Looking up the exact location of the data would be expensive 2587 * and could always be out of date by the time the bio is submitted.) 2588 */ 2589 static int cache_bvec_merge(struct dm_target *ti, 2590 struct bvec_merge_data *bvm, 2591 struct bio_vec *biovec, int max_size) 2592 { 2593 struct cache *cache = ti->private; 2594 struct request_queue *q = bdev_get_queue(cache->origin_dev->bdev); 2595 2596 if (!q->merge_bvec_fn) 2597 return max_size; 2598 2599 bvm->bi_bdev = cache->origin_dev->bdev; 2600 return min(max_size, q->merge_bvec_fn(q, bvm, biovec)); 2601 } 2602 2603 static void set_discard_limits(struct cache *cache, struct queue_limits *limits) 2604 { 2605 /* 2606 * FIXME: these limits may be incompatible with the cache device 2607 */ 2608 limits->max_discard_sectors = cache->discard_block_size * 1024; 2609 limits->discard_granularity = cache->discard_block_size << SECTOR_SHIFT; 2610 } 2611 2612 static void cache_io_hints(struct dm_target *ti, struct queue_limits *limits) 2613 { 2614 struct cache *cache = ti->private; 2615 uint64_t io_opt_sectors = limits->io_opt >> SECTOR_SHIFT; 2616 2617 /* 2618 * If the system-determined stacked limits are compatible with the 2619 * cache's blocksize (io_opt is a factor) do not override them. 2620 */ 2621 if (io_opt_sectors < cache->sectors_per_block || 2622 do_div(io_opt_sectors, cache->sectors_per_block)) { 2623 blk_limits_io_min(limits, 0); 2624 blk_limits_io_opt(limits, cache->sectors_per_block << SECTOR_SHIFT); 2625 } 2626 set_discard_limits(cache, limits); 2627 } 2628 2629 /*----------------------------------------------------------------*/ 2630 2631 static struct target_type cache_target = { 2632 .name = "cache", 2633 .version = {1, 1, 1}, 2634 .module = THIS_MODULE, 2635 .ctr = cache_ctr, 2636 .dtr = cache_dtr, 2637 .map = cache_map, 2638 .end_io = cache_end_io, 2639 .postsuspend = cache_postsuspend, 2640 .preresume = cache_preresume, 2641 .resume = cache_resume, 2642 .status = cache_status, 2643 .message = cache_message, 2644 .iterate_devices = cache_iterate_devices, 2645 .merge = cache_bvec_merge, 2646 .io_hints = cache_io_hints, 2647 }; 2648 2649 static int __init dm_cache_init(void) 2650 { 2651 int r; 2652 2653 r = dm_register_target(&cache_target); 2654 if (r) { 2655 DMERR("cache target registration failed: %d", r); 2656 return r; 2657 } 2658 2659 migration_cache = KMEM_CACHE(dm_cache_migration, 0); 2660 if (!migration_cache) { 2661 dm_unregister_target(&cache_target); 2662 return -ENOMEM; 2663 } 2664 2665 return 0; 2666 } 2667 2668 static void __exit dm_cache_exit(void) 2669 { 2670 dm_unregister_target(&cache_target); 2671 kmem_cache_destroy(migration_cache); 2672 } 2673 2674 module_init(dm_cache_init); 2675 module_exit(dm_cache_exit); 2676 2677 MODULE_DESCRIPTION(DM_NAME " cache target"); 2678 MODULE_AUTHOR("Joe Thornber <ejt@redhat.com>"); 2679 MODULE_LICENSE("GPL"); 2680