1 /* 2 * Copyright (C) 2012 Red Hat. All rights reserved. 3 * 4 * This file is released under the GPL. 5 */ 6 7 #include "dm.h" 8 #include "dm-bio-prison.h" 9 #include "dm-bio-record.h" 10 #include "dm-cache-metadata.h" 11 12 #include <linux/dm-io.h> 13 #include <linux/dm-kcopyd.h> 14 #include <linux/init.h> 15 #include <linux/mempool.h> 16 #include <linux/module.h> 17 #include <linux/slab.h> 18 #include <linux/vmalloc.h> 19 20 #define DM_MSG_PREFIX "cache" 21 22 DECLARE_DM_KCOPYD_THROTTLE_WITH_MODULE_PARM(cache_copy_throttle, 23 "A percentage of time allocated for copying to and/or from cache"); 24 25 /*----------------------------------------------------------------*/ 26 27 /* 28 * Glossary: 29 * 30 * oblock: index of an origin block 31 * cblock: index of a cache block 32 * promotion: movement of a block from origin to cache 33 * demotion: movement of a block from cache to origin 34 * migration: movement of a block between the origin and cache device, 35 * either direction 36 */ 37 38 /*----------------------------------------------------------------*/ 39 40 static size_t bitset_size_in_bytes(unsigned nr_entries) 41 { 42 return sizeof(unsigned long) * dm_div_up(nr_entries, BITS_PER_LONG); 43 } 44 45 static unsigned long *alloc_bitset(unsigned nr_entries) 46 { 47 size_t s = bitset_size_in_bytes(nr_entries); 48 return vzalloc(s); 49 } 50 51 static void clear_bitset(void *bitset, unsigned nr_entries) 52 { 53 size_t s = bitset_size_in_bytes(nr_entries); 54 memset(bitset, 0, s); 55 } 56 57 static void free_bitset(unsigned long *bits) 58 { 59 vfree(bits); 60 } 61 62 /*----------------------------------------------------------------*/ 63 64 #define PRISON_CELLS 1024 65 #define MIGRATION_POOL_SIZE 128 66 #define COMMIT_PERIOD HZ 67 #define MIGRATION_COUNT_WINDOW 10 68 69 /* 70 * The block size of the device holding cache data must be >= 32KB 71 */ 72 #define DATA_DEV_BLOCK_SIZE_MIN_SECTORS (32 * 1024 >> SECTOR_SHIFT) 73 74 /* 75 * FIXME: the cache is read/write for the time being. 76 */ 77 enum cache_mode { 78 CM_WRITE, /* metadata may be changed */ 79 CM_READ_ONLY, /* metadata may not be changed */ 80 }; 81 82 struct cache_features { 83 enum cache_mode mode; 84 bool write_through:1; 85 }; 86 87 struct cache_stats { 88 atomic_t read_hit; 89 atomic_t read_miss; 90 atomic_t write_hit; 91 atomic_t write_miss; 92 atomic_t demotion; 93 atomic_t promotion; 94 atomic_t copies_avoided; 95 atomic_t cache_cell_clash; 96 atomic_t commit_count; 97 atomic_t discard_count; 98 }; 99 100 struct cache { 101 struct dm_target *ti; 102 struct dm_target_callbacks callbacks; 103 104 /* 105 * Metadata is written to this device. 106 */ 107 struct dm_dev *metadata_dev; 108 109 /* 110 * The slower of the two data devices. Typically a spindle. 111 */ 112 struct dm_dev *origin_dev; 113 114 /* 115 * The faster of the two data devices. Typically an SSD. 116 */ 117 struct dm_dev *cache_dev; 118 119 /* 120 * Cache features such as write-through. 121 */ 122 struct cache_features features; 123 124 /* 125 * Size of the origin device in _complete_ blocks and native sectors. 126 */ 127 dm_oblock_t origin_blocks; 128 sector_t origin_sectors; 129 130 /* 131 * Size of the cache device in blocks. 132 */ 133 dm_cblock_t cache_size; 134 135 /* 136 * Fields for converting from sectors to blocks. 137 */ 138 uint32_t sectors_per_block; 139 int sectors_per_block_shift; 140 141 struct dm_cache_metadata *cmd; 142 143 spinlock_t lock; 144 struct bio_list deferred_bios; 145 struct bio_list deferred_flush_bios; 146 struct bio_list deferred_writethrough_bios; 147 struct list_head quiesced_migrations; 148 struct list_head completed_migrations; 149 struct list_head need_commit_migrations; 150 sector_t migration_threshold; 151 atomic_t nr_migrations; 152 wait_queue_head_t migration_wait; 153 154 /* 155 * cache_size entries, dirty if set 156 */ 157 dm_cblock_t nr_dirty; 158 unsigned long *dirty_bitset; 159 160 /* 161 * origin_blocks entries, discarded if set. 162 */ 163 uint32_t discard_block_size; /* a power of 2 times sectors per block */ 164 dm_dblock_t discard_nr_blocks; 165 unsigned long *discard_bitset; 166 167 struct dm_kcopyd_client *copier; 168 struct workqueue_struct *wq; 169 struct work_struct worker; 170 171 struct delayed_work waker; 172 unsigned long last_commit_jiffies; 173 174 struct dm_bio_prison *prison; 175 struct dm_deferred_set *all_io_ds; 176 177 mempool_t *migration_pool; 178 struct dm_cache_migration *next_migration; 179 180 struct dm_cache_policy *policy; 181 unsigned policy_nr_args; 182 183 bool need_tick_bio:1; 184 bool sized:1; 185 bool quiescing:1; 186 bool commit_requested:1; 187 bool loaded_mappings:1; 188 bool loaded_discards:1; 189 190 struct cache_stats stats; 191 192 /* 193 * Rather than reconstructing the table line for the status we just 194 * save it and regurgitate. 195 */ 196 unsigned nr_ctr_args; 197 const char **ctr_args; 198 }; 199 200 struct per_bio_data { 201 bool tick:1; 202 unsigned req_nr:2; 203 struct dm_deferred_entry *all_io_entry; 204 205 /* 206 * writethrough fields. These MUST remain at the end of this 207 * structure and the 'cache' member must be the first as it 208 * is used to determine the offset of the writethrough fields. 209 */ 210 struct cache *cache; 211 dm_cblock_t cblock; 212 bio_end_io_t *saved_bi_end_io; 213 struct dm_bio_details bio_details; 214 }; 215 216 struct dm_cache_migration { 217 struct list_head list; 218 struct cache *cache; 219 220 unsigned long start_jiffies; 221 dm_oblock_t old_oblock; 222 dm_oblock_t new_oblock; 223 dm_cblock_t cblock; 224 225 bool err:1; 226 bool writeback:1; 227 bool demote:1; 228 bool promote:1; 229 230 struct dm_bio_prison_cell *old_ocell; 231 struct dm_bio_prison_cell *new_ocell; 232 }; 233 234 /* 235 * Processing a bio in the worker thread may require these memory 236 * allocations. We prealloc to avoid deadlocks (the same worker thread 237 * frees them back to the mempool). 238 */ 239 struct prealloc { 240 struct dm_cache_migration *mg; 241 struct dm_bio_prison_cell *cell1; 242 struct dm_bio_prison_cell *cell2; 243 }; 244 245 static void wake_worker(struct cache *cache) 246 { 247 queue_work(cache->wq, &cache->worker); 248 } 249 250 /*----------------------------------------------------------------*/ 251 252 static struct dm_bio_prison_cell *alloc_prison_cell(struct cache *cache) 253 { 254 /* FIXME: change to use a local slab. */ 255 return dm_bio_prison_alloc_cell(cache->prison, GFP_NOWAIT); 256 } 257 258 static void free_prison_cell(struct cache *cache, struct dm_bio_prison_cell *cell) 259 { 260 dm_bio_prison_free_cell(cache->prison, cell); 261 } 262 263 static int prealloc_data_structs(struct cache *cache, struct prealloc *p) 264 { 265 if (!p->mg) { 266 p->mg = mempool_alloc(cache->migration_pool, GFP_NOWAIT); 267 if (!p->mg) 268 return -ENOMEM; 269 } 270 271 if (!p->cell1) { 272 p->cell1 = alloc_prison_cell(cache); 273 if (!p->cell1) 274 return -ENOMEM; 275 } 276 277 if (!p->cell2) { 278 p->cell2 = alloc_prison_cell(cache); 279 if (!p->cell2) 280 return -ENOMEM; 281 } 282 283 return 0; 284 } 285 286 static void prealloc_free_structs(struct cache *cache, struct prealloc *p) 287 { 288 if (p->cell2) 289 free_prison_cell(cache, p->cell2); 290 291 if (p->cell1) 292 free_prison_cell(cache, p->cell1); 293 294 if (p->mg) 295 mempool_free(p->mg, cache->migration_pool); 296 } 297 298 static struct dm_cache_migration *prealloc_get_migration(struct prealloc *p) 299 { 300 struct dm_cache_migration *mg = p->mg; 301 302 BUG_ON(!mg); 303 p->mg = NULL; 304 305 return mg; 306 } 307 308 /* 309 * You must have a cell within the prealloc struct to return. If not this 310 * function will BUG() rather than returning NULL. 311 */ 312 static struct dm_bio_prison_cell *prealloc_get_cell(struct prealloc *p) 313 { 314 struct dm_bio_prison_cell *r = NULL; 315 316 if (p->cell1) { 317 r = p->cell1; 318 p->cell1 = NULL; 319 320 } else if (p->cell2) { 321 r = p->cell2; 322 p->cell2 = NULL; 323 } else 324 BUG(); 325 326 return r; 327 } 328 329 /* 330 * You can't have more than two cells in a prealloc struct. BUG() will be 331 * called if you try and overfill. 332 */ 333 static void prealloc_put_cell(struct prealloc *p, struct dm_bio_prison_cell *cell) 334 { 335 if (!p->cell2) 336 p->cell2 = cell; 337 338 else if (!p->cell1) 339 p->cell1 = cell; 340 341 else 342 BUG(); 343 } 344 345 /*----------------------------------------------------------------*/ 346 347 static void build_key(dm_oblock_t oblock, struct dm_cell_key *key) 348 { 349 key->virtual = 0; 350 key->dev = 0; 351 key->block = from_oblock(oblock); 352 } 353 354 /* 355 * The caller hands in a preallocated cell, and a free function for it. 356 * The cell will be freed if there's an error, or if it wasn't used because 357 * a cell with that key already exists. 358 */ 359 typedef void (*cell_free_fn)(void *context, struct dm_bio_prison_cell *cell); 360 361 static int bio_detain(struct cache *cache, dm_oblock_t oblock, 362 struct bio *bio, struct dm_bio_prison_cell *cell_prealloc, 363 cell_free_fn free_fn, void *free_context, 364 struct dm_bio_prison_cell **cell_result) 365 { 366 int r; 367 struct dm_cell_key key; 368 369 build_key(oblock, &key); 370 r = dm_bio_detain(cache->prison, &key, bio, cell_prealloc, cell_result); 371 if (r) 372 free_fn(free_context, cell_prealloc); 373 374 return r; 375 } 376 377 static int get_cell(struct cache *cache, 378 dm_oblock_t oblock, 379 struct prealloc *structs, 380 struct dm_bio_prison_cell **cell_result) 381 { 382 int r; 383 struct dm_cell_key key; 384 struct dm_bio_prison_cell *cell_prealloc; 385 386 cell_prealloc = prealloc_get_cell(structs); 387 388 build_key(oblock, &key); 389 r = dm_get_cell(cache->prison, &key, cell_prealloc, cell_result); 390 if (r) 391 prealloc_put_cell(structs, cell_prealloc); 392 393 return r; 394 } 395 396 /*----------------------------------------------------------------*/ 397 398 static bool is_dirty(struct cache *cache, dm_cblock_t b) 399 { 400 return test_bit(from_cblock(b), cache->dirty_bitset); 401 } 402 403 static void set_dirty(struct cache *cache, dm_oblock_t oblock, dm_cblock_t cblock) 404 { 405 if (!test_and_set_bit(from_cblock(cblock), cache->dirty_bitset)) { 406 cache->nr_dirty = to_cblock(from_cblock(cache->nr_dirty) + 1); 407 policy_set_dirty(cache->policy, oblock); 408 } 409 } 410 411 static void clear_dirty(struct cache *cache, dm_oblock_t oblock, dm_cblock_t cblock) 412 { 413 if (test_and_clear_bit(from_cblock(cblock), cache->dirty_bitset)) { 414 policy_clear_dirty(cache->policy, oblock); 415 cache->nr_dirty = to_cblock(from_cblock(cache->nr_dirty) - 1); 416 if (!from_cblock(cache->nr_dirty)) 417 dm_table_event(cache->ti->table); 418 } 419 } 420 421 /*----------------------------------------------------------------*/ 422 423 static bool block_size_is_power_of_two(struct cache *cache) 424 { 425 return cache->sectors_per_block_shift >= 0; 426 } 427 428 /* gcc on ARM generates spurious references to __udivdi3 and __umoddi3 */ 429 #if defined(CONFIG_ARM) && __GNUC__ == 4 && __GNUC_MINOR__ <= 6 430 __always_inline 431 #endif 432 static dm_block_t block_div(dm_block_t b, uint32_t n) 433 { 434 do_div(b, n); 435 436 return b; 437 } 438 439 static dm_dblock_t oblock_to_dblock(struct cache *cache, dm_oblock_t oblock) 440 { 441 uint32_t discard_blocks = cache->discard_block_size; 442 dm_block_t b = from_oblock(oblock); 443 444 if (!block_size_is_power_of_two(cache)) 445 discard_blocks = discard_blocks / cache->sectors_per_block; 446 else 447 discard_blocks >>= cache->sectors_per_block_shift; 448 449 b = block_div(b, discard_blocks); 450 451 return to_dblock(b); 452 } 453 454 static void set_discard(struct cache *cache, dm_dblock_t b) 455 { 456 unsigned long flags; 457 458 atomic_inc(&cache->stats.discard_count); 459 460 spin_lock_irqsave(&cache->lock, flags); 461 set_bit(from_dblock(b), cache->discard_bitset); 462 spin_unlock_irqrestore(&cache->lock, flags); 463 } 464 465 static void clear_discard(struct cache *cache, dm_dblock_t b) 466 { 467 unsigned long flags; 468 469 spin_lock_irqsave(&cache->lock, flags); 470 clear_bit(from_dblock(b), cache->discard_bitset); 471 spin_unlock_irqrestore(&cache->lock, flags); 472 } 473 474 static bool is_discarded(struct cache *cache, dm_dblock_t b) 475 { 476 int r; 477 unsigned long flags; 478 479 spin_lock_irqsave(&cache->lock, flags); 480 r = test_bit(from_dblock(b), cache->discard_bitset); 481 spin_unlock_irqrestore(&cache->lock, flags); 482 483 return r; 484 } 485 486 static bool is_discarded_oblock(struct cache *cache, dm_oblock_t b) 487 { 488 int r; 489 unsigned long flags; 490 491 spin_lock_irqsave(&cache->lock, flags); 492 r = test_bit(from_dblock(oblock_to_dblock(cache, b)), 493 cache->discard_bitset); 494 spin_unlock_irqrestore(&cache->lock, flags); 495 496 return r; 497 } 498 499 /*----------------------------------------------------------------*/ 500 501 static void load_stats(struct cache *cache) 502 { 503 struct dm_cache_statistics stats; 504 505 dm_cache_metadata_get_stats(cache->cmd, &stats); 506 atomic_set(&cache->stats.read_hit, stats.read_hits); 507 atomic_set(&cache->stats.read_miss, stats.read_misses); 508 atomic_set(&cache->stats.write_hit, stats.write_hits); 509 atomic_set(&cache->stats.write_miss, stats.write_misses); 510 } 511 512 static void save_stats(struct cache *cache) 513 { 514 struct dm_cache_statistics stats; 515 516 stats.read_hits = atomic_read(&cache->stats.read_hit); 517 stats.read_misses = atomic_read(&cache->stats.read_miss); 518 stats.write_hits = atomic_read(&cache->stats.write_hit); 519 stats.write_misses = atomic_read(&cache->stats.write_miss); 520 521 dm_cache_metadata_set_stats(cache->cmd, &stats); 522 } 523 524 /*---------------------------------------------------------------- 525 * Per bio data 526 *--------------------------------------------------------------*/ 527 528 /* 529 * If using writeback, leave out struct per_bio_data's writethrough fields. 530 */ 531 #define PB_DATA_SIZE_WB (offsetof(struct per_bio_data, cache)) 532 #define PB_DATA_SIZE_WT (sizeof(struct per_bio_data)) 533 534 static size_t get_per_bio_data_size(struct cache *cache) 535 { 536 return cache->features.write_through ? PB_DATA_SIZE_WT : PB_DATA_SIZE_WB; 537 } 538 539 static struct per_bio_data *get_per_bio_data(struct bio *bio, size_t data_size) 540 { 541 struct per_bio_data *pb = dm_per_bio_data(bio, data_size); 542 BUG_ON(!pb); 543 return pb; 544 } 545 546 static struct per_bio_data *init_per_bio_data(struct bio *bio, size_t data_size) 547 { 548 struct per_bio_data *pb = get_per_bio_data(bio, data_size); 549 550 pb->tick = false; 551 pb->req_nr = dm_bio_get_target_bio_nr(bio); 552 pb->all_io_entry = NULL; 553 554 return pb; 555 } 556 557 /*---------------------------------------------------------------- 558 * Remapping 559 *--------------------------------------------------------------*/ 560 static void remap_to_origin(struct cache *cache, struct bio *bio) 561 { 562 bio->bi_bdev = cache->origin_dev->bdev; 563 } 564 565 static void remap_to_cache(struct cache *cache, struct bio *bio, 566 dm_cblock_t cblock) 567 { 568 sector_t bi_sector = bio->bi_sector; 569 570 bio->bi_bdev = cache->cache_dev->bdev; 571 if (!block_size_is_power_of_two(cache)) 572 bio->bi_sector = (from_cblock(cblock) * cache->sectors_per_block) + 573 sector_div(bi_sector, cache->sectors_per_block); 574 else 575 bio->bi_sector = (from_cblock(cblock) << cache->sectors_per_block_shift) | 576 (bi_sector & (cache->sectors_per_block - 1)); 577 } 578 579 static void check_if_tick_bio_needed(struct cache *cache, struct bio *bio) 580 { 581 unsigned long flags; 582 size_t pb_data_size = get_per_bio_data_size(cache); 583 struct per_bio_data *pb = get_per_bio_data(bio, pb_data_size); 584 585 spin_lock_irqsave(&cache->lock, flags); 586 if (cache->need_tick_bio && 587 !(bio->bi_rw & (REQ_FUA | REQ_FLUSH | REQ_DISCARD))) { 588 pb->tick = true; 589 cache->need_tick_bio = false; 590 } 591 spin_unlock_irqrestore(&cache->lock, flags); 592 } 593 594 static void remap_to_origin_clear_discard(struct cache *cache, struct bio *bio, 595 dm_oblock_t oblock) 596 { 597 check_if_tick_bio_needed(cache, bio); 598 remap_to_origin(cache, bio); 599 if (bio_data_dir(bio) == WRITE) 600 clear_discard(cache, oblock_to_dblock(cache, oblock)); 601 } 602 603 static void remap_to_cache_dirty(struct cache *cache, struct bio *bio, 604 dm_oblock_t oblock, dm_cblock_t cblock) 605 { 606 remap_to_cache(cache, bio, cblock); 607 if (bio_data_dir(bio) == WRITE) { 608 set_dirty(cache, oblock, cblock); 609 clear_discard(cache, oblock_to_dblock(cache, oblock)); 610 } 611 } 612 613 static dm_oblock_t get_bio_block(struct cache *cache, struct bio *bio) 614 { 615 sector_t block_nr = bio->bi_sector; 616 617 if (!block_size_is_power_of_two(cache)) 618 (void) sector_div(block_nr, cache->sectors_per_block); 619 else 620 block_nr >>= cache->sectors_per_block_shift; 621 622 return to_oblock(block_nr); 623 } 624 625 static int bio_triggers_commit(struct cache *cache, struct bio *bio) 626 { 627 return bio->bi_rw & (REQ_FLUSH | REQ_FUA); 628 } 629 630 static void issue(struct cache *cache, struct bio *bio) 631 { 632 unsigned long flags; 633 634 if (!bio_triggers_commit(cache, bio)) { 635 generic_make_request(bio); 636 return; 637 } 638 639 /* 640 * Batch together any bios that trigger commits and then issue a 641 * single commit for them in do_worker(). 642 */ 643 spin_lock_irqsave(&cache->lock, flags); 644 cache->commit_requested = true; 645 bio_list_add(&cache->deferred_flush_bios, bio); 646 spin_unlock_irqrestore(&cache->lock, flags); 647 } 648 649 static void defer_writethrough_bio(struct cache *cache, struct bio *bio) 650 { 651 unsigned long flags; 652 653 spin_lock_irqsave(&cache->lock, flags); 654 bio_list_add(&cache->deferred_writethrough_bios, bio); 655 spin_unlock_irqrestore(&cache->lock, flags); 656 657 wake_worker(cache); 658 } 659 660 static void writethrough_endio(struct bio *bio, int err) 661 { 662 struct per_bio_data *pb = get_per_bio_data(bio, PB_DATA_SIZE_WT); 663 bio->bi_end_io = pb->saved_bi_end_io; 664 665 if (err) { 666 bio_endio(bio, err); 667 return; 668 } 669 670 dm_bio_restore(&pb->bio_details, bio); 671 remap_to_cache(pb->cache, bio, pb->cblock); 672 673 /* 674 * We can't issue this bio directly, since we're in interrupt 675 * context. So it gets put on a bio list for processing by the 676 * worker thread. 677 */ 678 defer_writethrough_bio(pb->cache, bio); 679 } 680 681 /* 682 * When running in writethrough mode we need to send writes to clean blocks 683 * to both the cache and origin devices. In future we'd like to clone the 684 * bio and send them in parallel, but for now we're doing them in 685 * series as this is easier. 686 */ 687 static void remap_to_origin_then_cache(struct cache *cache, struct bio *bio, 688 dm_oblock_t oblock, dm_cblock_t cblock) 689 { 690 struct per_bio_data *pb = get_per_bio_data(bio, PB_DATA_SIZE_WT); 691 692 pb->cache = cache; 693 pb->cblock = cblock; 694 pb->saved_bi_end_io = bio->bi_end_io; 695 dm_bio_record(&pb->bio_details, bio); 696 bio->bi_end_io = writethrough_endio; 697 698 remap_to_origin_clear_discard(pb->cache, bio, oblock); 699 } 700 701 /*---------------------------------------------------------------- 702 * Migration processing 703 * 704 * Migration covers moving data from the origin device to the cache, or 705 * vice versa. 706 *--------------------------------------------------------------*/ 707 static void free_migration(struct dm_cache_migration *mg) 708 { 709 mempool_free(mg, mg->cache->migration_pool); 710 } 711 712 static void inc_nr_migrations(struct cache *cache) 713 { 714 atomic_inc(&cache->nr_migrations); 715 } 716 717 static void dec_nr_migrations(struct cache *cache) 718 { 719 atomic_dec(&cache->nr_migrations); 720 721 /* 722 * Wake the worker in case we're suspending the target. 723 */ 724 wake_up(&cache->migration_wait); 725 } 726 727 static void __cell_defer(struct cache *cache, struct dm_bio_prison_cell *cell, 728 bool holder) 729 { 730 (holder ? dm_cell_release : dm_cell_release_no_holder) 731 (cache->prison, cell, &cache->deferred_bios); 732 free_prison_cell(cache, cell); 733 } 734 735 static void cell_defer(struct cache *cache, struct dm_bio_prison_cell *cell, 736 bool holder) 737 { 738 unsigned long flags; 739 740 spin_lock_irqsave(&cache->lock, flags); 741 __cell_defer(cache, cell, holder); 742 spin_unlock_irqrestore(&cache->lock, flags); 743 744 wake_worker(cache); 745 } 746 747 static void cleanup_migration(struct dm_cache_migration *mg) 748 { 749 dec_nr_migrations(mg->cache); 750 free_migration(mg); 751 } 752 753 static void migration_failure(struct dm_cache_migration *mg) 754 { 755 struct cache *cache = mg->cache; 756 757 if (mg->writeback) { 758 DMWARN_LIMIT("writeback failed; couldn't copy block"); 759 set_dirty(cache, mg->old_oblock, mg->cblock); 760 cell_defer(cache, mg->old_ocell, false); 761 762 } else if (mg->demote) { 763 DMWARN_LIMIT("demotion failed; couldn't copy block"); 764 policy_force_mapping(cache->policy, mg->new_oblock, mg->old_oblock); 765 766 cell_defer(cache, mg->old_ocell, mg->promote ? 0 : 1); 767 if (mg->promote) 768 cell_defer(cache, mg->new_ocell, 1); 769 } else { 770 DMWARN_LIMIT("promotion failed; couldn't copy block"); 771 policy_remove_mapping(cache->policy, mg->new_oblock); 772 cell_defer(cache, mg->new_ocell, 1); 773 } 774 775 cleanup_migration(mg); 776 } 777 778 static void migration_success_pre_commit(struct dm_cache_migration *mg) 779 { 780 unsigned long flags; 781 struct cache *cache = mg->cache; 782 783 if (mg->writeback) { 784 cell_defer(cache, mg->old_ocell, false); 785 clear_dirty(cache, mg->old_oblock, mg->cblock); 786 cleanup_migration(mg); 787 return; 788 789 } else if (mg->demote) { 790 if (dm_cache_remove_mapping(cache->cmd, mg->cblock)) { 791 DMWARN_LIMIT("demotion failed; couldn't update on disk metadata"); 792 policy_force_mapping(cache->policy, mg->new_oblock, 793 mg->old_oblock); 794 if (mg->promote) 795 cell_defer(cache, mg->new_ocell, true); 796 cleanup_migration(mg); 797 return; 798 } 799 } else { 800 if (dm_cache_insert_mapping(cache->cmd, mg->cblock, mg->new_oblock)) { 801 DMWARN_LIMIT("promotion failed; couldn't update on disk metadata"); 802 policy_remove_mapping(cache->policy, mg->new_oblock); 803 cleanup_migration(mg); 804 return; 805 } 806 } 807 808 spin_lock_irqsave(&cache->lock, flags); 809 list_add_tail(&mg->list, &cache->need_commit_migrations); 810 cache->commit_requested = true; 811 spin_unlock_irqrestore(&cache->lock, flags); 812 } 813 814 static void migration_success_post_commit(struct dm_cache_migration *mg) 815 { 816 unsigned long flags; 817 struct cache *cache = mg->cache; 818 819 if (mg->writeback) { 820 DMWARN("writeback unexpectedly triggered commit"); 821 return; 822 823 } else if (mg->demote) { 824 cell_defer(cache, mg->old_ocell, mg->promote ? 0 : 1); 825 826 if (mg->promote) { 827 mg->demote = false; 828 829 spin_lock_irqsave(&cache->lock, flags); 830 list_add_tail(&mg->list, &cache->quiesced_migrations); 831 spin_unlock_irqrestore(&cache->lock, flags); 832 833 } else 834 cleanup_migration(mg); 835 836 } else { 837 cell_defer(cache, mg->new_ocell, true); 838 clear_dirty(cache, mg->new_oblock, mg->cblock); 839 cleanup_migration(mg); 840 } 841 } 842 843 static void copy_complete(int read_err, unsigned long write_err, void *context) 844 { 845 unsigned long flags; 846 struct dm_cache_migration *mg = (struct dm_cache_migration *) context; 847 struct cache *cache = mg->cache; 848 849 if (read_err || write_err) 850 mg->err = true; 851 852 spin_lock_irqsave(&cache->lock, flags); 853 list_add_tail(&mg->list, &cache->completed_migrations); 854 spin_unlock_irqrestore(&cache->lock, flags); 855 856 wake_worker(cache); 857 } 858 859 static void issue_copy_real(struct dm_cache_migration *mg) 860 { 861 int r; 862 struct dm_io_region o_region, c_region; 863 struct cache *cache = mg->cache; 864 865 o_region.bdev = cache->origin_dev->bdev; 866 o_region.count = cache->sectors_per_block; 867 868 c_region.bdev = cache->cache_dev->bdev; 869 c_region.sector = from_cblock(mg->cblock) * cache->sectors_per_block; 870 c_region.count = cache->sectors_per_block; 871 872 if (mg->writeback || mg->demote) { 873 /* demote */ 874 o_region.sector = from_oblock(mg->old_oblock) * cache->sectors_per_block; 875 r = dm_kcopyd_copy(cache->copier, &c_region, 1, &o_region, 0, copy_complete, mg); 876 } else { 877 /* promote */ 878 o_region.sector = from_oblock(mg->new_oblock) * cache->sectors_per_block; 879 r = dm_kcopyd_copy(cache->copier, &o_region, 1, &c_region, 0, copy_complete, mg); 880 } 881 882 if (r < 0) 883 migration_failure(mg); 884 } 885 886 static void avoid_copy(struct dm_cache_migration *mg) 887 { 888 atomic_inc(&mg->cache->stats.copies_avoided); 889 migration_success_pre_commit(mg); 890 } 891 892 static void issue_copy(struct dm_cache_migration *mg) 893 { 894 bool avoid; 895 struct cache *cache = mg->cache; 896 897 if (mg->writeback || mg->demote) 898 avoid = !is_dirty(cache, mg->cblock) || 899 is_discarded_oblock(cache, mg->old_oblock); 900 else 901 avoid = is_discarded_oblock(cache, mg->new_oblock); 902 903 avoid ? avoid_copy(mg) : issue_copy_real(mg); 904 } 905 906 static void complete_migration(struct dm_cache_migration *mg) 907 { 908 if (mg->err) 909 migration_failure(mg); 910 else 911 migration_success_pre_commit(mg); 912 } 913 914 static void process_migrations(struct cache *cache, struct list_head *head, 915 void (*fn)(struct dm_cache_migration *)) 916 { 917 unsigned long flags; 918 struct list_head list; 919 struct dm_cache_migration *mg, *tmp; 920 921 INIT_LIST_HEAD(&list); 922 spin_lock_irqsave(&cache->lock, flags); 923 list_splice_init(head, &list); 924 spin_unlock_irqrestore(&cache->lock, flags); 925 926 list_for_each_entry_safe(mg, tmp, &list, list) 927 fn(mg); 928 } 929 930 static void __queue_quiesced_migration(struct dm_cache_migration *mg) 931 { 932 list_add_tail(&mg->list, &mg->cache->quiesced_migrations); 933 } 934 935 static void queue_quiesced_migration(struct dm_cache_migration *mg) 936 { 937 unsigned long flags; 938 struct cache *cache = mg->cache; 939 940 spin_lock_irqsave(&cache->lock, flags); 941 __queue_quiesced_migration(mg); 942 spin_unlock_irqrestore(&cache->lock, flags); 943 944 wake_worker(cache); 945 } 946 947 static void queue_quiesced_migrations(struct cache *cache, struct list_head *work) 948 { 949 unsigned long flags; 950 struct dm_cache_migration *mg, *tmp; 951 952 spin_lock_irqsave(&cache->lock, flags); 953 list_for_each_entry_safe(mg, tmp, work, list) 954 __queue_quiesced_migration(mg); 955 spin_unlock_irqrestore(&cache->lock, flags); 956 957 wake_worker(cache); 958 } 959 960 static void check_for_quiesced_migrations(struct cache *cache, 961 struct per_bio_data *pb) 962 { 963 struct list_head work; 964 965 if (!pb->all_io_entry) 966 return; 967 968 INIT_LIST_HEAD(&work); 969 if (pb->all_io_entry) 970 dm_deferred_entry_dec(pb->all_io_entry, &work); 971 972 if (!list_empty(&work)) 973 queue_quiesced_migrations(cache, &work); 974 } 975 976 static void quiesce_migration(struct dm_cache_migration *mg) 977 { 978 if (!dm_deferred_set_add_work(mg->cache->all_io_ds, &mg->list)) 979 queue_quiesced_migration(mg); 980 } 981 982 static void promote(struct cache *cache, struct prealloc *structs, 983 dm_oblock_t oblock, dm_cblock_t cblock, 984 struct dm_bio_prison_cell *cell) 985 { 986 struct dm_cache_migration *mg = prealloc_get_migration(structs); 987 988 mg->err = false; 989 mg->writeback = false; 990 mg->demote = false; 991 mg->promote = true; 992 mg->cache = cache; 993 mg->new_oblock = oblock; 994 mg->cblock = cblock; 995 mg->old_ocell = NULL; 996 mg->new_ocell = cell; 997 mg->start_jiffies = jiffies; 998 999 inc_nr_migrations(cache); 1000 quiesce_migration(mg); 1001 } 1002 1003 static void writeback(struct cache *cache, struct prealloc *structs, 1004 dm_oblock_t oblock, dm_cblock_t cblock, 1005 struct dm_bio_prison_cell *cell) 1006 { 1007 struct dm_cache_migration *mg = prealloc_get_migration(structs); 1008 1009 mg->err = false; 1010 mg->writeback = true; 1011 mg->demote = false; 1012 mg->promote = false; 1013 mg->cache = cache; 1014 mg->old_oblock = oblock; 1015 mg->cblock = cblock; 1016 mg->old_ocell = cell; 1017 mg->new_ocell = NULL; 1018 mg->start_jiffies = jiffies; 1019 1020 inc_nr_migrations(cache); 1021 quiesce_migration(mg); 1022 } 1023 1024 static void demote_then_promote(struct cache *cache, struct prealloc *structs, 1025 dm_oblock_t old_oblock, dm_oblock_t new_oblock, 1026 dm_cblock_t cblock, 1027 struct dm_bio_prison_cell *old_ocell, 1028 struct dm_bio_prison_cell *new_ocell) 1029 { 1030 struct dm_cache_migration *mg = prealloc_get_migration(structs); 1031 1032 mg->err = false; 1033 mg->writeback = false; 1034 mg->demote = true; 1035 mg->promote = true; 1036 mg->cache = cache; 1037 mg->old_oblock = old_oblock; 1038 mg->new_oblock = new_oblock; 1039 mg->cblock = cblock; 1040 mg->old_ocell = old_ocell; 1041 mg->new_ocell = new_ocell; 1042 mg->start_jiffies = jiffies; 1043 1044 inc_nr_migrations(cache); 1045 quiesce_migration(mg); 1046 } 1047 1048 /*---------------------------------------------------------------- 1049 * bio processing 1050 *--------------------------------------------------------------*/ 1051 static void defer_bio(struct cache *cache, struct bio *bio) 1052 { 1053 unsigned long flags; 1054 1055 spin_lock_irqsave(&cache->lock, flags); 1056 bio_list_add(&cache->deferred_bios, bio); 1057 spin_unlock_irqrestore(&cache->lock, flags); 1058 1059 wake_worker(cache); 1060 } 1061 1062 static void process_flush_bio(struct cache *cache, struct bio *bio) 1063 { 1064 size_t pb_data_size = get_per_bio_data_size(cache); 1065 struct per_bio_data *pb = get_per_bio_data(bio, pb_data_size); 1066 1067 BUG_ON(bio->bi_size); 1068 if (!pb->req_nr) 1069 remap_to_origin(cache, bio); 1070 else 1071 remap_to_cache(cache, bio, 0); 1072 1073 issue(cache, bio); 1074 } 1075 1076 /* 1077 * People generally discard large parts of a device, eg, the whole device 1078 * when formatting. Splitting these large discards up into cache block 1079 * sized ios and then quiescing (always neccessary for discard) takes too 1080 * long. 1081 * 1082 * We keep it simple, and allow any size of discard to come in, and just 1083 * mark off blocks on the discard bitset. No passdown occurs! 1084 * 1085 * To implement passdown we need to change the bio_prison such that a cell 1086 * can have a key that spans many blocks. 1087 */ 1088 static void process_discard_bio(struct cache *cache, struct bio *bio) 1089 { 1090 dm_block_t start_block = dm_sector_div_up(bio->bi_sector, 1091 cache->discard_block_size); 1092 dm_block_t end_block = bio->bi_sector + bio_sectors(bio); 1093 dm_block_t b; 1094 1095 end_block = block_div(end_block, cache->discard_block_size); 1096 1097 for (b = start_block; b < end_block; b++) 1098 set_discard(cache, to_dblock(b)); 1099 1100 bio_endio(bio, 0); 1101 } 1102 1103 static bool spare_migration_bandwidth(struct cache *cache) 1104 { 1105 sector_t current_volume = (atomic_read(&cache->nr_migrations) + 1) * 1106 cache->sectors_per_block; 1107 return current_volume < cache->migration_threshold; 1108 } 1109 1110 static bool is_writethrough_io(struct cache *cache, struct bio *bio, 1111 dm_cblock_t cblock) 1112 { 1113 return bio_data_dir(bio) == WRITE && 1114 cache->features.write_through && !is_dirty(cache, cblock); 1115 } 1116 1117 static void inc_hit_counter(struct cache *cache, struct bio *bio) 1118 { 1119 atomic_inc(bio_data_dir(bio) == READ ? 1120 &cache->stats.read_hit : &cache->stats.write_hit); 1121 } 1122 1123 static void inc_miss_counter(struct cache *cache, struct bio *bio) 1124 { 1125 atomic_inc(bio_data_dir(bio) == READ ? 1126 &cache->stats.read_miss : &cache->stats.write_miss); 1127 } 1128 1129 static void process_bio(struct cache *cache, struct prealloc *structs, 1130 struct bio *bio) 1131 { 1132 int r; 1133 bool release_cell = true; 1134 dm_oblock_t block = get_bio_block(cache, bio); 1135 struct dm_bio_prison_cell *cell_prealloc, *old_ocell, *new_ocell; 1136 struct policy_result lookup_result; 1137 size_t pb_data_size = get_per_bio_data_size(cache); 1138 struct per_bio_data *pb = get_per_bio_data(bio, pb_data_size); 1139 bool discarded_block = is_discarded_oblock(cache, block); 1140 bool can_migrate = discarded_block || spare_migration_bandwidth(cache); 1141 1142 /* 1143 * Check to see if that block is currently migrating. 1144 */ 1145 cell_prealloc = prealloc_get_cell(structs); 1146 r = bio_detain(cache, block, bio, cell_prealloc, 1147 (cell_free_fn) prealloc_put_cell, 1148 structs, &new_ocell); 1149 if (r > 0) 1150 return; 1151 1152 r = policy_map(cache->policy, block, true, can_migrate, discarded_block, 1153 bio, &lookup_result); 1154 1155 if (r == -EWOULDBLOCK) 1156 /* migration has been denied */ 1157 lookup_result.op = POLICY_MISS; 1158 1159 switch (lookup_result.op) { 1160 case POLICY_HIT: 1161 inc_hit_counter(cache, bio); 1162 pb->all_io_entry = dm_deferred_entry_inc(cache->all_io_ds); 1163 1164 if (is_writethrough_io(cache, bio, lookup_result.cblock)) 1165 remap_to_origin_then_cache(cache, bio, block, lookup_result.cblock); 1166 else 1167 remap_to_cache_dirty(cache, bio, block, lookup_result.cblock); 1168 1169 issue(cache, bio); 1170 break; 1171 1172 case POLICY_MISS: 1173 inc_miss_counter(cache, bio); 1174 pb->all_io_entry = dm_deferred_entry_inc(cache->all_io_ds); 1175 remap_to_origin_clear_discard(cache, bio, block); 1176 issue(cache, bio); 1177 break; 1178 1179 case POLICY_NEW: 1180 atomic_inc(&cache->stats.promotion); 1181 promote(cache, structs, block, lookup_result.cblock, new_ocell); 1182 release_cell = false; 1183 break; 1184 1185 case POLICY_REPLACE: 1186 cell_prealloc = prealloc_get_cell(structs); 1187 r = bio_detain(cache, lookup_result.old_oblock, bio, cell_prealloc, 1188 (cell_free_fn) prealloc_put_cell, 1189 structs, &old_ocell); 1190 if (r > 0) { 1191 /* 1192 * We have to be careful to avoid lock inversion of 1193 * the cells. So we back off, and wait for the 1194 * old_ocell to become free. 1195 */ 1196 policy_force_mapping(cache->policy, block, 1197 lookup_result.old_oblock); 1198 atomic_inc(&cache->stats.cache_cell_clash); 1199 break; 1200 } 1201 atomic_inc(&cache->stats.demotion); 1202 atomic_inc(&cache->stats.promotion); 1203 1204 demote_then_promote(cache, structs, lookup_result.old_oblock, 1205 block, lookup_result.cblock, 1206 old_ocell, new_ocell); 1207 release_cell = false; 1208 break; 1209 1210 default: 1211 DMERR_LIMIT("%s: erroring bio, unknown policy op: %u", __func__, 1212 (unsigned) lookup_result.op); 1213 bio_io_error(bio); 1214 } 1215 1216 if (release_cell) 1217 cell_defer(cache, new_ocell, false); 1218 } 1219 1220 static int need_commit_due_to_time(struct cache *cache) 1221 { 1222 return jiffies < cache->last_commit_jiffies || 1223 jiffies > cache->last_commit_jiffies + COMMIT_PERIOD; 1224 } 1225 1226 static int commit_if_needed(struct cache *cache) 1227 { 1228 if (dm_cache_changed_this_transaction(cache->cmd) && 1229 (cache->commit_requested || need_commit_due_to_time(cache))) { 1230 atomic_inc(&cache->stats.commit_count); 1231 cache->last_commit_jiffies = jiffies; 1232 cache->commit_requested = false; 1233 return dm_cache_commit(cache->cmd, false); 1234 } 1235 1236 return 0; 1237 } 1238 1239 static void process_deferred_bios(struct cache *cache) 1240 { 1241 unsigned long flags; 1242 struct bio_list bios; 1243 struct bio *bio; 1244 struct prealloc structs; 1245 1246 memset(&structs, 0, sizeof(structs)); 1247 bio_list_init(&bios); 1248 1249 spin_lock_irqsave(&cache->lock, flags); 1250 bio_list_merge(&bios, &cache->deferred_bios); 1251 bio_list_init(&cache->deferred_bios); 1252 spin_unlock_irqrestore(&cache->lock, flags); 1253 1254 while (!bio_list_empty(&bios)) { 1255 /* 1256 * If we've got no free migration structs, and processing 1257 * this bio might require one, we pause until there are some 1258 * prepared mappings to process. 1259 */ 1260 if (prealloc_data_structs(cache, &structs)) { 1261 spin_lock_irqsave(&cache->lock, flags); 1262 bio_list_merge(&cache->deferred_bios, &bios); 1263 spin_unlock_irqrestore(&cache->lock, flags); 1264 break; 1265 } 1266 1267 bio = bio_list_pop(&bios); 1268 1269 if (bio->bi_rw & REQ_FLUSH) 1270 process_flush_bio(cache, bio); 1271 else if (bio->bi_rw & REQ_DISCARD) 1272 process_discard_bio(cache, bio); 1273 else 1274 process_bio(cache, &structs, bio); 1275 } 1276 1277 prealloc_free_structs(cache, &structs); 1278 } 1279 1280 static void process_deferred_flush_bios(struct cache *cache, bool submit_bios) 1281 { 1282 unsigned long flags; 1283 struct bio_list bios; 1284 struct bio *bio; 1285 1286 bio_list_init(&bios); 1287 1288 spin_lock_irqsave(&cache->lock, flags); 1289 bio_list_merge(&bios, &cache->deferred_flush_bios); 1290 bio_list_init(&cache->deferred_flush_bios); 1291 spin_unlock_irqrestore(&cache->lock, flags); 1292 1293 while ((bio = bio_list_pop(&bios))) 1294 submit_bios ? generic_make_request(bio) : bio_io_error(bio); 1295 } 1296 1297 static void process_deferred_writethrough_bios(struct cache *cache) 1298 { 1299 unsigned long flags; 1300 struct bio_list bios; 1301 struct bio *bio; 1302 1303 bio_list_init(&bios); 1304 1305 spin_lock_irqsave(&cache->lock, flags); 1306 bio_list_merge(&bios, &cache->deferred_writethrough_bios); 1307 bio_list_init(&cache->deferred_writethrough_bios); 1308 spin_unlock_irqrestore(&cache->lock, flags); 1309 1310 while ((bio = bio_list_pop(&bios))) 1311 generic_make_request(bio); 1312 } 1313 1314 static void writeback_some_dirty_blocks(struct cache *cache) 1315 { 1316 int r = 0; 1317 dm_oblock_t oblock; 1318 dm_cblock_t cblock; 1319 struct prealloc structs; 1320 struct dm_bio_prison_cell *old_ocell; 1321 1322 memset(&structs, 0, sizeof(structs)); 1323 1324 while (spare_migration_bandwidth(cache)) { 1325 if (prealloc_data_structs(cache, &structs)) 1326 break; 1327 1328 r = policy_writeback_work(cache->policy, &oblock, &cblock); 1329 if (r) 1330 break; 1331 1332 r = get_cell(cache, oblock, &structs, &old_ocell); 1333 if (r) { 1334 policy_set_dirty(cache->policy, oblock); 1335 break; 1336 } 1337 1338 writeback(cache, &structs, oblock, cblock, old_ocell); 1339 } 1340 1341 prealloc_free_structs(cache, &structs); 1342 } 1343 1344 /*---------------------------------------------------------------- 1345 * Main worker loop 1346 *--------------------------------------------------------------*/ 1347 static void start_quiescing(struct cache *cache) 1348 { 1349 unsigned long flags; 1350 1351 spin_lock_irqsave(&cache->lock, flags); 1352 cache->quiescing = 1; 1353 spin_unlock_irqrestore(&cache->lock, flags); 1354 } 1355 1356 static void stop_quiescing(struct cache *cache) 1357 { 1358 unsigned long flags; 1359 1360 spin_lock_irqsave(&cache->lock, flags); 1361 cache->quiescing = 0; 1362 spin_unlock_irqrestore(&cache->lock, flags); 1363 } 1364 1365 static bool is_quiescing(struct cache *cache) 1366 { 1367 int r; 1368 unsigned long flags; 1369 1370 spin_lock_irqsave(&cache->lock, flags); 1371 r = cache->quiescing; 1372 spin_unlock_irqrestore(&cache->lock, flags); 1373 1374 return r; 1375 } 1376 1377 static void wait_for_migrations(struct cache *cache) 1378 { 1379 wait_event(cache->migration_wait, !atomic_read(&cache->nr_migrations)); 1380 } 1381 1382 static void stop_worker(struct cache *cache) 1383 { 1384 cancel_delayed_work(&cache->waker); 1385 flush_workqueue(cache->wq); 1386 } 1387 1388 static void requeue_deferred_io(struct cache *cache) 1389 { 1390 struct bio *bio; 1391 struct bio_list bios; 1392 1393 bio_list_init(&bios); 1394 bio_list_merge(&bios, &cache->deferred_bios); 1395 bio_list_init(&cache->deferred_bios); 1396 1397 while ((bio = bio_list_pop(&bios))) 1398 bio_endio(bio, DM_ENDIO_REQUEUE); 1399 } 1400 1401 static int more_work(struct cache *cache) 1402 { 1403 if (is_quiescing(cache)) 1404 return !list_empty(&cache->quiesced_migrations) || 1405 !list_empty(&cache->completed_migrations) || 1406 !list_empty(&cache->need_commit_migrations); 1407 else 1408 return !bio_list_empty(&cache->deferred_bios) || 1409 !bio_list_empty(&cache->deferred_flush_bios) || 1410 !bio_list_empty(&cache->deferred_writethrough_bios) || 1411 !list_empty(&cache->quiesced_migrations) || 1412 !list_empty(&cache->completed_migrations) || 1413 !list_empty(&cache->need_commit_migrations); 1414 } 1415 1416 static void do_worker(struct work_struct *ws) 1417 { 1418 struct cache *cache = container_of(ws, struct cache, worker); 1419 1420 do { 1421 if (!is_quiescing(cache)) 1422 process_deferred_bios(cache); 1423 1424 process_migrations(cache, &cache->quiesced_migrations, issue_copy); 1425 process_migrations(cache, &cache->completed_migrations, complete_migration); 1426 1427 writeback_some_dirty_blocks(cache); 1428 1429 process_deferred_writethrough_bios(cache); 1430 1431 if (commit_if_needed(cache)) { 1432 process_deferred_flush_bios(cache, false); 1433 1434 /* 1435 * FIXME: rollback metadata or just go into a 1436 * failure mode and error everything 1437 */ 1438 } else { 1439 process_deferred_flush_bios(cache, true); 1440 process_migrations(cache, &cache->need_commit_migrations, 1441 migration_success_post_commit); 1442 } 1443 } while (more_work(cache)); 1444 } 1445 1446 /* 1447 * We want to commit periodically so that not too much 1448 * unwritten metadata builds up. 1449 */ 1450 static void do_waker(struct work_struct *ws) 1451 { 1452 struct cache *cache = container_of(to_delayed_work(ws), struct cache, waker); 1453 policy_tick(cache->policy); 1454 wake_worker(cache); 1455 queue_delayed_work(cache->wq, &cache->waker, COMMIT_PERIOD); 1456 } 1457 1458 /*----------------------------------------------------------------*/ 1459 1460 static int is_congested(struct dm_dev *dev, int bdi_bits) 1461 { 1462 struct request_queue *q = bdev_get_queue(dev->bdev); 1463 return bdi_congested(&q->backing_dev_info, bdi_bits); 1464 } 1465 1466 static int cache_is_congested(struct dm_target_callbacks *cb, int bdi_bits) 1467 { 1468 struct cache *cache = container_of(cb, struct cache, callbacks); 1469 1470 return is_congested(cache->origin_dev, bdi_bits) || 1471 is_congested(cache->cache_dev, bdi_bits); 1472 } 1473 1474 /*---------------------------------------------------------------- 1475 * Target methods 1476 *--------------------------------------------------------------*/ 1477 1478 /* 1479 * This function gets called on the error paths of the constructor, so we 1480 * have to cope with a partially initialised struct. 1481 */ 1482 static void destroy(struct cache *cache) 1483 { 1484 unsigned i; 1485 1486 if (cache->next_migration) 1487 mempool_free(cache->next_migration, cache->migration_pool); 1488 1489 if (cache->migration_pool) 1490 mempool_destroy(cache->migration_pool); 1491 1492 if (cache->all_io_ds) 1493 dm_deferred_set_destroy(cache->all_io_ds); 1494 1495 if (cache->prison) 1496 dm_bio_prison_destroy(cache->prison); 1497 1498 if (cache->wq) 1499 destroy_workqueue(cache->wq); 1500 1501 if (cache->dirty_bitset) 1502 free_bitset(cache->dirty_bitset); 1503 1504 if (cache->discard_bitset) 1505 free_bitset(cache->discard_bitset); 1506 1507 if (cache->copier) 1508 dm_kcopyd_client_destroy(cache->copier); 1509 1510 if (cache->cmd) 1511 dm_cache_metadata_close(cache->cmd); 1512 1513 if (cache->metadata_dev) 1514 dm_put_device(cache->ti, cache->metadata_dev); 1515 1516 if (cache->origin_dev) 1517 dm_put_device(cache->ti, cache->origin_dev); 1518 1519 if (cache->cache_dev) 1520 dm_put_device(cache->ti, cache->cache_dev); 1521 1522 if (cache->policy) 1523 dm_cache_policy_destroy(cache->policy); 1524 1525 for (i = 0; i < cache->nr_ctr_args ; i++) 1526 kfree(cache->ctr_args[i]); 1527 kfree(cache->ctr_args); 1528 1529 kfree(cache); 1530 } 1531 1532 static void cache_dtr(struct dm_target *ti) 1533 { 1534 struct cache *cache = ti->private; 1535 1536 destroy(cache); 1537 } 1538 1539 static sector_t get_dev_size(struct dm_dev *dev) 1540 { 1541 return i_size_read(dev->bdev->bd_inode) >> SECTOR_SHIFT; 1542 } 1543 1544 /*----------------------------------------------------------------*/ 1545 1546 /* 1547 * Construct a cache device mapping. 1548 * 1549 * cache <metadata dev> <cache dev> <origin dev> <block size> 1550 * <#feature args> [<feature arg>]* 1551 * <policy> <#policy args> [<policy arg>]* 1552 * 1553 * metadata dev : fast device holding the persistent metadata 1554 * cache dev : fast device holding cached data blocks 1555 * origin dev : slow device holding original data blocks 1556 * block size : cache unit size in sectors 1557 * 1558 * #feature args : number of feature arguments passed 1559 * feature args : writethrough. (The default is writeback.) 1560 * 1561 * policy : the replacement policy to use 1562 * #policy args : an even number of policy arguments corresponding 1563 * to key/value pairs passed to the policy 1564 * policy args : key/value pairs passed to the policy 1565 * E.g. 'sequential_threshold 1024' 1566 * See cache-policies.txt for details. 1567 * 1568 * Optional feature arguments are: 1569 * writethrough : write through caching that prohibits cache block 1570 * content from being different from origin block content. 1571 * Without this argument, the default behaviour is to write 1572 * back cache block contents later for performance reasons, 1573 * so they may differ from the corresponding origin blocks. 1574 */ 1575 struct cache_args { 1576 struct dm_target *ti; 1577 1578 struct dm_dev *metadata_dev; 1579 1580 struct dm_dev *cache_dev; 1581 sector_t cache_sectors; 1582 1583 struct dm_dev *origin_dev; 1584 sector_t origin_sectors; 1585 1586 uint32_t block_size; 1587 1588 const char *policy_name; 1589 int policy_argc; 1590 const char **policy_argv; 1591 1592 struct cache_features features; 1593 }; 1594 1595 static void destroy_cache_args(struct cache_args *ca) 1596 { 1597 if (ca->metadata_dev) 1598 dm_put_device(ca->ti, ca->metadata_dev); 1599 1600 if (ca->cache_dev) 1601 dm_put_device(ca->ti, ca->cache_dev); 1602 1603 if (ca->origin_dev) 1604 dm_put_device(ca->ti, ca->origin_dev); 1605 1606 kfree(ca); 1607 } 1608 1609 static bool at_least_one_arg(struct dm_arg_set *as, char **error) 1610 { 1611 if (!as->argc) { 1612 *error = "Insufficient args"; 1613 return false; 1614 } 1615 1616 return true; 1617 } 1618 1619 static int parse_metadata_dev(struct cache_args *ca, struct dm_arg_set *as, 1620 char **error) 1621 { 1622 int r; 1623 sector_t metadata_dev_size; 1624 char b[BDEVNAME_SIZE]; 1625 1626 if (!at_least_one_arg(as, error)) 1627 return -EINVAL; 1628 1629 r = dm_get_device(ca->ti, dm_shift_arg(as), FMODE_READ | FMODE_WRITE, 1630 &ca->metadata_dev); 1631 if (r) { 1632 *error = "Error opening metadata device"; 1633 return r; 1634 } 1635 1636 metadata_dev_size = get_dev_size(ca->metadata_dev); 1637 if (metadata_dev_size > DM_CACHE_METADATA_MAX_SECTORS_WARNING) 1638 DMWARN("Metadata device %s is larger than %u sectors: excess space will not be used.", 1639 bdevname(ca->metadata_dev->bdev, b), THIN_METADATA_MAX_SECTORS); 1640 1641 return 0; 1642 } 1643 1644 static int parse_cache_dev(struct cache_args *ca, struct dm_arg_set *as, 1645 char **error) 1646 { 1647 int r; 1648 1649 if (!at_least_one_arg(as, error)) 1650 return -EINVAL; 1651 1652 r = dm_get_device(ca->ti, dm_shift_arg(as), FMODE_READ | FMODE_WRITE, 1653 &ca->cache_dev); 1654 if (r) { 1655 *error = "Error opening cache device"; 1656 return r; 1657 } 1658 ca->cache_sectors = get_dev_size(ca->cache_dev); 1659 1660 return 0; 1661 } 1662 1663 static int parse_origin_dev(struct cache_args *ca, struct dm_arg_set *as, 1664 char **error) 1665 { 1666 int r; 1667 1668 if (!at_least_one_arg(as, error)) 1669 return -EINVAL; 1670 1671 r = dm_get_device(ca->ti, dm_shift_arg(as), FMODE_READ | FMODE_WRITE, 1672 &ca->origin_dev); 1673 if (r) { 1674 *error = "Error opening origin device"; 1675 return r; 1676 } 1677 1678 ca->origin_sectors = get_dev_size(ca->origin_dev); 1679 if (ca->ti->len > ca->origin_sectors) { 1680 *error = "Device size larger than cached device"; 1681 return -EINVAL; 1682 } 1683 1684 return 0; 1685 } 1686 1687 static int parse_block_size(struct cache_args *ca, struct dm_arg_set *as, 1688 char **error) 1689 { 1690 unsigned long tmp; 1691 1692 if (!at_least_one_arg(as, error)) 1693 return -EINVAL; 1694 1695 if (kstrtoul(dm_shift_arg(as), 10, &tmp) || !tmp || 1696 tmp < DATA_DEV_BLOCK_SIZE_MIN_SECTORS || 1697 tmp & (DATA_DEV_BLOCK_SIZE_MIN_SECTORS - 1)) { 1698 *error = "Invalid data block size"; 1699 return -EINVAL; 1700 } 1701 1702 if (tmp > ca->cache_sectors) { 1703 *error = "Data block size is larger than the cache device"; 1704 return -EINVAL; 1705 } 1706 1707 ca->block_size = tmp; 1708 1709 return 0; 1710 } 1711 1712 static void init_features(struct cache_features *cf) 1713 { 1714 cf->mode = CM_WRITE; 1715 cf->write_through = false; 1716 } 1717 1718 static int parse_features(struct cache_args *ca, struct dm_arg_set *as, 1719 char **error) 1720 { 1721 static struct dm_arg _args[] = { 1722 {0, 1, "Invalid number of cache feature arguments"}, 1723 }; 1724 1725 int r; 1726 unsigned argc; 1727 const char *arg; 1728 struct cache_features *cf = &ca->features; 1729 1730 init_features(cf); 1731 1732 r = dm_read_arg_group(_args, as, &argc, error); 1733 if (r) 1734 return -EINVAL; 1735 1736 while (argc--) { 1737 arg = dm_shift_arg(as); 1738 1739 if (!strcasecmp(arg, "writeback")) 1740 cf->write_through = false; 1741 1742 else if (!strcasecmp(arg, "writethrough")) 1743 cf->write_through = true; 1744 1745 else { 1746 *error = "Unrecognised cache feature requested"; 1747 return -EINVAL; 1748 } 1749 } 1750 1751 return 0; 1752 } 1753 1754 static int parse_policy(struct cache_args *ca, struct dm_arg_set *as, 1755 char **error) 1756 { 1757 static struct dm_arg _args[] = { 1758 {0, 1024, "Invalid number of policy arguments"}, 1759 }; 1760 1761 int r; 1762 1763 if (!at_least_one_arg(as, error)) 1764 return -EINVAL; 1765 1766 ca->policy_name = dm_shift_arg(as); 1767 1768 r = dm_read_arg_group(_args, as, &ca->policy_argc, error); 1769 if (r) 1770 return -EINVAL; 1771 1772 ca->policy_argv = (const char **)as->argv; 1773 dm_consume_args(as, ca->policy_argc); 1774 1775 return 0; 1776 } 1777 1778 static int parse_cache_args(struct cache_args *ca, int argc, char **argv, 1779 char **error) 1780 { 1781 int r; 1782 struct dm_arg_set as; 1783 1784 as.argc = argc; 1785 as.argv = argv; 1786 1787 r = parse_metadata_dev(ca, &as, error); 1788 if (r) 1789 return r; 1790 1791 r = parse_cache_dev(ca, &as, error); 1792 if (r) 1793 return r; 1794 1795 r = parse_origin_dev(ca, &as, error); 1796 if (r) 1797 return r; 1798 1799 r = parse_block_size(ca, &as, error); 1800 if (r) 1801 return r; 1802 1803 r = parse_features(ca, &as, error); 1804 if (r) 1805 return r; 1806 1807 r = parse_policy(ca, &as, error); 1808 if (r) 1809 return r; 1810 1811 return 0; 1812 } 1813 1814 /*----------------------------------------------------------------*/ 1815 1816 static struct kmem_cache *migration_cache; 1817 1818 #define NOT_CORE_OPTION 1 1819 1820 static int process_config_option(struct cache *cache, const char *key, const char *value) 1821 { 1822 unsigned long tmp; 1823 1824 if (!strcasecmp(key, "migration_threshold")) { 1825 if (kstrtoul(value, 10, &tmp)) 1826 return -EINVAL; 1827 1828 cache->migration_threshold = tmp; 1829 return 0; 1830 } 1831 1832 return NOT_CORE_OPTION; 1833 } 1834 1835 static int set_config_value(struct cache *cache, const char *key, const char *value) 1836 { 1837 int r = process_config_option(cache, key, value); 1838 1839 if (r == NOT_CORE_OPTION) 1840 r = policy_set_config_value(cache->policy, key, value); 1841 1842 if (r) 1843 DMWARN("bad config value for %s: %s", key, value); 1844 1845 return r; 1846 } 1847 1848 static int set_config_values(struct cache *cache, int argc, const char **argv) 1849 { 1850 int r = 0; 1851 1852 if (argc & 1) { 1853 DMWARN("Odd number of policy arguments given but they should be <key> <value> pairs."); 1854 return -EINVAL; 1855 } 1856 1857 while (argc) { 1858 r = set_config_value(cache, argv[0], argv[1]); 1859 if (r) 1860 break; 1861 1862 argc -= 2; 1863 argv += 2; 1864 } 1865 1866 return r; 1867 } 1868 1869 static int create_cache_policy(struct cache *cache, struct cache_args *ca, 1870 char **error) 1871 { 1872 cache->policy = dm_cache_policy_create(ca->policy_name, 1873 cache->cache_size, 1874 cache->origin_sectors, 1875 cache->sectors_per_block); 1876 if (!cache->policy) { 1877 *error = "Error creating cache's policy"; 1878 return -ENOMEM; 1879 } 1880 1881 return 0; 1882 } 1883 1884 /* 1885 * We want the discard block size to be a power of two, at least the size 1886 * of the cache block size, and have no more than 2^14 discard blocks 1887 * across the origin. 1888 */ 1889 #define MAX_DISCARD_BLOCKS (1 << 14) 1890 1891 static bool too_many_discard_blocks(sector_t discard_block_size, 1892 sector_t origin_size) 1893 { 1894 (void) sector_div(origin_size, discard_block_size); 1895 1896 return origin_size > MAX_DISCARD_BLOCKS; 1897 } 1898 1899 static sector_t calculate_discard_block_size(sector_t cache_block_size, 1900 sector_t origin_size) 1901 { 1902 sector_t discard_block_size; 1903 1904 discard_block_size = roundup_pow_of_two(cache_block_size); 1905 1906 if (origin_size) 1907 while (too_many_discard_blocks(discard_block_size, origin_size)) 1908 discard_block_size *= 2; 1909 1910 return discard_block_size; 1911 } 1912 1913 #define DEFAULT_MIGRATION_THRESHOLD 2048 1914 1915 static int cache_create(struct cache_args *ca, struct cache **result) 1916 { 1917 int r = 0; 1918 char **error = &ca->ti->error; 1919 struct cache *cache; 1920 struct dm_target *ti = ca->ti; 1921 dm_block_t origin_blocks; 1922 struct dm_cache_metadata *cmd; 1923 bool may_format = ca->features.mode == CM_WRITE; 1924 1925 cache = kzalloc(sizeof(*cache), GFP_KERNEL); 1926 if (!cache) 1927 return -ENOMEM; 1928 1929 cache->ti = ca->ti; 1930 ti->private = cache; 1931 ti->num_flush_bios = 2; 1932 ti->flush_supported = true; 1933 1934 ti->num_discard_bios = 1; 1935 ti->discards_supported = true; 1936 ti->discard_zeroes_data_unsupported = true; 1937 1938 cache->features = ca->features; 1939 ti->per_bio_data_size = get_per_bio_data_size(cache); 1940 1941 cache->callbacks.congested_fn = cache_is_congested; 1942 dm_table_add_target_callbacks(ti->table, &cache->callbacks); 1943 1944 cache->metadata_dev = ca->metadata_dev; 1945 cache->origin_dev = ca->origin_dev; 1946 cache->cache_dev = ca->cache_dev; 1947 1948 ca->metadata_dev = ca->origin_dev = ca->cache_dev = NULL; 1949 1950 /* FIXME: factor out this whole section */ 1951 origin_blocks = cache->origin_sectors = ca->origin_sectors; 1952 origin_blocks = block_div(origin_blocks, ca->block_size); 1953 cache->origin_blocks = to_oblock(origin_blocks); 1954 1955 cache->sectors_per_block = ca->block_size; 1956 if (dm_set_target_max_io_len(ti, cache->sectors_per_block)) { 1957 r = -EINVAL; 1958 goto bad; 1959 } 1960 1961 if (ca->block_size & (ca->block_size - 1)) { 1962 dm_block_t cache_size = ca->cache_sectors; 1963 1964 cache->sectors_per_block_shift = -1; 1965 cache_size = block_div(cache_size, ca->block_size); 1966 cache->cache_size = to_cblock(cache_size); 1967 } else { 1968 cache->sectors_per_block_shift = __ffs(ca->block_size); 1969 cache->cache_size = to_cblock(ca->cache_sectors >> cache->sectors_per_block_shift); 1970 } 1971 1972 r = create_cache_policy(cache, ca, error); 1973 if (r) 1974 goto bad; 1975 1976 cache->policy_nr_args = ca->policy_argc; 1977 cache->migration_threshold = DEFAULT_MIGRATION_THRESHOLD; 1978 1979 r = set_config_values(cache, ca->policy_argc, ca->policy_argv); 1980 if (r) { 1981 *error = "Error setting cache policy's config values"; 1982 goto bad; 1983 } 1984 1985 cmd = dm_cache_metadata_open(cache->metadata_dev->bdev, 1986 ca->block_size, may_format, 1987 dm_cache_policy_get_hint_size(cache->policy)); 1988 if (IS_ERR(cmd)) { 1989 *error = "Error creating metadata object"; 1990 r = PTR_ERR(cmd); 1991 goto bad; 1992 } 1993 cache->cmd = cmd; 1994 1995 spin_lock_init(&cache->lock); 1996 bio_list_init(&cache->deferred_bios); 1997 bio_list_init(&cache->deferred_flush_bios); 1998 bio_list_init(&cache->deferred_writethrough_bios); 1999 INIT_LIST_HEAD(&cache->quiesced_migrations); 2000 INIT_LIST_HEAD(&cache->completed_migrations); 2001 INIT_LIST_HEAD(&cache->need_commit_migrations); 2002 atomic_set(&cache->nr_migrations, 0); 2003 init_waitqueue_head(&cache->migration_wait); 2004 2005 r = -ENOMEM; 2006 cache->nr_dirty = 0; 2007 cache->dirty_bitset = alloc_bitset(from_cblock(cache->cache_size)); 2008 if (!cache->dirty_bitset) { 2009 *error = "could not allocate dirty bitset"; 2010 goto bad; 2011 } 2012 clear_bitset(cache->dirty_bitset, from_cblock(cache->cache_size)); 2013 2014 cache->discard_block_size = 2015 calculate_discard_block_size(cache->sectors_per_block, 2016 cache->origin_sectors); 2017 cache->discard_nr_blocks = oblock_to_dblock(cache, cache->origin_blocks); 2018 cache->discard_bitset = alloc_bitset(from_dblock(cache->discard_nr_blocks)); 2019 if (!cache->discard_bitset) { 2020 *error = "could not allocate discard bitset"; 2021 goto bad; 2022 } 2023 clear_bitset(cache->discard_bitset, from_dblock(cache->discard_nr_blocks)); 2024 2025 cache->copier = dm_kcopyd_client_create(&dm_kcopyd_throttle); 2026 if (IS_ERR(cache->copier)) { 2027 *error = "could not create kcopyd client"; 2028 r = PTR_ERR(cache->copier); 2029 goto bad; 2030 } 2031 2032 cache->wq = alloc_ordered_workqueue("dm-" DM_MSG_PREFIX, WQ_MEM_RECLAIM); 2033 if (!cache->wq) { 2034 *error = "could not create workqueue for metadata object"; 2035 goto bad; 2036 } 2037 INIT_WORK(&cache->worker, do_worker); 2038 INIT_DELAYED_WORK(&cache->waker, do_waker); 2039 cache->last_commit_jiffies = jiffies; 2040 2041 cache->prison = dm_bio_prison_create(PRISON_CELLS); 2042 if (!cache->prison) { 2043 *error = "could not create bio prison"; 2044 goto bad; 2045 } 2046 2047 cache->all_io_ds = dm_deferred_set_create(); 2048 if (!cache->all_io_ds) { 2049 *error = "could not create all_io deferred set"; 2050 goto bad; 2051 } 2052 2053 cache->migration_pool = mempool_create_slab_pool(MIGRATION_POOL_SIZE, 2054 migration_cache); 2055 if (!cache->migration_pool) { 2056 *error = "Error creating cache's migration mempool"; 2057 goto bad; 2058 } 2059 2060 cache->next_migration = NULL; 2061 2062 cache->need_tick_bio = true; 2063 cache->sized = false; 2064 cache->quiescing = false; 2065 cache->commit_requested = false; 2066 cache->loaded_mappings = false; 2067 cache->loaded_discards = false; 2068 2069 load_stats(cache); 2070 2071 atomic_set(&cache->stats.demotion, 0); 2072 atomic_set(&cache->stats.promotion, 0); 2073 atomic_set(&cache->stats.copies_avoided, 0); 2074 atomic_set(&cache->stats.cache_cell_clash, 0); 2075 atomic_set(&cache->stats.commit_count, 0); 2076 atomic_set(&cache->stats.discard_count, 0); 2077 2078 *result = cache; 2079 return 0; 2080 2081 bad: 2082 destroy(cache); 2083 return r; 2084 } 2085 2086 static int copy_ctr_args(struct cache *cache, int argc, const char **argv) 2087 { 2088 unsigned i; 2089 const char **copy; 2090 2091 copy = kcalloc(argc, sizeof(*copy), GFP_KERNEL); 2092 if (!copy) 2093 return -ENOMEM; 2094 for (i = 0; i < argc; i++) { 2095 copy[i] = kstrdup(argv[i], GFP_KERNEL); 2096 if (!copy[i]) { 2097 while (i--) 2098 kfree(copy[i]); 2099 kfree(copy); 2100 return -ENOMEM; 2101 } 2102 } 2103 2104 cache->nr_ctr_args = argc; 2105 cache->ctr_args = copy; 2106 2107 return 0; 2108 } 2109 2110 static int cache_ctr(struct dm_target *ti, unsigned argc, char **argv) 2111 { 2112 int r = -EINVAL; 2113 struct cache_args *ca; 2114 struct cache *cache = NULL; 2115 2116 ca = kzalloc(sizeof(*ca), GFP_KERNEL); 2117 if (!ca) { 2118 ti->error = "Error allocating memory for cache"; 2119 return -ENOMEM; 2120 } 2121 ca->ti = ti; 2122 2123 r = parse_cache_args(ca, argc, argv, &ti->error); 2124 if (r) 2125 goto out; 2126 2127 r = cache_create(ca, &cache); 2128 if (r) 2129 goto out; 2130 2131 r = copy_ctr_args(cache, argc - 3, (const char **)argv + 3); 2132 if (r) { 2133 destroy(cache); 2134 goto out; 2135 } 2136 2137 ti->private = cache; 2138 2139 out: 2140 destroy_cache_args(ca); 2141 return r; 2142 } 2143 2144 static int cache_map(struct dm_target *ti, struct bio *bio) 2145 { 2146 struct cache *cache = ti->private; 2147 2148 int r; 2149 dm_oblock_t block = get_bio_block(cache, bio); 2150 size_t pb_data_size = get_per_bio_data_size(cache); 2151 bool can_migrate = false; 2152 bool discarded_block; 2153 struct dm_bio_prison_cell *cell; 2154 struct policy_result lookup_result; 2155 struct per_bio_data *pb; 2156 2157 if (from_oblock(block) > from_oblock(cache->origin_blocks)) { 2158 /* 2159 * This can only occur if the io goes to a partial block at 2160 * the end of the origin device. We don't cache these. 2161 * Just remap to the origin and carry on. 2162 */ 2163 remap_to_origin_clear_discard(cache, bio, block); 2164 return DM_MAPIO_REMAPPED; 2165 } 2166 2167 pb = init_per_bio_data(bio, pb_data_size); 2168 2169 if (bio->bi_rw & (REQ_FLUSH | REQ_FUA | REQ_DISCARD)) { 2170 defer_bio(cache, bio); 2171 return DM_MAPIO_SUBMITTED; 2172 } 2173 2174 /* 2175 * Check to see if that block is currently migrating. 2176 */ 2177 cell = alloc_prison_cell(cache); 2178 if (!cell) { 2179 defer_bio(cache, bio); 2180 return DM_MAPIO_SUBMITTED; 2181 } 2182 2183 r = bio_detain(cache, block, bio, cell, 2184 (cell_free_fn) free_prison_cell, 2185 cache, &cell); 2186 if (r) { 2187 if (r < 0) 2188 defer_bio(cache, bio); 2189 2190 return DM_MAPIO_SUBMITTED; 2191 } 2192 2193 discarded_block = is_discarded_oblock(cache, block); 2194 2195 r = policy_map(cache->policy, block, false, can_migrate, discarded_block, 2196 bio, &lookup_result); 2197 if (r == -EWOULDBLOCK) { 2198 cell_defer(cache, cell, true); 2199 return DM_MAPIO_SUBMITTED; 2200 2201 } else if (r) { 2202 DMERR_LIMIT("Unexpected return from cache replacement policy: %d", r); 2203 bio_io_error(bio); 2204 return DM_MAPIO_SUBMITTED; 2205 } 2206 2207 switch (lookup_result.op) { 2208 case POLICY_HIT: 2209 inc_hit_counter(cache, bio); 2210 pb->all_io_entry = dm_deferred_entry_inc(cache->all_io_ds); 2211 2212 if (is_writethrough_io(cache, bio, lookup_result.cblock)) 2213 remap_to_origin_then_cache(cache, bio, block, lookup_result.cblock); 2214 else 2215 remap_to_cache_dirty(cache, bio, block, lookup_result.cblock); 2216 2217 cell_defer(cache, cell, false); 2218 break; 2219 2220 case POLICY_MISS: 2221 inc_miss_counter(cache, bio); 2222 pb->all_io_entry = dm_deferred_entry_inc(cache->all_io_ds); 2223 2224 if (pb->req_nr != 0) { 2225 /* 2226 * This is a duplicate writethrough io that is no 2227 * longer needed because the block has been demoted. 2228 */ 2229 bio_endio(bio, 0); 2230 cell_defer(cache, cell, false); 2231 return DM_MAPIO_SUBMITTED; 2232 } else { 2233 remap_to_origin_clear_discard(cache, bio, block); 2234 cell_defer(cache, cell, false); 2235 } 2236 break; 2237 2238 default: 2239 DMERR_LIMIT("%s: erroring bio: unknown policy op: %u", __func__, 2240 (unsigned) lookup_result.op); 2241 bio_io_error(bio); 2242 return DM_MAPIO_SUBMITTED; 2243 } 2244 2245 return DM_MAPIO_REMAPPED; 2246 } 2247 2248 static int cache_end_io(struct dm_target *ti, struct bio *bio, int error) 2249 { 2250 struct cache *cache = ti->private; 2251 unsigned long flags; 2252 size_t pb_data_size = get_per_bio_data_size(cache); 2253 struct per_bio_data *pb = get_per_bio_data(bio, pb_data_size); 2254 2255 if (pb->tick) { 2256 policy_tick(cache->policy); 2257 2258 spin_lock_irqsave(&cache->lock, flags); 2259 cache->need_tick_bio = true; 2260 spin_unlock_irqrestore(&cache->lock, flags); 2261 } 2262 2263 check_for_quiesced_migrations(cache, pb); 2264 2265 return 0; 2266 } 2267 2268 static int write_dirty_bitset(struct cache *cache) 2269 { 2270 unsigned i, r; 2271 2272 for (i = 0; i < from_cblock(cache->cache_size); i++) { 2273 r = dm_cache_set_dirty(cache->cmd, to_cblock(i), 2274 is_dirty(cache, to_cblock(i))); 2275 if (r) 2276 return r; 2277 } 2278 2279 return 0; 2280 } 2281 2282 static int write_discard_bitset(struct cache *cache) 2283 { 2284 unsigned i, r; 2285 2286 r = dm_cache_discard_bitset_resize(cache->cmd, cache->discard_block_size, 2287 cache->discard_nr_blocks); 2288 if (r) { 2289 DMERR("could not resize on-disk discard bitset"); 2290 return r; 2291 } 2292 2293 for (i = 0; i < from_dblock(cache->discard_nr_blocks); i++) { 2294 r = dm_cache_set_discard(cache->cmd, to_dblock(i), 2295 is_discarded(cache, to_dblock(i))); 2296 if (r) 2297 return r; 2298 } 2299 2300 return 0; 2301 } 2302 2303 static int save_hint(void *context, dm_cblock_t cblock, dm_oblock_t oblock, 2304 uint32_t hint) 2305 { 2306 struct cache *cache = context; 2307 return dm_cache_save_hint(cache->cmd, cblock, hint); 2308 } 2309 2310 static int write_hints(struct cache *cache) 2311 { 2312 int r; 2313 2314 r = dm_cache_begin_hints(cache->cmd, cache->policy); 2315 if (r) { 2316 DMERR("dm_cache_begin_hints failed"); 2317 return r; 2318 } 2319 2320 r = policy_walk_mappings(cache->policy, save_hint, cache); 2321 if (r) 2322 DMERR("policy_walk_mappings failed"); 2323 2324 return r; 2325 } 2326 2327 /* 2328 * returns true on success 2329 */ 2330 static bool sync_metadata(struct cache *cache) 2331 { 2332 int r1, r2, r3, r4; 2333 2334 r1 = write_dirty_bitset(cache); 2335 if (r1) 2336 DMERR("could not write dirty bitset"); 2337 2338 r2 = write_discard_bitset(cache); 2339 if (r2) 2340 DMERR("could not write discard bitset"); 2341 2342 save_stats(cache); 2343 2344 r3 = write_hints(cache); 2345 if (r3) 2346 DMERR("could not write hints"); 2347 2348 /* 2349 * If writing the above metadata failed, we still commit, but don't 2350 * set the clean shutdown flag. This will effectively force every 2351 * dirty bit to be set on reload. 2352 */ 2353 r4 = dm_cache_commit(cache->cmd, !r1 && !r2 && !r3); 2354 if (r4) 2355 DMERR("could not write cache metadata. Data loss may occur."); 2356 2357 return !r1 && !r2 && !r3 && !r4; 2358 } 2359 2360 static void cache_postsuspend(struct dm_target *ti) 2361 { 2362 struct cache *cache = ti->private; 2363 2364 start_quiescing(cache); 2365 wait_for_migrations(cache); 2366 stop_worker(cache); 2367 requeue_deferred_io(cache); 2368 stop_quiescing(cache); 2369 2370 (void) sync_metadata(cache); 2371 } 2372 2373 static int load_mapping(void *context, dm_oblock_t oblock, dm_cblock_t cblock, 2374 bool dirty, uint32_t hint, bool hint_valid) 2375 { 2376 int r; 2377 struct cache *cache = context; 2378 2379 r = policy_load_mapping(cache->policy, oblock, cblock, hint, hint_valid); 2380 if (r) 2381 return r; 2382 2383 if (dirty) 2384 set_dirty(cache, oblock, cblock); 2385 else 2386 clear_dirty(cache, oblock, cblock); 2387 2388 return 0; 2389 } 2390 2391 static int load_discard(void *context, sector_t discard_block_size, 2392 dm_dblock_t dblock, bool discard) 2393 { 2394 struct cache *cache = context; 2395 2396 /* FIXME: handle mis-matched block size */ 2397 2398 if (discard) 2399 set_discard(cache, dblock); 2400 else 2401 clear_discard(cache, dblock); 2402 2403 return 0; 2404 } 2405 2406 static int cache_preresume(struct dm_target *ti) 2407 { 2408 int r = 0; 2409 struct cache *cache = ti->private; 2410 sector_t actual_cache_size = get_dev_size(cache->cache_dev); 2411 (void) sector_div(actual_cache_size, cache->sectors_per_block); 2412 2413 /* 2414 * Check to see if the cache has resized. 2415 */ 2416 if (from_cblock(cache->cache_size) != actual_cache_size || !cache->sized) { 2417 cache->cache_size = to_cblock(actual_cache_size); 2418 2419 r = dm_cache_resize(cache->cmd, cache->cache_size); 2420 if (r) { 2421 DMERR("could not resize cache metadata"); 2422 return r; 2423 } 2424 2425 cache->sized = true; 2426 } 2427 2428 if (!cache->loaded_mappings) { 2429 r = dm_cache_load_mappings(cache->cmd, cache->policy, 2430 load_mapping, cache); 2431 if (r) { 2432 DMERR("could not load cache mappings"); 2433 return r; 2434 } 2435 2436 cache->loaded_mappings = true; 2437 } 2438 2439 if (!cache->loaded_discards) { 2440 r = dm_cache_load_discards(cache->cmd, load_discard, cache); 2441 if (r) { 2442 DMERR("could not load origin discards"); 2443 return r; 2444 } 2445 2446 cache->loaded_discards = true; 2447 } 2448 2449 return r; 2450 } 2451 2452 static void cache_resume(struct dm_target *ti) 2453 { 2454 struct cache *cache = ti->private; 2455 2456 cache->need_tick_bio = true; 2457 do_waker(&cache->waker.work); 2458 } 2459 2460 /* 2461 * Status format: 2462 * 2463 * <#used metadata blocks>/<#total metadata blocks> 2464 * <#read hits> <#read misses> <#write hits> <#write misses> 2465 * <#demotions> <#promotions> <#blocks in cache> <#dirty> 2466 * <#features> <features>* 2467 * <#core args> <core args> 2468 * <#policy args> <policy args>* 2469 */ 2470 static void cache_status(struct dm_target *ti, status_type_t type, 2471 unsigned status_flags, char *result, unsigned maxlen) 2472 { 2473 int r = 0; 2474 unsigned i; 2475 ssize_t sz = 0; 2476 dm_block_t nr_free_blocks_metadata = 0; 2477 dm_block_t nr_blocks_metadata = 0; 2478 char buf[BDEVNAME_SIZE]; 2479 struct cache *cache = ti->private; 2480 dm_cblock_t residency; 2481 2482 switch (type) { 2483 case STATUSTYPE_INFO: 2484 /* Commit to ensure statistics aren't out-of-date */ 2485 if (!(status_flags & DM_STATUS_NOFLUSH_FLAG) && !dm_suspended(ti)) { 2486 r = dm_cache_commit(cache->cmd, false); 2487 if (r) 2488 DMERR("could not commit metadata for accurate status"); 2489 } 2490 2491 r = dm_cache_get_free_metadata_block_count(cache->cmd, 2492 &nr_free_blocks_metadata); 2493 if (r) { 2494 DMERR("could not get metadata free block count"); 2495 goto err; 2496 } 2497 2498 r = dm_cache_get_metadata_dev_size(cache->cmd, &nr_blocks_metadata); 2499 if (r) { 2500 DMERR("could not get metadata device size"); 2501 goto err; 2502 } 2503 2504 residency = policy_residency(cache->policy); 2505 2506 DMEMIT("%llu/%llu %u %u %u %u %u %u %llu %u ", 2507 (unsigned long long)(nr_blocks_metadata - nr_free_blocks_metadata), 2508 (unsigned long long)nr_blocks_metadata, 2509 (unsigned) atomic_read(&cache->stats.read_hit), 2510 (unsigned) atomic_read(&cache->stats.read_miss), 2511 (unsigned) atomic_read(&cache->stats.write_hit), 2512 (unsigned) atomic_read(&cache->stats.write_miss), 2513 (unsigned) atomic_read(&cache->stats.demotion), 2514 (unsigned) atomic_read(&cache->stats.promotion), 2515 (unsigned long long) from_cblock(residency), 2516 cache->nr_dirty); 2517 2518 if (cache->features.write_through) 2519 DMEMIT("1 writethrough "); 2520 else 2521 DMEMIT("0 "); 2522 2523 DMEMIT("2 migration_threshold %llu ", (unsigned long long) cache->migration_threshold); 2524 if (sz < maxlen) { 2525 r = policy_emit_config_values(cache->policy, result + sz, maxlen - sz); 2526 if (r) 2527 DMERR("policy_emit_config_values returned %d", r); 2528 } 2529 2530 break; 2531 2532 case STATUSTYPE_TABLE: 2533 format_dev_t(buf, cache->metadata_dev->bdev->bd_dev); 2534 DMEMIT("%s ", buf); 2535 format_dev_t(buf, cache->cache_dev->bdev->bd_dev); 2536 DMEMIT("%s ", buf); 2537 format_dev_t(buf, cache->origin_dev->bdev->bd_dev); 2538 DMEMIT("%s", buf); 2539 2540 for (i = 0; i < cache->nr_ctr_args - 1; i++) 2541 DMEMIT(" %s", cache->ctr_args[i]); 2542 if (cache->nr_ctr_args) 2543 DMEMIT(" %s", cache->ctr_args[cache->nr_ctr_args - 1]); 2544 } 2545 2546 return; 2547 2548 err: 2549 DMEMIT("Error"); 2550 } 2551 2552 /* 2553 * Supports <key> <value>. 2554 * 2555 * The key migration_threshold is supported by the cache target core. 2556 */ 2557 static int cache_message(struct dm_target *ti, unsigned argc, char **argv) 2558 { 2559 struct cache *cache = ti->private; 2560 2561 if (argc != 2) 2562 return -EINVAL; 2563 2564 return set_config_value(cache, argv[0], argv[1]); 2565 } 2566 2567 static int cache_iterate_devices(struct dm_target *ti, 2568 iterate_devices_callout_fn fn, void *data) 2569 { 2570 int r = 0; 2571 struct cache *cache = ti->private; 2572 2573 r = fn(ti, cache->cache_dev, 0, get_dev_size(cache->cache_dev), data); 2574 if (!r) 2575 r = fn(ti, cache->origin_dev, 0, ti->len, data); 2576 2577 return r; 2578 } 2579 2580 /* 2581 * We assume I/O is going to the origin (which is the volume 2582 * more likely to have restrictions e.g. by being striped). 2583 * (Looking up the exact location of the data would be expensive 2584 * and could always be out of date by the time the bio is submitted.) 2585 */ 2586 static int cache_bvec_merge(struct dm_target *ti, 2587 struct bvec_merge_data *bvm, 2588 struct bio_vec *biovec, int max_size) 2589 { 2590 struct cache *cache = ti->private; 2591 struct request_queue *q = bdev_get_queue(cache->origin_dev->bdev); 2592 2593 if (!q->merge_bvec_fn) 2594 return max_size; 2595 2596 bvm->bi_bdev = cache->origin_dev->bdev; 2597 return min(max_size, q->merge_bvec_fn(q, bvm, biovec)); 2598 } 2599 2600 static void set_discard_limits(struct cache *cache, struct queue_limits *limits) 2601 { 2602 /* 2603 * FIXME: these limits may be incompatible with the cache device 2604 */ 2605 limits->max_discard_sectors = cache->discard_block_size * 1024; 2606 limits->discard_granularity = cache->discard_block_size << SECTOR_SHIFT; 2607 } 2608 2609 static void cache_io_hints(struct dm_target *ti, struct queue_limits *limits) 2610 { 2611 struct cache *cache = ti->private; 2612 2613 blk_limits_io_min(limits, 0); 2614 blk_limits_io_opt(limits, cache->sectors_per_block << SECTOR_SHIFT); 2615 set_discard_limits(cache, limits); 2616 } 2617 2618 /*----------------------------------------------------------------*/ 2619 2620 static struct target_type cache_target = { 2621 .name = "cache", 2622 .version = {1, 1, 1}, 2623 .module = THIS_MODULE, 2624 .ctr = cache_ctr, 2625 .dtr = cache_dtr, 2626 .map = cache_map, 2627 .end_io = cache_end_io, 2628 .postsuspend = cache_postsuspend, 2629 .preresume = cache_preresume, 2630 .resume = cache_resume, 2631 .status = cache_status, 2632 .message = cache_message, 2633 .iterate_devices = cache_iterate_devices, 2634 .merge = cache_bvec_merge, 2635 .io_hints = cache_io_hints, 2636 }; 2637 2638 static int __init dm_cache_init(void) 2639 { 2640 int r; 2641 2642 r = dm_register_target(&cache_target); 2643 if (r) { 2644 DMERR("cache target registration failed: %d", r); 2645 return r; 2646 } 2647 2648 migration_cache = KMEM_CACHE(dm_cache_migration, 0); 2649 if (!migration_cache) { 2650 dm_unregister_target(&cache_target); 2651 return -ENOMEM; 2652 } 2653 2654 return 0; 2655 } 2656 2657 static void __exit dm_cache_exit(void) 2658 { 2659 dm_unregister_target(&cache_target); 2660 kmem_cache_destroy(migration_cache); 2661 } 2662 2663 module_init(dm_cache_init); 2664 module_exit(dm_cache_exit); 2665 2666 MODULE_DESCRIPTION(DM_NAME " cache target"); 2667 MODULE_AUTHOR("Joe Thornber <ejt@redhat.com>"); 2668 MODULE_LICENSE("GPL"); 2669