1 /* 2 * Copyright (C) 2012 Red Hat. All rights reserved. 3 * 4 * This file is released under the GPL. 5 */ 6 7 #include "dm.h" 8 #include "dm-bio-prison.h" 9 #include "dm-bio-record.h" 10 #include "dm-cache-metadata.h" 11 12 #include <linux/dm-io.h> 13 #include <linux/dm-kcopyd.h> 14 #include <linux/init.h> 15 #include <linux/mempool.h> 16 #include <linux/module.h> 17 #include <linux/slab.h> 18 #include <linux/vmalloc.h> 19 20 #define DM_MSG_PREFIX "cache" 21 22 DECLARE_DM_KCOPYD_THROTTLE_WITH_MODULE_PARM(cache_copy_throttle, 23 "A percentage of time allocated for copying to and/or from cache"); 24 25 /*----------------------------------------------------------------*/ 26 27 /* 28 * Glossary: 29 * 30 * oblock: index of an origin block 31 * cblock: index of a cache block 32 * promotion: movement of a block from origin to cache 33 * demotion: movement of a block from cache to origin 34 * migration: movement of a block between the origin and cache device, 35 * either direction 36 */ 37 38 /*----------------------------------------------------------------*/ 39 40 static size_t bitset_size_in_bytes(unsigned nr_entries) 41 { 42 return sizeof(unsigned long) * dm_div_up(nr_entries, BITS_PER_LONG); 43 } 44 45 static unsigned long *alloc_bitset(unsigned nr_entries) 46 { 47 size_t s = bitset_size_in_bytes(nr_entries); 48 return vzalloc(s); 49 } 50 51 static void clear_bitset(void *bitset, unsigned nr_entries) 52 { 53 size_t s = bitset_size_in_bytes(nr_entries); 54 memset(bitset, 0, s); 55 } 56 57 static void free_bitset(unsigned long *bits) 58 { 59 vfree(bits); 60 } 61 62 /*----------------------------------------------------------------*/ 63 64 #define PRISON_CELLS 1024 65 #define MIGRATION_POOL_SIZE 128 66 #define COMMIT_PERIOD HZ 67 #define MIGRATION_COUNT_WINDOW 10 68 69 /* 70 * The block size of the device holding cache data must be >= 32KB 71 */ 72 #define DATA_DEV_BLOCK_SIZE_MIN_SECTORS (32 * 1024 >> SECTOR_SHIFT) 73 74 /* 75 * FIXME: the cache is read/write for the time being. 76 */ 77 enum cache_mode { 78 CM_WRITE, /* metadata may be changed */ 79 CM_READ_ONLY, /* metadata may not be changed */ 80 }; 81 82 struct cache_features { 83 enum cache_mode mode; 84 bool write_through:1; 85 }; 86 87 struct cache_stats { 88 atomic_t read_hit; 89 atomic_t read_miss; 90 atomic_t write_hit; 91 atomic_t write_miss; 92 atomic_t demotion; 93 atomic_t promotion; 94 atomic_t copies_avoided; 95 atomic_t cache_cell_clash; 96 atomic_t commit_count; 97 atomic_t discard_count; 98 }; 99 100 struct cache { 101 struct dm_target *ti; 102 struct dm_target_callbacks callbacks; 103 104 /* 105 * Metadata is written to this device. 106 */ 107 struct dm_dev *metadata_dev; 108 109 /* 110 * The slower of the two data devices. Typically a spindle. 111 */ 112 struct dm_dev *origin_dev; 113 114 /* 115 * The faster of the two data devices. Typically an SSD. 116 */ 117 struct dm_dev *cache_dev; 118 119 /* 120 * Cache features such as write-through. 121 */ 122 struct cache_features features; 123 124 /* 125 * Size of the origin device in _complete_ blocks and native sectors. 126 */ 127 dm_oblock_t origin_blocks; 128 sector_t origin_sectors; 129 130 /* 131 * Size of the cache device in blocks. 132 */ 133 dm_cblock_t cache_size; 134 135 /* 136 * Fields for converting from sectors to blocks. 137 */ 138 uint32_t sectors_per_block; 139 int sectors_per_block_shift; 140 141 struct dm_cache_metadata *cmd; 142 143 spinlock_t lock; 144 struct bio_list deferred_bios; 145 struct bio_list deferred_flush_bios; 146 struct bio_list deferred_writethrough_bios; 147 struct list_head quiesced_migrations; 148 struct list_head completed_migrations; 149 struct list_head need_commit_migrations; 150 sector_t migration_threshold; 151 atomic_t nr_migrations; 152 wait_queue_head_t migration_wait; 153 154 /* 155 * cache_size entries, dirty if set 156 */ 157 dm_cblock_t nr_dirty; 158 unsigned long *dirty_bitset; 159 160 /* 161 * origin_blocks entries, discarded if set. 162 */ 163 uint32_t discard_block_size; /* a power of 2 times sectors per block */ 164 dm_dblock_t discard_nr_blocks; 165 unsigned long *discard_bitset; 166 167 struct dm_kcopyd_client *copier; 168 struct workqueue_struct *wq; 169 struct work_struct worker; 170 171 struct delayed_work waker; 172 unsigned long last_commit_jiffies; 173 174 struct dm_bio_prison *prison; 175 struct dm_deferred_set *all_io_ds; 176 177 mempool_t *migration_pool; 178 struct dm_cache_migration *next_migration; 179 180 struct dm_cache_policy *policy; 181 unsigned policy_nr_args; 182 183 bool need_tick_bio:1; 184 bool sized:1; 185 bool quiescing:1; 186 bool commit_requested:1; 187 bool loaded_mappings:1; 188 bool loaded_discards:1; 189 190 struct cache_stats stats; 191 192 /* 193 * Rather than reconstructing the table line for the status we just 194 * save it and regurgitate. 195 */ 196 unsigned nr_ctr_args; 197 const char **ctr_args; 198 }; 199 200 struct per_bio_data { 201 bool tick:1; 202 unsigned req_nr:2; 203 struct dm_deferred_entry *all_io_entry; 204 205 /* 206 * writethrough fields. These MUST remain at the end of this 207 * structure and the 'cache' member must be the first as it 208 * is used to determine the offset of the writethrough fields. 209 */ 210 struct cache *cache; 211 dm_cblock_t cblock; 212 bio_end_io_t *saved_bi_end_io; 213 struct dm_bio_details bio_details; 214 }; 215 216 struct dm_cache_migration { 217 struct list_head list; 218 struct cache *cache; 219 220 unsigned long start_jiffies; 221 dm_oblock_t old_oblock; 222 dm_oblock_t new_oblock; 223 dm_cblock_t cblock; 224 225 bool err:1; 226 bool writeback:1; 227 bool demote:1; 228 bool promote:1; 229 230 struct dm_bio_prison_cell *old_ocell; 231 struct dm_bio_prison_cell *new_ocell; 232 }; 233 234 /* 235 * Processing a bio in the worker thread may require these memory 236 * allocations. We prealloc to avoid deadlocks (the same worker thread 237 * frees them back to the mempool). 238 */ 239 struct prealloc { 240 struct dm_cache_migration *mg; 241 struct dm_bio_prison_cell *cell1; 242 struct dm_bio_prison_cell *cell2; 243 }; 244 245 static void wake_worker(struct cache *cache) 246 { 247 queue_work(cache->wq, &cache->worker); 248 } 249 250 /*----------------------------------------------------------------*/ 251 252 static struct dm_bio_prison_cell *alloc_prison_cell(struct cache *cache) 253 { 254 /* FIXME: change to use a local slab. */ 255 return dm_bio_prison_alloc_cell(cache->prison, GFP_NOWAIT); 256 } 257 258 static void free_prison_cell(struct cache *cache, struct dm_bio_prison_cell *cell) 259 { 260 dm_bio_prison_free_cell(cache->prison, cell); 261 } 262 263 static int prealloc_data_structs(struct cache *cache, struct prealloc *p) 264 { 265 if (!p->mg) { 266 p->mg = mempool_alloc(cache->migration_pool, GFP_NOWAIT); 267 if (!p->mg) 268 return -ENOMEM; 269 } 270 271 if (!p->cell1) { 272 p->cell1 = alloc_prison_cell(cache); 273 if (!p->cell1) 274 return -ENOMEM; 275 } 276 277 if (!p->cell2) { 278 p->cell2 = alloc_prison_cell(cache); 279 if (!p->cell2) 280 return -ENOMEM; 281 } 282 283 return 0; 284 } 285 286 static void prealloc_free_structs(struct cache *cache, struct prealloc *p) 287 { 288 if (p->cell2) 289 free_prison_cell(cache, p->cell2); 290 291 if (p->cell1) 292 free_prison_cell(cache, p->cell1); 293 294 if (p->mg) 295 mempool_free(p->mg, cache->migration_pool); 296 } 297 298 static struct dm_cache_migration *prealloc_get_migration(struct prealloc *p) 299 { 300 struct dm_cache_migration *mg = p->mg; 301 302 BUG_ON(!mg); 303 p->mg = NULL; 304 305 return mg; 306 } 307 308 /* 309 * You must have a cell within the prealloc struct to return. If not this 310 * function will BUG() rather than returning NULL. 311 */ 312 static struct dm_bio_prison_cell *prealloc_get_cell(struct prealloc *p) 313 { 314 struct dm_bio_prison_cell *r = NULL; 315 316 if (p->cell1) { 317 r = p->cell1; 318 p->cell1 = NULL; 319 320 } else if (p->cell2) { 321 r = p->cell2; 322 p->cell2 = NULL; 323 } else 324 BUG(); 325 326 return r; 327 } 328 329 /* 330 * You can't have more than two cells in a prealloc struct. BUG() will be 331 * called if you try and overfill. 332 */ 333 static void prealloc_put_cell(struct prealloc *p, struct dm_bio_prison_cell *cell) 334 { 335 if (!p->cell2) 336 p->cell2 = cell; 337 338 else if (!p->cell1) 339 p->cell1 = cell; 340 341 else 342 BUG(); 343 } 344 345 /*----------------------------------------------------------------*/ 346 347 static void build_key(dm_oblock_t oblock, struct dm_cell_key *key) 348 { 349 key->virtual = 0; 350 key->dev = 0; 351 key->block = from_oblock(oblock); 352 } 353 354 /* 355 * The caller hands in a preallocated cell, and a free function for it. 356 * The cell will be freed if there's an error, or if it wasn't used because 357 * a cell with that key already exists. 358 */ 359 typedef void (*cell_free_fn)(void *context, struct dm_bio_prison_cell *cell); 360 361 static int bio_detain(struct cache *cache, dm_oblock_t oblock, 362 struct bio *bio, struct dm_bio_prison_cell *cell_prealloc, 363 cell_free_fn free_fn, void *free_context, 364 struct dm_bio_prison_cell **cell_result) 365 { 366 int r; 367 struct dm_cell_key key; 368 369 build_key(oblock, &key); 370 r = dm_bio_detain(cache->prison, &key, bio, cell_prealloc, cell_result); 371 if (r) 372 free_fn(free_context, cell_prealloc); 373 374 return r; 375 } 376 377 static int get_cell(struct cache *cache, 378 dm_oblock_t oblock, 379 struct prealloc *structs, 380 struct dm_bio_prison_cell **cell_result) 381 { 382 int r; 383 struct dm_cell_key key; 384 struct dm_bio_prison_cell *cell_prealloc; 385 386 cell_prealloc = prealloc_get_cell(structs); 387 388 build_key(oblock, &key); 389 r = dm_get_cell(cache->prison, &key, cell_prealloc, cell_result); 390 if (r) 391 prealloc_put_cell(structs, cell_prealloc); 392 393 return r; 394 } 395 396 /*----------------------------------------------------------------*/ 397 398 static bool is_dirty(struct cache *cache, dm_cblock_t b) 399 { 400 return test_bit(from_cblock(b), cache->dirty_bitset); 401 } 402 403 static void set_dirty(struct cache *cache, dm_oblock_t oblock, dm_cblock_t cblock) 404 { 405 if (!test_and_set_bit(from_cblock(cblock), cache->dirty_bitset)) { 406 cache->nr_dirty = to_cblock(from_cblock(cache->nr_dirty) + 1); 407 policy_set_dirty(cache->policy, oblock); 408 } 409 } 410 411 static void clear_dirty(struct cache *cache, dm_oblock_t oblock, dm_cblock_t cblock) 412 { 413 if (test_and_clear_bit(from_cblock(cblock), cache->dirty_bitset)) { 414 policy_clear_dirty(cache->policy, oblock); 415 cache->nr_dirty = to_cblock(from_cblock(cache->nr_dirty) - 1); 416 if (!from_cblock(cache->nr_dirty)) 417 dm_table_event(cache->ti->table); 418 } 419 } 420 421 /*----------------------------------------------------------------*/ 422 423 static bool block_size_is_power_of_two(struct cache *cache) 424 { 425 return cache->sectors_per_block_shift >= 0; 426 } 427 428 static dm_block_t block_div(dm_block_t b, uint32_t n) 429 { 430 do_div(b, n); 431 432 return b; 433 } 434 435 static dm_dblock_t oblock_to_dblock(struct cache *cache, dm_oblock_t oblock) 436 { 437 uint32_t discard_blocks = cache->discard_block_size; 438 dm_block_t b = from_oblock(oblock); 439 440 if (!block_size_is_power_of_two(cache)) 441 discard_blocks = discard_blocks / cache->sectors_per_block; 442 else 443 discard_blocks >>= cache->sectors_per_block_shift; 444 445 b = block_div(b, discard_blocks); 446 447 return to_dblock(b); 448 } 449 450 static void set_discard(struct cache *cache, dm_dblock_t b) 451 { 452 unsigned long flags; 453 454 atomic_inc(&cache->stats.discard_count); 455 456 spin_lock_irqsave(&cache->lock, flags); 457 set_bit(from_dblock(b), cache->discard_bitset); 458 spin_unlock_irqrestore(&cache->lock, flags); 459 } 460 461 static void clear_discard(struct cache *cache, dm_dblock_t b) 462 { 463 unsigned long flags; 464 465 spin_lock_irqsave(&cache->lock, flags); 466 clear_bit(from_dblock(b), cache->discard_bitset); 467 spin_unlock_irqrestore(&cache->lock, flags); 468 } 469 470 static bool is_discarded(struct cache *cache, dm_dblock_t b) 471 { 472 int r; 473 unsigned long flags; 474 475 spin_lock_irqsave(&cache->lock, flags); 476 r = test_bit(from_dblock(b), cache->discard_bitset); 477 spin_unlock_irqrestore(&cache->lock, flags); 478 479 return r; 480 } 481 482 static bool is_discarded_oblock(struct cache *cache, dm_oblock_t b) 483 { 484 int r; 485 unsigned long flags; 486 487 spin_lock_irqsave(&cache->lock, flags); 488 r = test_bit(from_dblock(oblock_to_dblock(cache, b)), 489 cache->discard_bitset); 490 spin_unlock_irqrestore(&cache->lock, flags); 491 492 return r; 493 } 494 495 /*----------------------------------------------------------------*/ 496 497 static void load_stats(struct cache *cache) 498 { 499 struct dm_cache_statistics stats; 500 501 dm_cache_metadata_get_stats(cache->cmd, &stats); 502 atomic_set(&cache->stats.read_hit, stats.read_hits); 503 atomic_set(&cache->stats.read_miss, stats.read_misses); 504 atomic_set(&cache->stats.write_hit, stats.write_hits); 505 atomic_set(&cache->stats.write_miss, stats.write_misses); 506 } 507 508 static void save_stats(struct cache *cache) 509 { 510 struct dm_cache_statistics stats; 511 512 stats.read_hits = atomic_read(&cache->stats.read_hit); 513 stats.read_misses = atomic_read(&cache->stats.read_miss); 514 stats.write_hits = atomic_read(&cache->stats.write_hit); 515 stats.write_misses = atomic_read(&cache->stats.write_miss); 516 517 dm_cache_metadata_set_stats(cache->cmd, &stats); 518 } 519 520 /*---------------------------------------------------------------- 521 * Per bio data 522 *--------------------------------------------------------------*/ 523 524 /* 525 * If using writeback, leave out struct per_bio_data's writethrough fields. 526 */ 527 #define PB_DATA_SIZE_WB (offsetof(struct per_bio_data, cache)) 528 #define PB_DATA_SIZE_WT (sizeof(struct per_bio_data)) 529 530 static size_t get_per_bio_data_size(struct cache *cache) 531 { 532 return cache->features.write_through ? PB_DATA_SIZE_WT : PB_DATA_SIZE_WB; 533 } 534 535 static struct per_bio_data *get_per_bio_data(struct bio *bio, size_t data_size) 536 { 537 struct per_bio_data *pb = dm_per_bio_data(bio, data_size); 538 BUG_ON(!pb); 539 return pb; 540 } 541 542 static struct per_bio_data *init_per_bio_data(struct bio *bio, size_t data_size) 543 { 544 struct per_bio_data *pb = get_per_bio_data(bio, data_size); 545 546 pb->tick = false; 547 pb->req_nr = dm_bio_get_target_bio_nr(bio); 548 pb->all_io_entry = NULL; 549 550 return pb; 551 } 552 553 /*---------------------------------------------------------------- 554 * Remapping 555 *--------------------------------------------------------------*/ 556 static void remap_to_origin(struct cache *cache, struct bio *bio) 557 { 558 bio->bi_bdev = cache->origin_dev->bdev; 559 } 560 561 static void remap_to_cache(struct cache *cache, struct bio *bio, 562 dm_cblock_t cblock) 563 { 564 sector_t bi_sector = bio->bi_sector; 565 566 bio->bi_bdev = cache->cache_dev->bdev; 567 if (!block_size_is_power_of_two(cache)) 568 bio->bi_sector = (from_cblock(cblock) * cache->sectors_per_block) + 569 sector_div(bi_sector, cache->sectors_per_block); 570 else 571 bio->bi_sector = (from_cblock(cblock) << cache->sectors_per_block_shift) | 572 (bi_sector & (cache->sectors_per_block - 1)); 573 } 574 575 static void check_if_tick_bio_needed(struct cache *cache, struct bio *bio) 576 { 577 unsigned long flags; 578 size_t pb_data_size = get_per_bio_data_size(cache); 579 struct per_bio_data *pb = get_per_bio_data(bio, pb_data_size); 580 581 spin_lock_irqsave(&cache->lock, flags); 582 if (cache->need_tick_bio && 583 !(bio->bi_rw & (REQ_FUA | REQ_FLUSH | REQ_DISCARD))) { 584 pb->tick = true; 585 cache->need_tick_bio = false; 586 } 587 spin_unlock_irqrestore(&cache->lock, flags); 588 } 589 590 static void remap_to_origin_clear_discard(struct cache *cache, struct bio *bio, 591 dm_oblock_t oblock) 592 { 593 check_if_tick_bio_needed(cache, bio); 594 remap_to_origin(cache, bio); 595 if (bio_data_dir(bio) == WRITE) 596 clear_discard(cache, oblock_to_dblock(cache, oblock)); 597 } 598 599 static void remap_to_cache_dirty(struct cache *cache, struct bio *bio, 600 dm_oblock_t oblock, dm_cblock_t cblock) 601 { 602 remap_to_cache(cache, bio, cblock); 603 if (bio_data_dir(bio) == WRITE) { 604 set_dirty(cache, oblock, cblock); 605 clear_discard(cache, oblock_to_dblock(cache, oblock)); 606 } 607 } 608 609 static dm_oblock_t get_bio_block(struct cache *cache, struct bio *bio) 610 { 611 sector_t block_nr = bio->bi_sector; 612 613 if (!block_size_is_power_of_two(cache)) 614 (void) sector_div(block_nr, cache->sectors_per_block); 615 else 616 block_nr >>= cache->sectors_per_block_shift; 617 618 return to_oblock(block_nr); 619 } 620 621 static int bio_triggers_commit(struct cache *cache, struct bio *bio) 622 { 623 return bio->bi_rw & (REQ_FLUSH | REQ_FUA); 624 } 625 626 static void issue(struct cache *cache, struct bio *bio) 627 { 628 unsigned long flags; 629 630 if (!bio_triggers_commit(cache, bio)) { 631 generic_make_request(bio); 632 return; 633 } 634 635 /* 636 * Batch together any bios that trigger commits and then issue a 637 * single commit for them in do_worker(). 638 */ 639 spin_lock_irqsave(&cache->lock, flags); 640 cache->commit_requested = true; 641 bio_list_add(&cache->deferred_flush_bios, bio); 642 spin_unlock_irqrestore(&cache->lock, flags); 643 } 644 645 static void defer_writethrough_bio(struct cache *cache, struct bio *bio) 646 { 647 unsigned long flags; 648 649 spin_lock_irqsave(&cache->lock, flags); 650 bio_list_add(&cache->deferred_writethrough_bios, bio); 651 spin_unlock_irqrestore(&cache->lock, flags); 652 653 wake_worker(cache); 654 } 655 656 static void writethrough_endio(struct bio *bio, int err) 657 { 658 struct per_bio_data *pb = get_per_bio_data(bio, PB_DATA_SIZE_WT); 659 bio->bi_end_io = pb->saved_bi_end_io; 660 661 if (err) { 662 bio_endio(bio, err); 663 return; 664 } 665 666 dm_bio_restore(&pb->bio_details, bio); 667 remap_to_cache(pb->cache, bio, pb->cblock); 668 669 /* 670 * We can't issue this bio directly, since we're in interrupt 671 * context. So it gets put on a bio list for processing by the 672 * worker thread. 673 */ 674 defer_writethrough_bio(pb->cache, bio); 675 } 676 677 /* 678 * When running in writethrough mode we need to send writes to clean blocks 679 * to both the cache and origin devices. In future we'd like to clone the 680 * bio and send them in parallel, but for now we're doing them in 681 * series as this is easier. 682 */ 683 static void remap_to_origin_then_cache(struct cache *cache, struct bio *bio, 684 dm_oblock_t oblock, dm_cblock_t cblock) 685 { 686 struct per_bio_data *pb = get_per_bio_data(bio, PB_DATA_SIZE_WT); 687 688 pb->cache = cache; 689 pb->cblock = cblock; 690 pb->saved_bi_end_io = bio->bi_end_io; 691 dm_bio_record(&pb->bio_details, bio); 692 bio->bi_end_io = writethrough_endio; 693 694 remap_to_origin_clear_discard(pb->cache, bio, oblock); 695 } 696 697 /*---------------------------------------------------------------- 698 * Migration processing 699 * 700 * Migration covers moving data from the origin device to the cache, or 701 * vice versa. 702 *--------------------------------------------------------------*/ 703 static void free_migration(struct dm_cache_migration *mg) 704 { 705 mempool_free(mg, mg->cache->migration_pool); 706 } 707 708 static void inc_nr_migrations(struct cache *cache) 709 { 710 atomic_inc(&cache->nr_migrations); 711 } 712 713 static void dec_nr_migrations(struct cache *cache) 714 { 715 atomic_dec(&cache->nr_migrations); 716 717 /* 718 * Wake the worker in case we're suspending the target. 719 */ 720 wake_up(&cache->migration_wait); 721 } 722 723 static void __cell_defer(struct cache *cache, struct dm_bio_prison_cell *cell, 724 bool holder) 725 { 726 (holder ? dm_cell_release : dm_cell_release_no_holder) 727 (cache->prison, cell, &cache->deferred_bios); 728 free_prison_cell(cache, cell); 729 } 730 731 static void cell_defer(struct cache *cache, struct dm_bio_prison_cell *cell, 732 bool holder) 733 { 734 unsigned long flags; 735 736 spin_lock_irqsave(&cache->lock, flags); 737 __cell_defer(cache, cell, holder); 738 spin_unlock_irqrestore(&cache->lock, flags); 739 740 wake_worker(cache); 741 } 742 743 static void cleanup_migration(struct dm_cache_migration *mg) 744 { 745 dec_nr_migrations(mg->cache); 746 free_migration(mg); 747 } 748 749 static void migration_failure(struct dm_cache_migration *mg) 750 { 751 struct cache *cache = mg->cache; 752 753 if (mg->writeback) { 754 DMWARN_LIMIT("writeback failed; couldn't copy block"); 755 set_dirty(cache, mg->old_oblock, mg->cblock); 756 cell_defer(cache, mg->old_ocell, false); 757 758 } else if (mg->demote) { 759 DMWARN_LIMIT("demotion failed; couldn't copy block"); 760 policy_force_mapping(cache->policy, mg->new_oblock, mg->old_oblock); 761 762 cell_defer(cache, mg->old_ocell, mg->promote ? 0 : 1); 763 if (mg->promote) 764 cell_defer(cache, mg->new_ocell, 1); 765 } else { 766 DMWARN_LIMIT("promotion failed; couldn't copy block"); 767 policy_remove_mapping(cache->policy, mg->new_oblock); 768 cell_defer(cache, mg->new_ocell, 1); 769 } 770 771 cleanup_migration(mg); 772 } 773 774 static void migration_success_pre_commit(struct dm_cache_migration *mg) 775 { 776 unsigned long flags; 777 struct cache *cache = mg->cache; 778 779 if (mg->writeback) { 780 cell_defer(cache, mg->old_ocell, false); 781 clear_dirty(cache, mg->old_oblock, mg->cblock); 782 cleanup_migration(mg); 783 return; 784 785 } else if (mg->demote) { 786 if (dm_cache_remove_mapping(cache->cmd, mg->cblock)) { 787 DMWARN_LIMIT("demotion failed; couldn't update on disk metadata"); 788 policy_force_mapping(cache->policy, mg->new_oblock, 789 mg->old_oblock); 790 if (mg->promote) 791 cell_defer(cache, mg->new_ocell, true); 792 cleanup_migration(mg); 793 return; 794 } 795 } else { 796 if (dm_cache_insert_mapping(cache->cmd, mg->cblock, mg->new_oblock)) { 797 DMWARN_LIMIT("promotion failed; couldn't update on disk metadata"); 798 policy_remove_mapping(cache->policy, mg->new_oblock); 799 cleanup_migration(mg); 800 return; 801 } 802 } 803 804 spin_lock_irqsave(&cache->lock, flags); 805 list_add_tail(&mg->list, &cache->need_commit_migrations); 806 cache->commit_requested = true; 807 spin_unlock_irqrestore(&cache->lock, flags); 808 } 809 810 static void migration_success_post_commit(struct dm_cache_migration *mg) 811 { 812 unsigned long flags; 813 struct cache *cache = mg->cache; 814 815 if (mg->writeback) { 816 DMWARN("writeback unexpectedly triggered commit"); 817 return; 818 819 } else if (mg->demote) { 820 cell_defer(cache, mg->old_ocell, mg->promote ? 0 : 1); 821 822 if (mg->promote) { 823 mg->demote = false; 824 825 spin_lock_irqsave(&cache->lock, flags); 826 list_add_tail(&mg->list, &cache->quiesced_migrations); 827 spin_unlock_irqrestore(&cache->lock, flags); 828 829 } else 830 cleanup_migration(mg); 831 832 } else { 833 cell_defer(cache, mg->new_ocell, true); 834 clear_dirty(cache, mg->new_oblock, mg->cblock); 835 cleanup_migration(mg); 836 } 837 } 838 839 static void copy_complete(int read_err, unsigned long write_err, void *context) 840 { 841 unsigned long flags; 842 struct dm_cache_migration *mg = (struct dm_cache_migration *) context; 843 struct cache *cache = mg->cache; 844 845 if (read_err || write_err) 846 mg->err = true; 847 848 spin_lock_irqsave(&cache->lock, flags); 849 list_add_tail(&mg->list, &cache->completed_migrations); 850 spin_unlock_irqrestore(&cache->lock, flags); 851 852 wake_worker(cache); 853 } 854 855 static void issue_copy_real(struct dm_cache_migration *mg) 856 { 857 int r; 858 struct dm_io_region o_region, c_region; 859 struct cache *cache = mg->cache; 860 861 o_region.bdev = cache->origin_dev->bdev; 862 o_region.count = cache->sectors_per_block; 863 864 c_region.bdev = cache->cache_dev->bdev; 865 c_region.sector = from_cblock(mg->cblock) * cache->sectors_per_block; 866 c_region.count = cache->sectors_per_block; 867 868 if (mg->writeback || mg->demote) { 869 /* demote */ 870 o_region.sector = from_oblock(mg->old_oblock) * cache->sectors_per_block; 871 r = dm_kcopyd_copy(cache->copier, &c_region, 1, &o_region, 0, copy_complete, mg); 872 } else { 873 /* promote */ 874 o_region.sector = from_oblock(mg->new_oblock) * cache->sectors_per_block; 875 r = dm_kcopyd_copy(cache->copier, &o_region, 1, &c_region, 0, copy_complete, mg); 876 } 877 878 if (r < 0) 879 migration_failure(mg); 880 } 881 882 static void avoid_copy(struct dm_cache_migration *mg) 883 { 884 atomic_inc(&mg->cache->stats.copies_avoided); 885 migration_success_pre_commit(mg); 886 } 887 888 static void issue_copy(struct dm_cache_migration *mg) 889 { 890 bool avoid; 891 struct cache *cache = mg->cache; 892 893 if (mg->writeback || mg->demote) 894 avoid = !is_dirty(cache, mg->cblock) || 895 is_discarded_oblock(cache, mg->old_oblock); 896 else 897 avoid = is_discarded_oblock(cache, mg->new_oblock); 898 899 avoid ? avoid_copy(mg) : issue_copy_real(mg); 900 } 901 902 static void complete_migration(struct dm_cache_migration *mg) 903 { 904 if (mg->err) 905 migration_failure(mg); 906 else 907 migration_success_pre_commit(mg); 908 } 909 910 static void process_migrations(struct cache *cache, struct list_head *head, 911 void (*fn)(struct dm_cache_migration *)) 912 { 913 unsigned long flags; 914 struct list_head list; 915 struct dm_cache_migration *mg, *tmp; 916 917 INIT_LIST_HEAD(&list); 918 spin_lock_irqsave(&cache->lock, flags); 919 list_splice_init(head, &list); 920 spin_unlock_irqrestore(&cache->lock, flags); 921 922 list_for_each_entry_safe(mg, tmp, &list, list) 923 fn(mg); 924 } 925 926 static void __queue_quiesced_migration(struct dm_cache_migration *mg) 927 { 928 list_add_tail(&mg->list, &mg->cache->quiesced_migrations); 929 } 930 931 static void queue_quiesced_migration(struct dm_cache_migration *mg) 932 { 933 unsigned long flags; 934 struct cache *cache = mg->cache; 935 936 spin_lock_irqsave(&cache->lock, flags); 937 __queue_quiesced_migration(mg); 938 spin_unlock_irqrestore(&cache->lock, flags); 939 940 wake_worker(cache); 941 } 942 943 static void queue_quiesced_migrations(struct cache *cache, struct list_head *work) 944 { 945 unsigned long flags; 946 struct dm_cache_migration *mg, *tmp; 947 948 spin_lock_irqsave(&cache->lock, flags); 949 list_for_each_entry_safe(mg, tmp, work, list) 950 __queue_quiesced_migration(mg); 951 spin_unlock_irqrestore(&cache->lock, flags); 952 953 wake_worker(cache); 954 } 955 956 static void check_for_quiesced_migrations(struct cache *cache, 957 struct per_bio_data *pb) 958 { 959 struct list_head work; 960 961 if (!pb->all_io_entry) 962 return; 963 964 INIT_LIST_HEAD(&work); 965 if (pb->all_io_entry) 966 dm_deferred_entry_dec(pb->all_io_entry, &work); 967 968 if (!list_empty(&work)) 969 queue_quiesced_migrations(cache, &work); 970 } 971 972 static void quiesce_migration(struct dm_cache_migration *mg) 973 { 974 if (!dm_deferred_set_add_work(mg->cache->all_io_ds, &mg->list)) 975 queue_quiesced_migration(mg); 976 } 977 978 static void promote(struct cache *cache, struct prealloc *structs, 979 dm_oblock_t oblock, dm_cblock_t cblock, 980 struct dm_bio_prison_cell *cell) 981 { 982 struct dm_cache_migration *mg = prealloc_get_migration(structs); 983 984 mg->err = false; 985 mg->writeback = false; 986 mg->demote = false; 987 mg->promote = true; 988 mg->cache = cache; 989 mg->new_oblock = oblock; 990 mg->cblock = cblock; 991 mg->old_ocell = NULL; 992 mg->new_ocell = cell; 993 mg->start_jiffies = jiffies; 994 995 inc_nr_migrations(cache); 996 quiesce_migration(mg); 997 } 998 999 static void writeback(struct cache *cache, struct prealloc *structs, 1000 dm_oblock_t oblock, dm_cblock_t cblock, 1001 struct dm_bio_prison_cell *cell) 1002 { 1003 struct dm_cache_migration *mg = prealloc_get_migration(structs); 1004 1005 mg->err = false; 1006 mg->writeback = true; 1007 mg->demote = false; 1008 mg->promote = false; 1009 mg->cache = cache; 1010 mg->old_oblock = oblock; 1011 mg->cblock = cblock; 1012 mg->old_ocell = cell; 1013 mg->new_ocell = NULL; 1014 mg->start_jiffies = jiffies; 1015 1016 inc_nr_migrations(cache); 1017 quiesce_migration(mg); 1018 } 1019 1020 static void demote_then_promote(struct cache *cache, struct prealloc *structs, 1021 dm_oblock_t old_oblock, dm_oblock_t new_oblock, 1022 dm_cblock_t cblock, 1023 struct dm_bio_prison_cell *old_ocell, 1024 struct dm_bio_prison_cell *new_ocell) 1025 { 1026 struct dm_cache_migration *mg = prealloc_get_migration(structs); 1027 1028 mg->err = false; 1029 mg->writeback = false; 1030 mg->demote = true; 1031 mg->promote = true; 1032 mg->cache = cache; 1033 mg->old_oblock = old_oblock; 1034 mg->new_oblock = new_oblock; 1035 mg->cblock = cblock; 1036 mg->old_ocell = old_ocell; 1037 mg->new_ocell = new_ocell; 1038 mg->start_jiffies = jiffies; 1039 1040 inc_nr_migrations(cache); 1041 quiesce_migration(mg); 1042 } 1043 1044 /*---------------------------------------------------------------- 1045 * bio processing 1046 *--------------------------------------------------------------*/ 1047 static void defer_bio(struct cache *cache, struct bio *bio) 1048 { 1049 unsigned long flags; 1050 1051 spin_lock_irqsave(&cache->lock, flags); 1052 bio_list_add(&cache->deferred_bios, bio); 1053 spin_unlock_irqrestore(&cache->lock, flags); 1054 1055 wake_worker(cache); 1056 } 1057 1058 static void process_flush_bio(struct cache *cache, struct bio *bio) 1059 { 1060 size_t pb_data_size = get_per_bio_data_size(cache); 1061 struct per_bio_data *pb = get_per_bio_data(bio, pb_data_size); 1062 1063 BUG_ON(bio->bi_size); 1064 if (!pb->req_nr) 1065 remap_to_origin(cache, bio); 1066 else 1067 remap_to_cache(cache, bio, 0); 1068 1069 issue(cache, bio); 1070 } 1071 1072 /* 1073 * People generally discard large parts of a device, eg, the whole device 1074 * when formatting. Splitting these large discards up into cache block 1075 * sized ios and then quiescing (always neccessary for discard) takes too 1076 * long. 1077 * 1078 * We keep it simple, and allow any size of discard to come in, and just 1079 * mark off blocks on the discard bitset. No passdown occurs! 1080 * 1081 * To implement passdown we need to change the bio_prison such that a cell 1082 * can have a key that spans many blocks. 1083 */ 1084 static void process_discard_bio(struct cache *cache, struct bio *bio) 1085 { 1086 dm_block_t start_block = dm_sector_div_up(bio->bi_sector, 1087 cache->discard_block_size); 1088 dm_block_t end_block = bio->bi_sector + bio_sectors(bio); 1089 dm_block_t b; 1090 1091 end_block = block_div(end_block, cache->discard_block_size); 1092 1093 for (b = start_block; b < end_block; b++) 1094 set_discard(cache, to_dblock(b)); 1095 1096 bio_endio(bio, 0); 1097 } 1098 1099 static bool spare_migration_bandwidth(struct cache *cache) 1100 { 1101 sector_t current_volume = (atomic_read(&cache->nr_migrations) + 1) * 1102 cache->sectors_per_block; 1103 return current_volume < cache->migration_threshold; 1104 } 1105 1106 static bool is_writethrough_io(struct cache *cache, struct bio *bio, 1107 dm_cblock_t cblock) 1108 { 1109 return bio_data_dir(bio) == WRITE && 1110 cache->features.write_through && !is_dirty(cache, cblock); 1111 } 1112 1113 static void inc_hit_counter(struct cache *cache, struct bio *bio) 1114 { 1115 atomic_inc(bio_data_dir(bio) == READ ? 1116 &cache->stats.read_hit : &cache->stats.write_hit); 1117 } 1118 1119 static void inc_miss_counter(struct cache *cache, struct bio *bio) 1120 { 1121 atomic_inc(bio_data_dir(bio) == READ ? 1122 &cache->stats.read_miss : &cache->stats.write_miss); 1123 } 1124 1125 static void process_bio(struct cache *cache, struct prealloc *structs, 1126 struct bio *bio) 1127 { 1128 int r; 1129 bool release_cell = true; 1130 dm_oblock_t block = get_bio_block(cache, bio); 1131 struct dm_bio_prison_cell *cell_prealloc, *old_ocell, *new_ocell; 1132 struct policy_result lookup_result; 1133 size_t pb_data_size = get_per_bio_data_size(cache); 1134 struct per_bio_data *pb = get_per_bio_data(bio, pb_data_size); 1135 bool discarded_block = is_discarded_oblock(cache, block); 1136 bool can_migrate = discarded_block || spare_migration_bandwidth(cache); 1137 1138 /* 1139 * Check to see if that block is currently migrating. 1140 */ 1141 cell_prealloc = prealloc_get_cell(structs); 1142 r = bio_detain(cache, block, bio, cell_prealloc, 1143 (cell_free_fn) prealloc_put_cell, 1144 structs, &new_ocell); 1145 if (r > 0) 1146 return; 1147 1148 r = policy_map(cache->policy, block, true, can_migrate, discarded_block, 1149 bio, &lookup_result); 1150 1151 if (r == -EWOULDBLOCK) 1152 /* migration has been denied */ 1153 lookup_result.op = POLICY_MISS; 1154 1155 switch (lookup_result.op) { 1156 case POLICY_HIT: 1157 inc_hit_counter(cache, bio); 1158 pb->all_io_entry = dm_deferred_entry_inc(cache->all_io_ds); 1159 1160 if (is_writethrough_io(cache, bio, lookup_result.cblock)) 1161 remap_to_origin_then_cache(cache, bio, block, lookup_result.cblock); 1162 else 1163 remap_to_cache_dirty(cache, bio, block, lookup_result.cblock); 1164 1165 issue(cache, bio); 1166 break; 1167 1168 case POLICY_MISS: 1169 inc_miss_counter(cache, bio); 1170 pb->all_io_entry = dm_deferred_entry_inc(cache->all_io_ds); 1171 remap_to_origin_clear_discard(cache, bio, block); 1172 issue(cache, bio); 1173 break; 1174 1175 case POLICY_NEW: 1176 atomic_inc(&cache->stats.promotion); 1177 promote(cache, structs, block, lookup_result.cblock, new_ocell); 1178 release_cell = false; 1179 break; 1180 1181 case POLICY_REPLACE: 1182 cell_prealloc = prealloc_get_cell(structs); 1183 r = bio_detain(cache, lookup_result.old_oblock, bio, cell_prealloc, 1184 (cell_free_fn) prealloc_put_cell, 1185 structs, &old_ocell); 1186 if (r > 0) { 1187 /* 1188 * We have to be careful to avoid lock inversion of 1189 * the cells. So we back off, and wait for the 1190 * old_ocell to become free. 1191 */ 1192 policy_force_mapping(cache->policy, block, 1193 lookup_result.old_oblock); 1194 atomic_inc(&cache->stats.cache_cell_clash); 1195 break; 1196 } 1197 atomic_inc(&cache->stats.demotion); 1198 atomic_inc(&cache->stats.promotion); 1199 1200 demote_then_promote(cache, structs, lookup_result.old_oblock, 1201 block, lookup_result.cblock, 1202 old_ocell, new_ocell); 1203 release_cell = false; 1204 break; 1205 1206 default: 1207 DMERR_LIMIT("%s: erroring bio, unknown policy op: %u", __func__, 1208 (unsigned) lookup_result.op); 1209 bio_io_error(bio); 1210 } 1211 1212 if (release_cell) 1213 cell_defer(cache, new_ocell, false); 1214 } 1215 1216 static int need_commit_due_to_time(struct cache *cache) 1217 { 1218 return jiffies < cache->last_commit_jiffies || 1219 jiffies > cache->last_commit_jiffies + COMMIT_PERIOD; 1220 } 1221 1222 static int commit_if_needed(struct cache *cache) 1223 { 1224 if (dm_cache_changed_this_transaction(cache->cmd) && 1225 (cache->commit_requested || need_commit_due_to_time(cache))) { 1226 atomic_inc(&cache->stats.commit_count); 1227 cache->last_commit_jiffies = jiffies; 1228 cache->commit_requested = false; 1229 return dm_cache_commit(cache->cmd, false); 1230 } 1231 1232 return 0; 1233 } 1234 1235 static void process_deferred_bios(struct cache *cache) 1236 { 1237 unsigned long flags; 1238 struct bio_list bios; 1239 struct bio *bio; 1240 struct prealloc structs; 1241 1242 memset(&structs, 0, sizeof(structs)); 1243 bio_list_init(&bios); 1244 1245 spin_lock_irqsave(&cache->lock, flags); 1246 bio_list_merge(&bios, &cache->deferred_bios); 1247 bio_list_init(&cache->deferred_bios); 1248 spin_unlock_irqrestore(&cache->lock, flags); 1249 1250 while (!bio_list_empty(&bios)) { 1251 /* 1252 * If we've got no free migration structs, and processing 1253 * this bio might require one, we pause until there are some 1254 * prepared mappings to process. 1255 */ 1256 if (prealloc_data_structs(cache, &structs)) { 1257 spin_lock_irqsave(&cache->lock, flags); 1258 bio_list_merge(&cache->deferred_bios, &bios); 1259 spin_unlock_irqrestore(&cache->lock, flags); 1260 break; 1261 } 1262 1263 bio = bio_list_pop(&bios); 1264 1265 if (bio->bi_rw & REQ_FLUSH) 1266 process_flush_bio(cache, bio); 1267 else if (bio->bi_rw & REQ_DISCARD) 1268 process_discard_bio(cache, bio); 1269 else 1270 process_bio(cache, &structs, bio); 1271 } 1272 1273 prealloc_free_structs(cache, &structs); 1274 } 1275 1276 static void process_deferred_flush_bios(struct cache *cache, bool submit_bios) 1277 { 1278 unsigned long flags; 1279 struct bio_list bios; 1280 struct bio *bio; 1281 1282 bio_list_init(&bios); 1283 1284 spin_lock_irqsave(&cache->lock, flags); 1285 bio_list_merge(&bios, &cache->deferred_flush_bios); 1286 bio_list_init(&cache->deferred_flush_bios); 1287 spin_unlock_irqrestore(&cache->lock, flags); 1288 1289 while ((bio = bio_list_pop(&bios))) 1290 submit_bios ? generic_make_request(bio) : bio_io_error(bio); 1291 } 1292 1293 static void process_deferred_writethrough_bios(struct cache *cache) 1294 { 1295 unsigned long flags; 1296 struct bio_list bios; 1297 struct bio *bio; 1298 1299 bio_list_init(&bios); 1300 1301 spin_lock_irqsave(&cache->lock, flags); 1302 bio_list_merge(&bios, &cache->deferred_writethrough_bios); 1303 bio_list_init(&cache->deferred_writethrough_bios); 1304 spin_unlock_irqrestore(&cache->lock, flags); 1305 1306 while ((bio = bio_list_pop(&bios))) 1307 generic_make_request(bio); 1308 } 1309 1310 static void writeback_some_dirty_blocks(struct cache *cache) 1311 { 1312 int r = 0; 1313 dm_oblock_t oblock; 1314 dm_cblock_t cblock; 1315 struct prealloc structs; 1316 struct dm_bio_prison_cell *old_ocell; 1317 1318 memset(&structs, 0, sizeof(structs)); 1319 1320 while (spare_migration_bandwidth(cache)) { 1321 if (prealloc_data_structs(cache, &structs)) 1322 break; 1323 1324 r = policy_writeback_work(cache->policy, &oblock, &cblock); 1325 if (r) 1326 break; 1327 1328 r = get_cell(cache, oblock, &structs, &old_ocell); 1329 if (r) { 1330 policy_set_dirty(cache->policy, oblock); 1331 break; 1332 } 1333 1334 writeback(cache, &structs, oblock, cblock, old_ocell); 1335 } 1336 1337 prealloc_free_structs(cache, &structs); 1338 } 1339 1340 /*---------------------------------------------------------------- 1341 * Main worker loop 1342 *--------------------------------------------------------------*/ 1343 static void start_quiescing(struct cache *cache) 1344 { 1345 unsigned long flags; 1346 1347 spin_lock_irqsave(&cache->lock, flags); 1348 cache->quiescing = 1; 1349 spin_unlock_irqrestore(&cache->lock, flags); 1350 } 1351 1352 static void stop_quiescing(struct cache *cache) 1353 { 1354 unsigned long flags; 1355 1356 spin_lock_irqsave(&cache->lock, flags); 1357 cache->quiescing = 0; 1358 spin_unlock_irqrestore(&cache->lock, flags); 1359 } 1360 1361 static bool is_quiescing(struct cache *cache) 1362 { 1363 int r; 1364 unsigned long flags; 1365 1366 spin_lock_irqsave(&cache->lock, flags); 1367 r = cache->quiescing; 1368 spin_unlock_irqrestore(&cache->lock, flags); 1369 1370 return r; 1371 } 1372 1373 static void wait_for_migrations(struct cache *cache) 1374 { 1375 wait_event(cache->migration_wait, !atomic_read(&cache->nr_migrations)); 1376 } 1377 1378 static void stop_worker(struct cache *cache) 1379 { 1380 cancel_delayed_work(&cache->waker); 1381 flush_workqueue(cache->wq); 1382 } 1383 1384 static void requeue_deferred_io(struct cache *cache) 1385 { 1386 struct bio *bio; 1387 struct bio_list bios; 1388 1389 bio_list_init(&bios); 1390 bio_list_merge(&bios, &cache->deferred_bios); 1391 bio_list_init(&cache->deferred_bios); 1392 1393 while ((bio = bio_list_pop(&bios))) 1394 bio_endio(bio, DM_ENDIO_REQUEUE); 1395 } 1396 1397 static int more_work(struct cache *cache) 1398 { 1399 if (is_quiescing(cache)) 1400 return !list_empty(&cache->quiesced_migrations) || 1401 !list_empty(&cache->completed_migrations) || 1402 !list_empty(&cache->need_commit_migrations); 1403 else 1404 return !bio_list_empty(&cache->deferred_bios) || 1405 !bio_list_empty(&cache->deferred_flush_bios) || 1406 !bio_list_empty(&cache->deferred_writethrough_bios) || 1407 !list_empty(&cache->quiesced_migrations) || 1408 !list_empty(&cache->completed_migrations) || 1409 !list_empty(&cache->need_commit_migrations); 1410 } 1411 1412 static void do_worker(struct work_struct *ws) 1413 { 1414 struct cache *cache = container_of(ws, struct cache, worker); 1415 1416 do { 1417 if (!is_quiescing(cache)) 1418 process_deferred_bios(cache); 1419 1420 process_migrations(cache, &cache->quiesced_migrations, issue_copy); 1421 process_migrations(cache, &cache->completed_migrations, complete_migration); 1422 1423 writeback_some_dirty_blocks(cache); 1424 1425 process_deferred_writethrough_bios(cache); 1426 1427 if (commit_if_needed(cache)) { 1428 process_deferred_flush_bios(cache, false); 1429 1430 /* 1431 * FIXME: rollback metadata or just go into a 1432 * failure mode and error everything 1433 */ 1434 } else { 1435 process_deferred_flush_bios(cache, true); 1436 process_migrations(cache, &cache->need_commit_migrations, 1437 migration_success_post_commit); 1438 } 1439 } while (more_work(cache)); 1440 } 1441 1442 /* 1443 * We want to commit periodically so that not too much 1444 * unwritten metadata builds up. 1445 */ 1446 static void do_waker(struct work_struct *ws) 1447 { 1448 struct cache *cache = container_of(to_delayed_work(ws), struct cache, waker); 1449 policy_tick(cache->policy); 1450 wake_worker(cache); 1451 queue_delayed_work(cache->wq, &cache->waker, COMMIT_PERIOD); 1452 } 1453 1454 /*----------------------------------------------------------------*/ 1455 1456 static int is_congested(struct dm_dev *dev, int bdi_bits) 1457 { 1458 struct request_queue *q = bdev_get_queue(dev->bdev); 1459 return bdi_congested(&q->backing_dev_info, bdi_bits); 1460 } 1461 1462 static int cache_is_congested(struct dm_target_callbacks *cb, int bdi_bits) 1463 { 1464 struct cache *cache = container_of(cb, struct cache, callbacks); 1465 1466 return is_congested(cache->origin_dev, bdi_bits) || 1467 is_congested(cache->cache_dev, bdi_bits); 1468 } 1469 1470 /*---------------------------------------------------------------- 1471 * Target methods 1472 *--------------------------------------------------------------*/ 1473 1474 /* 1475 * This function gets called on the error paths of the constructor, so we 1476 * have to cope with a partially initialised struct. 1477 */ 1478 static void destroy(struct cache *cache) 1479 { 1480 unsigned i; 1481 1482 if (cache->next_migration) 1483 mempool_free(cache->next_migration, cache->migration_pool); 1484 1485 if (cache->migration_pool) 1486 mempool_destroy(cache->migration_pool); 1487 1488 if (cache->all_io_ds) 1489 dm_deferred_set_destroy(cache->all_io_ds); 1490 1491 if (cache->prison) 1492 dm_bio_prison_destroy(cache->prison); 1493 1494 if (cache->wq) 1495 destroy_workqueue(cache->wq); 1496 1497 if (cache->dirty_bitset) 1498 free_bitset(cache->dirty_bitset); 1499 1500 if (cache->discard_bitset) 1501 free_bitset(cache->discard_bitset); 1502 1503 if (cache->copier) 1504 dm_kcopyd_client_destroy(cache->copier); 1505 1506 if (cache->cmd) 1507 dm_cache_metadata_close(cache->cmd); 1508 1509 if (cache->metadata_dev) 1510 dm_put_device(cache->ti, cache->metadata_dev); 1511 1512 if (cache->origin_dev) 1513 dm_put_device(cache->ti, cache->origin_dev); 1514 1515 if (cache->cache_dev) 1516 dm_put_device(cache->ti, cache->cache_dev); 1517 1518 if (cache->policy) 1519 dm_cache_policy_destroy(cache->policy); 1520 1521 for (i = 0; i < cache->nr_ctr_args ; i++) 1522 kfree(cache->ctr_args[i]); 1523 kfree(cache->ctr_args); 1524 1525 kfree(cache); 1526 } 1527 1528 static void cache_dtr(struct dm_target *ti) 1529 { 1530 struct cache *cache = ti->private; 1531 1532 destroy(cache); 1533 } 1534 1535 static sector_t get_dev_size(struct dm_dev *dev) 1536 { 1537 return i_size_read(dev->bdev->bd_inode) >> SECTOR_SHIFT; 1538 } 1539 1540 /*----------------------------------------------------------------*/ 1541 1542 /* 1543 * Construct a cache device mapping. 1544 * 1545 * cache <metadata dev> <cache dev> <origin dev> <block size> 1546 * <#feature args> [<feature arg>]* 1547 * <policy> <#policy args> [<policy arg>]* 1548 * 1549 * metadata dev : fast device holding the persistent metadata 1550 * cache dev : fast device holding cached data blocks 1551 * origin dev : slow device holding original data blocks 1552 * block size : cache unit size in sectors 1553 * 1554 * #feature args : number of feature arguments passed 1555 * feature args : writethrough. (The default is writeback.) 1556 * 1557 * policy : the replacement policy to use 1558 * #policy args : an even number of policy arguments corresponding 1559 * to key/value pairs passed to the policy 1560 * policy args : key/value pairs passed to the policy 1561 * E.g. 'sequential_threshold 1024' 1562 * See cache-policies.txt for details. 1563 * 1564 * Optional feature arguments are: 1565 * writethrough : write through caching that prohibits cache block 1566 * content from being different from origin block content. 1567 * Without this argument, the default behaviour is to write 1568 * back cache block contents later for performance reasons, 1569 * so they may differ from the corresponding origin blocks. 1570 */ 1571 struct cache_args { 1572 struct dm_target *ti; 1573 1574 struct dm_dev *metadata_dev; 1575 1576 struct dm_dev *cache_dev; 1577 sector_t cache_sectors; 1578 1579 struct dm_dev *origin_dev; 1580 sector_t origin_sectors; 1581 1582 uint32_t block_size; 1583 1584 const char *policy_name; 1585 int policy_argc; 1586 const char **policy_argv; 1587 1588 struct cache_features features; 1589 }; 1590 1591 static void destroy_cache_args(struct cache_args *ca) 1592 { 1593 if (ca->metadata_dev) 1594 dm_put_device(ca->ti, ca->metadata_dev); 1595 1596 if (ca->cache_dev) 1597 dm_put_device(ca->ti, ca->cache_dev); 1598 1599 if (ca->origin_dev) 1600 dm_put_device(ca->ti, ca->origin_dev); 1601 1602 kfree(ca); 1603 } 1604 1605 static bool at_least_one_arg(struct dm_arg_set *as, char **error) 1606 { 1607 if (!as->argc) { 1608 *error = "Insufficient args"; 1609 return false; 1610 } 1611 1612 return true; 1613 } 1614 1615 static int parse_metadata_dev(struct cache_args *ca, struct dm_arg_set *as, 1616 char **error) 1617 { 1618 int r; 1619 sector_t metadata_dev_size; 1620 char b[BDEVNAME_SIZE]; 1621 1622 if (!at_least_one_arg(as, error)) 1623 return -EINVAL; 1624 1625 r = dm_get_device(ca->ti, dm_shift_arg(as), FMODE_READ | FMODE_WRITE, 1626 &ca->metadata_dev); 1627 if (r) { 1628 *error = "Error opening metadata device"; 1629 return r; 1630 } 1631 1632 metadata_dev_size = get_dev_size(ca->metadata_dev); 1633 if (metadata_dev_size > DM_CACHE_METADATA_MAX_SECTORS_WARNING) 1634 DMWARN("Metadata device %s is larger than %u sectors: excess space will not be used.", 1635 bdevname(ca->metadata_dev->bdev, b), THIN_METADATA_MAX_SECTORS); 1636 1637 return 0; 1638 } 1639 1640 static int parse_cache_dev(struct cache_args *ca, struct dm_arg_set *as, 1641 char **error) 1642 { 1643 int r; 1644 1645 if (!at_least_one_arg(as, error)) 1646 return -EINVAL; 1647 1648 r = dm_get_device(ca->ti, dm_shift_arg(as), FMODE_READ | FMODE_WRITE, 1649 &ca->cache_dev); 1650 if (r) { 1651 *error = "Error opening cache device"; 1652 return r; 1653 } 1654 ca->cache_sectors = get_dev_size(ca->cache_dev); 1655 1656 return 0; 1657 } 1658 1659 static int parse_origin_dev(struct cache_args *ca, struct dm_arg_set *as, 1660 char **error) 1661 { 1662 int r; 1663 1664 if (!at_least_one_arg(as, error)) 1665 return -EINVAL; 1666 1667 r = dm_get_device(ca->ti, dm_shift_arg(as), FMODE_READ | FMODE_WRITE, 1668 &ca->origin_dev); 1669 if (r) { 1670 *error = "Error opening origin device"; 1671 return r; 1672 } 1673 1674 ca->origin_sectors = get_dev_size(ca->origin_dev); 1675 if (ca->ti->len > ca->origin_sectors) { 1676 *error = "Device size larger than cached device"; 1677 return -EINVAL; 1678 } 1679 1680 return 0; 1681 } 1682 1683 static int parse_block_size(struct cache_args *ca, struct dm_arg_set *as, 1684 char **error) 1685 { 1686 unsigned long tmp; 1687 1688 if (!at_least_one_arg(as, error)) 1689 return -EINVAL; 1690 1691 if (kstrtoul(dm_shift_arg(as), 10, &tmp) || !tmp || 1692 tmp < DATA_DEV_BLOCK_SIZE_MIN_SECTORS || 1693 tmp & (DATA_DEV_BLOCK_SIZE_MIN_SECTORS - 1)) { 1694 *error = "Invalid data block size"; 1695 return -EINVAL; 1696 } 1697 1698 if (tmp > ca->cache_sectors) { 1699 *error = "Data block size is larger than the cache device"; 1700 return -EINVAL; 1701 } 1702 1703 ca->block_size = tmp; 1704 1705 return 0; 1706 } 1707 1708 static void init_features(struct cache_features *cf) 1709 { 1710 cf->mode = CM_WRITE; 1711 cf->write_through = false; 1712 } 1713 1714 static int parse_features(struct cache_args *ca, struct dm_arg_set *as, 1715 char **error) 1716 { 1717 static struct dm_arg _args[] = { 1718 {0, 1, "Invalid number of cache feature arguments"}, 1719 }; 1720 1721 int r; 1722 unsigned argc; 1723 const char *arg; 1724 struct cache_features *cf = &ca->features; 1725 1726 init_features(cf); 1727 1728 r = dm_read_arg_group(_args, as, &argc, error); 1729 if (r) 1730 return -EINVAL; 1731 1732 while (argc--) { 1733 arg = dm_shift_arg(as); 1734 1735 if (!strcasecmp(arg, "writeback")) 1736 cf->write_through = false; 1737 1738 else if (!strcasecmp(arg, "writethrough")) 1739 cf->write_through = true; 1740 1741 else { 1742 *error = "Unrecognised cache feature requested"; 1743 return -EINVAL; 1744 } 1745 } 1746 1747 return 0; 1748 } 1749 1750 static int parse_policy(struct cache_args *ca, struct dm_arg_set *as, 1751 char **error) 1752 { 1753 static struct dm_arg _args[] = { 1754 {0, 1024, "Invalid number of policy arguments"}, 1755 }; 1756 1757 int r; 1758 1759 if (!at_least_one_arg(as, error)) 1760 return -EINVAL; 1761 1762 ca->policy_name = dm_shift_arg(as); 1763 1764 r = dm_read_arg_group(_args, as, &ca->policy_argc, error); 1765 if (r) 1766 return -EINVAL; 1767 1768 ca->policy_argv = (const char **)as->argv; 1769 dm_consume_args(as, ca->policy_argc); 1770 1771 return 0; 1772 } 1773 1774 static int parse_cache_args(struct cache_args *ca, int argc, char **argv, 1775 char **error) 1776 { 1777 int r; 1778 struct dm_arg_set as; 1779 1780 as.argc = argc; 1781 as.argv = argv; 1782 1783 r = parse_metadata_dev(ca, &as, error); 1784 if (r) 1785 return r; 1786 1787 r = parse_cache_dev(ca, &as, error); 1788 if (r) 1789 return r; 1790 1791 r = parse_origin_dev(ca, &as, error); 1792 if (r) 1793 return r; 1794 1795 r = parse_block_size(ca, &as, error); 1796 if (r) 1797 return r; 1798 1799 r = parse_features(ca, &as, error); 1800 if (r) 1801 return r; 1802 1803 r = parse_policy(ca, &as, error); 1804 if (r) 1805 return r; 1806 1807 return 0; 1808 } 1809 1810 /*----------------------------------------------------------------*/ 1811 1812 static struct kmem_cache *migration_cache; 1813 1814 #define NOT_CORE_OPTION 1 1815 1816 static int process_config_option(struct cache *cache, const char *key, const char *value) 1817 { 1818 unsigned long tmp; 1819 1820 if (!strcasecmp(key, "migration_threshold")) { 1821 if (kstrtoul(value, 10, &tmp)) 1822 return -EINVAL; 1823 1824 cache->migration_threshold = tmp; 1825 return 0; 1826 } 1827 1828 return NOT_CORE_OPTION; 1829 } 1830 1831 static int set_config_value(struct cache *cache, const char *key, const char *value) 1832 { 1833 int r = process_config_option(cache, key, value); 1834 1835 if (r == NOT_CORE_OPTION) 1836 r = policy_set_config_value(cache->policy, key, value); 1837 1838 if (r) 1839 DMWARN("bad config value for %s: %s", key, value); 1840 1841 return r; 1842 } 1843 1844 static int set_config_values(struct cache *cache, int argc, const char **argv) 1845 { 1846 int r = 0; 1847 1848 if (argc & 1) { 1849 DMWARN("Odd number of policy arguments given but they should be <key> <value> pairs."); 1850 return -EINVAL; 1851 } 1852 1853 while (argc) { 1854 r = set_config_value(cache, argv[0], argv[1]); 1855 if (r) 1856 break; 1857 1858 argc -= 2; 1859 argv += 2; 1860 } 1861 1862 return r; 1863 } 1864 1865 static int create_cache_policy(struct cache *cache, struct cache_args *ca, 1866 char **error) 1867 { 1868 cache->policy = dm_cache_policy_create(ca->policy_name, 1869 cache->cache_size, 1870 cache->origin_sectors, 1871 cache->sectors_per_block); 1872 if (!cache->policy) { 1873 *error = "Error creating cache's policy"; 1874 return -ENOMEM; 1875 } 1876 1877 return 0; 1878 } 1879 1880 /* 1881 * We want the discard block size to be a power of two, at least the size 1882 * of the cache block size, and have no more than 2^14 discard blocks 1883 * across the origin. 1884 */ 1885 #define MAX_DISCARD_BLOCKS (1 << 14) 1886 1887 static bool too_many_discard_blocks(sector_t discard_block_size, 1888 sector_t origin_size) 1889 { 1890 (void) sector_div(origin_size, discard_block_size); 1891 1892 return origin_size > MAX_DISCARD_BLOCKS; 1893 } 1894 1895 static sector_t calculate_discard_block_size(sector_t cache_block_size, 1896 sector_t origin_size) 1897 { 1898 sector_t discard_block_size; 1899 1900 discard_block_size = roundup_pow_of_two(cache_block_size); 1901 1902 if (origin_size) 1903 while (too_many_discard_blocks(discard_block_size, origin_size)) 1904 discard_block_size *= 2; 1905 1906 return discard_block_size; 1907 } 1908 1909 #define DEFAULT_MIGRATION_THRESHOLD 2048 1910 1911 static int cache_create(struct cache_args *ca, struct cache **result) 1912 { 1913 int r = 0; 1914 char **error = &ca->ti->error; 1915 struct cache *cache; 1916 struct dm_target *ti = ca->ti; 1917 dm_block_t origin_blocks; 1918 struct dm_cache_metadata *cmd; 1919 bool may_format = ca->features.mode == CM_WRITE; 1920 1921 cache = kzalloc(sizeof(*cache), GFP_KERNEL); 1922 if (!cache) 1923 return -ENOMEM; 1924 1925 cache->ti = ca->ti; 1926 ti->private = cache; 1927 ti->num_flush_bios = 2; 1928 ti->flush_supported = true; 1929 1930 ti->num_discard_bios = 1; 1931 ti->discards_supported = true; 1932 ti->discard_zeroes_data_unsupported = true; 1933 1934 cache->features = ca->features; 1935 ti->per_bio_data_size = get_per_bio_data_size(cache); 1936 1937 cache->callbacks.congested_fn = cache_is_congested; 1938 dm_table_add_target_callbacks(ti->table, &cache->callbacks); 1939 1940 cache->metadata_dev = ca->metadata_dev; 1941 cache->origin_dev = ca->origin_dev; 1942 cache->cache_dev = ca->cache_dev; 1943 1944 ca->metadata_dev = ca->origin_dev = ca->cache_dev = NULL; 1945 1946 /* FIXME: factor out this whole section */ 1947 origin_blocks = cache->origin_sectors = ca->origin_sectors; 1948 origin_blocks = block_div(origin_blocks, ca->block_size); 1949 cache->origin_blocks = to_oblock(origin_blocks); 1950 1951 cache->sectors_per_block = ca->block_size; 1952 if (dm_set_target_max_io_len(ti, cache->sectors_per_block)) { 1953 r = -EINVAL; 1954 goto bad; 1955 } 1956 1957 if (ca->block_size & (ca->block_size - 1)) { 1958 dm_block_t cache_size = ca->cache_sectors; 1959 1960 cache->sectors_per_block_shift = -1; 1961 cache_size = block_div(cache_size, ca->block_size); 1962 cache->cache_size = to_cblock(cache_size); 1963 } else { 1964 cache->sectors_per_block_shift = __ffs(ca->block_size); 1965 cache->cache_size = to_cblock(ca->cache_sectors >> cache->sectors_per_block_shift); 1966 } 1967 1968 r = create_cache_policy(cache, ca, error); 1969 if (r) 1970 goto bad; 1971 1972 cache->policy_nr_args = ca->policy_argc; 1973 cache->migration_threshold = DEFAULT_MIGRATION_THRESHOLD; 1974 1975 r = set_config_values(cache, ca->policy_argc, ca->policy_argv); 1976 if (r) { 1977 *error = "Error setting cache policy's config values"; 1978 goto bad; 1979 } 1980 1981 cmd = dm_cache_metadata_open(cache->metadata_dev->bdev, 1982 ca->block_size, may_format, 1983 dm_cache_policy_get_hint_size(cache->policy)); 1984 if (IS_ERR(cmd)) { 1985 *error = "Error creating metadata object"; 1986 r = PTR_ERR(cmd); 1987 goto bad; 1988 } 1989 cache->cmd = cmd; 1990 1991 spin_lock_init(&cache->lock); 1992 bio_list_init(&cache->deferred_bios); 1993 bio_list_init(&cache->deferred_flush_bios); 1994 bio_list_init(&cache->deferred_writethrough_bios); 1995 INIT_LIST_HEAD(&cache->quiesced_migrations); 1996 INIT_LIST_HEAD(&cache->completed_migrations); 1997 INIT_LIST_HEAD(&cache->need_commit_migrations); 1998 atomic_set(&cache->nr_migrations, 0); 1999 init_waitqueue_head(&cache->migration_wait); 2000 2001 r = -ENOMEM; 2002 cache->nr_dirty = 0; 2003 cache->dirty_bitset = alloc_bitset(from_cblock(cache->cache_size)); 2004 if (!cache->dirty_bitset) { 2005 *error = "could not allocate dirty bitset"; 2006 goto bad; 2007 } 2008 clear_bitset(cache->dirty_bitset, from_cblock(cache->cache_size)); 2009 2010 cache->discard_block_size = 2011 calculate_discard_block_size(cache->sectors_per_block, 2012 cache->origin_sectors); 2013 cache->discard_nr_blocks = oblock_to_dblock(cache, cache->origin_blocks); 2014 cache->discard_bitset = alloc_bitset(from_dblock(cache->discard_nr_blocks)); 2015 if (!cache->discard_bitset) { 2016 *error = "could not allocate discard bitset"; 2017 goto bad; 2018 } 2019 clear_bitset(cache->discard_bitset, from_dblock(cache->discard_nr_blocks)); 2020 2021 cache->copier = dm_kcopyd_client_create(&dm_kcopyd_throttle); 2022 if (IS_ERR(cache->copier)) { 2023 *error = "could not create kcopyd client"; 2024 r = PTR_ERR(cache->copier); 2025 goto bad; 2026 } 2027 2028 cache->wq = alloc_ordered_workqueue("dm-" DM_MSG_PREFIX, WQ_MEM_RECLAIM); 2029 if (!cache->wq) { 2030 *error = "could not create workqueue for metadata object"; 2031 goto bad; 2032 } 2033 INIT_WORK(&cache->worker, do_worker); 2034 INIT_DELAYED_WORK(&cache->waker, do_waker); 2035 cache->last_commit_jiffies = jiffies; 2036 2037 cache->prison = dm_bio_prison_create(PRISON_CELLS); 2038 if (!cache->prison) { 2039 *error = "could not create bio prison"; 2040 goto bad; 2041 } 2042 2043 cache->all_io_ds = dm_deferred_set_create(); 2044 if (!cache->all_io_ds) { 2045 *error = "could not create all_io deferred set"; 2046 goto bad; 2047 } 2048 2049 cache->migration_pool = mempool_create_slab_pool(MIGRATION_POOL_SIZE, 2050 migration_cache); 2051 if (!cache->migration_pool) { 2052 *error = "Error creating cache's migration mempool"; 2053 goto bad; 2054 } 2055 2056 cache->next_migration = NULL; 2057 2058 cache->need_tick_bio = true; 2059 cache->sized = false; 2060 cache->quiescing = false; 2061 cache->commit_requested = false; 2062 cache->loaded_mappings = false; 2063 cache->loaded_discards = false; 2064 2065 load_stats(cache); 2066 2067 atomic_set(&cache->stats.demotion, 0); 2068 atomic_set(&cache->stats.promotion, 0); 2069 atomic_set(&cache->stats.copies_avoided, 0); 2070 atomic_set(&cache->stats.cache_cell_clash, 0); 2071 atomic_set(&cache->stats.commit_count, 0); 2072 atomic_set(&cache->stats.discard_count, 0); 2073 2074 *result = cache; 2075 return 0; 2076 2077 bad: 2078 destroy(cache); 2079 return r; 2080 } 2081 2082 static int copy_ctr_args(struct cache *cache, int argc, const char **argv) 2083 { 2084 unsigned i; 2085 const char **copy; 2086 2087 copy = kcalloc(argc, sizeof(*copy), GFP_KERNEL); 2088 if (!copy) 2089 return -ENOMEM; 2090 for (i = 0; i < argc; i++) { 2091 copy[i] = kstrdup(argv[i], GFP_KERNEL); 2092 if (!copy[i]) { 2093 while (i--) 2094 kfree(copy[i]); 2095 kfree(copy); 2096 return -ENOMEM; 2097 } 2098 } 2099 2100 cache->nr_ctr_args = argc; 2101 cache->ctr_args = copy; 2102 2103 return 0; 2104 } 2105 2106 static int cache_ctr(struct dm_target *ti, unsigned argc, char **argv) 2107 { 2108 int r = -EINVAL; 2109 struct cache_args *ca; 2110 struct cache *cache = NULL; 2111 2112 ca = kzalloc(sizeof(*ca), GFP_KERNEL); 2113 if (!ca) { 2114 ti->error = "Error allocating memory for cache"; 2115 return -ENOMEM; 2116 } 2117 ca->ti = ti; 2118 2119 r = parse_cache_args(ca, argc, argv, &ti->error); 2120 if (r) 2121 goto out; 2122 2123 r = cache_create(ca, &cache); 2124 if (r) 2125 goto out; 2126 2127 r = copy_ctr_args(cache, argc - 3, (const char **)argv + 3); 2128 if (r) { 2129 destroy(cache); 2130 goto out; 2131 } 2132 2133 ti->private = cache; 2134 2135 out: 2136 destroy_cache_args(ca); 2137 return r; 2138 } 2139 2140 static int cache_map(struct dm_target *ti, struct bio *bio) 2141 { 2142 struct cache *cache = ti->private; 2143 2144 int r; 2145 dm_oblock_t block = get_bio_block(cache, bio); 2146 size_t pb_data_size = get_per_bio_data_size(cache); 2147 bool can_migrate = false; 2148 bool discarded_block; 2149 struct dm_bio_prison_cell *cell; 2150 struct policy_result lookup_result; 2151 struct per_bio_data *pb; 2152 2153 if (from_oblock(block) > from_oblock(cache->origin_blocks)) { 2154 /* 2155 * This can only occur if the io goes to a partial block at 2156 * the end of the origin device. We don't cache these. 2157 * Just remap to the origin and carry on. 2158 */ 2159 remap_to_origin_clear_discard(cache, bio, block); 2160 return DM_MAPIO_REMAPPED; 2161 } 2162 2163 pb = init_per_bio_data(bio, pb_data_size); 2164 2165 if (bio->bi_rw & (REQ_FLUSH | REQ_FUA | REQ_DISCARD)) { 2166 defer_bio(cache, bio); 2167 return DM_MAPIO_SUBMITTED; 2168 } 2169 2170 /* 2171 * Check to see if that block is currently migrating. 2172 */ 2173 cell = alloc_prison_cell(cache); 2174 if (!cell) { 2175 defer_bio(cache, bio); 2176 return DM_MAPIO_SUBMITTED; 2177 } 2178 2179 r = bio_detain(cache, block, bio, cell, 2180 (cell_free_fn) free_prison_cell, 2181 cache, &cell); 2182 if (r) { 2183 if (r < 0) 2184 defer_bio(cache, bio); 2185 2186 return DM_MAPIO_SUBMITTED; 2187 } 2188 2189 discarded_block = is_discarded_oblock(cache, block); 2190 2191 r = policy_map(cache->policy, block, false, can_migrate, discarded_block, 2192 bio, &lookup_result); 2193 if (r == -EWOULDBLOCK) { 2194 cell_defer(cache, cell, true); 2195 return DM_MAPIO_SUBMITTED; 2196 2197 } else if (r) { 2198 DMERR_LIMIT("Unexpected return from cache replacement policy: %d", r); 2199 bio_io_error(bio); 2200 return DM_MAPIO_SUBMITTED; 2201 } 2202 2203 switch (lookup_result.op) { 2204 case POLICY_HIT: 2205 inc_hit_counter(cache, bio); 2206 pb->all_io_entry = dm_deferred_entry_inc(cache->all_io_ds); 2207 2208 if (is_writethrough_io(cache, bio, lookup_result.cblock)) 2209 remap_to_origin_then_cache(cache, bio, block, lookup_result.cblock); 2210 else 2211 remap_to_cache_dirty(cache, bio, block, lookup_result.cblock); 2212 2213 cell_defer(cache, cell, false); 2214 break; 2215 2216 case POLICY_MISS: 2217 inc_miss_counter(cache, bio); 2218 pb->all_io_entry = dm_deferred_entry_inc(cache->all_io_ds); 2219 2220 if (pb->req_nr != 0) { 2221 /* 2222 * This is a duplicate writethrough io that is no 2223 * longer needed because the block has been demoted. 2224 */ 2225 bio_endio(bio, 0); 2226 cell_defer(cache, cell, false); 2227 return DM_MAPIO_SUBMITTED; 2228 } else { 2229 remap_to_origin_clear_discard(cache, bio, block); 2230 cell_defer(cache, cell, false); 2231 } 2232 break; 2233 2234 default: 2235 DMERR_LIMIT("%s: erroring bio: unknown policy op: %u", __func__, 2236 (unsigned) lookup_result.op); 2237 bio_io_error(bio); 2238 return DM_MAPIO_SUBMITTED; 2239 } 2240 2241 return DM_MAPIO_REMAPPED; 2242 } 2243 2244 static int cache_end_io(struct dm_target *ti, struct bio *bio, int error) 2245 { 2246 struct cache *cache = ti->private; 2247 unsigned long flags; 2248 size_t pb_data_size = get_per_bio_data_size(cache); 2249 struct per_bio_data *pb = get_per_bio_data(bio, pb_data_size); 2250 2251 if (pb->tick) { 2252 policy_tick(cache->policy); 2253 2254 spin_lock_irqsave(&cache->lock, flags); 2255 cache->need_tick_bio = true; 2256 spin_unlock_irqrestore(&cache->lock, flags); 2257 } 2258 2259 check_for_quiesced_migrations(cache, pb); 2260 2261 return 0; 2262 } 2263 2264 static int write_dirty_bitset(struct cache *cache) 2265 { 2266 unsigned i, r; 2267 2268 for (i = 0; i < from_cblock(cache->cache_size); i++) { 2269 r = dm_cache_set_dirty(cache->cmd, to_cblock(i), 2270 is_dirty(cache, to_cblock(i))); 2271 if (r) 2272 return r; 2273 } 2274 2275 return 0; 2276 } 2277 2278 static int write_discard_bitset(struct cache *cache) 2279 { 2280 unsigned i, r; 2281 2282 r = dm_cache_discard_bitset_resize(cache->cmd, cache->discard_block_size, 2283 cache->discard_nr_blocks); 2284 if (r) { 2285 DMERR("could not resize on-disk discard bitset"); 2286 return r; 2287 } 2288 2289 for (i = 0; i < from_dblock(cache->discard_nr_blocks); i++) { 2290 r = dm_cache_set_discard(cache->cmd, to_dblock(i), 2291 is_discarded(cache, to_dblock(i))); 2292 if (r) 2293 return r; 2294 } 2295 2296 return 0; 2297 } 2298 2299 static int save_hint(void *context, dm_cblock_t cblock, dm_oblock_t oblock, 2300 uint32_t hint) 2301 { 2302 struct cache *cache = context; 2303 return dm_cache_save_hint(cache->cmd, cblock, hint); 2304 } 2305 2306 static int write_hints(struct cache *cache) 2307 { 2308 int r; 2309 2310 r = dm_cache_begin_hints(cache->cmd, cache->policy); 2311 if (r) { 2312 DMERR("dm_cache_begin_hints failed"); 2313 return r; 2314 } 2315 2316 r = policy_walk_mappings(cache->policy, save_hint, cache); 2317 if (r) 2318 DMERR("policy_walk_mappings failed"); 2319 2320 return r; 2321 } 2322 2323 /* 2324 * returns true on success 2325 */ 2326 static bool sync_metadata(struct cache *cache) 2327 { 2328 int r1, r2, r3, r4; 2329 2330 r1 = write_dirty_bitset(cache); 2331 if (r1) 2332 DMERR("could not write dirty bitset"); 2333 2334 r2 = write_discard_bitset(cache); 2335 if (r2) 2336 DMERR("could not write discard bitset"); 2337 2338 save_stats(cache); 2339 2340 r3 = write_hints(cache); 2341 if (r3) 2342 DMERR("could not write hints"); 2343 2344 /* 2345 * If writing the above metadata failed, we still commit, but don't 2346 * set the clean shutdown flag. This will effectively force every 2347 * dirty bit to be set on reload. 2348 */ 2349 r4 = dm_cache_commit(cache->cmd, !r1 && !r2 && !r3); 2350 if (r4) 2351 DMERR("could not write cache metadata. Data loss may occur."); 2352 2353 return !r1 && !r2 && !r3 && !r4; 2354 } 2355 2356 static void cache_postsuspend(struct dm_target *ti) 2357 { 2358 struct cache *cache = ti->private; 2359 2360 start_quiescing(cache); 2361 wait_for_migrations(cache); 2362 stop_worker(cache); 2363 requeue_deferred_io(cache); 2364 stop_quiescing(cache); 2365 2366 (void) sync_metadata(cache); 2367 } 2368 2369 static int load_mapping(void *context, dm_oblock_t oblock, dm_cblock_t cblock, 2370 bool dirty, uint32_t hint, bool hint_valid) 2371 { 2372 int r; 2373 struct cache *cache = context; 2374 2375 r = policy_load_mapping(cache->policy, oblock, cblock, hint, hint_valid); 2376 if (r) 2377 return r; 2378 2379 if (dirty) 2380 set_dirty(cache, oblock, cblock); 2381 else 2382 clear_dirty(cache, oblock, cblock); 2383 2384 return 0; 2385 } 2386 2387 static int load_discard(void *context, sector_t discard_block_size, 2388 dm_dblock_t dblock, bool discard) 2389 { 2390 struct cache *cache = context; 2391 2392 /* FIXME: handle mis-matched block size */ 2393 2394 if (discard) 2395 set_discard(cache, dblock); 2396 else 2397 clear_discard(cache, dblock); 2398 2399 return 0; 2400 } 2401 2402 static int cache_preresume(struct dm_target *ti) 2403 { 2404 int r = 0; 2405 struct cache *cache = ti->private; 2406 sector_t actual_cache_size = get_dev_size(cache->cache_dev); 2407 (void) sector_div(actual_cache_size, cache->sectors_per_block); 2408 2409 /* 2410 * Check to see if the cache has resized. 2411 */ 2412 if (from_cblock(cache->cache_size) != actual_cache_size || !cache->sized) { 2413 cache->cache_size = to_cblock(actual_cache_size); 2414 2415 r = dm_cache_resize(cache->cmd, cache->cache_size); 2416 if (r) { 2417 DMERR("could not resize cache metadata"); 2418 return r; 2419 } 2420 2421 cache->sized = true; 2422 } 2423 2424 if (!cache->loaded_mappings) { 2425 r = dm_cache_load_mappings(cache->cmd, cache->policy, 2426 load_mapping, cache); 2427 if (r) { 2428 DMERR("could not load cache mappings"); 2429 return r; 2430 } 2431 2432 cache->loaded_mappings = true; 2433 } 2434 2435 if (!cache->loaded_discards) { 2436 r = dm_cache_load_discards(cache->cmd, load_discard, cache); 2437 if (r) { 2438 DMERR("could not load origin discards"); 2439 return r; 2440 } 2441 2442 cache->loaded_discards = true; 2443 } 2444 2445 return r; 2446 } 2447 2448 static void cache_resume(struct dm_target *ti) 2449 { 2450 struct cache *cache = ti->private; 2451 2452 cache->need_tick_bio = true; 2453 do_waker(&cache->waker.work); 2454 } 2455 2456 /* 2457 * Status format: 2458 * 2459 * <#used metadata blocks>/<#total metadata blocks> 2460 * <#read hits> <#read misses> <#write hits> <#write misses> 2461 * <#demotions> <#promotions> <#blocks in cache> <#dirty> 2462 * <#features> <features>* 2463 * <#core args> <core args> 2464 * <#policy args> <policy args>* 2465 */ 2466 static void cache_status(struct dm_target *ti, status_type_t type, 2467 unsigned status_flags, char *result, unsigned maxlen) 2468 { 2469 int r = 0; 2470 unsigned i; 2471 ssize_t sz = 0; 2472 dm_block_t nr_free_blocks_metadata = 0; 2473 dm_block_t nr_blocks_metadata = 0; 2474 char buf[BDEVNAME_SIZE]; 2475 struct cache *cache = ti->private; 2476 dm_cblock_t residency; 2477 2478 switch (type) { 2479 case STATUSTYPE_INFO: 2480 /* Commit to ensure statistics aren't out-of-date */ 2481 if (!(status_flags & DM_STATUS_NOFLUSH_FLAG) && !dm_suspended(ti)) { 2482 r = dm_cache_commit(cache->cmd, false); 2483 if (r) 2484 DMERR("could not commit metadata for accurate status"); 2485 } 2486 2487 r = dm_cache_get_free_metadata_block_count(cache->cmd, 2488 &nr_free_blocks_metadata); 2489 if (r) { 2490 DMERR("could not get metadata free block count"); 2491 goto err; 2492 } 2493 2494 r = dm_cache_get_metadata_dev_size(cache->cmd, &nr_blocks_metadata); 2495 if (r) { 2496 DMERR("could not get metadata device size"); 2497 goto err; 2498 } 2499 2500 residency = policy_residency(cache->policy); 2501 2502 DMEMIT("%llu/%llu %u %u %u %u %u %u %llu %u ", 2503 (unsigned long long)(nr_blocks_metadata - nr_free_blocks_metadata), 2504 (unsigned long long)nr_blocks_metadata, 2505 (unsigned) atomic_read(&cache->stats.read_hit), 2506 (unsigned) atomic_read(&cache->stats.read_miss), 2507 (unsigned) atomic_read(&cache->stats.write_hit), 2508 (unsigned) atomic_read(&cache->stats.write_miss), 2509 (unsigned) atomic_read(&cache->stats.demotion), 2510 (unsigned) atomic_read(&cache->stats.promotion), 2511 (unsigned long long) from_cblock(residency), 2512 cache->nr_dirty); 2513 2514 if (cache->features.write_through) 2515 DMEMIT("1 writethrough "); 2516 else 2517 DMEMIT("0 "); 2518 2519 DMEMIT("2 migration_threshold %llu ", (unsigned long long) cache->migration_threshold); 2520 if (sz < maxlen) { 2521 r = policy_emit_config_values(cache->policy, result + sz, maxlen - sz); 2522 if (r) 2523 DMERR("policy_emit_config_values returned %d", r); 2524 } 2525 2526 break; 2527 2528 case STATUSTYPE_TABLE: 2529 format_dev_t(buf, cache->metadata_dev->bdev->bd_dev); 2530 DMEMIT("%s ", buf); 2531 format_dev_t(buf, cache->cache_dev->bdev->bd_dev); 2532 DMEMIT("%s ", buf); 2533 format_dev_t(buf, cache->origin_dev->bdev->bd_dev); 2534 DMEMIT("%s", buf); 2535 2536 for (i = 0; i < cache->nr_ctr_args - 1; i++) 2537 DMEMIT(" %s", cache->ctr_args[i]); 2538 if (cache->nr_ctr_args) 2539 DMEMIT(" %s", cache->ctr_args[cache->nr_ctr_args - 1]); 2540 } 2541 2542 return; 2543 2544 err: 2545 DMEMIT("Error"); 2546 } 2547 2548 /* 2549 * Supports <key> <value>. 2550 * 2551 * The key migration_threshold is supported by the cache target core. 2552 */ 2553 static int cache_message(struct dm_target *ti, unsigned argc, char **argv) 2554 { 2555 struct cache *cache = ti->private; 2556 2557 if (argc != 2) 2558 return -EINVAL; 2559 2560 return set_config_value(cache, argv[0], argv[1]); 2561 } 2562 2563 static int cache_iterate_devices(struct dm_target *ti, 2564 iterate_devices_callout_fn fn, void *data) 2565 { 2566 int r = 0; 2567 struct cache *cache = ti->private; 2568 2569 r = fn(ti, cache->cache_dev, 0, get_dev_size(cache->cache_dev), data); 2570 if (!r) 2571 r = fn(ti, cache->origin_dev, 0, ti->len, data); 2572 2573 return r; 2574 } 2575 2576 /* 2577 * We assume I/O is going to the origin (which is the volume 2578 * more likely to have restrictions e.g. by being striped). 2579 * (Looking up the exact location of the data would be expensive 2580 * and could always be out of date by the time the bio is submitted.) 2581 */ 2582 static int cache_bvec_merge(struct dm_target *ti, 2583 struct bvec_merge_data *bvm, 2584 struct bio_vec *biovec, int max_size) 2585 { 2586 struct cache *cache = ti->private; 2587 struct request_queue *q = bdev_get_queue(cache->origin_dev->bdev); 2588 2589 if (!q->merge_bvec_fn) 2590 return max_size; 2591 2592 bvm->bi_bdev = cache->origin_dev->bdev; 2593 return min(max_size, q->merge_bvec_fn(q, bvm, biovec)); 2594 } 2595 2596 static void set_discard_limits(struct cache *cache, struct queue_limits *limits) 2597 { 2598 /* 2599 * FIXME: these limits may be incompatible with the cache device 2600 */ 2601 limits->max_discard_sectors = cache->discard_block_size * 1024; 2602 limits->discard_granularity = cache->discard_block_size << SECTOR_SHIFT; 2603 } 2604 2605 static void cache_io_hints(struct dm_target *ti, struct queue_limits *limits) 2606 { 2607 struct cache *cache = ti->private; 2608 2609 blk_limits_io_min(limits, 0); 2610 blk_limits_io_opt(limits, cache->sectors_per_block << SECTOR_SHIFT); 2611 set_discard_limits(cache, limits); 2612 } 2613 2614 /*----------------------------------------------------------------*/ 2615 2616 static struct target_type cache_target = { 2617 .name = "cache", 2618 .version = {1, 1, 1}, 2619 .module = THIS_MODULE, 2620 .ctr = cache_ctr, 2621 .dtr = cache_dtr, 2622 .map = cache_map, 2623 .end_io = cache_end_io, 2624 .postsuspend = cache_postsuspend, 2625 .preresume = cache_preresume, 2626 .resume = cache_resume, 2627 .status = cache_status, 2628 .message = cache_message, 2629 .iterate_devices = cache_iterate_devices, 2630 .merge = cache_bvec_merge, 2631 .io_hints = cache_io_hints, 2632 }; 2633 2634 static int __init dm_cache_init(void) 2635 { 2636 int r; 2637 2638 r = dm_register_target(&cache_target); 2639 if (r) { 2640 DMERR("cache target registration failed: %d", r); 2641 return r; 2642 } 2643 2644 migration_cache = KMEM_CACHE(dm_cache_migration, 0); 2645 if (!migration_cache) { 2646 dm_unregister_target(&cache_target); 2647 return -ENOMEM; 2648 } 2649 2650 return 0; 2651 } 2652 2653 static void __exit dm_cache_exit(void) 2654 { 2655 dm_unregister_target(&cache_target); 2656 kmem_cache_destroy(migration_cache); 2657 } 2658 2659 module_init(dm_cache_init); 2660 module_exit(dm_cache_exit); 2661 2662 MODULE_DESCRIPTION(DM_NAME " cache target"); 2663 MODULE_AUTHOR("Joe Thornber <ejt@redhat.com>"); 2664 MODULE_LICENSE("GPL"); 2665