1 /* 2 * Copyright (C) 2011 Red Hat UK. 3 * 4 * This file is released under the GPL. 5 */ 6 7 #include "dm-thin-metadata.h" 8 9 #include <linux/device-mapper.h> 10 #include <linux/dm-io.h> 11 #include <linux/dm-kcopyd.h> 12 #include <linux/list.h> 13 #include <linux/init.h> 14 #include <linux/module.h> 15 #include <linux/slab.h> 16 17 #define DM_MSG_PREFIX "thin" 18 19 /* 20 * Tunable constants 21 */ 22 #define ENDIO_HOOK_POOL_SIZE 10240 23 #define DEFERRED_SET_SIZE 64 24 #define MAPPING_POOL_SIZE 1024 25 #define PRISON_CELLS 1024 26 #define COMMIT_PERIOD HZ 27 28 /* 29 * The block size of the device holding pool data must be 30 * between 64KB and 1GB. 31 */ 32 #define DATA_DEV_BLOCK_SIZE_MIN_SECTORS (64 * 1024 >> SECTOR_SHIFT) 33 #define DATA_DEV_BLOCK_SIZE_MAX_SECTORS (1024 * 1024 * 1024 >> SECTOR_SHIFT) 34 35 /* 36 * Device id is restricted to 24 bits. 37 */ 38 #define MAX_DEV_ID ((1 << 24) - 1) 39 40 /* 41 * How do we handle breaking sharing of data blocks? 42 * ================================================= 43 * 44 * We use a standard copy-on-write btree to store the mappings for the 45 * devices (note I'm talking about copy-on-write of the metadata here, not 46 * the data). When you take an internal snapshot you clone the root node 47 * of the origin btree. After this there is no concept of an origin or a 48 * snapshot. They are just two device trees that happen to point to the 49 * same data blocks. 50 * 51 * When we get a write in we decide if it's to a shared data block using 52 * some timestamp magic. If it is, we have to break sharing. 53 * 54 * Let's say we write to a shared block in what was the origin. The 55 * steps are: 56 * 57 * i) plug io further to this physical block. (see bio_prison code). 58 * 59 * ii) quiesce any read io to that shared data block. Obviously 60 * including all devices that share this block. (see deferred_set code) 61 * 62 * iii) copy the data block to a newly allocate block. This step can be 63 * missed out if the io covers the block. (schedule_copy). 64 * 65 * iv) insert the new mapping into the origin's btree 66 * (process_prepared_mapping). This act of inserting breaks some 67 * sharing of btree nodes between the two devices. Breaking sharing only 68 * effects the btree of that specific device. Btrees for the other 69 * devices that share the block never change. The btree for the origin 70 * device as it was after the last commit is untouched, ie. we're using 71 * persistent data structures in the functional programming sense. 72 * 73 * v) unplug io to this physical block, including the io that triggered 74 * the breaking of sharing. 75 * 76 * Steps (ii) and (iii) occur in parallel. 77 * 78 * The metadata _doesn't_ need to be committed before the io continues. We 79 * get away with this because the io is always written to a _new_ block. 80 * If there's a crash, then: 81 * 82 * - The origin mapping will point to the old origin block (the shared 83 * one). This will contain the data as it was before the io that triggered 84 * the breaking of sharing came in. 85 * 86 * - The snap mapping still points to the old block. As it would after 87 * the commit. 88 * 89 * The downside of this scheme is the timestamp magic isn't perfect, and 90 * will continue to think that data block in the snapshot device is shared 91 * even after the write to the origin has broken sharing. I suspect data 92 * blocks will typically be shared by many different devices, so we're 93 * breaking sharing n + 1 times, rather than n, where n is the number of 94 * devices that reference this data block. At the moment I think the 95 * benefits far, far outweigh the disadvantages. 96 */ 97 98 /*----------------------------------------------------------------*/ 99 100 /* 101 * Sometimes we can't deal with a bio straight away. We put them in prison 102 * where they can't cause any mischief. Bios are put in a cell identified 103 * by a key, multiple bios can be in the same cell. When the cell is 104 * subsequently unlocked the bios become available. 105 */ 106 struct bio_prison; 107 108 struct cell_key { 109 int virtual; 110 dm_thin_id dev; 111 dm_block_t block; 112 }; 113 114 struct cell { 115 struct hlist_node list; 116 struct bio_prison *prison; 117 struct cell_key key; 118 struct bio *holder; 119 struct bio_list bios; 120 }; 121 122 struct bio_prison { 123 spinlock_t lock; 124 mempool_t *cell_pool; 125 126 unsigned nr_buckets; 127 unsigned hash_mask; 128 struct hlist_head *cells; 129 }; 130 131 static uint32_t calc_nr_buckets(unsigned nr_cells) 132 { 133 uint32_t n = 128; 134 135 nr_cells /= 4; 136 nr_cells = min(nr_cells, 8192u); 137 138 while (n < nr_cells) 139 n <<= 1; 140 141 return n; 142 } 143 144 /* 145 * @nr_cells should be the number of cells you want in use _concurrently_. 146 * Don't confuse it with the number of distinct keys. 147 */ 148 static struct bio_prison *prison_create(unsigned nr_cells) 149 { 150 unsigned i; 151 uint32_t nr_buckets = calc_nr_buckets(nr_cells); 152 size_t len = sizeof(struct bio_prison) + 153 (sizeof(struct hlist_head) * nr_buckets); 154 struct bio_prison *prison = kmalloc(len, GFP_KERNEL); 155 156 if (!prison) 157 return NULL; 158 159 spin_lock_init(&prison->lock); 160 prison->cell_pool = mempool_create_kmalloc_pool(nr_cells, 161 sizeof(struct cell)); 162 if (!prison->cell_pool) { 163 kfree(prison); 164 return NULL; 165 } 166 167 prison->nr_buckets = nr_buckets; 168 prison->hash_mask = nr_buckets - 1; 169 prison->cells = (struct hlist_head *) (prison + 1); 170 for (i = 0; i < nr_buckets; i++) 171 INIT_HLIST_HEAD(prison->cells + i); 172 173 return prison; 174 } 175 176 static void prison_destroy(struct bio_prison *prison) 177 { 178 mempool_destroy(prison->cell_pool); 179 kfree(prison); 180 } 181 182 static uint32_t hash_key(struct bio_prison *prison, struct cell_key *key) 183 { 184 const unsigned long BIG_PRIME = 4294967291UL; 185 uint64_t hash = key->block * BIG_PRIME; 186 187 return (uint32_t) (hash & prison->hash_mask); 188 } 189 190 static int keys_equal(struct cell_key *lhs, struct cell_key *rhs) 191 { 192 return (lhs->virtual == rhs->virtual) && 193 (lhs->dev == rhs->dev) && 194 (lhs->block == rhs->block); 195 } 196 197 static struct cell *__search_bucket(struct hlist_head *bucket, 198 struct cell_key *key) 199 { 200 struct cell *cell; 201 struct hlist_node *tmp; 202 203 hlist_for_each_entry(cell, tmp, bucket, list) 204 if (keys_equal(&cell->key, key)) 205 return cell; 206 207 return NULL; 208 } 209 210 /* 211 * This may block if a new cell needs allocating. You must ensure that 212 * cells will be unlocked even if the calling thread is blocked. 213 * 214 * Returns 1 if the cell was already held, 0 if @inmate is the new holder. 215 */ 216 static int bio_detain(struct bio_prison *prison, struct cell_key *key, 217 struct bio *inmate, struct cell **ref) 218 { 219 int r = 1; 220 unsigned long flags; 221 uint32_t hash = hash_key(prison, key); 222 struct cell *cell, *cell2; 223 224 BUG_ON(hash > prison->nr_buckets); 225 226 spin_lock_irqsave(&prison->lock, flags); 227 228 cell = __search_bucket(prison->cells + hash, key); 229 if (cell) { 230 bio_list_add(&cell->bios, inmate); 231 goto out; 232 } 233 234 /* 235 * Allocate a new cell 236 */ 237 spin_unlock_irqrestore(&prison->lock, flags); 238 cell2 = mempool_alloc(prison->cell_pool, GFP_NOIO); 239 spin_lock_irqsave(&prison->lock, flags); 240 241 /* 242 * We've been unlocked, so we have to double check that 243 * nobody else has inserted this cell in the meantime. 244 */ 245 cell = __search_bucket(prison->cells + hash, key); 246 if (cell) { 247 mempool_free(cell2, prison->cell_pool); 248 bio_list_add(&cell->bios, inmate); 249 goto out; 250 } 251 252 /* 253 * Use new cell. 254 */ 255 cell = cell2; 256 257 cell->prison = prison; 258 memcpy(&cell->key, key, sizeof(cell->key)); 259 cell->holder = inmate; 260 bio_list_init(&cell->bios); 261 hlist_add_head(&cell->list, prison->cells + hash); 262 263 r = 0; 264 265 out: 266 spin_unlock_irqrestore(&prison->lock, flags); 267 268 *ref = cell; 269 270 return r; 271 } 272 273 /* 274 * @inmates must have been initialised prior to this call 275 */ 276 static void __cell_release(struct cell *cell, struct bio_list *inmates) 277 { 278 struct bio_prison *prison = cell->prison; 279 280 hlist_del(&cell->list); 281 282 if (inmates) { 283 bio_list_add(inmates, cell->holder); 284 bio_list_merge(inmates, &cell->bios); 285 } 286 287 mempool_free(cell, prison->cell_pool); 288 } 289 290 static void cell_release(struct cell *cell, struct bio_list *bios) 291 { 292 unsigned long flags; 293 struct bio_prison *prison = cell->prison; 294 295 spin_lock_irqsave(&prison->lock, flags); 296 __cell_release(cell, bios); 297 spin_unlock_irqrestore(&prison->lock, flags); 298 } 299 300 /* 301 * There are a couple of places where we put a bio into a cell briefly 302 * before taking it out again. In these situations we know that no other 303 * bio may be in the cell. This function releases the cell, and also does 304 * a sanity check. 305 */ 306 static void __cell_release_singleton(struct cell *cell, struct bio *bio) 307 { 308 BUG_ON(cell->holder != bio); 309 BUG_ON(!bio_list_empty(&cell->bios)); 310 311 __cell_release(cell, NULL); 312 } 313 314 static void cell_release_singleton(struct cell *cell, struct bio *bio) 315 { 316 unsigned long flags; 317 struct bio_prison *prison = cell->prison; 318 319 spin_lock_irqsave(&prison->lock, flags); 320 __cell_release_singleton(cell, bio); 321 spin_unlock_irqrestore(&prison->lock, flags); 322 } 323 324 /* 325 * Sometimes we don't want the holder, just the additional bios. 326 */ 327 static void __cell_release_no_holder(struct cell *cell, struct bio_list *inmates) 328 { 329 struct bio_prison *prison = cell->prison; 330 331 hlist_del(&cell->list); 332 bio_list_merge(inmates, &cell->bios); 333 334 mempool_free(cell, prison->cell_pool); 335 } 336 337 static void cell_release_no_holder(struct cell *cell, struct bio_list *inmates) 338 { 339 unsigned long flags; 340 struct bio_prison *prison = cell->prison; 341 342 spin_lock_irqsave(&prison->lock, flags); 343 __cell_release_no_holder(cell, inmates); 344 spin_unlock_irqrestore(&prison->lock, flags); 345 } 346 347 static void cell_error(struct cell *cell) 348 { 349 struct bio_prison *prison = cell->prison; 350 struct bio_list bios; 351 struct bio *bio; 352 unsigned long flags; 353 354 bio_list_init(&bios); 355 356 spin_lock_irqsave(&prison->lock, flags); 357 __cell_release(cell, &bios); 358 spin_unlock_irqrestore(&prison->lock, flags); 359 360 while ((bio = bio_list_pop(&bios))) 361 bio_io_error(bio); 362 } 363 364 /*----------------------------------------------------------------*/ 365 366 /* 367 * We use the deferred set to keep track of pending reads to shared blocks. 368 * We do this to ensure the new mapping caused by a write isn't performed 369 * until these prior reads have completed. Otherwise the insertion of the 370 * new mapping could free the old block that the read bios are mapped to. 371 */ 372 373 struct deferred_set; 374 struct deferred_entry { 375 struct deferred_set *ds; 376 unsigned count; 377 struct list_head work_items; 378 }; 379 380 struct deferred_set { 381 spinlock_t lock; 382 unsigned current_entry; 383 unsigned sweeper; 384 struct deferred_entry entries[DEFERRED_SET_SIZE]; 385 }; 386 387 static void ds_init(struct deferred_set *ds) 388 { 389 int i; 390 391 spin_lock_init(&ds->lock); 392 ds->current_entry = 0; 393 ds->sweeper = 0; 394 for (i = 0; i < DEFERRED_SET_SIZE; i++) { 395 ds->entries[i].ds = ds; 396 ds->entries[i].count = 0; 397 INIT_LIST_HEAD(&ds->entries[i].work_items); 398 } 399 } 400 401 static struct deferred_entry *ds_inc(struct deferred_set *ds) 402 { 403 unsigned long flags; 404 struct deferred_entry *entry; 405 406 spin_lock_irqsave(&ds->lock, flags); 407 entry = ds->entries + ds->current_entry; 408 entry->count++; 409 spin_unlock_irqrestore(&ds->lock, flags); 410 411 return entry; 412 } 413 414 static unsigned ds_next(unsigned index) 415 { 416 return (index + 1) % DEFERRED_SET_SIZE; 417 } 418 419 static void __sweep(struct deferred_set *ds, struct list_head *head) 420 { 421 while ((ds->sweeper != ds->current_entry) && 422 !ds->entries[ds->sweeper].count) { 423 list_splice_init(&ds->entries[ds->sweeper].work_items, head); 424 ds->sweeper = ds_next(ds->sweeper); 425 } 426 427 if ((ds->sweeper == ds->current_entry) && !ds->entries[ds->sweeper].count) 428 list_splice_init(&ds->entries[ds->sweeper].work_items, head); 429 } 430 431 static void ds_dec(struct deferred_entry *entry, struct list_head *head) 432 { 433 unsigned long flags; 434 435 spin_lock_irqsave(&entry->ds->lock, flags); 436 BUG_ON(!entry->count); 437 --entry->count; 438 __sweep(entry->ds, head); 439 spin_unlock_irqrestore(&entry->ds->lock, flags); 440 } 441 442 /* 443 * Returns 1 if deferred or 0 if no pending items to delay job. 444 */ 445 static int ds_add_work(struct deferred_set *ds, struct list_head *work) 446 { 447 int r = 1; 448 unsigned long flags; 449 unsigned next_entry; 450 451 spin_lock_irqsave(&ds->lock, flags); 452 if ((ds->sweeper == ds->current_entry) && 453 !ds->entries[ds->current_entry].count) 454 r = 0; 455 else { 456 list_add(work, &ds->entries[ds->current_entry].work_items); 457 next_entry = ds_next(ds->current_entry); 458 if (!ds->entries[next_entry].count) 459 ds->current_entry = next_entry; 460 } 461 spin_unlock_irqrestore(&ds->lock, flags); 462 463 return r; 464 } 465 466 /*----------------------------------------------------------------*/ 467 468 /* 469 * Key building. 470 */ 471 static void build_data_key(struct dm_thin_device *td, 472 dm_block_t b, struct cell_key *key) 473 { 474 key->virtual = 0; 475 key->dev = dm_thin_dev_id(td); 476 key->block = b; 477 } 478 479 static void build_virtual_key(struct dm_thin_device *td, dm_block_t b, 480 struct cell_key *key) 481 { 482 key->virtual = 1; 483 key->dev = dm_thin_dev_id(td); 484 key->block = b; 485 } 486 487 /*----------------------------------------------------------------*/ 488 489 /* 490 * A pool device ties together a metadata device and a data device. It 491 * also provides the interface for creating and destroying internal 492 * devices. 493 */ 494 struct new_mapping; 495 496 struct pool_features { 497 unsigned zero_new_blocks:1; 498 unsigned discard_enabled:1; 499 unsigned discard_passdown:1; 500 }; 501 502 struct pool { 503 struct list_head list; 504 struct dm_target *ti; /* Only set if a pool target is bound */ 505 506 struct mapped_device *pool_md; 507 struct block_device *md_dev; 508 struct dm_pool_metadata *pmd; 509 510 uint32_t sectors_per_block; 511 unsigned block_shift; 512 dm_block_t offset_mask; 513 dm_block_t low_water_blocks; 514 515 struct pool_features pf; 516 unsigned low_water_triggered:1; /* A dm event has been sent */ 517 unsigned no_free_space:1; /* A -ENOSPC warning has been issued */ 518 519 struct bio_prison *prison; 520 struct dm_kcopyd_client *copier; 521 522 struct workqueue_struct *wq; 523 struct work_struct worker; 524 struct delayed_work waker; 525 526 unsigned ref_count; 527 unsigned long last_commit_jiffies; 528 529 spinlock_t lock; 530 struct bio_list deferred_bios; 531 struct bio_list deferred_flush_bios; 532 struct list_head prepared_mappings; 533 struct list_head prepared_discards; 534 535 struct bio_list retry_on_resume_list; 536 537 struct deferred_set shared_read_ds; 538 struct deferred_set all_io_ds; 539 540 struct new_mapping *next_mapping; 541 mempool_t *mapping_pool; 542 mempool_t *endio_hook_pool; 543 }; 544 545 /* 546 * Target context for a pool. 547 */ 548 struct pool_c { 549 struct dm_target *ti; 550 struct pool *pool; 551 struct dm_dev *data_dev; 552 struct dm_dev *metadata_dev; 553 struct dm_target_callbacks callbacks; 554 555 dm_block_t low_water_blocks; 556 struct pool_features pf; 557 }; 558 559 /* 560 * Target context for a thin. 561 */ 562 struct thin_c { 563 struct dm_dev *pool_dev; 564 struct dm_dev *origin_dev; 565 dm_thin_id dev_id; 566 567 struct pool *pool; 568 struct dm_thin_device *td; 569 }; 570 571 /*----------------------------------------------------------------*/ 572 573 /* 574 * A global list of pools that uses a struct mapped_device as a key. 575 */ 576 static struct dm_thin_pool_table { 577 struct mutex mutex; 578 struct list_head pools; 579 } dm_thin_pool_table; 580 581 static void pool_table_init(void) 582 { 583 mutex_init(&dm_thin_pool_table.mutex); 584 INIT_LIST_HEAD(&dm_thin_pool_table.pools); 585 } 586 587 static void __pool_table_insert(struct pool *pool) 588 { 589 BUG_ON(!mutex_is_locked(&dm_thin_pool_table.mutex)); 590 list_add(&pool->list, &dm_thin_pool_table.pools); 591 } 592 593 static void __pool_table_remove(struct pool *pool) 594 { 595 BUG_ON(!mutex_is_locked(&dm_thin_pool_table.mutex)); 596 list_del(&pool->list); 597 } 598 599 static struct pool *__pool_table_lookup(struct mapped_device *md) 600 { 601 struct pool *pool = NULL, *tmp; 602 603 BUG_ON(!mutex_is_locked(&dm_thin_pool_table.mutex)); 604 605 list_for_each_entry(tmp, &dm_thin_pool_table.pools, list) { 606 if (tmp->pool_md == md) { 607 pool = tmp; 608 break; 609 } 610 } 611 612 return pool; 613 } 614 615 static struct pool *__pool_table_lookup_metadata_dev(struct block_device *md_dev) 616 { 617 struct pool *pool = NULL, *tmp; 618 619 BUG_ON(!mutex_is_locked(&dm_thin_pool_table.mutex)); 620 621 list_for_each_entry(tmp, &dm_thin_pool_table.pools, list) { 622 if (tmp->md_dev == md_dev) { 623 pool = tmp; 624 break; 625 } 626 } 627 628 return pool; 629 } 630 631 /*----------------------------------------------------------------*/ 632 633 struct endio_hook { 634 struct thin_c *tc; 635 struct deferred_entry *shared_read_entry; 636 struct deferred_entry *all_io_entry; 637 struct new_mapping *overwrite_mapping; 638 }; 639 640 static void __requeue_bio_list(struct thin_c *tc, struct bio_list *master) 641 { 642 struct bio *bio; 643 struct bio_list bios; 644 645 bio_list_init(&bios); 646 bio_list_merge(&bios, master); 647 bio_list_init(master); 648 649 while ((bio = bio_list_pop(&bios))) { 650 struct endio_hook *h = dm_get_mapinfo(bio)->ptr; 651 if (h->tc == tc) 652 bio_endio(bio, DM_ENDIO_REQUEUE); 653 else 654 bio_list_add(master, bio); 655 } 656 } 657 658 static void requeue_io(struct thin_c *tc) 659 { 660 struct pool *pool = tc->pool; 661 unsigned long flags; 662 663 spin_lock_irqsave(&pool->lock, flags); 664 __requeue_bio_list(tc, &pool->deferred_bios); 665 __requeue_bio_list(tc, &pool->retry_on_resume_list); 666 spin_unlock_irqrestore(&pool->lock, flags); 667 } 668 669 /* 670 * This section of code contains the logic for processing a thin device's IO. 671 * Much of the code depends on pool object resources (lists, workqueues, etc) 672 * but most is exclusively called from the thin target rather than the thin-pool 673 * target. 674 */ 675 676 static dm_block_t get_bio_block(struct thin_c *tc, struct bio *bio) 677 { 678 return bio->bi_sector >> tc->pool->block_shift; 679 } 680 681 static void remap(struct thin_c *tc, struct bio *bio, dm_block_t block) 682 { 683 struct pool *pool = tc->pool; 684 685 bio->bi_bdev = tc->pool_dev->bdev; 686 bio->bi_sector = (block << pool->block_shift) + 687 (bio->bi_sector & pool->offset_mask); 688 } 689 690 static void remap_to_origin(struct thin_c *tc, struct bio *bio) 691 { 692 bio->bi_bdev = tc->origin_dev->bdev; 693 } 694 695 static void issue(struct thin_c *tc, struct bio *bio) 696 { 697 struct pool *pool = tc->pool; 698 unsigned long flags; 699 700 /* 701 * Batch together any FUA/FLUSH bios we find and then issue 702 * a single commit for them in process_deferred_bios(). 703 */ 704 if (bio->bi_rw & (REQ_FLUSH | REQ_FUA)) { 705 spin_lock_irqsave(&pool->lock, flags); 706 bio_list_add(&pool->deferred_flush_bios, bio); 707 spin_unlock_irqrestore(&pool->lock, flags); 708 } else 709 generic_make_request(bio); 710 } 711 712 static void remap_to_origin_and_issue(struct thin_c *tc, struct bio *bio) 713 { 714 remap_to_origin(tc, bio); 715 issue(tc, bio); 716 } 717 718 static void remap_and_issue(struct thin_c *tc, struct bio *bio, 719 dm_block_t block) 720 { 721 remap(tc, bio, block); 722 issue(tc, bio); 723 } 724 725 /* 726 * wake_worker() is used when new work is queued and when pool_resume is 727 * ready to continue deferred IO processing. 728 */ 729 static void wake_worker(struct pool *pool) 730 { 731 queue_work(pool->wq, &pool->worker); 732 } 733 734 /*----------------------------------------------------------------*/ 735 736 /* 737 * Bio endio functions. 738 */ 739 struct new_mapping { 740 struct list_head list; 741 742 unsigned quiesced:1; 743 unsigned prepared:1; 744 unsigned pass_discard:1; 745 746 struct thin_c *tc; 747 dm_block_t virt_block; 748 dm_block_t data_block; 749 struct cell *cell, *cell2; 750 int err; 751 752 /* 753 * If the bio covers the whole area of a block then we can avoid 754 * zeroing or copying. Instead this bio is hooked. The bio will 755 * still be in the cell, so care has to be taken to avoid issuing 756 * the bio twice. 757 */ 758 struct bio *bio; 759 bio_end_io_t *saved_bi_end_io; 760 }; 761 762 static void __maybe_add_mapping(struct new_mapping *m) 763 { 764 struct pool *pool = m->tc->pool; 765 766 if (m->quiesced && m->prepared) { 767 list_add(&m->list, &pool->prepared_mappings); 768 wake_worker(pool); 769 } 770 } 771 772 static void copy_complete(int read_err, unsigned long write_err, void *context) 773 { 774 unsigned long flags; 775 struct new_mapping *m = context; 776 struct pool *pool = m->tc->pool; 777 778 m->err = read_err || write_err ? -EIO : 0; 779 780 spin_lock_irqsave(&pool->lock, flags); 781 m->prepared = 1; 782 __maybe_add_mapping(m); 783 spin_unlock_irqrestore(&pool->lock, flags); 784 } 785 786 static void overwrite_endio(struct bio *bio, int err) 787 { 788 unsigned long flags; 789 struct endio_hook *h = dm_get_mapinfo(bio)->ptr; 790 struct new_mapping *m = h->overwrite_mapping; 791 struct pool *pool = m->tc->pool; 792 793 m->err = err; 794 795 spin_lock_irqsave(&pool->lock, flags); 796 m->prepared = 1; 797 __maybe_add_mapping(m); 798 spin_unlock_irqrestore(&pool->lock, flags); 799 } 800 801 /*----------------------------------------------------------------*/ 802 803 /* 804 * Workqueue. 805 */ 806 807 /* 808 * Prepared mapping jobs. 809 */ 810 811 /* 812 * This sends the bios in the cell back to the deferred_bios list. 813 */ 814 static void cell_defer(struct thin_c *tc, struct cell *cell, 815 dm_block_t data_block) 816 { 817 struct pool *pool = tc->pool; 818 unsigned long flags; 819 820 spin_lock_irqsave(&pool->lock, flags); 821 cell_release(cell, &pool->deferred_bios); 822 spin_unlock_irqrestore(&tc->pool->lock, flags); 823 824 wake_worker(pool); 825 } 826 827 /* 828 * Same as cell_defer above, except it omits one particular detainee, 829 * a write bio that covers the block and has already been processed. 830 */ 831 static void cell_defer_except(struct thin_c *tc, struct cell *cell) 832 { 833 struct bio_list bios; 834 struct pool *pool = tc->pool; 835 unsigned long flags; 836 837 bio_list_init(&bios); 838 839 spin_lock_irqsave(&pool->lock, flags); 840 cell_release_no_holder(cell, &pool->deferred_bios); 841 spin_unlock_irqrestore(&pool->lock, flags); 842 843 wake_worker(pool); 844 } 845 846 static void process_prepared_mapping(struct new_mapping *m) 847 { 848 struct thin_c *tc = m->tc; 849 struct bio *bio; 850 int r; 851 852 bio = m->bio; 853 if (bio) 854 bio->bi_end_io = m->saved_bi_end_io; 855 856 if (m->err) { 857 cell_error(m->cell); 858 return; 859 } 860 861 /* 862 * Commit the prepared block into the mapping btree. 863 * Any I/O for this block arriving after this point will get 864 * remapped to it directly. 865 */ 866 r = dm_thin_insert_block(tc->td, m->virt_block, m->data_block); 867 if (r) { 868 DMERR("dm_thin_insert_block() failed"); 869 cell_error(m->cell); 870 return; 871 } 872 873 /* 874 * Release any bios held while the block was being provisioned. 875 * If we are processing a write bio that completely covers the block, 876 * we already processed it so can ignore it now when processing 877 * the bios in the cell. 878 */ 879 if (bio) { 880 cell_defer_except(tc, m->cell); 881 bio_endio(bio, 0); 882 } else 883 cell_defer(tc, m->cell, m->data_block); 884 885 list_del(&m->list); 886 mempool_free(m, tc->pool->mapping_pool); 887 } 888 889 static void process_prepared_discard(struct new_mapping *m) 890 { 891 int r; 892 struct thin_c *tc = m->tc; 893 894 r = dm_thin_remove_block(tc->td, m->virt_block); 895 if (r) 896 DMERR("dm_thin_remove_block() failed"); 897 898 /* 899 * Pass the discard down to the underlying device? 900 */ 901 if (m->pass_discard) 902 remap_and_issue(tc, m->bio, m->data_block); 903 else 904 bio_endio(m->bio, 0); 905 906 cell_defer_except(tc, m->cell); 907 cell_defer_except(tc, m->cell2); 908 mempool_free(m, tc->pool->mapping_pool); 909 } 910 911 static void process_prepared(struct pool *pool, struct list_head *head, 912 void (*fn)(struct new_mapping *)) 913 { 914 unsigned long flags; 915 struct list_head maps; 916 struct new_mapping *m, *tmp; 917 918 INIT_LIST_HEAD(&maps); 919 spin_lock_irqsave(&pool->lock, flags); 920 list_splice_init(head, &maps); 921 spin_unlock_irqrestore(&pool->lock, flags); 922 923 list_for_each_entry_safe(m, tmp, &maps, list) 924 fn(m); 925 } 926 927 /* 928 * Deferred bio jobs. 929 */ 930 static int io_overlaps_block(struct pool *pool, struct bio *bio) 931 { 932 return !(bio->bi_sector & pool->offset_mask) && 933 (bio->bi_size == (pool->sectors_per_block << SECTOR_SHIFT)); 934 935 } 936 937 static int io_overwrites_block(struct pool *pool, struct bio *bio) 938 { 939 return (bio_data_dir(bio) == WRITE) && 940 io_overlaps_block(pool, bio); 941 } 942 943 static void save_and_set_endio(struct bio *bio, bio_end_io_t **save, 944 bio_end_io_t *fn) 945 { 946 *save = bio->bi_end_io; 947 bio->bi_end_io = fn; 948 } 949 950 static int ensure_next_mapping(struct pool *pool) 951 { 952 if (pool->next_mapping) 953 return 0; 954 955 pool->next_mapping = mempool_alloc(pool->mapping_pool, GFP_ATOMIC); 956 957 return pool->next_mapping ? 0 : -ENOMEM; 958 } 959 960 static struct new_mapping *get_next_mapping(struct pool *pool) 961 { 962 struct new_mapping *r = pool->next_mapping; 963 964 BUG_ON(!pool->next_mapping); 965 966 pool->next_mapping = NULL; 967 968 return r; 969 } 970 971 static void schedule_copy(struct thin_c *tc, dm_block_t virt_block, 972 struct dm_dev *origin, dm_block_t data_origin, 973 dm_block_t data_dest, 974 struct cell *cell, struct bio *bio) 975 { 976 int r; 977 struct pool *pool = tc->pool; 978 struct new_mapping *m = get_next_mapping(pool); 979 980 INIT_LIST_HEAD(&m->list); 981 m->quiesced = 0; 982 m->prepared = 0; 983 m->tc = tc; 984 m->virt_block = virt_block; 985 m->data_block = data_dest; 986 m->cell = cell; 987 m->err = 0; 988 m->bio = NULL; 989 990 if (!ds_add_work(&pool->shared_read_ds, &m->list)) 991 m->quiesced = 1; 992 993 /* 994 * IO to pool_dev remaps to the pool target's data_dev. 995 * 996 * If the whole block of data is being overwritten, we can issue the 997 * bio immediately. Otherwise we use kcopyd to clone the data first. 998 */ 999 if (io_overwrites_block(pool, bio)) { 1000 struct endio_hook *h = dm_get_mapinfo(bio)->ptr; 1001 h->overwrite_mapping = m; 1002 m->bio = bio; 1003 save_and_set_endio(bio, &m->saved_bi_end_io, overwrite_endio); 1004 remap_and_issue(tc, bio, data_dest); 1005 } else { 1006 struct dm_io_region from, to; 1007 1008 from.bdev = origin->bdev; 1009 from.sector = data_origin * pool->sectors_per_block; 1010 from.count = pool->sectors_per_block; 1011 1012 to.bdev = tc->pool_dev->bdev; 1013 to.sector = data_dest * pool->sectors_per_block; 1014 to.count = pool->sectors_per_block; 1015 1016 r = dm_kcopyd_copy(pool->copier, &from, 1, &to, 1017 0, copy_complete, m); 1018 if (r < 0) { 1019 mempool_free(m, pool->mapping_pool); 1020 DMERR("dm_kcopyd_copy() failed"); 1021 cell_error(cell); 1022 } 1023 } 1024 } 1025 1026 static void schedule_internal_copy(struct thin_c *tc, dm_block_t virt_block, 1027 dm_block_t data_origin, dm_block_t data_dest, 1028 struct cell *cell, struct bio *bio) 1029 { 1030 schedule_copy(tc, virt_block, tc->pool_dev, 1031 data_origin, data_dest, cell, bio); 1032 } 1033 1034 static void schedule_external_copy(struct thin_c *tc, dm_block_t virt_block, 1035 dm_block_t data_dest, 1036 struct cell *cell, struct bio *bio) 1037 { 1038 schedule_copy(tc, virt_block, tc->origin_dev, 1039 virt_block, data_dest, cell, bio); 1040 } 1041 1042 static void schedule_zero(struct thin_c *tc, dm_block_t virt_block, 1043 dm_block_t data_block, struct cell *cell, 1044 struct bio *bio) 1045 { 1046 struct pool *pool = tc->pool; 1047 struct new_mapping *m = get_next_mapping(pool); 1048 1049 INIT_LIST_HEAD(&m->list); 1050 m->quiesced = 1; 1051 m->prepared = 0; 1052 m->tc = tc; 1053 m->virt_block = virt_block; 1054 m->data_block = data_block; 1055 m->cell = cell; 1056 m->err = 0; 1057 m->bio = NULL; 1058 1059 /* 1060 * If the whole block of data is being overwritten or we are not 1061 * zeroing pre-existing data, we can issue the bio immediately. 1062 * Otherwise we use kcopyd to zero the data first. 1063 */ 1064 if (!pool->pf.zero_new_blocks) 1065 process_prepared_mapping(m); 1066 1067 else if (io_overwrites_block(pool, bio)) { 1068 struct endio_hook *h = dm_get_mapinfo(bio)->ptr; 1069 h->overwrite_mapping = m; 1070 m->bio = bio; 1071 save_and_set_endio(bio, &m->saved_bi_end_io, overwrite_endio); 1072 remap_and_issue(tc, bio, data_block); 1073 1074 } else { 1075 int r; 1076 struct dm_io_region to; 1077 1078 to.bdev = tc->pool_dev->bdev; 1079 to.sector = data_block * pool->sectors_per_block; 1080 to.count = pool->sectors_per_block; 1081 1082 r = dm_kcopyd_zero(pool->copier, 1, &to, 0, copy_complete, m); 1083 if (r < 0) { 1084 mempool_free(m, pool->mapping_pool); 1085 DMERR("dm_kcopyd_zero() failed"); 1086 cell_error(cell); 1087 } 1088 } 1089 } 1090 1091 static int alloc_data_block(struct thin_c *tc, dm_block_t *result) 1092 { 1093 int r; 1094 dm_block_t free_blocks; 1095 unsigned long flags; 1096 struct pool *pool = tc->pool; 1097 1098 r = dm_pool_get_free_block_count(pool->pmd, &free_blocks); 1099 if (r) 1100 return r; 1101 1102 if (free_blocks <= pool->low_water_blocks && !pool->low_water_triggered) { 1103 DMWARN("%s: reached low water mark, sending event.", 1104 dm_device_name(pool->pool_md)); 1105 spin_lock_irqsave(&pool->lock, flags); 1106 pool->low_water_triggered = 1; 1107 spin_unlock_irqrestore(&pool->lock, flags); 1108 dm_table_event(pool->ti->table); 1109 } 1110 1111 if (!free_blocks) { 1112 if (pool->no_free_space) 1113 return -ENOSPC; 1114 else { 1115 /* 1116 * Try to commit to see if that will free up some 1117 * more space. 1118 */ 1119 r = dm_pool_commit_metadata(pool->pmd); 1120 if (r) { 1121 DMERR("%s: dm_pool_commit_metadata() failed, error = %d", 1122 __func__, r); 1123 return r; 1124 } 1125 1126 r = dm_pool_get_free_block_count(pool->pmd, &free_blocks); 1127 if (r) 1128 return r; 1129 1130 /* 1131 * If we still have no space we set a flag to avoid 1132 * doing all this checking and return -ENOSPC. 1133 */ 1134 if (!free_blocks) { 1135 DMWARN("%s: no free space available.", 1136 dm_device_name(pool->pool_md)); 1137 spin_lock_irqsave(&pool->lock, flags); 1138 pool->no_free_space = 1; 1139 spin_unlock_irqrestore(&pool->lock, flags); 1140 return -ENOSPC; 1141 } 1142 } 1143 } 1144 1145 r = dm_pool_alloc_data_block(pool->pmd, result); 1146 if (r) 1147 return r; 1148 1149 return 0; 1150 } 1151 1152 /* 1153 * If we have run out of space, queue bios until the device is 1154 * resumed, presumably after having been reloaded with more space. 1155 */ 1156 static void retry_on_resume(struct bio *bio) 1157 { 1158 struct endio_hook *h = dm_get_mapinfo(bio)->ptr; 1159 struct thin_c *tc = h->tc; 1160 struct pool *pool = tc->pool; 1161 unsigned long flags; 1162 1163 spin_lock_irqsave(&pool->lock, flags); 1164 bio_list_add(&pool->retry_on_resume_list, bio); 1165 spin_unlock_irqrestore(&pool->lock, flags); 1166 } 1167 1168 static void no_space(struct cell *cell) 1169 { 1170 struct bio *bio; 1171 struct bio_list bios; 1172 1173 bio_list_init(&bios); 1174 cell_release(cell, &bios); 1175 1176 while ((bio = bio_list_pop(&bios))) 1177 retry_on_resume(bio); 1178 } 1179 1180 static void process_discard(struct thin_c *tc, struct bio *bio) 1181 { 1182 int r; 1183 unsigned long flags; 1184 struct pool *pool = tc->pool; 1185 struct cell *cell, *cell2; 1186 struct cell_key key, key2; 1187 dm_block_t block = get_bio_block(tc, bio); 1188 struct dm_thin_lookup_result lookup_result; 1189 struct new_mapping *m; 1190 1191 build_virtual_key(tc->td, block, &key); 1192 if (bio_detain(tc->pool->prison, &key, bio, &cell)) 1193 return; 1194 1195 r = dm_thin_find_block(tc->td, block, 1, &lookup_result); 1196 switch (r) { 1197 case 0: 1198 /* 1199 * Check nobody is fiddling with this pool block. This can 1200 * happen if someone's in the process of breaking sharing 1201 * on this block. 1202 */ 1203 build_data_key(tc->td, lookup_result.block, &key2); 1204 if (bio_detain(tc->pool->prison, &key2, bio, &cell2)) { 1205 cell_release_singleton(cell, bio); 1206 break; 1207 } 1208 1209 if (io_overlaps_block(pool, bio)) { 1210 /* 1211 * IO may still be going to the destination block. We must 1212 * quiesce before we can do the removal. 1213 */ 1214 m = get_next_mapping(pool); 1215 m->tc = tc; 1216 m->pass_discard = (!lookup_result.shared) & pool->pf.discard_passdown; 1217 m->virt_block = block; 1218 m->data_block = lookup_result.block; 1219 m->cell = cell; 1220 m->cell2 = cell2; 1221 m->err = 0; 1222 m->bio = bio; 1223 1224 if (!ds_add_work(&pool->all_io_ds, &m->list)) { 1225 spin_lock_irqsave(&pool->lock, flags); 1226 list_add(&m->list, &pool->prepared_discards); 1227 spin_unlock_irqrestore(&pool->lock, flags); 1228 wake_worker(pool); 1229 } 1230 } else { 1231 /* 1232 * This path is hit if people are ignoring 1233 * limits->discard_granularity. It ignores any 1234 * part of the discard that is in a subsequent 1235 * block. 1236 */ 1237 sector_t offset = bio->bi_sector - (block << pool->block_shift); 1238 unsigned remaining = (pool->sectors_per_block - offset) << 9; 1239 bio->bi_size = min(bio->bi_size, remaining); 1240 1241 cell_release_singleton(cell, bio); 1242 cell_release_singleton(cell2, bio); 1243 remap_and_issue(tc, bio, lookup_result.block); 1244 } 1245 break; 1246 1247 case -ENODATA: 1248 /* 1249 * It isn't provisioned, just forget it. 1250 */ 1251 cell_release_singleton(cell, bio); 1252 bio_endio(bio, 0); 1253 break; 1254 1255 default: 1256 DMERR("discard: find block unexpectedly returned %d", r); 1257 cell_release_singleton(cell, bio); 1258 bio_io_error(bio); 1259 break; 1260 } 1261 } 1262 1263 static void break_sharing(struct thin_c *tc, struct bio *bio, dm_block_t block, 1264 struct cell_key *key, 1265 struct dm_thin_lookup_result *lookup_result, 1266 struct cell *cell) 1267 { 1268 int r; 1269 dm_block_t data_block; 1270 1271 r = alloc_data_block(tc, &data_block); 1272 switch (r) { 1273 case 0: 1274 schedule_internal_copy(tc, block, lookup_result->block, 1275 data_block, cell, bio); 1276 break; 1277 1278 case -ENOSPC: 1279 no_space(cell); 1280 break; 1281 1282 default: 1283 DMERR("%s: alloc_data_block() failed, error = %d", __func__, r); 1284 cell_error(cell); 1285 break; 1286 } 1287 } 1288 1289 static void process_shared_bio(struct thin_c *tc, struct bio *bio, 1290 dm_block_t block, 1291 struct dm_thin_lookup_result *lookup_result) 1292 { 1293 struct cell *cell; 1294 struct pool *pool = tc->pool; 1295 struct cell_key key; 1296 1297 /* 1298 * If cell is already occupied, then sharing is already in the process 1299 * of being broken so we have nothing further to do here. 1300 */ 1301 build_data_key(tc->td, lookup_result->block, &key); 1302 if (bio_detain(pool->prison, &key, bio, &cell)) 1303 return; 1304 1305 if (bio_data_dir(bio) == WRITE) 1306 break_sharing(tc, bio, block, &key, lookup_result, cell); 1307 else { 1308 struct endio_hook *h = dm_get_mapinfo(bio)->ptr; 1309 1310 h->shared_read_entry = ds_inc(&pool->shared_read_ds); 1311 1312 cell_release_singleton(cell, bio); 1313 remap_and_issue(tc, bio, lookup_result->block); 1314 } 1315 } 1316 1317 static void provision_block(struct thin_c *tc, struct bio *bio, dm_block_t block, 1318 struct cell *cell) 1319 { 1320 int r; 1321 dm_block_t data_block; 1322 1323 /* 1324 * Remap empty bios (flushes) immediately, without provisioning. 1325 */ 1326 if (!bio->bi_size) { 1327 cell_release_singleton(cell, bio); 1328 remap_and_issue(tc, bio, 0); 1329 return; 1330 } 1331 1332 /* 1333 * Fill read bios with zeroes and complete them immediately. 1334 */ 1335 if (bio_data_dir(bio) == READ) { 1336 zero_fill_bio(bio); 1337 cell_release_singleton(cell, bio); 1338 bio_endio(bio, 0); 1339 return; 1340 } 1341 1342 r = alloc_data_block(tc, &data_block); 1343 switch (r) { 1344 case 0: 1345 if (tc->origin_dev) 1346 schedule_external_copy(tc, block, data_block, cell, bio); 1347 else 1348 schedule_zero(tc, block, data_block, cell, bio); 1349 break; 1350 1351 case -ENOSPC: 1352 no_space(cell); 1353 break; 1354 1355 default: 1356 DMERR("%s: alloc_data_block() failed, error = %d", __func__, r); 1357 cell_error(cell); 1358 break; 1359 } 1360 } 1361 1362 static void process_bio(struct thin_c *tc, struct bio *bio) 1363 { 1364 int r; 1365 dm_block_t block = get_bio_block(tc, bio); 1366 struct cell *cell; 1367 struct cell_key key; 1368 struct dm_thin_lookup_result lookup_result; 1369 1370 /* 1371 * If cell is already occupied, then the block is already 1372 * being provisioned so we have nothing further to do here. 1373 */ 1374 build_virtual_key(tc->td, block, &key); 1375 if (bio_detain(tc->pool->prison, &key, bio, &cell)) 1376 return; 1377 1378 r = dm_thin_find_block(tc->td, block, 1, &lookup_result); 1379 switch (r) { 1380 case 0: 1381 /* 1382 * We can release this cell now. This thread is the only 1383 * one that puts bios into a cell, and we know there were 1384 * no preceding bios. 1385 */ 1386 /* 1387 * TODO: this will probably have to change when discard goes 1388 * back in. 1389 */ 1390 cell_release_singleton(cell, bio); 1391 1392 if (lookup_result.shared) 1393 process_shared_bio(tc, bio, block, &lookup_result); 1394 else 1395 remap_and_issue(tc, bio, lookup_result.block); 1396 break; 1397 1398 case -ENODATA: 1399 if (bio_data_dir(bio) == READ && tc->origin_dev) { 1400 cell_release_singleton(cell, bio); 1401 remap_to_origin_and_issue(tc, bio); 1402 } else 1403 provision_block(tc, bio, block, cell); 1404 break; 1405 1406 default: 1407 DMERR("dm_thin_find_block() failed, error = %d", r); 1408 cell_release_singleton(cell, bio); 1409 bio_io_error(bio); 1410 break; 1411 } 1412 } 1413 1414 static int need_commit_due_to_time(struct pool *pool) 1415 { 1416 return jiffies < pool->last_commit_jiffies || 1417 jiffies > pool->last_commit_jiffies + COMMIT_PERIOD; 1418 } 1419 1420 static void process_deferred_bios(struct pool *pool) 1421 { 1422 unsigned long flags; 1423 struct bio *bio; 1424 struct bio_list bios; 1425 int r; 1426 1427 bio_list_init(&bios); 1428 1429 spin_lock_irqsave(&pool->lock, flags); 1430 bio_list_merge(&bios, &pool->deferred_bios); 1431 bio_list_init(&pool->deferred_bios); 1432 spin_unlock_irqrestore(&pool->lock, flags); 1433 1434 while ((bio = bio_list_pop(&bios))) { 1435 struct endio_hook *h = dm_get_mapinfo(bio)->ptr; 1436 struct thin_c *tc = h->tc; 1437 1438 /* 1439 * If we've got no free new_mapping structs, and processing 1440 * this bio might require one, we pause until there are some 1441 * prepared mappings to process. 1442 */ 1443 if (ensure_next_mapping(pool)) { 1444 spin_lock_irqsave(&pool->lock, flags); 1445 bio_list_merge(&pool->deferred_bios, &bios); 1446 spin_unlock_irqrestore(&pool->lock, flags); 1447 1448 break; 1449 } 1450 1451 if (bio->bi_rw & REQ_DISCARD) 1452 process_discard(tc, bio); 1453 else 1454 process_bio(tc, bio); 1455 } 1456 1457 /* 1458 * If there are any deferred flush bios, we must commit 1459 * the metadata before issuing them. 1460 */ 1461 bio_list_init(&bios); 1462 spin_lock_irqsave(&pool->lock, flags); 1463 bio_list_merge(&bios, &pool->deferred_flush_bios); 1464 bio_list_init(&pool->deferred_flush_bios); 1465 spin_unlock_irqrestore(&pool->lock, flags); 1466 1467 if (bio_list_empty(&bios) && !need_commit_due_to_time(pool)) 1468 return; 1469 1470 r = dm_pool_commit_metadata(pool->pmd); 1471 if (r) { 1472 DMERR("%s: dm_pool_commit_metadata() failed, error = %d", 1473 __func__, r); 1474 while ((bio = bio_list_pop(&bios))) 1475 bio_io_error(bio); 1476 return; 1477 } 1478 pool->last_commit_jiffies = jiffies; 1479 1480 while ((bio = bio_list_pop(&bios))) 1481 generic_make_request(bio); 1482 } 1483 1484 static void do_worker(struct work_struct *ws) 1485 { 1486 struct pool *pool = container_of(ws, struct pool, worker); 1487 1488 process_prepared(pool, &pool->prepared_mappings, process_prepared_mapping); 1489 process_prepared(pool, &pool->prepared_discards, process_prepared_discard); 1490 process_deferred_bios(pool); 1491 } 1492 1493 /* 1494 * We want to commit periodically so that not too much 1495 * unwritten data builds up. 1496 */ 1497 static void do_waker(struct work_struct *ws) 1498 { 1499 struct pool *pool = container_of(to_delayed_work(ws), struct pool, waker); 1500 wake_worker(pool); 1501 queue_delayed_work(pool->wq, &pool->waker, COMMIT_PERIOD); 1502 } 1503 1504 /*----------------------------------------------------------------*/ 1505 1506 /* 1507 * Mapping functions. 1508 */ 1509 1510 /* 1511 * Called only while mapping a thin bio to hand it over to the workqueue. 1512 */ 1513 static void thin_defer_bio(struct thin_c *tc, struct bio *bio) 1514 { 1515 unsigned long flags; 1516 struct pool *pool = tc->pool; 1517 1518 spin_lock_irqsave(&pool->lock, flags); 1519 bio_list_add(&pool->deferred_bios, bio); 1520 spin_unlock_irqrestore(&pool->lock, flags); 1521 1522 wake_worker(pool); 1523 } 1524 1525 static struct endio_hook *thin_hook_bio(struct thin_c *tc, struct bio *bio) 1526 { 1527 struct pool *pool = tc->pool; 1528 struct endio_hook *h = mempool_alloc(pool->endio_hook_pool, GFP_NOIO); 1529 1530 h->tc = tc; 1531 h->shared_read_entry = NULL; 1532 h->all_io_entry = bio->bi_rw & REQ_DISCARD ? NULL : ds_inc(&pool->all_io_ds); 1533 h->overwrite_mapping = NULL; 1534 1535 return h; 1536 } 1537 1538 /* 1539 * Non-blocking function called from the thin target's map function. 1540 */ 1541 static int thin_bio_map(struct dm_target *ti, struct bio *bio, 1542 union map_info *map_context) 1543 { 1544 int r; 1545 struct thin_c *tc = ti->private; 1546 dm_block_t block = get_bio_block(tc, bio); 1547 struct dm_thin_device *td = tc->td; 1548 struct dm_thin_lookup_result result; 1549 1550 map_context->ptr = thin_hook_bio(tc, bio); 1551 if (bio->bi_rw & (REQ_DISCARD | REQ_FLUSH | REQ_FUA)) { 1552 thin_defer_bio(tc, bio); 1553 return DM_MAPIO_SUBMITTED; 1554 } 1555 1556 r = dm_thin_find_block(td, block, 0, &result); 1557 1558 /* 1559 * Note that we defer readahead too. 1560 */ 1561 switch (r) { 1562 case 0: 1563 if (unlikely(result.shared)) { 1564 /* 1565 * We have a race condition here between the 1566 * result.shared value returned by the lookup and 1567 * snapshot creation, which may cause new 1568 * sharing. 1569 * 1570 * To avoid this always quiesce the origin before 1571 * taking the snap. You want to do this anyway to 1572 * ensure a consistent application view 1573 * (i.e. lockfs). 1574 * 1575 * More distant ancestors are irrelevant. The 1576 * shared flag will be set in their case. 1577 */ 1578 thin_defer_bio(tc, bio); 1579 r = DM_MAPIO_SUBMITTED; 1580 } else { 1581 remap(tc, bio, result.block); 1582 r = DM_MAPIO_REMAPPED; 1583 } 1584 break; 1585 1586 case -ENODATA: 1587 /* 1588 * In future, the failed dm_thin_find_block above could 1589 * provide the hint to load the metadata into cache. 1590 */ 1591 case -EWOULDBLOCK: 1592 thin_defer_bio(tc, bio); 1593 r = DM_MAPIO_SUBMITTED; 1594 break; 1595 } 1596 1597 return r; 1598 } 1599 1600 static int pool_is_congested(struct dm_target_callbacks *cb, int bdi_bits) 1601 { 1602 int r; 1603 unsigned long flags; 1604 struct pool_c *pt = container_of(cb, struct pool_c, callbacks); 1605 1606 spin_lock_irqsave(&pt->pool->lock, flags); 1607 r = !bio_list_empty(&pt->pool->retry_on_resume_list); 1608 spin_unlock_irqrestore(&pt->pool->lock, flags); 1609 1610 if (!r) { 1611 struct request_queue *q = bdev_get_queue(pt->data_dev->bdev); 1612 r = bdi_congested(&q->backing_dev_info, bdi_bits); 1613 } 1614 1615 return r; 1616 } 1617 1618 static void __requeue_bios(struct pool *pool) 1619 { 1620 bio_list_merge(&pool->deferred_bios, &pool->retry_on_resume_list); 1621 bio_list_init(&pool->retry_on_resume_list); 1622 } 1623 1624 /*---------------------------------------------------------------- 1625 * Binding of control targets to a pool object 1626 *--------------------------------------------------------------*/ 1627 static int bind_control_target(struct pool *pool, struct dm_target *ti) 1628 { 1629 struct pool_c *pt = ti->private; 1630 1631 pool->ti = ti; 1632 pool->low_water_blocks = pt->low_water_blocks; 1633 pool->pf = pt->pf; 1634 1635 return 0; 1636 } 1637 1638 static void unbind_control_target(struct pool *pool, struct dm_target *ti) 1639 { 1640 if (pool->ti == ti) 1641 pool->ti = NULL; 1642 } 1643 1644 /*---------------------------------------------------------------- 1645 * Pool creation 1646 *--------------------------------------------------------------*/ 1647 /* Initialize pool features. */ 1648 static void pool_features_init(struct pool_features *pf) 1649 { 1650 pf->zero_new_blocks = 1; 1651 pf->discard_enabled = 1; 1652 pf->discard_passdown = 1; 1653 } 1654 1655 static void __pool_destroy(struct pool *pool) 1656 { 1657 __pool_table_remove(pool); 1658 1659 if (dm_pool_metadata_close(pool->pmd) < 0) 1660 DMWARN("%s: dm_pool_metadata_close() failed.", __func__); 1661 1662 prison_destroy(pool->prison); 1663 dm_kcopyd_client_destroy(pool->copier); 1664 1665 if (pool->wq) 1666 destroy_workqueue(pool->wq); 1667 1668 if (pool->next_mapping) 1669 mempool_free(pool->next_mapping, pool->mapping_pool); 1670 mempool_destroy(pool->mapping_pool); 1671 mempool_destroy(pool->endio_hook_pool); 1672 kfree(pool); 1673 } 1674 1675 static struct pool *pool_create(struct mapped_device *pool_md, 1676 struct block_device *metadata_dev, 1677 unsigned long block_size, char **error) 1678 { 1679 int r; 1680 void *err_p; 1681 struct pool *pool; 1682 struct dm_pool_metadata *pmd; 1683 1684 pmd = dm_pool_metadata_open(metadata_dev, block_size); 1685 if (IS_ERR(pmd)) { 1686 *error = "Error creating metadata object"; 1687 return (struct pool *)pmd; 1688 } 1689 1690 pool = kmalloc(sizeof(*pool), GFP_KERNEL); 1691 if (!pool) { 1692 *error = "Error allocating memory for pool"; 1693 err_p = ERR_PTR(-ENOMEM); 1694 goto bad_pool; 1695 } 1696 1697 pool->pmd = pmd; 1698 pool->sectors_per_block = block_size; 1699 pool->block_shift = ffs(block_size) - 1; 1700 pool->offset_mask = block_size - 1; 1701 pool->low_water_blocks = 0; 1702 pool_features_init(&pool->pf); 1703 pool->prison = prison_create(PRISON_CELLS); 1704 if (!pool->prison) { 1705 *error = "Error creating pool's bio prison"; 1706 err_p = ERR_PTR(-ENOMEM); 1707 goto bad_prison; 1708 } 1709 1710 pool->copier = dm_kcopyd_client_create(); 1711 if (IS_ERR(pool->copier)) { 1712 r = PTR_ERR(pool->copier); 1713 *error = "Error creating pool's kcopyd client"; 1714 err_p = ERR_PTR(r); 1715 goto bad_kcopyd_client; 1716 } 1717 1718 /* 1719 * Create singlethreaded workqueue that will service all devices 1720 * that use this metadata. 1721 */ 1722 pool->wq = alloc_ordered_workqueue("dm-" DM_MSG_PREFIX, WQ_MEM_RECLAIM); 1723 if (!pool->wq) { 1724 *error = "Error creating pool's workqueue"; 1725 err_p = ERR_PTR(-ENOMEM); 1726 goto bad_wq; 1727 } 1728 1729 INIT_WORK(&pool->worker, do_worker); 1730 INIT_DELAYED_WORK(&pool->waker, do_waker); 1731 spin_lock_init(&pool->lock); 1732 bio_list_init(&pool->deferred_bios); 1733 bio_list_init(&pool->deferred_flush_bios); 1734 INIT_LIST_HEAD(&pool->prepared_mappings); 1735 INIT_LIST_HEAD(&pool->prepared_discards); 1736 pool->low_water_triggered = 0; 1737 pool->no_free_space = 0; 1738 bio_list_init(&pool->retry_on_resume_list); 1739 ds_init(&pool->shared_read_ds); 1740 ds_init(&pool->all_io_ds); 1741 1742 pool->next_mapping = NULL; 1743 pool->mapping_pool = 1744 mempool_create_kmalloc_pool(MAPPING_POOL_SIZE, sizeof(struct new_mapping)); 1745 if (!pool->mapping_pool) { 1746 *error = "Error creating pool's mapping mempool"; 1747 err_p = ERR_PTR(-ENOMEM); 1748 goto bad_mapping_pool; 1749 } 1750 1751 pool->endio_hook_pool = 1752 mempool_create_kmalloc_pool(ENDIO_HOOK_POOL_SIZE, sizeof(struct endio_hook)); 1753 if (!pool->endio_hook_pool) { 1754 *error = "Error creating pool's endio_hook mempool"; 1755 err_p = ERR_PTR(-ENOMEM); 1756 goto bad_endio_hook_pool; 1757 } 1758 pool->ref_count = 1; 1759 pool->last_commit_jiffies = jiffies; 1760 pool->pool_md = pool_md; 1761 pool->md_dev = metadata_dev; 1762 __pool_table_insert(pool); 1763 1764 return pool; 1765 1766 bad_endio_hook_pool: 1767 mempool_destroy(pool->mapping_pool); 1768 bad_mapping_pool: 1769 destroy_workqueue(pool->wq); 1770 bad_wq: 1771 dm_kcopyd_client_destroy(pool->copier); 1772 bad_kcopyd_client: 1773 prison_destroy(pool->prison); 1774 bad_prison: 1775 kfree(pool); 1776 bad_pool: 1777 if (dm_pool_metadata_close(pmd)) 1778 DMWARN("%s: dm_pool_metadata_close() failed.", __func__); 1779 1780 return err_p; 1781 } 1782 1783 static void __pool_inc(struct pool *pool) 1784 { 1785 BUG_ON(!mutex_is_locked(&dm_thin_pool_table.mutex)); 1786 pool->ref_count++; 1787 } 1788 1789 static void __pool_dec(struct pool *pool) 1790 { 1791 BUG_ON(!mutex_is_locked(&dm_thin_pool_table.mutex)); 1792 BUG_ON(!pool->ref_count); 1793 if (!--pool->ref_count) 1794 __pool_destroy(pool); 1795 } 1796 1797 static struct pool *__pool_find(struct mapped_device *pool_md, 1798 struct block_device *metadata_dev, 1799 unsigned long block_size, char **error, 1800 int *created) 1801 { 1802 struct pool *pool = __pool_table_lookup_metadata_dev(metadata_dev); 1803 1804 if (pool) { 1805 if (pool->pool_md != pool_md) 1806 return ERR_PTR(-EBUSY); 1807 __pool_inc(pool); 1808 1809 } else { 1810 pool = __pool_table_lookup(pool_md); 1811 if (pool) { 1812 if (pool->md_dev != metadata_dev) 1813 return ERR_PTR(-EINVAL); 1814 __pool_inc(pool); 1815 1816 } else { 1817 pool = pool_create(pool_md, metadata_dev, block_size, error); 1818 *created = 1; 1819 } 1820 } 1821 1822 return pool; 1823 } 1824 1825 /*---------------------------------------------------------------- 1826 * Pool target methods 1827 *--------------------------------------------------------------*/ 1828 static void pool_dtr(struct dm_target *ti) 1829 { 1830 struct pool_c *pt = ti->private; 1831 1832 mutex_lock(&dm_thin_pool_table.mutex); 1833 1834 unbind_control_target(pt->pool, ti); 1835 __pool_dec(pt->pool); 1836 dm_put_device(ti, pt->metadata_dev); 1837 dm_put_device(ti, pt->data_dev); 1838 kfree(pt); 1839 1840 mutex_unlock(&dm_thin_pool_table.mutex); 1841 } 1842 1843 static int parse_pool_features(struct dm_arg_set *as, struct pool_features *pf, 1844 struct dm_target *ti) 1845 { 1846 int r; 1847 unsigned argc; 1848 const char *arg_name; 1849 1850 static struct dm_arg _args[] = { 1851 {0, 3, "Invalid number of pool feature arguments"}, 1852 }; 1853 1854 /* 1855 * No feature arguments supplied. 1856 */ 1857 if (!as->argc) 1858 return 0; 1859 1860 r = dm_read_arg_group(_args, as, &argc, &ti->error); 1861 if (r) 1862 return -EINVAL; 1863 1864 while (argc && !r) { 1865 arg_name = dm_shift_arg(as); 1866 argc--; 1867 1868 if (!strcasecmp(arg_name, "skip_block_zeroing")) { 1869 pf->zero_new_blocks = 0; 1870 continue; 1871 } else if (!strcasecmp(arg_name, "ignore_discard")) { 1872 pf->discard_enabled = 0; 1873 continue; 1874 } else if (!strcasecmp(arg_name, "no_discard_passdown")) { 1875 pf->discard_passdown = 0; 1876 continue; 1877 } 1878 1879 ti->error = "Unrecognised pool feature requested"; 1880 r = -EINVAL; 1881 } 1882 1883 return r; 1884 } 1885 1886 /* 1887 * thin-pool <metadata dev> <data dev> 1888 * <data block size (sectors)> 1889 * <low water mark (blocks)> 1890 * [<#feature args> [<arg>]*] 1891 * 1892 * Optional feature arguments are: 1893 * skip_block_zeroing: skips the zeroing of newly-provisioned blocks. 1894 * ignore_discard: disable discard 1895 * no_discard_passdown: don't pass discards down to the data device 1896 */ 1897 static int pool_ctr(struct dm_target *ti, unsigned argc, char **argv) 1898 { 1899 int r, pool_created = 0; 1900 struct pool_c *pt; 1901 struct pool *pool; 1902 struct pool_features pf; 1903 struct dm_arg_set as; 1904 struct dm_dev *data_dev; 1905 unsigned long block_size; 1906 dm_block_t low_water_blocks; 1907 struct dm_dev *metadata_dev; 1908 sector_t metadata_dev_size; 1909 char b[BDEVNAME_SIZE]; 1910 1911 /* 1912 * FIXME Remove validation from scope of lock. 1913 */ 1914 mutex_lock(&dm_thin_pool_table.mutex); 1915 1916 if (argc < 4) { 1917 ti->error = "Invalid argument count"; 1918 r = -EINVAL; 1919 goto out_unlock; 1920 } 1921 as.argc = argc; 1922 as.argv = argv; 1923 1924 r = dm_get_device(ti, argv[0], FMODE_READ | FMODE_WRITE, &metadata_dev); 1925 if (r) { 1926 ti->error = "Error opening metadata block device"; 1927 goto out_unlock; 1928 } 1929 1930 metadata_dev_size = i_size_read(metadata_dev->bdev->bd_inode) >> SECTOR_SHIFT; 1931 if (metadata_dev_size > THIN_METADATA_MAX_SECTORS_WARNING) 1932 DMWARN("Metadata device %s is larger than %u sectors: excess space will not be used.", 1933 bdevname(metadata_dev->bdev, b), THIN_METADATA_MAX_SECTORS); 1934 1935 r = dm_get_device(ti, argv[1], FMODE_READ | FMODE_WRITE, &data_dev); 1936 if (r) { 1937 ti->error = "Error getting data device"; 1938 goto out_metadata; 1939 } 1940 1941 if (kstrtoul(argv[2], 10, &block_size) || !block_size || 1942 block_size < DATA_DEV_BLOCK_SIZE_MIN_SECTORS || 1943 block_size > DATA_DEV_BLOCK_SIZE_MAX_SECTORS || 1944 !is_power_of_2(block_size)) { 1945 ti->error = "Invalid block size"; 1946 r = -EINVAL; 1947 goto out; 1948 } 1949 1950 if (kstrtoull(argv[3], 10, (unsigned long long *)&low_water_blocks)) { 1951 ti->error = "Invalid low water mark"; 1952 r = -EINVAL; 1953 goto out; 1954 } 1955 1956 /* 1957 * Set default pool features. 1958 */ 1959 pool_features_init(&pf); 1960 1961 dm_consume_args(&as, 4); 1962 r = parse_pool_features(&as, &pf, ti); 1963 if (r) 1964 goto out; 1965 1966 pt = kzalloc(sizeof(*pt), GFP_KERNEL); 1967 if (!pt) { 1968 r = -ENOMEM; 1969 goto out; 1970 } 1971 1972 pool = __pool_find(dm_table_get_md(ti->table), metadata_dev->bdev, 1973 block_size, &ti->error, &pool_created); 1974 if (IS_ERR(pool)) { 1975 r = PTR_ERR(pool); 1976 goto out_free_pt; 1977 } 1978 1979 /* 1980 * 'pool_created' reflects whether this is the first table load. 1981 * Top level discard support is not allowed to be changed after 1982 * initial load. This would require a pool reload to trigger thin 1983 * device changes. 1984 */ 1985 if (!pool_created && pf.discard_enabled != pool->pf.discard_enabled) { 1986 ti->error = "Discard support cannot be disabled once enabled"; 1987 r = -EINVAL; 1988 goto out_flags_changed; 1989 } 1990 1991 /* 1992 * If discard_passdown was enabled verify that the data device 1993 * supports discards. Disable discard_passdown if not; otherwise 1994 * -EOPNOTSUPP will be returned. 1995 */ 1996 if (pf.discard_passdown) { 1997 struct request_queue *q = bdev_get_queue(data_dev->bdev); 1998 if (!q || !blk_queue_discard(q)) { 1999 DMWARN("Discard unsupported by data device: Disabling discard passdown."); 2000 pf.discard_passdown = 0; 2001 } 2002 } 2003 2004 pt->pool = pool; 2005 pt->ti = ti; 2006 pt->metadata_dev = metadata_dev; 2007 pt->data_dev = data_dev; 2008 pt->low_water_blocks = low_water_blocks; 2009 pt->pf = pf; 2010 ti->num_flush_requests = 1; 2011 /* 2012 * Only need to enable discards if the pool should pass 2013 * them down to the data device. The thin device's discard 2014 * processing will cause mappings to be removed from the btree. 2015 */ 2016 if (pf.discard_enabled && pf.discard_passdown) { 2017 ti->num_discard_requests = 1; 2018 /* 2019 * Setting 'discards_supported' circumvents the normal 2020 * stacking of discard limits (this keeps the pool and 2021 * thin devices' discard limits consistent). 2022 */ 2023 ti->discards_supported = 1; 2024 } 2025 ti->private = pt; 2026 2027 pt->callbacks.congested_fn = pool_is_congested; 2028 dm_table_add_target_callbacks(ti->table, &pt->callbacks); 2029 2030 mutex_unlock(&dm_thin_pool_table.mutex); 2031 2032 return 0; 2033 2034 out_flags_changed: 2035 __pool_dec(pool); 2036 out_free_pt: 2037 kfree(pt); 2038 out: 2039 dm_put_device(ti, data_dev); 2040 out_metadata: 2041 dm_put_device(ti, metadata_dev); 2042 out_unlock: 2043 mutex_unlock(&dm_thin_pool_table.mutex); 2044 2045 return r; 2046 } 2047 2048 static int pool_map(struct dm_target *ti, struct bio *bio, 2049 union map_info *map_context) 2050 { 2051 int r; 2052 struct pool_c *pt = ti->private; 2053 struct pool *pool = pt->pool; 2054 unsigned long flags; 2055 2056 /* 2057 * As this is a singleton target, ti->begin is always zero. 2058 */ 2059 spin_lock_irqsave(&pool->lock, flags); 2060 bio->bi_bdev = pt->data_dev->bdev; 2061 r = DM_MAPIO_REMAPPED; 2062 spin_unlock_irqrestore(&pool->lock, flags); 2063 2064 return r; 2065 } 2066 2067 /* 2068 * Retrieves the number of blocks of the data device from 2069 * the superblock and compares it to the actual device size, 2070 * thus resizing the data device in case it has grown. 2071 * 2072 * This both copes with opening preallocated data devices in the ctr 2073 * being followed by a resume 2074 * -and- 2075 * calling the resume method individually after userspace has 2076 * grown the data device in reaction to a table event. 2077 */ 2078 static int pool_preresume(struct dm_target *ti) 2079 { 2080 int r; 2081 struct pool_c *pt = ti->private; 2082 struct pool *pool = pt->pool; 2083 dm_block_t data_size, sb_data_size; 2084 2085 /* 2086 * Take control of the pool object. 2087 */ 2088 r = bind_control_target(pool, ti); 2089 if (r) 2090 return r; 2091 2092 data_size = ti->len >> pool->block_shift; 2093 r = dm_pool_get_data_dev_size(pool->pmd, &sb_data_size); 2094 if (r) { 2095 DMERR("failed to retrieve data device size"); 2096 return r; 2097 } 2098 2099 if (data_size < sb_data_size) { 2100 DMERR("pool target too small, is %llu blocks (expected %llu)", 2101 data_size, sb_data_size); 2102 return -EINVAL; 2103 2104 } else if (data_size > sb_data_size) { 2105 r = dm_pool_resize_data_dev(pool->pmd, data_size); 2106 if (r) { 2107 DMERR("failed to resize data device"); 2108 return r; 2109 } 2110 2111 r = dm_pool_commit_metadata(pool->pmd); 2112 if (r) { 2113 DMERR("%s: dm_pool_commit_metadata() failed, error = %d", 2114 __func__, r); 2115 return r; 2116 } 2117 } 2118 2119 return 0; 2120 } 2121 2122 static void pool_resume(struct dm_target *ti) 2123 { 2124 struct pool_c *pt = ti->private; 2125 struct pool *pool = pt->pool; 2126 unsigned long flags; 2127 2128 spin_lock_irqsave(&pool->lock, flags); 2129 pool->low_water_triggered = 0; 2130 pool->no_free_space = 0; 2131 __requeue_bios(pool); 2132 spin_unlock_irqrestore(&pool->lock, flags); 2133 2134 do_waker(&pool->waker.work); 2135 } 2136 2137 static void pool_postsuspend(struct dm_target *ti) 2138 { 2139 int r; 2140 struct pool_c *pt = ti->private; 2141 struct pool *pool = pt->pool; 2142 2143 cancel_delayed_work(&pool->waker); 2144 flush_workqueue(pool->wq); 2145 2146 r = dm_pool_commit_metadata(pool->pmd); 2147 if (r < 0) { 2148 DMERR("%s: dm_pool_commit_metadata() failed, error = %d", 2149 __func__, r); 2150 /* FIXME: invalidate device? error the next FUA or FLUSH bio ?*/ 2151 } 2152 } 2153 2154 static int check_arg_count(unsigned argc, unsigned args_required) 2155 { 2156 if (argc != args_required) { 2157 DMWARN("Message received with %u arguments instead of %u.", 2158 argc, args_required); 2159 return -EINVAL; 2160 } 2161 2162 return 0; 2163 } 2164 2165 static int read_dev_id(char *arg, dm_thin_id *dev_id, int warning) 2166 { 2167 if (!kstrtoull(arg, 10, (unsigned long long *)dev_id) && 2168 *dev_id <= MAX_DEV_ID) 2169 return 0; 2170 2171 if (warning) 2172 DMWARN("Message received with invalid device id: %s", arg); 2173 2174 return -EINVAL; 2175 } 2176 2177 static int process_create_thin_mesg(unsigned argc, char **argv, struct pool *pool) 2178 { 2179 dm_thin_id dev_id; 2180 int r; 2181 2182 r = check_arg_count(argc, 2); 2183 if (r) 2184 return r; 2185 2186 r = read_dev_id(argv[1], &dev_id, 1); 2187 if (r) 2188 return r; 2189 2190 r = dm_pool_create_thin(pool->pmd, dev_id); 2191 if (r) { 2192 DMWARN("Creation of new thinly-provisioned device with id %s failed.", 2193 argv[1]); 2194 return r; 2195 } 2196 2197 return 0; 2198 } 2199 2200 static int process_create_snap_mesg(unsigned argc, char **argv, struct pool *pool) 2201 { 2202 dm_thin_id dev_id; 2203 dm_thin_id origin_dev_id; 2204 int r; 2205 2206 r = check_arg_count(argc, 3); 2207 if (r) 2208 return r; 2209 2210 r = read_dev_id(argv[1], &dev_id, 1); 2211 if (r) 2212 return r; 2213 2214 r = read_dev_id(argv[2], &origin_dev_id, 1); 2215 if (r) 2216 return r; 2217 2218 r = dm_pool_create_snap(pool->pmd, dev_id, origin_dev_id); 2219 if (r) { 2220 DMWARN("Creation of new snapshot %s of device %s failed.", 2221 argv[1], argv[2]); 2222 return r; 2223 } 2224 2225 return 0; 2226 } 2227 2228 static int process_delete_mesg(unsigned argc, char **argv, struct pool *pool) 2229 { 2230 dm_thin_id dev_id; 2231 int r; 2232 2233 r = check_arg_count(argc, 2); 2234 if (r) 2235 return r; 2236 2237 r = read_dev_id(argv[1], &dev_id, 1); 2238 if (r) 2239 return r; 2240 2241 r = dm_pool_delete_thin_device(pool->pmd, dev_id); 2242 if (r) 2243 DMWARN("Deletion of thin device %s failed.", argv[1]); 2244 2245 return r; 2246 } 2247 2248 static int process_set_transaction_id_mesg(unsigned argc, char **argv, struct pool *pool) 2249 { 2250 dm_thin_id old_id, new_id; 2251 int r; 2252 2253 r = check_arg_count(argc, 3); 2254 if (r) 2255 return r; 2256 2257 if (kstrtoull(argv[1], 10, (unsigned long long *)&old_id)) { 2258 DMWARN("set_transaction_id message: Unrecognised id %s.", argv[1]); 2259 return -EINVAL; 2260 } 2261 2262 if (kstrtoull(argv[2], 10, (unsigned long long *)&new_id)) { 2263 DMWARN("set_transaction_id message: Unrecognised new id %s.", argv[2]); 2264 return -EINVAL; 2265 } 2266 2267 r = dm_pool_set_metadata_transaction_id(pool->pmd, old_id, new_id); 2268 if (r) { 2269 DMWARN("Failed to change transaction id from %s to %s.", 2270 argv[1], argv[2]); 2271 return r; 2272 } 2273 2274 return 0; 2275 } 2276 2277 /* 2278 * Messages supported: 2279 * create_thin <dev_id> 2280 * create_snap <dev_id> <origin_id> 2281 * delete <dev_id> 2282 * trim <dev_id> <new_size_in_sectors> 2283 * set_transaction_id <current_trans_id> <new_trans_id> 2284 */ 2285 static int pool_message(struct dm_target *ti, unsigned argc, char **argv) 2286 { 2287 int r = -EINVAL; 2288 struct pool_c *pt = ti->private; 2289 struct pool *pool = pt->pool; 2290 2291 if (!strcasecmp(argv[0], "create_thin")) 2292 r = process_create_thin_mesg(argc, argv, pool); 2293 2294 else if (!strcasecmp(argv[0], "create_snap")) 2295 r = process_create_snap_mesg(argc, argv, pool); 2296 2297 else if (!strcasecmp(argv[0], "delete")) 2298 r = process_delete_mesg(argc, argv, pool); 2299 2300 else if (!strcasecmp(argv[0], "set_transaction_id")) 2301 r = process_set_transaction_id_mesg(argc, argv, pool); 2302 2303 else 2304 DMWARN("Unrecognised thin pool target message received: %s", argv[0]); 2305 2306 if (!r) { 2307 r = dm_pool_commit_metadata(pool->pmd); 2308 if (r) 2309 DMERR("%s message: dm_pool_commit_metadata() failed, error = %d", 2310 argv[0], r); 2311 } 2312 2313 return r; 2314 } 2315 2316 /* 2317 * Status line is: 2318 * <transaction id> <used metadata sectors>/<total metadata sectors> 2319 * <used data sectors>/<total data sectors> <held metadata root> 2320 */ 2321 static int pool_status(struct dm_target *ti, status_type_t type, 2322 char *result, unsigned maxlen) 2323 { 2324 int r, count; 2325 unsigned sz = 0; 2326 uint64_t transaction_id; 2327 dm_block_t nr_free_blocks_data; 2328 dm_block_t nr_free_blocks_metadata; 2329 dm_block_t nr_blocks_data; 2330 dm_block_t nr_blocks_metadata; 2331 dm_block_t held_root; 2332 char buf[BDEVNAME_SIZE]; 2333 char buf2[BDEVNAME_SIZE]; 2334 struct pool_c *pt = ti->private; 2335 struct pool *pool = pt->pool; 2336 2337 switch (type) { 2338 case STATUSTYPE_INFO: 2339 r = dm_pool_get_metadata_transaction_id(pool->pmd, 2340 &transaction_id); 2341 if (r) 2342 return r; 2343 2344 r = dm_pool_get_free_metadata_block_count(pool->pmd, 2345 &nr_free_blocks_metadata); 2346 if (r) 2347 return r; 2348 2349 r = dm_pool_get_metadata_dev_size(pool->pmd, &nr_blocks_metadata); 2350 if (r) 2351 return r; 2352 2353 r = dm_pool_get_free_block_count(pool->pmd, 2354 &nr_free_blocks_data); 2355 if (r) 2356 return r; 2357 2358 r = dm_pool_get_data_dev_size(pool->pmd, &nr_blocks_data); 2359 if (r) 2360 return r; 2361 2362 r = dm_pool_get_held_metadata_root(pool->pmd, &held_root); 2363 if (r) 2364 return r; 2365 2366 DMEMIT("%llu %llu/%llu %llu/%llu ", 2367 (unsigned long long)transaction_id, 2368 (unsigned long long)(nr_blocks_metadata - nr_free_blocks_metadata), 2369 (unsigned long long)nr_blocks_metadata, 2370 (unsigned long long)(nr_blocks_data - nr_free_blocks_data), 2371 (unsigned long long)nr_blocks_data); 2372 2373 if (held_root) 2374 DMEMIT("%llu", held_root); 2375 else 2376 DMEMIT("-"); 2377 2378 break; 2379 2380 case STATUSTYPE_TABLE: 2381 DMEMIT("%s %s %lu %llu ", 2382 format_dev_t(buf, pt->metadata_dev->bdev->bd_dev), 2383 format_dev_t(buf2, pt->data_dev->bdev->bd_dev), 2384 (unsigned long)pool->sectors_per_block, 2385 (unsigned long long)pt->low_water_blocks); 2386 2387 count = !pool->pf.zero_new_blocks + !pool->pf.discard_enabled + 2388 !pool->pf.discard_passdown; 2389 DMEMIT("%u ", count); 2390 2391 if (!pool->pf.zero_new_blocks) 2392 DMEMIT("skip_block_zeroing "); 2393 2394 if (!pool->pf.discard_enabled) 2395 DMEMIT("ignore_discard "); 2396 2397 if (!pool->pf.discard_passdown) 2398 DMEMIT("no_discard_passdown "); 2399 2400 break; 2401 } 2402 2403 return 0; 2404 } 2405 2406 static int pool_iterate_devices(struct dm_target *ti, 2407 iterate_devices_callout_fn fn, void *data) 2408 { 2409 struct pool_c *pt = ti->private; 2410 2411 return fn(ti, pt->data_dev, 0, ti->len, data); 2412 } 2413 2414 static int pool_merge(struct dm_target *ti, struct bvec_merge_data *bvm, 2415 struct bio_vec *biovec, int max_size) 2416 { 2417 struct pool_c *pt = ti->private; 2418 struct request_queue *q = bdev_get_queue(pt->data_dev->bdev); 2419 2420 if (!q->merge_bvec_fn) 2421 return max_size; 2422 2423 bvm->bi_bdev = pt->data_dev->bdev; 2424 2425 return min(max_size, q->merge_bvec_fn(q, bvm, biovec)); 2426 } 2427 2428 static void set_discard_limits(struct pool *pool, struct queue_limits *limits) 2429 { 2430 /* 2431 * FIXME: these limits may be incompatible with the pool's data device 2432 */ 2433 limits->max_discard_sectors = pool->sectors_per_block; 2434 2435 /* 2436 * This is just a hint, and not enforced. We have to cope with 2437 * bios that overlap 2 blocks. 2438 */ 2439 limits->discard_granularity = pool->sectors_per_block << SECTOR_SHIFT; 2440 limits->discard_zeroes_data = pool->pf.zero_new_blocks; 2441 } 2442 2443 static void pool_io_hints(struct dm_target *ti, struct queue_limits *limits) 2444 { 2445 struct pool_c *pt = ti->private; 2446 struct pool *pool = pt->pool; 2447 2448 blk_limits_io_min(limits, 0); 2449 blk_limits_io_opt(limits, pool->sectors_per_block << SECTOR_SHIFT); 2450 if (pool->pf.discard_enabled) 2451 set_discard_limits(pool, limits); 2452 } 2453 2454 static struct target_type pool_target = { 2455 .name = "thin-pool", 2456 .features = DM_TARGET_SINGLETON | DM_TARGET_ALWAYS_WRITEABLE | 2457 DM_TARGET_IMMUTABLE, 2458 .version = {1, 1, 0}, 2459 .module = THIS_MODULE, 2460 .ctr = pool_ctr, 2461 .dtr = pool_dtr, 2462 .map = pool_map, 2463 .postsuspend = pool_postsuspend, 2464 .preresume = pool_preresume, 2465 .resume = pool_resume, 2466 .message = pool_message, 2467 .status = pool_status, 2468 .merge = pool_merge, 2469 .iterate_devices = pool_iterate_devices, 2470 .io_hints = pool_io_hints, 2471 }; 2472 2473 /*---------------------------------------------------------------- 2474 * Thin target methods 2475 *--------------------------------------------------------------*/ 2476 static void thin_dtr(struct dm_target *ti) 2477 { 2478 struct thin_c *tc = ti->private; 2479 2480 mutex_lock(&dm_thin_pool_table.mutex); 2481 2482 __pool_dec(tc->pool); 2483 dm_pool_close_thin_device(tc->td); 2484 dm_put_device(ti, tc->pool_dev); 2485 if (tc->origin_dev) 2486 dm_put_device(ti, tc->origin_dev); 2487 kfree(tc); 2488 2489 mutex_unlock(&dm_thin_pool_table.mutex); 2490 } 2491 2492 /* 2493 * Thin target parameters: 2494 * 2495 * <pool_dev> <dev_id> [origin_dev] 2496 * 2497 * pool_dev: the path to the pool (eg, /dev/mapper/my_pool) 2498 * dev_id: the internal device identifier 2499 * origin_dev: a device external to the pool that should act as the origin 2500 * 2501 * If the pool device has discards disabled, they get disabled for the thin 2502 * device as well. 2503 */ 2504 static int thin_ctr(struct dm_target *ti, unsigned argc, char **argv) 2505 { 2506 int r; 2507 struct thin_c *tc; 2508 struct dm_dev *pool_dev, *origin_dev; 2509 struct mapped_device *pool_md; 2510 2511 mutex_lock(&dm_thin_pool_table.mutex); 2512 2513 if (argc != 2 && argc != 3) { 2514 ti->error = "Invalid argument count"; 2515 r = -EINVAL; 2516 goto out_unlock; 2517 } 2518 2519 tc = ti->private = kzalloc(sizeof(*tc), GFP_KERNEL); 2520 if (!tc) { 2521 ti->error = "Out of memory"; 2522 r = -ENOMEM; 2523 goto out_unlock; 2524 } 2525 2526 if (argc == 3) { 2527 r = dm_get_device(ti, argv[2], FMODE_READ, &origin_dev); 2528 if (r) { 2529 ti->error = "Error opening origin device"; 2530 goto bad_origin_dev; 2531 } 2532 tc->origin_dev = origin_dev; 2533 } 2534 2535 r = dm_get_device(ti, argv[0], dm_table_get_mode(ti->table), &pool_dev); 2536 if (r) { 2537 ti->error = "Error opening pool device"; 2538 goto bad_pool_dev; 2539 } 2540 tc->pool_dev = pool_dev; 2541 2542 if (read_dev_id(argv[1], (unsigned long long *)&tc->dev_id, 0)) { 2543 ti->error = "Invalid device id"; 2544 r = -EINVAL; 2545 goto bad_common; 2546 } 2547 2548 pool_md = dm_get_md(tc->pool_dev->bdev->bd_dev); 2549 if (!pool_md) { 2550 ti->error = "Couldn't get pool mapped device"; 2551 r = -EINVAL; 2552 goto bad_common; 2553 } 2554 2555 tc->pool = __pool_table_lookup(pool_md); 2556 if (!tc->pool) { 2557 ti->error = "Couldn't find pool object"; 2558 r = -EINVAL; 2559 goto bad_pool_lookup; 2560 } 2561 __pool_inc(tc->pool); 2562 2563 r = dm_pool_open_thin_device(tc->pool->pmd, tc->dev_id, &tc->td); 2564 if (r) { 2565 ti->error = "Couldn't open thin internal device"; 2566 goto bad_thin_open; 2567 } 2568 2569 ti->split_io = tc->pool->sectors_per_block; 2570 ti->num_flush_requests = 1; 2571 2572 /* In case the pool supports discards, pass them on. */ 2573 if (tc->pool->pf.discard_enabled) { 2574 ti->discards_supported = 1; 2575 ti->num_discard_requests = 1; 2576 } 2577 2578 dm_put(pool_md); 2579 2580 mutex_unlock(&dm_thin_pool_table.mutex); 2581 2582 return 0; 2583 2584 bad_thin_open: 2585 __pool_dec(tc->pool); 2586 bad_pool_lookup: 2587 dm_put(pool_md); 2588 bad_common: 2589 dm_put_device(ti, tc->pool_dev); 2590 bad_pool_dev: 2591 if (tc->origin_dev) 2592 dm_put_device(ti, tc->origin_dev); 2593 bad_origin_dev: 2594 kfree(tc); 2595 out_unlock: 2596 mutex_unlock(&dm_thin_pool_table.mutex); 2597 2598 return r; 2599 } 2600 2601 static int thin_map(struct dm_target *ti, struct bio *bio, 2602 union map_info *map_context) 2603 { 2604 bio->bi_sector = dm_target_offset(ti, bio->bi_sector); 2605 2606 return thin_bio_map(ti, bio, map_context); 2607 } 2608 2609 static int thin_endio(struct dm_target *ti, 2610 struct bio *bio, int err, 2611 union map_info *map_context) 2612 { 2613 unsigned long flags; 2614 struct endio_hook *h = map_context->ptr; 2615 struct list_head work; 2616 struct new_mapping *m, *tmp; 2617 struct pool *pool = h->tc->pool; 2618 2619 if (h->shared_read_entry) { 2620 INIT_LIST_HEAD(&work); 2621 ds_dec(h->shared_read_entry, &work); 2622 2623 spin_lock_irqsave(&pool->lock, flags); 2624 list_for_each_entry_safe(m, tmp, &work, list) { 2625 list_del(&m->list); 2626 m->quiesced = 1; 2627 __maybe_add_mapping(m); 2628 } 2629 spin_unlock_irqrestore(&pool->lock, flags); 2630 } 2631 2632 if (h->all_io_entry) { 2633 INIT_LIST_HEAD(&work); 2634 ds_dec(h->all_io_entry, &work); 2635 spin_lock_irqsave(&pool->lock, flags); 2636 list_for_each_entry_safe(m, tmp, &work, list) 2637 list_add(&m->list, &pool->prepared_discards); 2638 spin_unlock_irqrestore(&pool->lock, flags); 2639 } 2640 2641 mempool_free(h, pool->endio_hook_pool); 2642 2643 return 0; 2644 } 2645 2646 static void thin_postsuspend(struct dm_target *ti) 2647 { 2648 if (dm_noflush_suspending(ti)) 2649 requeue_io((struct thin_c *)ti->private); 2650 } 2651 2652 /* 2653 * <nr mapped sectors> <highest mapped sector> 2654 */ 2655 static int thin_status(struct dm_target *ti, status_type_t type, 2656 char *result, unsigned maxlen) 2657 { 2658 int r; 2659 ssize_t sz = 0; 2660 dm_block_t mapped, highest; 2661 char buf[BDEVNAME_SIZE]; 2662 struct thin_c *tc = ti->private; 2663 2664 if (!tc->td) 2665 DMEMIT("-"); 2666 else { 2667 switch (type) { 2668 case STATUSTYPE_INFO: 2669 r = dm_thin_get_mapped_count(tc->td, &mapped); 2670 if (r) 2671 return r; 2672 2673 r = dm_thin_get_highest_mapped_block(tc->td, &highest); 2674 if (r < 0) 2675 return r; 2676 2677 DMEMIT("%llu ", mapped * tc->pool->sectors_per_block); 2678 if (r) 2679 DMEMIT("%llu", ((highest + 1) * 2680 tc->pool->sectors_per_block) - 1); 2681 else 2682 DMEMIT("-"); 2683 break; 2684 2685 case STATUSTYPE_TABLE: 2686 DMEMIT("%s %lu", 2687 format_dev_t(buf, tc->pool_dev->bdev->bd_dev), 2688 (unsigned long) tc->dev_id); 2689 if (tc->origin_dev) 2690 DMEMIT(" %s", format_dev_t(buf, tc->origin_dev->bdev->bd_dev)); 2691 break; 2692 } 2693 } 2694 2695 return 0; 2696 } 2697 2698 static int thin_iterate_devices(struct dm_target *ti, 2699 iterate_devices_callout_fn fn, void *data) 2700 { 2701 dm_block_t blocks; 2702 struct thin_c *tc = ti->private; 2703 2704 /* 2705 * We can't call dm_pool_get_data_dev_size() since that blocks. So 2706 * we follow a more convoluted path through to the pool's target. 2707 */ 2708 if (!tc->pool->ti) 2709 return 0; /* nothing is bound */ 2710 2711 blocks = tc->pool->ti->len >> tc->pool->block_shift; 2712 if (blocks) 2713 return fn(ti, tc->pool_dev, 0, tc->pool->sectors_per_block * blocks, data); 2714 2715 return 0; 2716 } 2717 2718 static void thin_io_hints(struct dm_target *ti, struct queue_limits *limits) 2719 { 2720 struct thin_c *tc = ti->private; 2721 struct pool *pool = tc->pool; 2722 2723 blk_limits_io_min(limits, 0); 2724 blk_limits_io_opt(limits, pool->sectors_per_block << SECTOR_SHIFT); 2725 set_discard_limits(pool, limits); 2726 } 2727 2728 static struct target_type thin_target = { 2729 .name = "thin", 2730 .version = {1, 1, 0}, 2731 .module = THIS_MODULE, 2732 .ctr = thin_ctr, 2733 .dtr = thin_dtr, 2734 .map = thin_map, 2735 .end_io = thin_endio, 2736 .postsuspend = thin_postsuspend, 2737 .status = thin_status, 2738 .iterate_devices = thin_iterate_devices, 2739 .io_hints = thin_io_hints, 2740 }; 2741 2742 /*----------------------------------------------------------------*/ 2743 2744 static int __init dm_thin_init(void) 2745 { 2746 int r; 2747 2748 pool_table_init(); 2749 2750 r = dm_register_target(&thin_target); 2751 if (r) 2752 return r; 2753 2754 r = dm_register_target(&pool_target); 2755 if (r) 2756 dm_unregister_target(&thin_target); 2757 2758 return r; 2759 } 2760 2761 static void dm_thin_exit(void) 2762 { 2763 dm_unregister_target(&thin_target); 2764 dm_unregister_target(&pool_target); 2765 } 2766 2767 module_init(dm_thin_init); 2768 module_exit(dm_thin_exit); 2769 2770 MODULE_DESCRIPTION(DM_NAME " thin provisioning target"); 2771 MODULE_AUTHOR("Joe Thornber <dm-devel@redhat.com>"); 2772 MODULE_LICENSE("GPL"); 2773