1 /* 2 * Copyright (C) 2011 Red Hat UK. 3 * 4 * This file is released under the GPL. 5 */ 6 7 #include "dm-thin-metadata.h" 8 9 #include <linux/device-mapper.h> 10 #include <linux/dm-io.h> 11 #include <linux/dm-kcopyd.h> 12 #include <linux/list.h> 13 #include <linux/init.h> 14 #include <linux/module.h> 15 #include <linux/slab.h> 16 17 #define DM_MSG_PREFIX "thin" 18 19 /* 20 * Tunable constants 21 */ 22 #define ENDIO_HOOK_POOL_SIZE 10240 23 #define DEFERRED_SET_SIZE 64 24 #define MAPPING_POOL_SIZE 1024 25 #define PRISON_CELLS 1024 26 27 /* 28 * The block size of the device holding pool data must be 29 * between 64KB and 1GB. 30 */ 31 #define DATA_DEV_BLOCK_SIZE_MIN_SECTORS (64 * 1024 >> SECTOR_SHIFT) 32 #define DATA_DEV_BLOCK_SIZE_MAX_SECTORS (1024 * 1024 * 1024 >> SECTOR_SHIFT) 33 34 /* 35 * The metadata device is currently limited in size. The limitation is 36 * checked lower down in dm-space-map-metadata, but we also check it here 37 * so we can fail early. 38 * 39 * We have one block of index, which can hold 255 index entries. Each 40 * index entry contains allocation info about 16k metadata blocks. 41 */ 42 #define METADATA_DEV_MAX_SECTORS (255 * (1 << 14) * (THIN_METADATA_BLOCK_SIZE / (1 << SECTOR_SHIFT))) 43 44 /* 45 * Device id is restricted to 24 bits. 46 */ 47 #define MAX_DEV_ID ((1 << 24) - 1) 48 49 /* 50 * How do we handle breaking sharing of data blocks? 51 * ================================================= 52 * 53 * We use a standard copy-on-write btree to store the mappings for the 54 * devices (note I'm talking about copy-on-write of the metadata here, not 55 * the data). When you take an internal snapshot you clone the root node 56 * of the origin btree. After this there is no concept of an origin or a 57 * snapshot. They are just two device trees that happen to point to the 58 * same data blocks. 59 * 60 * When we get a write in we decide if it's to a shared data block using 61 * some timestamp magic. If it is, we have to break sharing. 62 * 63 * Let's say we write to a shared block in what was the origin. The 64 * steps are: 65 * 66 * i) plug io further to this physical block. (see bio_prison code). 67 * 68 * ii) quiesce any read io to that shared data block. Obviously 69 * including all devices that share this block. (see deferred_set code) 70 * 71 * iii) copy the data block to a newly allocate block. This step can be 72 * missed out if the io covers the block. (schedule_copy). 73 * 74 * iv) insert the new mapping into the origin's btree 75 * (process_prepared_mappings). This act of inserting breaks some 76 * sharing of btree nodes between the two devices. Breaking sharing only 77 * effects the btree of that specific device. Btrees for the other 78 * devices that share the block never change. The btree for the origin 79 * device as it was after the last commit is untouched, ie. we're using 80 * persistent data structures in the functional programming sense. 81 * 82 * v) unplug io to this physical block, including the io that triggered 83 * the breaking of sharing. 84 * 85 * Steps (ii) and (iii) occur in parallel. 86 * 87 * The metadata _doesn't_ need to be committed before the io continues. We 88 * get away with this because the io is always written to a _new_ block. 89 * If there's a crash, then: 90 * 91 * - The origin mapping will point to the old origin block (the shared 92 * one). This will contain the data as it was before the io that triggered 93 * the breaking of sharing came in. 94 * 95 * - The snap mapping still points to the old block. As it would after 96 * the commit. 97 * 98 * The downside of this scheme is the timestamp magic isn't perfect, and 99 * will continue to think that data block in the snapshot device is shared 100 * even after the write to the origin has broken sharing. I suspect data 101 * blocks will typically be shared by many different devices, so we're 102 * breaking sharing n + 1 times, rather than n, where n is the number of 103 * devices that reference this data block. At the moment I think the 104 * benefits far, far outweigh the disadvantages. 105 */ 106 107 /*----------------------------------------------------------------*/ 108 109 /* 110 * Sometimes we can't deal with a bio straight away. We put them in prison 111 * where they can't cause any mischief. Bios are put in a cell identified 112 * by a key, multiple bios can be in the same cell. When the cell is 113 * subsequently unlocked the bios become available. 114 */ 115 struct bio_prison; 116 117 struct cell_key { 118 int virtual; 119 dm_thin_id dev; 120 dm_block_t block; 121 }; 122 123 struct cell { 124 struct hlist_node list; 125 struct bio_prison *prison; 126 struct cell_key key; 127 unsigned count; 128 struct bio_list bios; 129 }; 130 131 struct bio_prison { 132 spinlock_t lock; 133 mempool_t *cell_pool; 134 135 unsigned nr_buckets; 136 unsigned hash_mask; 137 struct hlist_head *cells; 138 }; 139 140 static uint32_t calc_nr_buckets(unsigned nr_cells) 141 { 142 uint32_t n = 128; 143 144 nr_cells /= 4; 145 nr_cells = min(nr_cells, 8192u); 146 147 while (n < nr_cells) 148 n <<= 1; 149 150 return n; 151 } 152 153 /* 154 * @nr_cells should be the number of cells you want in use _concurrently_. 155 * Don't confuse it with the number of distinct keys. 156 */ 157 static struct bio_prison *prison_create(unsigned nr_cells) 158 { 159 unsigned i; 160 uint32_t nr_buckets = calc_nr_buckets(nr_cells); 161 size_t len = sizeof(struct bio_prison) + 162 (sizeof(struct hlist_head) * nr_buckets); 163 struct bio_prison *prison = kmalloc(len, GFP_KERNEL); 164 165 if (!prison) 166 return NULL; 167 168 spin_lock_init(&prison->lock); 169 prison->cell_pool = mempool_create_kmalloc_pool(nr_cells, 170 sizeof(struct cell)); 171 if (!prison->cell_pool) { 172 kfree(prison); 173 return NULL; 174 } 175 176 prison->nr_buckets = nr_buckets; 177 prison->hash_mask = nr_buckets - 1; 178 prison->cells = (struct hlist_head *) (prison + 1); 179 for (i = 0; i < nr_buckets; i++) 180 INIT_HLIST_HEAD(prison->cells + i); 181 182 return prison; 183 } 184 185 static void prison_destroy(struct bio_prison *prison) 186 { 187 mempool_destroy(prison->cell_pool); 188 kfree(prison); 189 } 190 191 static uint32_t hash_key(struct bio_prison *prison, struct cell_key *key) 192 { 193 const unsigned long BIG_PRIME = 4294967291UL; 194 uint64_t hash = key->block * BIG_PRIME; 195 196 return (uint32_t) (hash & prison->hash_mask); 197 } 198 199 static int keys_equal(struct cell_key *lhs, struct cell_key *rhs) 200 { 201 return (lhs->virtual == rhs->virtual) && 202 (lhs->dev == rhs->dev) && 203 (lhs->block == rhs->block); 204 } 205 206 static struct cell *__search_bucket(struct hlist_head *bucket, 207 struct cell_key *key) 208 { 209 struct cell *cell; 210 struct hlist_node *tmp; 211 212 hlist_for_each_entry(cell, tmp, bucket, list) 213 if (keys_equal(&cell->key, key)) 214 return cell; 215 216 return NULL; 217 } 218 219 /* 220 * This may block if a new cell needs allocating. You must ensure that 221 * cells will be unlocked even if the calling thread is blocked. 222 * 223 * Returns the number of entries in the cell prior to the new addition 224 * or < 0 on failure. 225 */ 226 static int bio_detain(struct bio_prison *prison, struct cell_key *key, 227 struct bio *inmate, struct cell **ref) 228 { 229 int r; 230 unsigned long flags; 231 uint32_t hash = hash_key(prison, key); 232 struct cell *uninitialized_var(cell), *cell2 = NULL; 233 234 BUG_ON(hash > prison->nr_buckets); 235 236 spin_lock_irqsave(&prison->lock, flags); 237 cell = __search_bucket(prison->cells + hash, key); 238 239 if (!cell) { 240 /* 241 * Allocate a new cell 242 */ 243 spin_unlock_irqrestore(&prison->lock, flags); 244 cell2 = mempool_alloc(prison->cell_pool, GFP_NOIO); 245 spin_lock_irqsave(&prison->lock, flags); 246 247 /* 248 * We've been unlocked, so we have to double check that 249 * nobody else has inserted this cell in the meantime. 250 */ 251 cell = __search_bucket(prison->cells + hash, key); 252 253 if (!cell) { 254 cell = cell2; 255 cell2 = NULL; 256 257 cell->prison = prison; 258 memcpy(&cell->key, key, sizeof(cell->key)); 259 cell->count = 0; 260 bio_list_init(&cell->bios); 261 hlist_add_head(&cell->list, prison->cells + hash); 262 } 263 } 264 265 r = cell->count++; 266 bio_list_add(&cell->bios, inmate); 267 spin_unlock_irqrestore(&prison->lock, flags); 268 269 if (cell2) 270 mempool_free(cell2, prison->cell_pool); 271 272 *ref = cell; 273 274 return r; 275 } 276 277 /* 278 * @inmates must have been initialised prior to this call 279 */ 280 static void __cell_release(struct cell *cell, struct bio_list *inmates) 281 { 282 struct bio_prison *prison = cell->prison; 283 284 hlist_del(&cell->list); 285 286 if (inmates) 287 bio_list_merge(inmates, &cell->bios); 288 289 mempool_free(cell, prison->cell_pool); 290 } 291 292 static void cell_release(struct cell *cell, struct bio_list *bios) 293 { 294 unsigned long flags; 295 struct bio_prison *prison = cell->prison; 296 297 spin_lock_irqsave(&prison->lock, flags); 298 __cell_release(cell, bios); 299 spin_unlock_irqrestore(&prison->lock, flags); 300 } 301 302 /* 303 * There are a couple of places where we put a bio into a cell briefly 304 * before taking it out again. In these situations we know that no other 305 * bio may be in the cell. This function releases the cell, and also does 306 * a sanity check. 307 */ 308 static void cell_release_singleton(struct cell *cell, struct bio *bio) 309 { 310 struct bio_prison *prison = cell->prison; 311 struct bio_list bios; 312 struct bio *b; 313 unsigned long flags; 314 315 bio_list_init(&bios); 316 317 spin_lock_irqsave(&prison->lock, flags); 318 __cell_release(cell, &bios); 319 spin_unlock_irqrestore(&prison->lock, flags); 320 321 b = bio_list_pop(&bios); 322 BUG_ON(b != bio); 323 BUG_ON(!bio_list_empty(&bios)); 324 } 325 326 static void cell_error(struct cell *cell) 327 { 328 struct bio_prison *prison = cell->prison; 329 struct bio_list bios; 330 struct bio *bio; 331 unsigned long flags; 332 333 bio_list_init(&bios); 334 335 spin_lock_irqsave(&prison->lock, flags); 336 __cell_release(cell, &bios); 337 spin_unlock_irqrestore(&prison->lock, flags); 338 339 while ((bio = bio_list_pop(&bios))) 340 bio_io_error(bio); 341 } 342 343 /*----------------------------------------------------------------*/ 344 345 /* 346 * We use the deferred set to keep track of pending reads to shared blocks. 347 * We do this to ensure the new mapping caused by a write isn't performed 348 * until these prior reads have completed. Otherwise the insertion of the 349 * new mapping could free the old block that the read bios are mapped to. 350 */ 351 352 struct deferred_set; 353 struct deferred_entry { 354 struct deferred_set *ds; 355 unsigned count; 356 struct list_head work_items; 357 }; 358 359 struct deferred_set { 360 spinlock_t lock; 361 unsigned current_entry; 362 unsigned sweeper; 363 struct deferred_entry entries[DEFERRED_SET_SIZE]; 364 }; 365 366 static void ds_init(struct deferred_set *ds) 367 { 368 int i; 369 370 spin_lock_init(&ds->lock); 371 ds->current_entry = 0; 372 ds->sweeper = 0; 373 for (i = 0; i < DEFERRED_SET_SIZE; i++) { 374 ds->entries[i].ds = ds; 375 ds->entries[i].count = 0; 376 INIT_LIST_HEAD(&ds->entries[i].work_items); 377 } 378 } 379 380 static struct deferred_entry *ds_inc(struct deferred_set *ds) 381 { 382 unsigned long flags; 383 struct deferred_entry *entry; 384 385 spin_lock_irqsave(&ds->lock, flags); 386 entry = ds->entries + ds->current_entry; 387 entry->count++; 388 spin_unlock_irqrestore(&ds->lock, flags); 389 390 return entry; 391 } 392 393 static unsigned ds_next(unsigned index) 394 { 395 return (index + 1) % DEFERRED_SET_SIZE; 396 } 397 398 static void __sweep(struct deferred_set *ds, struct list_head *head) 399 { 400 while ((ds->sweeper != ds->current_entry) && 401 !ds->entries[ds->sweeper].count) { 402 list_splice_init(&ds->entries[ds->sweeper].work_items, head); 403 ds->sweeper = ds_next(ds->sweeper); 404 } 405 406 if ((ds->sweeper == ds->current_entry) && !ds->entries[ds->sweeper].count) 407 list_splice_init(&ds->entries[ds->sweeper].work_items, head); 408 } 409 410 static void ds_dec(struct deferred_entry *entry, struct list_head *head) 411 { 412 unsigned long flags; 413 414 spin_lock_irqsave(&entry->ds->lock, flags); 415 BUG_ON(!entry->count); 416 --entry->count; 417 __sweep(entry->ds, head); 418 spin_unlock_irqrestore(&entry->ds->lock, flags); 419 } 420 421 /* 422 * Returns 1 if deferred or 0 if no pending items to delay job. 423 */ 424 static int ds_add_work(struct deferred_set *ds, struct list_head *work) 425 { 426 int r = 1; 427 unsigned long flags; 428 unsigned next_entry; 429 430 spin_lock_irqsave(&ds->lock, flags); 431 if ((ds->sweeper == ds->current_entry) && 432 !ds->entries[ds->current_entry].count) 433 r = 0; 434 else { 435 list_add(work, &ds->entries[ds->current_entry].work_items); 436 next_entry = ds_next(ds->current_entry); 437 if (!ds->entries[next_entry].count) 438 ds->current_entry = next_entry; 439 } 440 spin_unlock_irqrestore(&ds->lock, flags); 441 442 return r; 443 } 444 445 /*----------------------------------------------------------------*/ 446 447 /* 448 * Key building. 449 */ 450 static void build_data_key(struct dm_thin_device *td, 451 dm_block_t b, struct cell_key *key) 452 { 453 key->virtual = 0; 454 key->dev = dm_thin_dev_id(td); 455 key->block = b; 456 } 457 458 static void build_virtual_key(struct dm_thin_device *td, dm_block_t b, 459 struct cell_key *key) 460 { 461 key->virtual = 1; 462 key->dev = dm_thin_dev_id(td); 463 key->block = b; 464 } 465 466 /*----------------------------------------------------------------*/ 467 468 /* 469 * A pool device ties together a metadata device and a data device. It 470 * also provides the interface for creating and destroying internal 471 * devices. 472 */ 473 struct new_mapping; 474 struct pool { 475 struct list_head list; 476 struct dm_target *ti; /* Only set if a pool target is bound */ 477 478 struct mapped_device *pool_md; 479 struct block_device *md_dev; 480 struct dm_pool_metadata *pmd; 481 482 uint32_t sectors_per_block; 483 unsigned block_shift; 484 dm_block_t offset_mask; 485 dm_block_t low_water_blocks; 486 487 unsigned zero_new_blocks:1; 488 unsigned low_water_triggered:1; /* A dm event has been sent */ 489 unsigned no_free_space:1; /* A -ENOSPC warning has been issued */ 490 491 struct bio_prison *prison; 492 struct dm_kcopyd_client *copier; 493 494 struct workqueue_struct *wq; 495 struct work_struct worker; 496 497 unsigned ref_count; 498 499 spinlock_t lock; 500 struct bio_list deferred_bios; 501 struct bio_list deferred_flush_bios; 502 struct list_head prepared_mappings; 503 504 struct bio_list retry_on_resume_list; 505 506 struct deferred_set ds; /* FIXME: move to thin_c */ 507 508 struct new_mapping *next_mapping; 509 mempool_t *mapping_pool; 510 mempool_t *endio_hook_pool; 511 }; 512 513 /* 514 * Target context for a pool. 515 */ 516 struct pool_c { 517 struct dm_target *ti; 518 struct pool *pool; 519 struct dm_dev *data_dev; 520 struct dm_dev *metadata_dev; 521 struct dm_target_callbacks callbacks; 522 523 dm_block_t low_water_blocks; 524 unsigned zero_new_blocks:1; 525 }; 526 527 /* 528 * Target context for a thin. 529 */ 530 struct thin_c { 531 struct dm_dev *pool_dev; 532 dm_thin_id dev_id; 533 534 struct pool *pool; 535 struct dm_thin_device *td; 536 }; 537 538 /*----------------------------------------------------------------*/ 539 540 /* 541 * A global list of pools that uses a struct mapped_device as a key. 542 */ 543 static struct dm_thin_pool_table { 544 struct mutex mutex; 545 struct list_head pools; 546 } dm_thin_pool_table; 547 548 static void pool_table_init(void) 549 { 550 mutex_init(&dm_thin_pool_table.mutex); 551 INIT_LIST_HEAD(&dm_thin_pool_table.pools); 552 } 553 554 static void __pool_table_insert(struct pool *pool) 555 { 556 BUG_ON(!mutex_is_locked(&dm_thin_pool_table.mutex)); 557 list_add(&pool->list, &dm_thin_pool_table.pools); 558 } 559 560 static void __pool_table_remove(struct pool *pool) 561 { 562 BUG_ON(!mutex_is_locked(&dm_thin_pool_table.mutex)); 563 list_del(&pool->list); 564 } 565 566 static struct pool *__pool_table_lookup(struct mapped_device *md) 567 { 568 struct pool *pool = NULL, *tmp; 569 570 BUG_ON(!mutex_is_locked(&dm_thin_pool_table.mutex)); 571 572 list_for_each_entry(tmp, &dm_thin_pool_table.pools, list) { 573 if (tmp->pool_md == md) { 574 pool = tmp; 575 break; 576 } 577 } 578 579 return pool; 580 } 581 582 static struct pool *__pool_table_lookup_metadata_dev(struct block_device *md_dev) 583 { 584 struct pool *pool = NULL, *tmp; 585 586 BUG_ON(!mutex_is_locked(&dm_thin_pool_table.mutex)); 587 588 list_for_each_entry(tmp, &dm_thin_pool_table.pools, list) { 589 if (tmp->md_dev == md_dev) { 590 pool = tmp; 591 break; 592 } 593 } 594 595 return pool; 596 } 597 598 /*----------------------------------------------------------------*/ 599 600 static void __requeue_bio_list(struct thin_c *tc, struct bio_list *master) 601 { 602 struct bio *bio; 603 struct bio_list bios; 604 605 bio_list_init(&bios); 606 bio_list_merge(&bios, master); 607 bio_list_init(master); 608 609 while ((bio = bio_list_pop(&bios))) { 610 if (dm_get_mapinfo(bio)->ptr == tc) 611 bio_endio(bio, DM_ENDIO_REQUEUE); 612 else 613 bio_list_add(master, bio); 614 } 615 } 616 617 static void requeue_io(struct thin_c *tc) 618 { 619 struct pool *pool = tc->pool; 620 unsigned long flags; 621 622 spin_lock_irqsave(&pool->lock, flags); 623 __requeue_bio_list(tc, &pool->deferred_bios); 624 __requeue_bio_list(tc, &pool->retry_on_resume_list); 625 spin_unlock_irqrestore(&pool->lock, flags); 626 } 627 628 /* 629 * This section of code contains the logic for processing a thin device's IO. 630 * Much of the code depends on pool object resources (lists, workqueues, etc) 631 * but most is exclusively called from the thin target rather than the thin-pool 632 * target. 633 */ 634 635 static dm_block_t get_bio_block(struct thin_c *tc, struct bio *bio) 636 { 637 return bio->bi_sector >> tc->pool->block_shift; 638 } 639 640 static void remap(struct thin_c *tc, struct bio *bio, dm_block_t block) 641 { 642 struct pool *pool = tc->pool; 643 644 bio->bi_bdev = tc->pool_dev->bdev; 645 bio->bi_sector = (block << pool->block_shift) + 646 (bio->bi_sector & pool->offset_mask); 647 } 648 649 static void remap_and_issue(struct thin_c *tc, struct bio *bio, 650 dm_block_t block) 651 { 652 struct pool *pool = tc->pool; 653 unsigned long flags; 654 655 remap(tc, bio, block); 656 657 /* 658 * Batch together any FUA/FLUSH bios we find and then issue 659 * a single commit for them in process_deferred_bios(). 660 */ 661 if (bio->bi_rw & (REQ_FLUSH | REQ_FUA)) { 662 spin_lock_irqsave(&pool->lock, flags); 663 bio_list_add(&pool->deferred_flush_bios, bio); 664 spin_unlock_irqrestore(&pool->lock, flags); 665 } else 666 generic_make_request(bio); 667 } 668 669 /* 670 * wake_worker() is used when new work is queued and when pool_resume is 671 * ready to continue deferred IO processing. 672 */ 673 static void wake_worker(struct pool *pool) 674 { 675 queue_work(pool->wq, &pool->worker); 676 } 677 678 /*----------------------------------------------------------------*/ 679 680 /* 681 * Bio endio functions. 682 */ 683 struct endio_hook { 684 struct thin_c *tc; 685 bio_end_io_t *saved_bi_end_io; 686 struct deferred_entry *entry; 687 }; 688 689 struct new_mapping { 690 struct list_head list; 691 692 int prepared; 693 694 struct thin_c *tc; 695 dm_block_t virt_block; 696 dm_block_t data_block; 697 struct cell *cell; 698 int err; 699 700 /* 701 * If the bio covers the whole area of a block then we can avoid 702 * zeroing or copying. Instead this bio is hooked. The bio will 703 * still be in the cell, so care has to be taken to avoid issuing 704 * the bio twice. 705 */ 706 struct bio *bio; 707 bio_end_io_t *saved_bi_end_io; 708 }; 709 710 static void __maybe_add_mapping(struct new_mapping *m) 711 { 712 struct pool *pool = m->tc->pool; 713 714 if (list_empty(&m->list) && m->prepared) { 715 list_add(&m->list, &pool->prepared_mappings); 716 wake_worker(pool); 717 } 718 } 719 720 static void copy_complete(int read_err, unsigned long write_err, void *context) 721 { 722 unsigned long flags; 723 struct new_mapping *m = context; 724 struct pool *pool = m->tc->pool; 725 726 m->err = read_err || write_err ? -EIO : 0; 727 728 spin_lock_irqsave(&pool->lock, flags); 729 m->prepared = 1; 730 __maybe_add_mapping(m); 731 spin_unlock_irqrestore(&pool->lock, flags); 732 } 733 734 static void overwrite_endio(struct bio *bio, int err) 735 { 736 unsigned long flags; 737 struct new_mapping *m = dm_get_mapinfo(bio)->ptr; 738 struct pool *pool = m->tc->pool; 739 740 m->err = err; 741 742 spin_lock_irqsave(&pool->lock, flags); 743 m->prepared = 1; 744 __maybe_add_mapping(m); 745 spin_unlock_irqrestore(&pool->lock, flags); 746 } 747 748 static void shared_read_endio(struct bio *bio, int err) 749 { 750 struct list_head mappings; 751 struct new_mapping *m, *tmp; 752 struct endio_hook *h = dm_get_mapinfo(bio)->ptr; 753 unsigned long flags; 754 struct pool *pool = h->tc->pool; 755 756 bio->bi_end_io = h->saved_bi_end_io; 757 bio_endio(bio, err); 758 759 INIT_LIST_HEAD(&mappings); 760 ds_dec(h->entry, &mappings); 761 762 spin_lock_irqsave(&pool->lock, flags); 763 list_for_each_entry_safe(m, tmp, &mappings, list) { 764 list_del(&m->list); 765 INIT_LIST_HEAD(&m->list); 766 __maybe_add_mapping(m); 767 } 768 spin_unlock_irqrestore(&pool->lock, flags); 769 770 mempool_free(h, pool->endio_hook_pool); 771 } 772 773 /*----------------------------------------------------------------*/ 774 775 /* 776 * Workqueue. 777 */ 778 779 /* 780 * Prepared mapping jobs. 781 */ 782 783 /* 784 * This sends the bios in the cell back to the deferred_bios list. 785 */ 786 static void cell_defer(struct thin_c *tc, struct cell *cell, 787 dm_block_t data_block) 788 { 789 struct pool *pool = tc->pool; 790 unsigned long flags; 791 792 spin_lock_irqsave(&pool->lock, flags); 793 cell_release(cell, &pool->deferred_bios); 794 spin_unlock_irqrestore(&tc->pool->lock, flags); 795 796 wake_worker(pool); 797 } 798 799 /* 800 * Same as cell_defer above, except it omits one particular detainee, 801 * a write bio that covers the block and has already been processed. 802 */ 803 static void cell_defer_except(struct thin_c *tc, struct cell *cell, 804 struct bio *exception) 805 { 806 struct bio_list bios; 807 struct bio *bio; 808 struct pool *pool = tc->pool; 809 unsigned long flags; 810 811 bio_list_init(&bios); 812 cell_release(cell, &bios); 813 814 spin_lock_irqsave(&pool->lock, flags); 815 while ((bio = bio_list_pop(&bios))) 816 if (bio != exception) 817 bio_list_add(&pool->deferred_bios, bio); 818 spin_unlock_irqrestore(&pool->lock, flags); 819 820 wake_worker(pool); 821 } 822 823 static void process_prepared_mapping(struct new_mapping *m) 824 { 825 struct thin_c *tc = m->tc; 826 struct bio *bio; 827 int r; 828 829 bio = m->bio; 830 if (bio) 831 bio->bi_end_io = m->saved_bi_end_io; 832 833 if (m->err) { 834 cell_error(m->cell); 835 return; 836 } 837 838 /* 839 * Commit the prepared block into the mapping btree. 840 * Any I/O for this block arriving after this point will get 841 * remapped to it directly. 842 */ 843 r = dm_thin_insert_block(tc->td, m->virt_block, m->data_block); 844 if (r) { 845 DMERR("dm_thin_insert_block() failed"); 846 cell_error(m->cell); 847 return; 848 } 849 850 /* 851 * Release any bios held while the block was being provisioned. 852 * If we are processing a write bio that completely covers the block, 853 * we already processed it so can ignore it now when processing 854 * the bios in the cell. 855 */ 856 if (bio) { 857 cell_defer_except(tc, m->cell, bio); 858 bio_endio(bio, 0); 859 } else 860 cell_defer(tc, m->cell, m->data_block); 861 862 list_del(&m->list); 863 mempool_free(m, tc->pool->mapping_pool); 864 } 865 866 static void process_prepared_mappings(struct pool *pool) 867 { 868 unsigned long flags; 869 struct list_head maps; 870 struct new_mapping *m, *tmp; 871 872 INIT_LIST_HEAD(&maps); 873 spin_lock_irqsave(&pool->lock, flags); 874 list_splice_init(&pool->prepared_mappings, &maps); 875 spin_unlock_irqrestore(&pool->lock, flags); 876 877 list_for_each_entry_safe(m, tmp, &maps, list) 878 process_prepared_mapping(m); 879 } 880 881 /* 882 * Deferred bio jobs. 883 */ 884 static int io_overwrites_block(struct pool *pool, struct bio *bio) 885 { 886 return ((bio_data_dir(bio) == WRITE) && 887 !(bio->bi_sector & pool->offset_mask)) && 888 (bio->bi_size == (pool->sectors_per_block << SECTOR_SHIFT)); 889 } 890 891 static void save_and_set_endio(struct bio *bio, bio_end_io_t **save, 892 bio_end_io_t *fn) 893 { 894 *save = bio->bi_end_io; 895 bio->bi_end_io = fn; 896 } 897 898 static int ensure_next_mapping(struct pool *pool) 899 { 900 if (pool->next_mapping) 901 return 0; 902 903 pool->next_mapping = mempool_alloc(pool->mapping_pool, GFP_ATOMIC); 904 905 return pool->next_mapping ? 0 : -ENOMEM; 906 } 907 908 static struct new_mapping *get_next_mapping(struct pool *pool) 909 { 910 struct new_mapping *r = pool->next_mapping; 911 912 BUG_ON(!pool->next_mapping); 913 914 pool->next_mapping = NULL; 915 916 return r; 917 } 918 919 static void schedule_copy(struct thin_c *tc, dm_block_t virt_block, 920 dm_block_t data_origin, dm_block_t data_dest, 921 struct cell *cell, struct bio *bio) 922 { 923 int r; 924 struct pool *pool = tc->pool; 925 struct new_mapping *m = get_next_mapping(pool); 926 927 INIT_LIST_HEAD(&m->list); 928 m->prepared = 0; 929 m->tc = tc; 930 m->virt_block = virt_block; 931 m->data_block = data_dest; 932 m->cell = cell; 933 m->err = 0; 934 m->bio = NULL; 935 936 ds_add_work(&pool->ds, &m->list); 937 938 /* 939 * IO to pool_dev remaps to the pool target's data_dev. 940 * 941 * If the whole block of data is being overwritten, we can issue the 942 * bio immediately. Otherwise we use kcopyd to clone the data first. 943 */ 944 if (io_overwrites_block(pool, bio)) { 945 m->bio = bio; 946 save_and_set_endio(bio, &m->saved_bi_end_io, overwrite_endio); 947 dm_get_mapinfo(bio)->ptr = m; 948 remap_and_issue(tc, bio, data_dest); 949 } else { 950 struct dm_io_region from, to; 951 952 from.bdev = tc->pool_dev->bdev; 953 from.sector = data_origin * pool->sectors_per_block; 954 from.count = pool->sectors_per_block; 955 956 to.bdev = tc->pool_dev->bdev; 957 to.sector = data_dest * pool->sectors_per_block; 958 to.count = pool->sectors_per_block; 959 960 r = dm_kcopyd_copy(pool->copier, &from, 1, &to, 961 0, copy_complete, m); 962 if (r < 0) { 963 mempool_free(m, pool->mapping_pool); 964 DMERR("dm_kcopyd_copy() failed"); 965 cell_error(cell); 966 } 967 } 968 } 969 970 static void schedule_zero(struct thin_c *tc, dm_block_t virt_block, 971 dm_block_t data_block, struct cell *cell, 972 struct bio *bio) 973 { 974 struct pool *pool = tc->pool; 975 struct new_mapping *m = get_next_mapping(pool); 976 977 INIT_LIST_HEAD(&m->list); 978 m->prepared = 0; 979 m->tc = tc; 980 m->virt_block = virt_block; 981 m->data_block = data_block; 982 m->cell = cell; 983 m->err = 0; 984 m->bio = NULL; 985 986 /* 987 * If the whole block of data is being overwritten or we are not 988 * zeroing pre-existing data, we can issue the bio immediately. 989 * Otherwise we use kcopyd to zero the data first. 990 */ 991 if (!pool->zero_new_blocks) 992 process_prepared_mapping(m); 993 994 else if (io_overwrites_block(pool, bio)) { 995 m->bio = bio; 996 save_and_set_endio(bio, &m->saved_bi_end_io, overwrite_endio); 997 dm_get_mapinfo(bio)->ptr = m; 998 remap_and_issue(tc, bio, data_block); 999 1000 } else { 1001 int r; 1002 struct dm_io_region to; 1003 1004 to.bdev = tc->pool_dev->bdev; 1005 to.sector = data_block * pool->sectors_per_block; 1006 to.count = pool->sectors_per_block; 1007 1008 r = dm_kcopyd_zero(pool->copier, 1, &to, 0, copy_complete, m); 1009 if (r < 0) { 1010 mempool_free(m, pool->mapping_pool); 1011 DMERR("dm_kcopyd_zero() failed"); 1012 cell_error(cell); 1013 } 1014 } 1015 } 1016 1017 static int alloc_data_block(struct thin_c *tc, dm_block_t *result) 1018 { 1019 int r; 1020 dm_block_t free_blocks; 1021 unsigned long flags; 1022 struct pool *pool = tc->pool; 1023 1024 r = dm_pool_get_free_block_count(pool->pmd, &free_blocks); 1025 if (r) 1026 return r; 1027 1028 if (free_blocks <= pool->low_water_blocks && !pool->low_water_triggered) { 1029 DMWARN("%s: reached low water mark, sending event.", 1030 dm_device_name(pool->pool_md)); 1031 spin_lock_irqsave(&pool->lock, flags); 1032 pool->low_water_triggered = 1; 1033 spin_unlock_irqrestore(&pool->lock, flags); 1034 dm_table_event(pool->ti->table); 1035 } 1036 1037 if (!free_blocks) { 1038 if (pool->no_free_space) 1039 return -ENOSPC; 1040 else { 1041 /* 1042 * Try to commit to see if that will free up some 1043 * more space. 1044 */ 1045 r = dm_pool_commit_metadata(pool->pmd); 1046 if (r) { 1047 DMERR("%s: dm_pool_commit_metadata() failed, error = %d", 1048 __func__, r); 1049 return r; 1050 } 1051 1052 r = dm_pool_get_free_block_count(pool->pmd, &free_blocks); 1053 if (r) 1054 return r; 1055 1056 /* 1057 * If we still have no space we set a flag to avoid 1058 * doing all this checking and return -ENOSPC. 1059 */ 1060 if (!free_blocks) { 1061 DMWARN("%s: no free space available.", 1062 dm_device_name(pool->pool_md)); 1063 spin_lock_irqsave(&pool->lock, flags); 1064 pool->no_free_space = 1; 1065 spin_unlock_irqrestore(&pool->lock, flags); 1066 return -ENOSPC; 1067 } 1068 } 1069 } 1070 1071 r = dm_pool_alloc_data_block(pool->pmd, result); 1072 if (r) 1073 return r; 1074 1075 return 0; 1076 } 1077 1078 /* 1079 * If we have run out of space, queue bios until the device is 1080 * resumed, presumably after having been reloaded with more space. 1081 */ 1082 static void retry_on_resume(struct bio *bio) 1083 { 1084 struct thin_c *tc = dm_get_mapinfo(bio)->ptr; 1085 struct pool *pool = tc->pool; 1086 unsigned long flags; 1087 1088 spin_lock_irqsave(&pool->lock, flags); 1089 bio_list_add(&pool->retry_on_resume_list, bio); 1090 spin_unlock_irqrestore(&pool->lock, flags); 1091 } 1092 1093 static void no_space(struct cell *cell) 1094 { 1095 struct bio *bio; 1096 struct bio_list bios; 1097 1098 bio_list_init(&bios); 1099 cell_release(cell, &bios); 1100 1101 while ((bio = bio_list_pop(&bios))) 1102 retry_on_resume(bio); 1103 } 1104 1105 static void break_sharing(struct thin_c *tc, struct bio *bio, dm_block_t block, 1106 struct cell_key *key, 1107 struct dm_thin_lookup_result *lookup_result, 1108 struct cell *cell) 1109 { 1110 int r; 1111 dm_block_t data_block; 1112 1113 r = alloc_data_block(tc, &data_block); 1114 switch (r) { 1115 case 0: 1116 schedule_copy(tc, block, lookup_result->block, 1117 data_block, cell, bio); 1118 break; 1119 1120 case -ENOSPC: 1121 no_space(cell); 1122 break; 1123 1124 default: 1125 DMERR("%s: alloc_data_block() failed, error = %d", __func__, r); 1126 cell_error(cell); 1127 break; 1128 } 1129 } 1130 1131 static void process_shared_bio(struct thin_c *tc, struct bio *bio, 1132 dm_block_t block, 1133 struct dm_thin_lookup_result *lookup_result) 1134 { 1135 struct cell *cell; 1136 struct pool *pool = tc->pool; 1137 struct cell_key key; 1138 1139 /* 1140 * If cell is already occupied, then sharing is already in the process 1141 * of being broken so we have nothing further to do here. 1142 */ 1143 build_data_key(tc->td, lookup_result->block, &key); 1144 if (bio_detain(pool->prison, &key, bio, &cell)) 1145 return; 1146 1147 if (bio_data_dir(bio) == WRITE) 1148 break_sharing(tc, bio, block, &key, lookup_result, cell); 1149 else { 1150 struct endio_hook *h; 1151 h = mempool_alloc(pool->endio_hook_pool, GFP_NOIO); 1152 1153 h->tc = tc; 1154 h->entry = ds_inc(&pool->ds); 1155 save_and_set_endio(bio, &h->saved_bi_end_io, shared_read_endio); 1156 dm_get_mapinfo(bio)->ptr = h; 1157 1158 cell_release_singleton(cell, bio); 1159 remap_and_issue(tc, bio, lookup_result->block); 1160 } 1161 } 1162 1163 static void provision_block(struct thin_c *tc, struct bio *bio, dm_block_t block, 1164 struct cell *cell) 1165 { 1166 int r; 1167 dm_block_t data_block; 1168 1169 /* 1170 * Remap empty bios (flushes) immediately, without provisioning. 1171 */ 1172 if (!bio->bi_size) { 1173 cell_release_singleton(cell, bio); 1174 remap_and_issue(tc, bio, 0); 1175 return; 1176 } 1177 1178 /* 1179 * Fill read bios with zeroes and complete them immediately. 1180 */ 1181 if (bio_data_dir(bio) == READ) { 1182 zero_fill_bio(bio); 1183 cell_release_singleton(cell, bio); 1184 bio_endio(bio, 0); 1185 return; 1186 } 1187 1188 r = alloc_data_block(tc, &data_block); 1189 switch (r) { 1190 case 0: 1191 schedule_zero(tc, block, data_block, cell, bio); 1192 break; 1193 1194 case -ENOSPC: 1195 no_space(cell); 1196 break; 1197 1198 default: 1199 DMERR("%s: alloc_data_block() failed, error = %d", __func__, r); 1200 cell_error(cell); 1201 break; 1202 } 1203 } 1204 1205 static void process_bio(struct thin_c *tc, struct bio *bio) 1206 { 1207 int r; 1208 dm_block_t block = get_bio_block(tc, bio); 1209 struct cell *cell; 1210 struct cell_key key; 1211 struct dm_thin_lookup_result lookup_result; 1212 1213 /* 1214 * If cell is already occupied, then the block is already 1215 * being provisioned so we have nothing further to do here. 1216 */ 1217 build_virtual_key(tc->td, block, &key); 1218 if (bio_detain(tc->pool->prison, &key, bio, &cell)) 1219 return; 1220 1221 r = dm_thin_find_block(tc->td, block, 1, &lookup_result); 1222 switch (r) { 1223 case 0: 1224 /* 1225 * We can release this cell now. This thread is the only 1226 * one that puts bios into a cell, and we know there were 1227 * no preceding bios. 1228 */ 1229 /* 1230 * TODO: this will probably have to change when discard goes 1231 * back in. 1232 */ 1233 cell_release_singleton(cell, bio); 1234 1235 if (lookup_result.shared) 1236 process_shared_bio(tc, bio, block, &lookup_result); 1237 else 1238 remap_and_issue(tc, bio, lookup_result.block); 1239 break; 1240 1241 case -ENODATA: 1242 provision_block(tc, bio, block, cell); 1243 break; 1244 1245 default: 1246 DMERR("dm_thin_find_block() failed, error = %d", r); 1247 bio_io_error(bio); 1248 break; 1249 } 1250 } 1251 1252 static void process_deferred_bios(struct pool *pool) 1253 { 1254 unsigned long flags; 1255 struct bio *bio; 1256 struct bio_list bios; 1257 int r; 1258 1259 bio_list_init(&bios); 1260 1261 spin_lock_irqsave(&pool->lock, flags); 1262 bio_list_merge(&bios, &pool->deferred_bios); 1263 bio_list_init(&pool->deferred_bios); 1264 spin_unlock_irqrestore(&pool->lock, flags); 1265 1266 while ((bio = bio_list_pop(&bios))) { 1267 struct thin_c *tc = dm_get_mapinfo(bio)->ptr; 1268 /* 1269 * If we've got no free new_mapping structs, and processing 1270 * this bio might require one, we pause until there are some 1271 * prepared mappings to process. 1272 */ 1273 if (ensure_next_mapping(pool)) { 1274 spin_lock_irqsave(&pool->lock, flags); 1275 bio_list_merge(&pool->deferred_bios, &bios); 1276 spin_unlock_irqrestore(&pool->lock, flags); 1277 1278 break; 1279 } 1280 process_bio(tc, bio); 1281 } 1282 1283 /* 1284 * If there are any deferred flush bios, we must commit 1285 * the metadata before issuing them. 1286 */ 1287 bio_list_init(&bios); 1288 spin_lock_irqsave(&pool->lock, flags); 1289 bio_list_merge(&bios, &pool->deferred_flush_bios); 1290 bio_list_init(&pool->deferred_flush_bios); 1291 spin_unlock_irqrestore(&pool->lock, flags); 1292 1293 if (bio_list_empty(&bios)) 1294 return; 1295 1296 r = dm_pool_commit_metadata(pool->pmd); 1297 if (r) { 1298 DMERR("%s: dm_pool_commit_metadata() failed, error = %d", 1299 __func__, r); 1300 while ((bio = bio_list_pop(&bios))) 1301 bio_io_error(bio); 1302 return; 1303 } 1304 1305 while ((bio = bio_list_pop(&bios))) 1306 generic_make_request(bio); 1307 } 1308 1309 static void do_worker(struct work_struct *ws) 1310 { 1311 struct pool *pool = container_of(ws, struct pool, worker); 1312 1313 process_prepared_mappings(pool); 1314 process_deferred_bios(pool); 1315 } 1316 1317 /*----------------------------------------------------------------*/ 1318 1319 /* 1320 * Mapping functions. 1321 */ 1322 1323 /* 1324 * Called only while mapping a thin bio to hand it over to the workqueue. 1325 */ 1326 static void thin_defer_bio(struct thin_c *tc, struct bio *bio) 1327 { 1328 unsigned long flags; 1329 struct pool *pool = tc->pool; 1330 1331 spin_lock_irqsave(&pool->lock, flags); 1332 bio_list_add(&pool->deferred_bios, bio); 1333 spin_unlock_irqrestore(&pool->lock, flags); 1334 1335 wake_worker(pool); 1336 } 1337 1338 /* 1339 * Non-blocking function called from the thin target's map function. 1340 */ 1341 static int thin_bio_map(struct dm_target *ti, struct bio *bio, 1342 union map_info *map_context) 1343 { 1344 int r; 1345 struct thin_c *tc = ti->private; 1346 dm_block_t block = get_bio_block(tc, bio); 1347 struct dm_thin_device *td = tc->td; 1348 struct dm_thin_lookup_result result; 1349 1350 /* 1351 * Save the thin context for easy access from the deferred bio later. 1352 */ 1353 map_context->ptr = tc; 1354 1355 if (bio->bi_rw & (REQ_FLUSH | REQ_FUA)) { 1356 thin_defer_bio(tc, bio); 1357 return DM_MAPIO_SUBMITTED; 1358 } 1359 1360 r = dm_thin_find_block(td, block, 0, &result); 1361 1362 /* 1363 * Note that we defer readahead too. 1364 */ 1365 switch (r) { 1366 case 0: 1367 if (unlikely(result.shared)) { 1368 /* 1369 * We have a race condition here between the 1370 * result.shared value returned by the lookup and 1371 * snapshot creation, which may cause new 1372 * sharing. 1373 * 1374 * To avoid this always quiesce the origin before 1375 * taking the snap. You want to do this anyway to 1376 * ensure a consistent application view 1377 * (i.e. lockfs). 1378 * 1379 * More distant ancestors are irrelevant. The 1380 * shared flag will be set in their case. 1381 */ 1382 thin_defer_bio(tc, bio); 1383 r = DM_MAPIO_SUBMITTED; 1384 } else { 1385 remap(tc, bio, result.block); 1386 r = DM_MAPIO_REMAPPED; 1387 } 1388 break; 1389 1390 case -ENODATA: 1391 /* 1392 * In future, the failed dm_thin_find_block above could 1393 * provide the hint to load the metadata into cache. 1394 */ 1395 case -EWOULDBLOCK: 1396 thin_defer_bio(tc, bio); 1397 r = DM_MAPIO_SUBMITTED; 1398 break; 1399 } 1400 1401 return r; 1402 } 1403 1404 static int pool_is_congested(struct dm_target_callbacks *cb, int bdi_bits) 1405 { 1406 int r; 1407 unsigned long flags; 1408 struct pool_c *pt = container_of(cb, struct pool_c, callbacks); 1409 1410 spin_lock_irqsave(&pt->pool->lock, flags); 1411 r = !bio_list_empty(&pt->pool->retry_on_resume_list); 1412 spin_unlock_irqrestore(&pt->pool->lock, flags); 1413 1414 if (!r) { 1415 struct request_queue *q = bdev_get_queue(pt->data_dev->bdev); 1416 r = bdi_congested(&q->backing_dev_info, bdi_bits); 1417 } 1418 1419 return r; 1420 } 1421 1422 static void __requeue_bios(struct pool *pool) 1423 { 1424 bio_list_merge(&pool->deferred_bios, &pool->retry_on_resume_list); 1425 bio_list_init(&pool->retry_on_resume_list); 1426 } 1427 1428 /*---------------------------------------------------------------- 1429 * Binding of control targets to a pool object 1430 *--------------------------------------------------------------*/ 1431 static int bind_control_target(struct pool *pool, struct dm_target *ti) 1432 { 1433 struct pool_c *pt = ti->private; 1434 1435 pool->ti = ti; 1436 pool->low_water_blocks = pt->low_water_blocks; 1437 pool->zero_new_blocks = pt->zero_new_blocks; 1438 1439 return 0; 1440 } 1441 1442 static void unbind_control_target(struct pool *pool, struct dm_target *ti) 1443 { 1444 if (pool->ti == ti) 1445 pool->ti = NULL; 1446 } 1447 1448 /*---------------------------------------------------------------- 1449 * Pool creation 1450 *--------------------------------------------------------------*/ 1451 static void __pool_destroy(struct pool *pool) 1452 { 1453 __pool_table_remove(pool); 1454 1455 if (dm_pool_metadata_close(pool->pmd) < 0) 1456 DMWARN("%s: dm_pool_metadata_close() failed.", __func__); 1457 1458 prison_destroy(pool->prison); 1459 dm_kcopyd_client_destroy(pool->copier); 1460 1461 if (pool->wq) 1462 destroy_workqueue(pool->wq); 1463 1464 if (pool->next_mapping) 1465 mempool_free(pool->next_mapping, pool->mapping_pool); 1466 mempool_destroy(pool->mapping_pool); 1467 mempool_destroy(pool->endio_hook_pool); 1468 kfree(pool); 1469 } 1470 1471 static struct pool *pool_create(struct mapped_device *pool_md, 1472 struct block_device *metadata_dev, 1473 unsigned long block_size, char **error) 1474 { 1475 int r; 1476 void *err_p; 1477 struct pool *pool; 1478 struct dm_pool_metadata *pmd; 1479 1480 pmd = dm_pool_metadata_open(metadata_dev, block_size); 1481 if (IS_ERR(pmd)) { 1482 *error = "Error creating metadata object"; 1483 return (struct pool *)pmd; 1484 } 1485 1486 pool = kmalloc(sizeof(*pool), GFP_KERNEL); 1487 if (!pool) { 1488 *error = "Error allocating memory for pool"; 1489 err_p = ERR_PTR(-ENOMEM); 1490 goto bad_pool; 1491 } 1492 1493 pool->pmd = pmd; 1494 pool->sectors_per_block = block_size; 1495 pool->block_shift = ffs(block_size) - 1; 1496 pool->offset_mask = block_size - 1; 1497 pool->low_water_blocks = 0; 1498 pool->zero_new_blocks = 1; 1499 pool->prison = prison_create(PRISON_CELLS); 1500 if (!pool->prison) { 1501 *error = "Error creating pool's bio prison"; 1502 err_p = ERR_PTR(-ENOMEM); 1503 goto bad_prison; 1504 } 1505 1506 pool->copier = dm_kcopyd_client_create(); 1507 if (IS_ERR(pool->copier)) { 1508 r = PTR_ERR(pool->copier); 1509 *error = "Error creating pool's kcopyd client"; 1510 err_p = ERR_PTR(r); 1511 goto bad_kcopyd_client; 1512 } 1513 1514 /* 1515 * Create singlethreaded workqueue that will service all devices 1516 * that use this metadata. 1517 */ 1518 pool->wq = alloc_ordered_workqueue("dm-" DM_MSG_PREFIX, WQ_MEM_RECLAIM); 1519 if (!pool->wq) { 1520 *error = "Error creating pool's workqueue"; 1521 err_p = ERR_PTR(-ENOMEM); 1522 goto bad_wq; 1523 } 1524 1525 INIT_WORK(&pool->worker, do_worker); 1526 spin_lock_init(&pool->lock); 1527 bio_list_init(&pool->deferred_bios); 1528 bio_list_init(&pool->deferred_flush_bios); 1529 INIT_LIST_HEAD(&pool->prepared_mappings); 1530 pool->low_water_triggered = 0; 1531 pool->no_free_space = 0; 1532 bio_list_init(&pool->retry_on_resume_list); 1533 ds_init(&pool->ds); 1534 1535 pool->next_mapping = NULL; 1536 pool->mapping_pool = 1537 mempool_create_kmalloc_pool(MAPPING_POOL_SIZE, sizeof(struct new_mapping)); 1538 if (!pool->mapping_pool) { 1539 *error = "Error creating pool's mapping mempool"; 1540 err_p = ERR_PTR(-ENOMEM); 1541 goto bad_mapping_pool; 1542 } 1543 1544 pool->endio_hook_pool = 1545 mempool_create_kmalloc_pool(ENDIO_HOOK_POOL_SIZE, sizeof(struct endio_hook)); 1546 if (!pool->endio_hook_pool) { 1547 *error = "Error creating pool's endio_hook mempool"; 1548 err_p = ERR_PTR(-ENOMEM); 1549 goto bad_endio_hook_pool; 1550 } 1551 pool->ref_count = 1; 1552 pool->pool_md = pool_md; 1553 pool->md_dev = metadata_dev; 1554 __pool_table_insert(pool); 1555 1556 return pool; 1557 1558 bad_endio_hook_pool: 1559 mempool_destroy(pool->mapping_pool); 1560 bad_mapping_pool: 1561 destroy_workqueue(pool->wq); 1562 bad_wq: 1563 dm_kcopyd_client_destroy(pool->copier); 1564 bad_kcopyd_client: 1565 prison_destroy(pool->prison); 1566 bad_prison: 1567 kfree(pool); 1568 bad_pool: 1569 if (dm_pool_metadata_close(pmd)) 1570 DMWARN("%s: dm_pool_metadata_close() failed.", __func__); 1571 1572 return err_p; 1573 } 1574 1575 static void __pool_inc(struct pool *pool) 1576 { 1577 BUG_ON(!mutex_is_locked(&dm_thin_pool_table.mutex)); 1578 pool->ref_count++; 1579 } 1580 1581 static void __pool_dec(struct pool *pool) 1582 { 1583 BUG_ON(!mutex_is_locked(&dm_thin_pool_table.mutex)); 1584 BUG_ON(!pool->ref_count); 1585 if (!--pool->ref_count) 1586 __pool_destroy(pool); 1587 } 1588 1589 static struct pool *__pool_find(struct mapped_device *pool_md, 1590 struct block_device *metadata_dev, 1591 unsigned long block_size, char **error) 1592 { 1593 struct pool *pool = __pool_table_lookup_metadata_dev(metadata_dev); 1594 1595 if (pool) { 1596 if (pool->pool_md != pool_md) 1597 return ERR_PTR(-EBUSY); 1598 __pool_inc(pool); 1599 1600 } else { 1601 pool = __pool_table_lookup(pool_md); 1602 if (pool) { 1603 if (pool->md_dev != metadata_dev) 1604 return ERR_PTR(-EINVAL); 1605 __pool_inc(pool); 1606 1607 } else 1608 pool = pool_create(pool_md, metadata_dev, block_size, error); 1609 } 1610 1611 return pool; 1612 } 1613 1614 /*---------------------------------------------------------------- 1615 * Pool target methods 1616 *--------------------------------------------------------------*/ 1617 static void pool_dtr(struct dm_target *ti) 1618 { 1619 struct pool_c *pt = ti->private; 1620 1621 mutex_lock(&dm_thin_pool_table.mutex); 1622 1623 unbind_control_target(pt->pool, ti); 1624 __pool_dec(pt->pool); 1625 dm_put_device(ti, pt->metadata_dev); 1626 dm_put_device(ti, pt->data_dev); 1627 kfree(pt); 1628 1629 mutex_unlock(&dm_thin_pool_table.mutex); 1630 } 1631 1632 struct pool_features { 1633 unsigned zero_new_blocks:1; 1634 }; 1635 1636 static int parse_pool_features(struct dm_arg_set *as, struct pool_features *pf, 1637 struct dm_target *ti) 1638 { 1639 int r; 1640 unsigned argc; 1641 const char *arg_name; 1642 1643 static struct dm_arg _args[] = { 1644 {0, 1, "Invalid number of pool feature arguments"}, 1645 }; 1646 1647 /* 1648 * No feature arguments supplied. 1649 */ 1650 if (!as->argc) 1651 return 0; 1652 1653 r = dm_read_arg_group(_args, as, &argc, &ti->error); 1654 if (r) 1655 return -EINVAL; 1656 1657 while (argc && !r) { 1658 arg_name = dm_shift_arg(as); 1659 argc--; 1660 1661 if (!strcasecmp(arg_name, "skip_block_zeroing")) { 1662 pf->zero_new_blocks = 0; 1663 continue; 1664 } 1665 1666 ti->error = "Unrecognised pool feature requested"; 1667 r = -EINVAL; 1668 } 1669 1670 return r; 1671 } 1672 1673 /* 1674 * thin-pool <metadata dev> <data dev> 1675 * <data block size (sectors)> 1676 * <low water mark (blocks)> 1677 * [<#feature args> [<arg>]*] 1678 * 1679 * Optional feature arguments are: 1680 * skip_block_zeroing: skips the zeroing of newly-provisioned blocks. 1681 */ 1682 static int pool_ctr(struct dm_target *ti, unsigned argc, char **argv) 1683 { 1684 int r; 1685 struct pool_c *pt; 1686 struct pool *pool; 1687 struct pool_features pf; 1688 struct dm_arg_set as; 1689 struct dm_dev *data_dev; 1690 unsigned long block_size; 1691 dm_block_t low_water_blocks; 1692 struct dm_dev *metadata_dev; 1693 sector_t metadata_dev_size; 1694 1695 /* 1696 * FIXME Remove validation from scope of lock. 1697 */ 1698 mutex_lock(&dm_thin_pool_table.mutex); 1699 1700 if (argc < 4) { 1701 ti->error = "Invalid argument count"; 1702 r = -EINVAL; 1703 goto out_unlock; 1704 } 1705 as.argc = argc; 1706 as.argv = argv; 1707 1708 r = dm_get_device(ti, argv[0], FMODE_READ | FMODE_WRITE, &metadata_dev); 1709 if (r) { 1710 ti->error = "Error opening metadata block device"; 1711 goto out_unlock; 1712 } 1713 1714 metadata_dev_size = i_size_read(metadata_dev->bdev->bd_inode) >> SECTOR_SHIFT; 1715 if (metadata_dev_size > METADATA_DEV_MAX_SECTORS) { 1716 ti->error = "Metadata device is too large"; 1717 r = -EINVAL; 1718 goto out_metadata; 1719 } 1720 1721 r = dm_get_device(ti, argv[1], FMODE_READ | FMODE_WRITE, &data_dev); 1722 if (r) { 1723 ti->error = "Error getting data device"; 1724 goto out_metadata; 1725 } 1726 1727 if (kstrtoul(argv[2], 10, &block_size) || !block_size || 1728 block_size < DATA_DEV_BLOCK_SIZE_MIN_SECTORS || 1729 block_size > DATA_DEV_BLOCK_SIZE_MAX_SECTORS || 1730 !is_power_of_2(block_size)) { 1731 ti->error = "Invalid block size"; 1732 r = -EINVAL; 1733 goto out; 1734 } 1735 1736 if (kstrtoull(argv[3], 10, (unsigned long long *)&low_water_blocks)) { 1737 ti->error = "Invalid low water mark"; 1738 r = -EINVAL; 1739 goto out; 1740 } 1741 1742 /* 1743 * Set default pool features. 1744 */ 1745 memset(&pf, 0, sizeof(pf)); 1746 pf.zero_new_blocks = 1; 1747 1748 dm_consume_args(&as, 4); 1749 r = parse_pool_features(&as, &pf, ti); 1750 if (r) 1751 goto out; 1752 1753 pt = kzalloc(sizeof(*pt), GFP_KERNEL); 1754 if (!pt) { 1755 r = -ENOMEM; 1756 goto out; 1757 } 1758 1759 pool = __pool_find(dm_table_get_md(ti->table), metadata_dev->bdev, 1760 block_size, &ti->error); 1761 if (IS_ERR(pool)) { 1762 r = PTR_ERR(pool); 1763 goto out_free_pt; 1764 } 1765 1766 pt->pool = pool; 1767 pt->ti = ti; 1768 pt->metadata_dev = metadata_dev; 1769 pt->data_dev = data_dev; 1770 pt->low_water_blocks = low_water_blocks; 1771 pt->zero_new_blocks = pf.zero_new_blocks; 1772 ti->num_flush_requests = 1; 1773 ti->num_discard_requests = 0; 1774 ti->private = pt; 1775 1776 pt->callbacks.congested_fn = pool_is_congested; 1777 dm_table_add_target_callbacks(ti->table, &pt->callbacks); 1778 1779 mutex_unlock(&dm_thin_pool_table.mutex); 1780 1781 return 0; 1782 1783 out_free_pt: 1784 kfree(pt); 1785 out: 1786 dm_put_device(ti, data_dev); 1787 out_metadata: 1788 dm_put_device(ti, metadata_dev); 1789 out_unlock: 1790 mutex_unlock(&dm_thin_pool_table.mutex); 1791 1792 return r; 1793 } 1794 1795 static int pool_map(struct dm_target *ti, struct bio *bio, 1796 union map_info *map_context) 1797 { 1798 int r; 1799 struct pool_c *pt = ti->private; 1800 struct pool *pool = pt->pool; 1801 unsigned long flags; 1802 1803 /* 1804 * As this is a singleton target, ti->begin is always zero. 1805 */ 1806 spin_lock_irqsave(&pool->lock, flags); 1807 bio->bi_bdev = pt->data_dev->bdev; 1808 r = DM_MAPIO_REMAPPED; 1809 spin_unlock_irqrestore(&pool->lock, flags); 1810 1811 return r; 1812 } 1813 1814 /* 1815 * Retrieves the number of blocks of the data device from 1816 * the superblock and compares it to the actual device size, 1817 * thus resizing the data device in case it has grown. 1818 * 1819 * This both copes with opening preallocated data devices in the ctr 1820 * being followed by a resume 1821 * -and- 1822 * calling the resume method individually after userspace has 1823 * grown the data device in reaction to a table event. 1824 */ 1825 static int pool_preresume(struct dm_target *ti) 1826 { 1827 int r; 1828 struct pool_c *pt = ti->private; 1829 struct pool *pool = pt->pool; 1830 dm_block_t data_size, sb_data_size; 1831 1832 /* 1833 * Take control of the pool object. 1834 */ 1835 r = bind_control_target(pool, ti); 1836 if (r) 1837 return r; 1838 1839 data_size = ti->len >> pool->block_shift; 1840 r = dm_pool_get_data_dev_size(pool->pmd, &sb_data_size); 1841 if (r) { 1842 DMERR("failed to retrieve data device size"); 1843 return r; 1844 } 1845 1846 if (data_size < sb_data_size) { 1847 DMERR("pool target too small, is %llu blocks (expected %llu)", 1848 data_size, sb_data_size); 1849 return -EINVAL; 1850 1851 } else if (data_size > sb_data_size) { 1852 r = dm_pool_resize_data_dev(pool->pmd, data_size); 1853 if (r) { 1854 DMERR("failed to resize data device"); 1855 return r; 1856 } 1857 1858 r = dm_pool_commit_metadata(pool->pmd); 1859 if (r) { 1860 DMERR("%s: dm_pool_commit_metadata() failed, error = %d", 1861 __func__, r); 1862 return r; 1863 } 1864 } 1865 1866 return 0; 1867 } 1868 1869 static void pool_resume(struct dm_target *ti) 1870 { 1871 struct pool_c *pt = ti->private; 1872 struct pool *pool = pt->pool; 1873 unsigned long flags; 1874 1875 spin_lock_irqsave(&pool->lock, flags); 1876 pool->low_water_triggered = 0; 1877 pool->no_free_space = 0; 1878 __requeue_bios(pool); 1879 spin_unlock_irqrestore(&pool->lock, flags); 1880 1881 wake_worker(pool); 1882 } 1883 1884 static void pool_postsuspend(struct dm_target *ti) 1885 { 1886 int r; 1887 struct pool_c *pt = ti->private; 1888 struct pool *pool = pt->pool; 1889 1890 flush_workqueue(pool->wq); 1891 1892 r = dm_pool_commit_metadata(pool->pmd); 1893 if (r < 0) { 1894 DMERR("%s: dm_pool_commit_metadata() failed, error = %d", 1895 __func__, r); 1896 /* FIXME: invalidate device? error the next FUA or FLUSH bio ?*/ 1897 } 1898 } 1899 1900 static int check_arg_count(unsigned argc, unsigned args_required) 1901 { 1902 if (argc != args_required) { 1903 DMWARN("Message received with %u arguments instead of %u.", 1904 argc, args_required); 1905 return -EINVAL; 1906 } 1907 1908 return 0; 1909 } 1910 1911 static int read_dev_id(char *arg, dm_thin_id *dev_id, int warning) 1912 { 1913 if (!kstrtoull(arg, 10, (unsigned long long *)dev_id) && 1914 *dev_id <= MAX_DEV_ID) 1915 return 0; 1916 1917 if (warning) 1918 DMWARN("Message received with invalid device id: %s", arg); 1919 1920 return -EINVAL; 1921 } 1922 1923 static int process_create_thin_mesg(unsigned argc, char **argv, struct pool *pool) 1924 { 1925 dm_thin_id dev_id; 1926 int r; 1927 1928 r = check_arg_count(argc, 2); 1929 if (r) 1930 return r; 1931 1932 r = read_dev_id(argv[1], &dev_id, 1); 1933 if (r) 1934 return r; 1935 1936 r = dm_pool_create_thin(pool->pmd, dev_id); 1937 if (r) { 1938 DMWARN("Creation of new thinly-provisioned device with id %s failed.", 1939 argv[1]); 1940 return r; 1941 } 1942 1943 return 0; 1944 } 1945 1946 static int process_create_snap_mesg(unsigned argc, char **argv, struct pool *pool) 1947 { 1948 dm_thin_id dev_id; 1949 dm_thin_id origin_dev_id; 1950 int r; 1951 1952 r = check_arg_count(argc, 3); 1953 if (r) 1954 return r; 1955 1956 r = read_dev_id(argv[1], &dev_id, 1); 1957 if (r) 1958 return r; 1959 1960 r = read_dev_id(argv[2], &origin_dev_id, 1); 1961 if (r) 1962 return r; 1963 1964 r = dm_pool_create_snap(pool->pmd, dev_id, origin_dev_id); 1965 if (r) { 1966 DMWARN("Creation of new snapshot %s of device %s failed.", 1967 argv[1], argv[2]); 1968 return r; 1969 } 1970 1971 return 0; 1972 } 1973 1974 static int process_delete_mesg(unsigned argc, char **argv, struct pool *pool) 1975 { 1976 dm_thin_id dev_id; 1977 int r; 1978 1979 r = check_arg_count(argc, 2); 1980 if (r) 1981 return r; 1982 1983 r = read_dev_id(argv[1], &dev_id, 1); 1984 if (r) 1985 return r; 1986 1987 r = dm_pool_delete_thin_device(pool->pmd, dev_id); 1988 if (r) 1989 DMWARN("Deletion of thin device %s failed.", argv[1]); 1990 1991 return r; 1992 } 1993 1994 static int process_set_transaction_id_mesg(unsigned argc, char **argv, struct pool *pool) 1995 { 1996 dm_thin_id old_id, new_id; 1997 int r; 1998 1999 r = check_arg_count(argc, 3); 2000 if (r) 2001 return r; 2002 2003 if (kstrtoull(argv[1], 10, (unsigned long long *)&old_id)) { 2004 DMWARN("set_transaction_id message: Unrecognised id %s.", argv[1]); 2005 return -EINVAL; 2006 } 2007 2008 if (kstrtoull(argv[2], 10, (unsigned long long *)&new_id)) { 2009 DMWARN("set_transaction_id message: Unrecognised new id %s.", argv[2]); 2010 return -EINVAL; 2011 } 2012 2013 r = dm_pool_set_metadata_transaction_id(pool->pmd, old_id, new_id); 2014 if (r) { 2015 DMWARN("Failed to change transaction id from %s to %s.", 2016 argv[1], argv[2]); 2017 return r; 2018 } 2019 2020 return 0; 2021 } 2022 2023 /* 2024 * Messages supported: 2025 * create_thin <dev_id> 2026 * create_snap <dev_id> <origin_id> 2027 * delete <dev_id> 2028 * trim <dev_id> <new_size_in_sectors> 2029 * set_transaction_id <current_trans_id> <new_trans_id> 2030 */ 2031 static int pool_message(struct dm_target *ti, unsigned argc, char **argv) 2032 { 2033 int r = -EINVAL; 2034 struct pool_c *pt = ti->private; 2035 struct pool *pool = pt->pool; 2036 2037 if (!strcasecmp(argv[0], "create_thin")) 2038 r = process_create_thin_mesg(argc, argv, pool); 2039 2040 else if (!strcasecmp(argv[0], "create_snap")) 2041 r = process_create_snap_mesg(argc, argv, pool); 2042 2043 else if (!strcasecmp(argv[0], "delete")) 2044 r = process_delete_mesg(argc, argv, pool); 2045 2046 else if (!strcasecmp(argv[0], "set_transaction_id")) 2047 r = process_set_transaction_id_mesg(argc, argv, pool); 2048 2049 else 2050 DMWARN("Unrecognised thin pool target message received: %s", argv[0]); 2051 2052 if (!r) { 2053 r = dm_pool_commit_metadata(pool->pmd); 2054 if (r) 2055 DMERR("%s message: dm_pool_commit_metadata() failed, error = %d", 2056 argv[0], r); 2057 } 2058 2059 return r; 2060 } 2061 2062 /* 2063 * Status line is: 2064 * <transaction id> <used metadata sectors>/<total metadata sectors> 2065 * <used data sectors>/<total data sectors> <held metadata root> 2066 */ 2067 static int pool_status(struct dm_target *ti, status_type_t type, 2068 char *result, unsigned maxlen) 2069 { 2070 int r; 2071 unsigned sz = 0; 2072 uint64_t transaction_id; 2073 dm_block_t nr_free_blocks_data; 2074 dm_block_t nr_free_blocks_metadata; 2075 dm_block_t nr_blocks_data; 2076 dm_block_t nr_blocks_metadata; 2077 dm_block_t held_root; 2078 char buf[BDEVNAME_SIZE]; 2079 char buf2[BDEVNAME_SIZE]; 2080 struct pool_c *pt = ti->private; 2081 struct pool *pool = pt->pool; 2082 2083 switch (type) { 2084 case STATUSTYPE_INFO: 2085 r = dm_pool_get_metadata_transaction_id(pool->pmd, 2086 &transaction_id); 2087 if (r) 2088 return r; 2089 2090 r = dm_pool_get_free_metadata_block_count(pool->pmd, 2091 &nr_free_blocks_metadata); 2092 if (r) 2093 return r; 2094 2095 r = dm_pool_get_metadata_dev_size(pool->pmd, &nr_blocks_metadata); 2096 if (r) 2097 return r; 2098 2099 r = dm_pool_get_free_block_count(pool->pmd, 2100 &nr_free_blocks_data); 2101 if (r) 2102 return r; 2103 2104 r = dm_pool_get_data_dev_size(pool->pmd, &nr_blocks_data); 2105 if (r) 2106 return r; 2107 2108 r = dm_pool_get_held_metadata_root(pool->pmd, &held_root); 2109 if (r) 2110 return r; 2111 2112 DMEMIT("%llu %llu/%llu %llu/%llu ", 2113 (unsigned long long)transaction_id, 2114 (unsigned long long)(nr_blocks_metadata - nr_free_blocks_metadata), 2115 (unsigned long long)nr_blocks_metadata, 2116 (unsigned long long)(nr_blocks_data - nr_free_blocks_data), 2117 (unsigned long long)nr_blocks_data); 2118 2119 if (held_root) 2120 DMEMIT("%llu", held_root); 2121 else 2122 DMEMIT("-"); 2123 2124 break; 2125 2126 case STATUSTYPE_TABLE: 2127 DMEMIT("%s %s %lu %llu ", 2128 format_dev_t(buf, pt->metadata_dev->bdev->bd_dev), 2129 format_dev_t(buf2, pt->data_dev->bdev->bd_dev), 2130 (unsigned long)pool->sectors_per_block, 2131 (unsigned long long)pt->low_water_blocks); 2132 2133 DMEMIT("%u ", !pool->zero_new_blocks); 2134 2135 if (!pool->zero_new_blocks) 2136 DMEMIT("skip_block_zeroing "); 2137 break; 2138 } 2139 2140 return 0; 2141 } 2142 2143 static int pool_iterate_devices(struct dm_target *ti, 2144 iterate_devices_callout_fn fn, void *data) 2145 { 2146 struct pool_c *pt = ti->private; 2147 2148 return fn(ti, pt->data_dev, 0, ti->len, data); 2149 } 2150 2151 static int pool_merge(struct dm_target *ti, struct bvec_merge_data *bvm, 2152 struct bio_vec *biovec, int max_size) 2153 { 2154 struct pool_c *pt = ti->private; 2155 struct request_queue *q = bdev_get_queue(pt->data_dev->bdev); 2156 2157 if (!q->merge_bvec_fn) 2158 return max_size; 2159 2160 bvm->bi_bdev = pt->data_dev->bdev; 2161 2162 return min(max_size, q->merge_bvec_fn(q, bvm, biovec)); 2163 } 2164 2165 static void pool_io_hints(struct dm_target *ti, struct queue_limits *limits) 2166 { 2167 struct pool_c *pt = ti->private; 2168 struct pool *pool = pt->pool; 2169 2170 blk_limits_io_min(limits, 0); 2171 blk_limits_io_opt(limits, pool->sectors_per_block << SECTOR_SHIFT); 2172 } 2173 2174 static struct target_type pool_target = { 2175 .name = "thin-pool", 2176 .features = DM_TARGET_SINGLETON | DM_TARGET_ALWAYS_WRITEABLE | 2177 DM_TARGET_IMMUTABLE, 2178 .version = {1, 0, 0}, 2179 .module = THIS_MODULE, 2180 .ctr = pool_ctr, 2181 .dtr = pool_dtr, 2182 .map = pool_map, 2183 .postsuspend = pool_postsuspend, 2184 .preresume = pool_preresume, 2185 .resume = pool_resume, 2186 .message = pool_message, 2187 .status = pool_status, 2188 .merge = pool_merge, 2189 .iterate_devices = pool_iterate_devices, 2190 .io_hints = pool_io_hints, 2191 }; 2192 2193 /*---------------------------------------------------------------- 2194 * Thin target methods 2195 *--------------------------------------------------------------*/ 2196 static void thin_dtr(struct dm_target *ti) 2197 { 2198 struct thin_c *tc = ti->private; 2199 2200 mutex_lock(&dm_thin_pool_table.mutex); 2201 2202 __pool_dec(tc->pool); 2203 dm_pool_close_thin_device(tc->td); 2204 dm_put_device(ti, tc->pool_dev); 2205 kfree(tc); 2206 2207 mutex_unlock(&dm_thin_pool_table.mutex); 2208 } 2209 2210 /* 2211 * Thin target parameters: 2212 * 2213 * <pool_dev> <dev_id> 2214 * 2215 * pool_dev: the path to the pool (eg, /dev/mapper/my_pool) 2216 * dev_id: the internal device identifier 2217 */ 2218 static int thin_ctr(struct dm_target *ti, unsigned argc, char **argv) 2219 { 2220 int r; 2221 struct thin_c *tc; 2222 struct dm_dev *pool_dev; 2223 struct mapped_device *pool_md; 2224 2225 mutex_lock(&dm_thin_pool_table.mutex); 2226 2227 if (argc != 2) { 2228 ti->error = "Invalid argument count"; 2229 r = -EINVAL; 2230 goto out_unlock; 2231 } 2232 2233 tc = ti->private = kzalloc(sizeof(*tc), GFP_KERNEL); 2234 if (!tc) { 2235 ti->error = "Out of memory"; 2236 r = -ENOMEM; 2237 goto out_unlock; 2238 } 2239 2240 r = dm_get_device(ti, argv[0], dm_table_get_mode(ti->table), &pool_dev); 2241 if (r) { 2242 ti->error = "Error opening pool device"; 2243 goto bad_pool_dev; 2244 } 2245 tc->pool_dev = pool_dev; 2246 2247 if (read_dev_id(argv[1], (unsigned long long *)&tc->dev_id, 0)) { 2248 ti->error = "Invalid device id"; 2249 r = -EINVAL; 2250 goto bad_common; 2251 } 2252 2253 pool_md = dm_get_md(tc->pool_dev->bdev->bd_dev); 2254 if (!pool_md) { 2255 ti->error = "Couldn't get pool mapped device"; 2256 r = -EINVAL; 2257 goto bad_common; 2258 } 2259 2260 tc->pool = __pool_table_lookup(pool_md); 2261 if (!tc->pool) { 2262 ti->error = "Couldn't find pool object"; 2263 r = -EINVAL; 2264 goto bad_pool_lookup; 2265 } 2266 __pool_inc(tc->pool); 2267 2268 r = dm_pool_open_thin_device(tc->pool->pmd, tc->dev_id, &tc->td); 2269 if (r) { 2270 ti->error = "Couldn't open thin internal device"; 2271 goto bad_thin_open; 2272 } 2273 2274 ti->split_io = tc->pool->sectors_per_block; 2275 ti->num_flush_requests = 1; 2276 ti->num_discard_requests = 0; 2277 ti->discards_supported = 0; 2278 2279 dm_put(pool_md); 2280 2281 mutex_unlock(&dm_thin_pool_table.mutex); 2282 2283 return 0; 2284 2285 bad_thin_open: 2286 __pool_dec(tc->pool); 2287 bad_pool_lookup: 2288 dm_put(pool_md); 2289 bad_common: 2290 dm_put_device(ti, tc->pool_dev); 2291 bad_pool_dev: 2292 kfree(tc); 2293 out_unlock: 2294 mutex_unlock(&dm_thin_pool_table.mutex); 2295 2296 return r; 2297 } 2298 2299 static int thin_map(struct dm_target *ti, struct bio *bio, 2300 union map_info *map_context) 2301 { 2302 bio->bi_sector -= ti->begin; 2303 2304 return thin_bio_map(ti, bio, map_context); 2305 } 2306 2307 static void thin_postsuspend(struct dm_target *ti) 2308 { 2309 if (dm_noflush_suspending(ti)) 2310 requeue_io((struct thin_c *)ti->private); 2311 } 2312 2313 /* 2314 * <nr mapped sectors> <highest mapped sector> 2315 */ 2316 static int thin_status(struct dm_target *ti, status_type_t type, 2317 char *result, unsigned maxlen) 2318 { 2319 int r; 2320 ssize_t sz = 0; 2321 dm_block_t mapped, highest; 2322 char buf[BDEVNAME_SIZE]; 2323 struct thin_c *tc = ti->private; 2324 2325 if (!tc->td) 2326 DMEMIT("-"); 2327 else { 2328 switch (type) { 2329 case STATUSTYPE_INFO: 2330 r = dm_thin_get_mapped_count(tc->td, &mapped); 2331 if (r) 2332 return r; 2333 2334 r = dm_thin_get_highest_mapped_block(tc->td, &highest); 2335 if (r < 0) 2336 return r; 2337 2338 DMEMIT("%llu ", mapped * tc->pool->sectors_per_block); 2339 if (r) 2340 DMEMIT("%llu", ((highest + 1) * 2341 tc->pool->sectors_per_block) - 1); 2342 else 2343 DMEMIT("-"); 2344 break; 2345 2346 case STATUSTYPE_TABLE: 2347 DMEMIT("%s %lu", 2348 format_dev_t(buf, tc->pool_dev->bdev->bd_dev), 2349 (unsigned long) tc->dev_id); 2350 break; 2351 } 2352 } 2353 2354 return 0; 2355 } 2356 2357 static int thin_iterate_devices(struct dm_target *ti, 2358 iterate_devices_callout_fn fn, void *data) 2359 { 2360 dm_block_t blocks; 2361 struct thin_c *tc = ti->private; 2362 2363 /* 2364 * We can't call dm_pool_get_data_dev_size() since that blocks. So 2365 * we follow a more convoluted path through to the pool's target. 2366 */ 2367 if (!tc->pool->ti) 2368 return 0; /* nothing is bound */ 2369 2370 blocks = tc->pool->ti->len >> tc->pool->block_shift; 2371 if (blocks) 2372 return fn(ti, tc->pool_dev, 0, tc->pool->sectors_per_block * blocks, data); 2373 2374 return 0; 2375 } 2376 2377 static void thin_io_hints(struct dm_target *ti, struct queue_limits *limits) 2378 { 2379 struct thin_c *tc = ti->private; 2380 2381 blk_limits_io_min(limits, 0); 2382 blk_limits_io_opt(limits, tc->pool->sectors_per_block << SECTOR_SHIFT); 2383 } 2384 2385 static struct target_type thin_target = { 2386 .name = "thin", 2387 .version = {1, 0, 0}, 2388 .module = THIS_MODULE, 2389 .ctr = thin_ctr, 2390 .dtr = thin_dtr, 2391 .map = thin_map, 2392 .postsuspend = thin_postsuspend, 2393 .status = thin_status, 2394 .iterate_devices = thin_iterate_devices, 2395 .io_hints = thin_io_hints, 2396 }; 2397 2398 /*----------------------------------------------------------------*/ 2399 2400 static int __init dm_thin_init(void) 2401 { 2402 int r; 2403 2404 pool_table_init(); 2405 2406 r = dm_register_target(&thin_target); 2407 if (r) 2408 return r; 2409 2410 r = dm_register_target(&pool_target); 2411 if (r) 2412 dm_unregister_target(&thin_target); 2413 2414 return r; 2415 } 2416 2417 static void dm_thin_exit(void) 2418 { 2419 dm_unregister_target(&thin_target); 2420 dm_unregister_target(&pool_target); 2421 } 2422 2423 module_init(dm_thin_init); 2424 module_exit(dm_thin_exit); 2425 2426 MODULE_DESCRIPTION(DM_NAME "device-mapper thin provisioning target"); 2427 MODULE_AUTHOR("Joe Thornber <dm-devel@redhat.com>"); 2428 MODULE_LICENSE("GPL"); 2429