1 /* 2 * Copyright (C) 2011-2012 Red Hat UK. 3 * 4 * This file is released under the GPL. 5 */ 6 7 #include "dm-thin-metadata.h" 8 #include "dm-bio-prison.h" 9 #include "dm.h" 10 11 #include <linux/device-mapper.h> 12 #include <linux/dm-io.h> 13 #include <linux/dm-kcopyd.h> 14 #include <linux/list.h> 15 #include <linux/init.h> 16 #include <linux/module.h> 17 #include <linux/slab.h> 18 19 #define DM_MSG_PREFIX "thin" 20 21 /* 22 * Tunable constants 23 */ 24 #define ENDIO_HOOK_POOL_SIZE 1024 25 #define MAPPING_POOL_SIZE 1024 26 #define PRISON_CELLS 1024 27 #define COMMIT_PERIOD HZ 28 29 DECLARE_DM_KCOPYD_THROTTLE_WITH_MODULE_PARM(snapshot_copy_throttle, 30 "A percentage of time allocated for copy on write"); 31 32 /* 33 * The block size of the device holding pool data must be 34 * between 64KB and 1GB. 35 */ 36 #define DATA_DEV_BLOCK_SIZE_MIN_SECTORS (64 * 1024 >> SECTOR_SHIFT) 37 #define DATA_DEV_BLOCK_SIZE_MAX_SECTORS (1024 * 1024 * 1024 >> SECTOR_SHIFT) 38 39 /* 40 * Device id is restricted to 24 bits. 41 */ 42 #define MAX_DEV_ID ((1 << 24) - 1) 43 44 /* 45 * How do we handle breaking sharing of data blocks? 46 * ================================================= 47 * 48 * We use a standard copy-on-write btree to store the mappings for the 49 * devices (note I'm talking about copy-on-write of the metadata here, not 50 * the data). When you take an internal snapshot you clone the root node 51 * of the origin btree. After this there is no concept of an origin or a 52 * snapshot. They are just two device trees that happen to point to the 53 * same data blocks. 54 * 55 * When we get a write in we decide if it's to a shared data block using 56 * some timestamp magic. If it is, we have to break sharing. 57 * 58 * Let's say we write to a shared block in what was the origin. The 59 * steps are: 60 * 61 * i) plug io further to this physical block. (see bio_prison code). 62 * 63 * ii) quiesce any read io to that shared data block. Obviously 64 * including all devices that share this block. (see dm_deferred_set code) 65 * 66 * iii) copy the data block to a newly allocate block. This step can be 67 * missed out if the io covers the block. (schedule_copy). 68 * 69 * iv) insert the new mapping into the origin's btree 70 * (process_prepared_mapping). This act of inserting breaks some 71 * sharing of btree nodes between the two devices. Breaking sharing only 72 * effects the btree of that specific device. Btrees for the other 73 * devices that share the block never change. The btree for the origin 74 * device as it was after the last commit is untouched, ie. we're using 75 * persistent data structures in the functional programming sense. 76 * 77 * v) unplug io to this physical block, including the io that triggered 78 * the breaking of sharing. 79 * 80 * Steps (ii) and (iii) occur in parallel. 81 * 82 * The metadata _doesn't_ need to be committed before the io continues. We 83 * get away with this because the io is always written to a _new_ block. 84 * If there's a crash, then: 85 * 86 * - The origin mapping will point to the old origin block (the shared 87 * one). This will contain the data as it was before the io that triggered 88 * the breaking of sharing came in. 89 * 90 * - The snap mapping still points to the old block. As it would after 91 * the commit. 92 * 93 * The downside of this scheme is the timestamp magic isn't perfect, and 94 * will continue to think that data block in the snapshot device is shared 95 * even after the write to the origin has broken sharing. I suspect data 96 * blocks will typically be shared by many different devices, so we're 97 * breaking sharing n + 1 times, rather than n, where n is the number of 98 * devices that reference this data block. At the moment I think the 99 * benefits far, far outweigh the disadvantages. 100 */ 101 102 /*----------------------------------------------------------------*/ 103 104 /* 105 * Key building. 106 */ 107 static void build_data_key(struct dm_thin_device *td, 108 dm_block_t b, struct dm_cell_key *key) 109 { 110 key->virtual = 0; 111 key->dev = dm_thin_dev_id(td); 112 key->block = b; 113 } 114 115 static void build_virtual_key(struct dm_thin_device *td, dm_block_t b, 116 struct dm_cell_key *key) 117 { 118 key->virtual = 1; 119 key->dev = dm_thin_dev_id(td); 120 key->block = b; 121 } 122 123 /*----------------------------------------------------------------*/ 124 125 /* 126 * A pool device ties together a metadata device and a data device. It 127 * also provides the interface for creating and destroying internal 128 * devices. 129 */ 130 struct dm_thin_new_mapping; 131 132 /* 133 * The pool runs in 3 modes. Ordered in degraded order for comparisons. 134 */ 135 enum pool_mode { 136 PM_WRITE, /* metadata may be changed */ 137 PM_READ_ONLY, /* metadata may not be changed */ 138 PM_FAIL, /* all I/O fails */ 139 }; 140 141 struct pool_features { 142 enum pool_mode mode; 143 144 bool zero_new_blocks:1; 145 bool discard_enabled:1; 146 bool discard_passdown:1; 147 }; 148 149 struct thin_c; 150 typedef void (*process_bio_fn)(struct thin_c *tc, struct bio *bio); 151 typedef void (*process_mapping_fn)(struct dm_thin_new_mapping *m); 152 153 struct pool { 154 struct list_head list; 155 struct dm_target *ti; /* Only set if a pool target is bound */ 156 157 struct mapped_device *pool_md; 158 struct block_device *md_dev; 159 struct dm_pool_metadata *pmd; 160 161 dm_block_t low_water_blocks; 162 uint32_t sectors_per_block; 163 int sectors_per_block_shift; 164 165 struct pool_features pf; 166 unsigned low_water_triggered:1; /* A dm event has been sent */ 167 unsigned no_free_space:1; /* A -ENOSPC warning has been issued */ 168 169 struct dm_bio_prison *prison; 170 struct dm_kcopyd_client *copier; 171 172 struct workqueue_struct *wq; 173 struct work_struct worker; 174 struct delayed_work waker; 175 176 unsigned long last_commit_jiffies; 177 unsigned ref_count; 178 179 spinlock_t lock; 180 struct bio_list deferred_bios; 181 struct bio_list deferred_flush_bios; 182 struct list_head prepared_mappings; 183 struct list_head prepared_discards; 184 185 struct bio_list retry_on_resume_list; 186 187 struct dm_deferred_set *shared_read_ds; 188 struct dm_deferred_set *all_io_ds; 189 190 struct dm_thin_new_mapping *next_mapping; 191 mempool_t *mapping_pool; 192 193 process_bio_fn process_bio; 194 process_bio_fn process_discard; 195 196 process_mapping_fn process_prepared_mapping; 197 process_mapping_fn process_prepared_discard; 198 }; 199 200 static enum pool_mode get_pool_mode(struct pool *pool); 201 static void set_pool_mode(struct pool *pool, enum pool_mode mode); 202 203 /* 204 * Target context for a pool. 205 */ 206 struct pool_c { 207 struct dm_target *ti; 208 struct pool *pool; 209 struct dm_dev *data_dev; 210 struct dm_dev *metadata_dev; 211 struct dm_target_callbacks callbacks; 212 213 dm_block_t low_water_blocks; 214 struct pool_features requested_pf; /* Features requested during table load */ 215 struct pool_features adjusted_pf; /* Features used after adjusting for constituent devices */ 216 }; 217 218 /* 219 * Target context for a thin. 220 */ 221 struct thin_c { 222 struct dm_dev *pool_dev; 223 struct dm_dev *origin_dev; 224 dm_thin_id dev_id; 225 226 struct pool *pool; 227 struct dm_thin_device *td; 228 }; 229 230 /*----------------------------------------------------------------*/ 231 232 /* 233 * wake_worker() is used when new work is queued and when pool_resume is 234 * ready to continue deferred IO processing. 235 */ 236 static void wake_worker(struct pool *pool) 237 { 238 queue_work(pool->wq, &pool->worker); 239 } 240 241 /*----------------------------------------------------------------*/ 242 243 static int bio_detain(struct pool *pool, struct dm_cell_key *key, struct bio *bio, 244 struct dm_bio_prison_cell **cell_result) 245 { 246 int r; 247 struct dm_bio_prison_cell *cell_prealloc; 248 249 /* 250 * Allocate a cell from the prison's mempool. 251 * This might block but it can't fail. 252 */ 253 cell_prealloc = dm_bio_prison_alloc_cell(pool->prison, GFP_NOIO); 254 255 r = dm_bio_detain(pool->prison, key, bio, cell_prealloc, cell_result); 256 if (r) 257 /* 258 * We reused an old cell; we can get rid of 259 * the new one. 260 */ 261 dm_bio_prison_free_cell(pool->prison, cell_prealloc); 262 263 return r; 264 } 265 266 static void cell_release(struct pool *pool, 267 struct dm_bio_prison_cell *cell, 268 struct bio_list *bios) 269 { 270 dm_cell_release(pool->prison, cell, bios); 271 dm_bio_prison_free_cell(pool->prison, cell); 272 } 273 274 static void cell_release_no_holder(struct pool *pool, 275 struct dm_bio_prison_cell *cell, 276 struct bio_list *bios) 277 { 278 dm_cell_release_no_holder(pool->prison, cell, bios); 279 dm_bio_prison_free_cell(pool->prison, cell); 280 } 281 282 static void cell_defer_no_holder_no_free(struct thin_c *tc, 283 struct dm_bio_prison_cell *cell) 284 { 285 struct pool *pool = tc->pool; 286 unsigned long flags; 287 288 spin_lock_irqsave(&pool->lock, flags); 289 dm_cell_release_no_holder(pool->prison, cell, &pool->deferred_bios); 290 spin_unlock_irqrestore(&pool->lock, flags); 291 292 wake_worker(pool); 293 } 294 295 static void cell_error(struct pool *pool, 296 struct dm_bio_prison_cell *cell) 297 { 298 dm_cell_error(pool->prison, cell); 299 dm_bio_prison_free_cell(pool->prison, cell); 300 } 301 302 /*----------------------------------------------------------------*/ 303 304 /* 305 * A global list of pools that uses a struct mapped_device as a key. 306 */ 307 static struct dm_thin_pool_table { 308 struct mutex mutex; 309 struct list_head pools; 310 } dm_thin_pool_table; 311 312 static void pool_table_init(void) 313 { 314 mutex_init(&dm_thin_pool_table.mutex); 315 INIT_LIST_HEAD(&dm_thin_pool_table.pools); 316 } 317 318 static void __pool_table_insert(struct pool *pool) 319 { 320 BUG_ON(!mutex_is_locked(&dm_thin_pool_table.mutex)); 321 list_add(&pool->list, &dm_thin_pool_table.pools); 322 } 323 324 static void __pool_table_remove(struct pool *pool) 325 { 326 BUG_ON(!mutex_is_locked(&dm_thin_pool_table.mutex)); 327 list_del(&pool->list); 328 } 329 330 static struct pool *__pool_table_lookup(struct mapped_device *md) 331 { 332 struct pool *pool = NULL, *tmp; 333 334 BUG_ON(!mutex_is_locked(&dm_thin_pool_table.mutex)); 335 336 list_for_each_entry(tmp, &dm_thin_pool_table.pools, list) { 337 if (tmp->pool_md == md) { 338 pool = tmp; 339 break; 340 } 341 } 342 343 return pool; 344 } 345 346 static struct pool *__pool_table_lookup_metadata_dev(struct block_device *md_dev) 347 { 348 struct pool *pool = NULL, *tmp; 349 350 BUG_ON(!mutex_is_locked(&dm_thin_pool_table.mutex)); 351 352 list_for_each_entry(tmp, &dm_thin_pool_table.pools, list) { 353 if (tmp->md_dev == md_dev) { 354 pool = tmp; 355 break; 356 } 357 } 358 359 return pool; 360 } 361 362 /*----------------------------------------------------------------*/ 363 364 struct dm_thin_endio_hook { 365 struct thin_c *tc; 366 struct dm_deferred_entry *shared_read_entry; 367 struct dm_deferred_entry *all_io_entry; 368 struct dm_thin_new_mapping *overwrite_mapping; 369 }; 370 371 static void __requeue_bio_list(struct thin_c *tc, struct bio_list *master) 372 { 373 struct bio *bio; 374 struct bio_list bios; 375 376 bio_list_init(&bios); 377 bio_list_merge(&bios, master); 378 bio_list_init(master); 379 380 while ((bio = bio_list_pop(&bios))) { 381 struct dm_thin_endio_hook *h = dm_per_bio_data(bio, sizeof(struct dm_thin_endio_hook)); 382 383 if (h->tc == tc) 384 bio_endio(bio, DM_ENDIO_REQUEUE); 385 else 386 bio_list_add(master, bio); 387 } 388 } 389 390 static void requeue_io(struct thin_c *tc) 391 { 392 struct pool *pool = tc->pool; 393 unsigned long flags; 394 395 spin_lock_irqsave(&pool->lock, flags); 396 __requeue_bio_list(tc, &pool->deferred_bios); 397 __requeue_bio_list(tc, &pool->retry_on_resume_list); 398 spin_unlock_irqrestore(&pool->lock, flags); 399 } 400 401 /* 402 * This section of code contains the logic for processing a thin device's IO. 403 * Much of the code depends on pool object resources (lists, workqueues, etc) 404 * but most is exclusively called from the thin target rather than the thin-pool 405 * target. 406 */ 407 408 static bool block_size_is_power_of_two(struct pool *pool) 409 { 410 return pool->sectors_per_block_shift >= 0; 411 } 412 413 static dm_block_t get_bio_block(struct thin_c *tc, struct bio *bio) 414 { 415 struct pool *pool = tc->pool; 416 sector_t block_nr = bio->bi_sector; 417 418 if (block_size_is_power_of_two(pool)) 419 block_nr >>= pool->sectors_per_block_shift; 420 else 421 (void) sector_div(block_nr, pool->sectors_per_block); 422 423 return block_nr; 424 } 425 426 static void remap(struct thin_c *tc, struct bio *bio, dm_block_t block) 427 { 428 struct pool *pool = tc->pool; 429 sector_t bi_sector = bio->bi_sector; 430 431 bio->bi_bdev = tc->pool_dev->bdev; 432 if (block_size_is_power_of_two(pool)) 433 bio->bi_sector = (block << pool->sectors_per_block_shift) | 434 (bi_sector & (pool->sectors_per_block - 1)); 435 else 436 bio->bi_sector = (block * pool->sectors_per_block) + 437 sector_div(bi_sector, pool->sectors_per_block); 438 } 439 440 static void remap_to_origin(struct thin_c *tc, struct bio *bio) 441 { 442 bio->bi_bdev = tc->origin_dev->bdev; 443 } 444 445 static int bio_triggers_commit(struct thin_c *tc, struct bio *bio) 446 { 447 return (bio->bi_rw & (REQ_FLUSH | REQ_FUA)) && 448 dm_thin_changed_this_transaction(tc->td); 449 } 450 451 static void inc_all_io_entry(struct pool *pool, struct bio *bio) 452 { 453 struct dm_thin_endio_hook *h; 454 455 if (bio->bi_rw & REQ_DISCARD) 456 return; 457 458 h = dm_per_bio_data(bio, sizeof(struct dm_thin_endio_hook)); 459 h->all_io_entry = dm_deferred_entry_inc(pool->all_io_ds); 460 } 461 462 static void issue(struct thin_c *tc, struct bio *bio) 463 { 464 struct pool *pool = tc->pool; 465 unsigned long flags; 466 467 if (!bio_triggers_commit(tc, bio)) { 468 generic_make_request(bio); 469 return; 470 } 471 472 /* 473 * Complete bio with an error if earlier I/O caused changes to 474 * the metadata that can't be committed e.g, due to I/O errors 475 * on the metadata device. 476 */ 477 if (dm_thin_aborted_changes(tc->td)) { 478 bio_io_error(bio); 479 return; 480 } 481 482 /* 483 * Batch together any bios that trigger commits and then issue a 484 * single commit for them in process_deferred_bios(). 485 */ 486 spin_lock_irqsave(&pool->lock, flags); 487 bio_list_add(&pool->deferred_flush_bios, bio); 488 spin_unlock_irqrestore(&pool->lock, flags); 489 } 490 491 static void remap_to_origin_and_issue(struct thin_c *tc, struct bio *bio) 492 { 493 remap_to_origin(tc, bio); 494 issue(tc, bio); 495 } 496 497 static void remap_and_issue(struct thin_c *tc, struct bio *bio, 498 dm_block_t block) 499 { 500 remap(tc, bio, block); 501 issue(tc, bio); 502 } 503 504 /*----------------------------------------------------------------*/ 505 506 /* 507 * Bio endio functions. 508 */ 509 struct dm_thin_new_mapping { 510 struct list_head list; 511 512 unsigned quiesced:1; 513 unsigned prepared:1; 514 unsigned pass_discard:1; 515 516 struct thin_c *tc; 517 dm_block_t virt_block; 518 dm_block_t data_block; 519 struct dm_bio_prison_cell *cell, *cell2; 520 int err; 521 522 /* 523 * If the bio covers the whole area of a block then we can avoid 524 * zeroing or copying. Instead this bio is hooked. The bio will 525 * still be in the cell, so care has to be taken to avoid issuing 526 * the bio twice. 527 */ 528 struct bio *bio; 529 bio_end_io_t *saved_bi_end_io; 530 }; 531 532 static void __maybe_add_mapping(struct dm_thin_new_mapping *m) 533 { 534 struct pool *pool = m->tc->pool; 535 536 if (m->quiesced && m->prepared) { 537 list_add(&m->list, &pool->prepared_mappings); 538 wake_worker(pool); 539 } 540 } 541 542 static void copy_complete(int read_err, unsigned long write_err, void *context) 543 { 544 unsigned long flags; 545 struct dm_thin_new_mapping *m = context; 546 struct pool *pool = m->tc->pool; 547 548 m->err = read_err || write_err ? -EIO : 0; 549 550 spin_lock_irqsave(&pool->lock, flags); 551 m->prepared = 1; 552 __maybe_add_mapping(m); 553 spin_unlock_irqrestore(&pool->lock, flags); 554 } 555 556 static void overwrite_endio(struct bio *bio, int err) 557 { 558 unsigned long flags; 559 struct dm_thin_endio_hook *h = dm_per_bio_data(bio, sizeof(struct dm_thin_endio_hook)); 560 struct dm_thin_new_mapping *m = h->overwrite_mapping; 561 struct pool *pool = m->tc->pool; 562 563 m->err = err; 564 565 spin_lock_irqsave(&pool->lock, flags); 566 m->prepared = 1; 567 __maybe_add_mapping(m); 568 spin_unlock_irqrestore(&pool->lock, flags); 569 } 570 571 /*----------------------------------------------------------------*/ 572 573 /* 574 * Workqueue. 575 */ 576 577 /* 578 * Prepared mapping jobs. 579 */ 580 581 /* 582 * This sends the bios in the cell back to the deferred_bios list. 583 */ 584 static void cell_defer(struct thin_c *tc, struct dm_bio_prison_cell *cell) 585 { 586 struct pool *pool = tc->pool; 587 unsigned long flags; 588 589 spin_lock_irqsave(&pool->lock, flags); 590 cell_release(pool, cell, &pool->deferred_bios); 591 spin_unlock_irqrestore(&tc->pool->lock, flags); 592 593 wake_worker(pool); 594 } 595 596 /* 597 * Same as cell_defer above, except it omits the original holder of the cell. 598 */ 599 static void cell_defer_no_holder(struct thin_c *tc, struct dm_bio_prison_cell *cell) 600 { 601 struct pool *pool = tc->pool; 602 unsigned long flags; 603 604 spin_lock_irqsave(&pool->lock, flags); 605 cell_release_no_holder(pool, cell, &pool->deferred_bios); 606 spin_unlock_irqrestore(&pool->lock, flags); 607 608 wake_worker(pool); 609 } 610 611 static void process_prepared_mapping_fail(struct dm_thin_new_mapping *m) 612 { 613 if (m->bio) 614 m->bio->bi_end_io = m->saved_bi_end_io; 615 cell_error(m->tc->pool, m->cell); 616 list_del(&m->list); 617 mempool_free(m, m->tc->pool->mapping_pool); 618 } 619 620 static void process_prepared_mapping(struct dm_thin_new_mapping *m) 621 { 622 struct thin_c *tc = m->tc; 623 struct pool *pool = tc->pool; 624 struct bio *bio; 625 int r; 626 627 bio = m->bio; 628 if (bio) 629 bio->bi_end_io = m->saved_bi_end_io; 630 631 if (m->err) { 632 cell_error(pool, m->cell); 633 goto out; 634 } 635 636 /* 637 * Commit the prepared block into the mapping btree. 638 * Any I/O for this block arriving after this point will get 639 * remapped to it directly. 640 */ 641 r = dm_thin_insert_block(tc->td, m->virt_block, m->data_block); 642 if (r) { 643 DMERR_LIMIT("%s: dm_thin_insert_block() failed: error = %d", 644 dm_device_name(pool->pool_md), r); 645 set_pool_mode(pool, PM_READ_ONLY); 646 cell_error(pool, m->cell); 647 goto out; 648 } 649 650 /* 651 * Release any bios held while the block was being provisioned. 652 * If we are processing a write bio that completely covers the block, 653 * we already processed it so can ignore it now when processing 654 * the bios in the cell. 655 */ 656 if (bio) { 657 cell_defer_no_holder(tc, m->cell); 658 bio_endio(bio, 0); 659 } else 660 cell_defer(tc, m->cell); 661 662 out: 663 list_del(&m->list); 664 mempool_free(m, pool->mapping_pool); 665 } 666 667 static void process_prepared_discard_fail(struct dm_thin_new_mapping *m) 668 { 669 struct thin_c *tc = m->tc; 670 671 bio_io_error(m->bio); 672 cell_defer_no_holder(tc, m->cell); 673 cell_defer_no_holder(tc, m->cell2); 674 mempool_free(m, tc->pool->mapping_pool); 675 } 676 677 static void process_prepared_discard_passdown(struct dm_thin_new_mapping *m) 678 { 679 struct thin_c *tc = m->tc; 680 681 inc_all_io_entry(tc->pool, m->bio); 682 cell_defer_no_holder(tc, m->cell); 683 cell_defer_no_holder(tc, m->cell2); 684 685 if (m->pass_discard) 686 remap_and_issue(tc, m->bio, m->data_block); 687 else 688 bio_endio(m->bio, 0); 689 690 mempool_free(m, tc->pool->mapping_pool); 691 } 692 693 static void process_prepared_discard(struct dm_thin_new_mapping *m) 694 { 695 int r; 696 struct thin_c *tc = m->tc; 697 698 r = dm_thin_remove_block(tc->td, m->virt_block); 699 if (r) 700 DMERR_LIMIT("dm_thin_remove_block() failed"); 701 702 process_prepared_discard_passdown(m); 703 } 704 705 static void process_prepared(struct pool *pool, struct list_head *head, 706 process_mapping_fn *fn) 707 { 708 unsigned long flags; 709 struct list_head maps; 710 struct dm_thin_new_mapping *m, *tmp; 711 712 INIT_LIST_HEAD(&maps); 713 spin_lock_irqsave(&pool->lock, flags); 714 list_splice_init(head, &maps); 715 spin_unlock_irqrestore(&pool->lock, flags); 716 717 list_for_each_entry_safe(m, tmp, &maps, list) 718 (*fn)(m); 719 } 720 721 /* 722 * Deferred bio jobs. 723 */ 724 static int io_overlaps_block(struct pool *pool, struct bio *bio) 725 { 726 return bio->bi_size == (pool->sectors_per_block << SECTOR_SHIFT); 727 } 728 729 static int io_overwrites_block(struct pool *pool, struct bio *bio) 730 { 731 return (bio_data_dir(bio) == WRITE) && 732 io_overlaps_block(pool, bio); 733 } 734 735 static void save_and_set_endio(struct bio *bio, bio_end_io_t **save, 736 bio_end_io_t *fn) 737 { 738 *save = bio->bi_end_io; 739 bio->bi_end_io = fn; 740 } 741 742 static int ensure_next_mapping(struct pool *pool) 743 { 744 if (pool->next_mapping) 745 return 0; 746 747 pool->next_mapping = mempool_alloc(pool->mapping_pool, GFP_ATOMIC); 748 749 return pool->next_mapping ? 0 : -ENOMEM; 750 } 751 752 static struct dm_thin_new_mapping *get_next_mapping(struct pool *pool) 753 { 754 struct dm_thin_new_mapping *r = pool->next_mapping; 755 756 BUG_ON(!pool->next_mapping); 757 758 pool->next_mapping = NULL; 759 760 return r; 761 } 762 763 static void schedule_copy(struct thin_c *tc, dm_block_t virt_block, 764 struct dm_dev *origin, dm_block_t data_origin, 765 dm_block_t data_dest, 766 struct dm_bio_prison_cell *cell, struct bio *bio) 767 { 768 int r; 769 struct pool *pool = tc->pool; 770 struct dm_thin_new_mapping *m = get_next_mapping(pool); 771 772 INIT_LIST_HEAD(&m->list); 773 m->quiesced = 0; 774 m->prepared = 0; 775 m->tc = tc; 776 m->virt_block = virt_block; 777 m->data_block = data_dest; 778 m->cell = cell; 779 m->err = 0; 780 m->bio = NULL; 781 782 if (!dm_deferred_set_add_work(pool->shared_read_ds, &m->list)) 783 m->quiesced = 1; 784 785 /* 786 * IO to pool_dev remaps to the pool target's data_dev. 787 * 788 * If the whole block of data is being overwritten, we can issue the 789 * bio immediately. Otherwise we use kcopyd to clone the data first. 790 */ 791 if (io_overwrites_block(pool, bio)) { 792 struct dm_thin_endio_hook *h = dm_per_bio_data(bio, sizeof(struct dm_thin_endio_hook)); 793 794 h->overwrite_mapping = m; 795 m->bio = bio; 796 save_and_set_endio(bio, &m->saved_bi_end_io, overwrite_endio); 797 inc_all_io_entry(pool, bio); 798 remap_and_issue(tc, bio, data_dest); 799 } else { 800 struct dm_io_region from, to; 801 802 from.bdev = origin->bdev; 803 from.sector = data_origin * pool->sectors_per_block; 804 from.count = pool->sectors_per_block; 805 806 to.bdev = tc->pool_dev->bdev; 807 to.sector = data_dest * pool->sectors_per_block; 808 to.count = pool->sectors_per_block; 809 810 r = dm_kcopyd_copy(pool->copier, &from, 1, &to, 811 0, copy_complete, m); 812 if (r < 0) { 813 mempool_free(m, pool->mapping_pool); 814 DMERR_LIMIT("dm_kcopyd_copy() failed"); 815 cell_error(pool, cell); 816 } 817 } 818 } 819 820 static void schedule_internal_copy(struct thin_c *tc, dm_block_t virt_block, 821 dm_block_t data_origin, dm_block_t data_dest, 822 struct dm_bio_prison_cell *cell, struct bio *bio) 823 { 824 schedule_copy(tc, virt_block, tc->pool_dev, 825 data_origin, data_dest, cell, bio); 826 } 827 828 static void schedule_external_copy(struct thin_c *tc, dm_block_t virt_block, 829 dm_block_t data_dest, 830 struct dm_bio_prison_cell *cell, struct bio *bio) 831 { 832 schedule_copy(tc, virt_block, tc->origin_dev, 833 virt_block, data_dest, cell, bio); 834 } 835 836 static void schedule_zero(struct thin_c *tc, dm_block_t virt_block, 837 dm_block_t data_block, struct dm_bio_prison_cell *cell, 838 struct bio *bio) 839 { 840 struct pool *pool = tc->pool; 841 struct dm_thin_new_mapping *m = get_next_mapping(pool); 842 843 INIT_LIST_HEAD(&m->list); 844 m->quiesced = 1; 845 m->prepared = 0; 846 m->tc = tc; 847 m->virt_block = virt_block; 848 m->data_block = data_block; 849 m->cell = cell; 850 m->err = 0; 851 m->bio = NULL; 852 853 /* 854 * If the whole block of data is being overwritten or we are not 855 * zeroing pre-existing data, we can issue the bio immediately. 856 * Otherwise we use kcopyd to zero the data first. 857 */ 858 if (!pool->pf.zero_new_blocks) 859 process_prepared_mapping(m); 860 861 else if (io_overwrites_block(pool, bio)) { 862 struct dm_thin_endio_hook *h = dm_per_bio_data(bio, sizeof(struct dm_thin_endio_hook)); 863 864 h->overwrite_mapping = m; 865 m->bio = bio; 866 save_and_set_endio(bio, &m->saved_bi_end_io, overwrite_endio); 867 inc_all_io_entry(pool, bio); 868 remap_and_issue(tc, bio, data_block); 869 } else { 870 int r; 871 struct dm_io_region to; 872 873 to.bdev = tc->pool_dev->bdev; 874 to.sector = data_block * pool->sectors_per_block; 875 to.count = pool->sectors_per_block; 876 877 r = dm_kcopyd_zero(pool->copier, 1, &to, 0, copy_complete, m); 878 if (r < 0) { 879 mempool_free(m, pool->mapping_pool); 880 DMERR_LIMIT("dm_kcopyd_zero() failed"); 881 cell_error(pool, cell); 882 } 883 } 884 } 885 886 /* 887 * A non-zero return indicates read_only or fail_io mode. 888 * Many callers don't care about the return value. 889 */ 890 static int commit(struct pool *pool) 891 { 892 int r; 893 894 if (get_pool_mode(pool) != PM_WRITE) 895 return -EINVAL; 896 897 r = dm_pool_commit_metadata(pool->pmd); 898 if (r) { 899 DMERR_LIMIT("%s: dm_pool_commit_metadata failed: error = %d", 900 dm_device_name(pool->pool_md), r); 901 set_pool_mode(pool, PM_READ_ONLY); 902 } 903 904 return r; 905 } 906 907 static int alloc_data_block(struct thin_c *tc, dm_block_t *result) 908 { 909 int r; 910 dm_block_t free_blocks; 911 unsigned long flags; 912 struct pool *pool = tc->pool; 913 914 /* 915 * Once no_free_space is set we must not allow allocation to succeed. 916 * Otherwise it is difficult to explain, debug, test and support. 917 */ 918 if (pool->no_free_space) 919 return -ENOSPC; 920 921 r = dm_pool_get_free_block_count(pool->pmd, &free_blocks); 922 if (r) 923 return r; 924 925 if (free_blocks <= pool->low_water_blocks && !pool->low_water_triggered) { 926 DMWARN("%s: reached low water mark for data device: sending event.", 927 dm_device_name(pool->pool_md)); 928 spin_lock_irqsave(&pool->lock, flags); 929 pool->low_water_triggered = 1; 930 spin_unlock_irqrestore(&pool->lock, flags); 931 dm_table_event(pool->ti->table); 932 } 933 934 if (!free_blocks) { 935 /* 936 * Try to commit to see if that will free up some 937 * more space. 938 */ 939 r = commit(pool); 940 if (r) 941 return r; 942 943 r = dm_pool_get_free_block_count(pool->pmd, &free_blocks); 944 if (r) 945 return r; 946 947 /* 948 * If we still have no space we set a flag to avoid 949 * doing all this checking and return -ENOSPC. This 950 * flag serves as a latch that disallows allocations from 951 * this pool until the admin takes action (e.g. resize or 952 * table reload). 953 */ 954 if (!free_blocks) { 955 DMWARN("%s: no free data space available.", 956 dm_device_name(pool->pool_md)); 957 spin_lock_irqsave(&pool->lock, flags); 958 pool->no_free_space = 1; 959 spin_unlock_irqrestore(&pool->lock, flags); 960 return -ENOSPC; 961 } 962 } 963 964 r = dm_pool_alloc_data_block(pool->pmd, result); 965 if (r) { 966 if (r == -ENOSPC && 967 !dm_pool_get_free_metadata_block_count(pool->pmd, &free_blocks) && 968 !free_blocks) { 969 DMWARN("%s: no free metadata space available.", 970 dm_device_name(pool->pool_md)); 971 set_pool_mode(pool, PM_READ_ONLY); 972 } 973 return r; 974 } 975 976 return 0; 977 } 978 979 /* 980 * If we have run out of space, queue bios until the device is 981 * resumed, presumably after having been reloaded with more space. 982 */ 983 static void retry_on_resume(struct bio *bio) 984 { 985 struct dm_thin_endio_hook *h = dm_per_bio_data(bio, sizeof(struct dm_thin_endio_hook)); 986 struct thin_c *tc = h->tc; 987 struct pool *pool = tc->pool; 988 unsigned long flags; 989 990 spin_lock_irqsave(&pool->lock, flags); 991 bio_list_add(&pool->retry_on_resume_list, bio); 992 spin_unlock_irqrestore(&pool->lock, flags); 993 } 994 995 static void no_space(struct pool *pool, struct dm_bio_prison_cell *cell) 996 { 997 struct bio *bio; 998 struct bio_list bios; 999 1000 bio_list_init(&bios); 1001 cell_release(pool, cell, &bios); 1002 1003 while ((bio = bio_list_pop(&bios))) 1004 retry_on_resume(bio); 1005 } 1006 1007 static void process_discard(struct thin_c *tc, struct bio *bio) 1008 { 1009 int r; 1010 unsigned long flags; 1011 struct pool *pool = tc->pool; 1012 struct dm_bio_prison_cell *cell, *cell2; 1013 struct dm_cell_key key, key2; 1014 dm_block_t block = get_bio_block(tc, bio); 1015 struct dm_thin_lookup_result lookup_result; 1016 struct dm_thin_new_mapping *m; 1017 1018 build_virtual_key(tc->td, block, &key); 1019 if (bio_detain(tc->pool, &key, bio, &cell)) 1020 return; 1021 1022 r = dm_thin_find_block(tc->td, block, 1, &lookup_result); 1023 switch (r) { 1024 case 0: 1025 /* 1026 * Check nobody is fiddling with this pool block. This can 1027 * happen if someone's in the process of breaking sharing 1028 * on this block. 1029 */ 1030 build_data_key(tc->td, lookup_result.block, &key2); 1031 if (bio_detain(tc->pool, &key2, bio, &cell2)) { 1032 cell_defer_no_holder(tc, cell); 1033 break; 1034 } 1035 1036 if (io_overlaps_block(pool, bio)) { 1037 /* 1038 * IO may still be going to the destination block. We must 1039 * quiesce before we can do the removal. 1040 */ 1041 m = get_next_mapping(pool); 1042 m->tc = tc; 1043 m->pass_discard = (!lookup_result.shared) && pool->pf.discard_passdown; 1044 m->virt_block = block; 1045 m->data_block = lookup_result.block; 1046 m->cell = cell; 1047 m->cell2 = cell2; 1048 m->err = 0; 1049 m->bio = bio; 1050 1051 if (!dm_deferred_set_add_work(pool->all_io_ds, &m->list)) { 1052 spin_lock_irqsave(&pool->lock, flags); 1053 list_add(&m->list, &pool->prepared_discards); 1054 spin_unlock_irqrestore(&pool->lock, flags); 1055 wake_worker(pool); 1056 } 1057 } else { 1058 inc_all_io_entry(pool, bio); 1059 cell_defer_no_holder(tc, cell); 1060 cell_defer_no_holder(tc, cell2); 1061 1062 /* 1063 * The DM core makes sure that the discard doesn't span 1064 * a block boundary. So we submit the discard of a 1065 * partial block appropriately. 1066 */ 1067 if ((!lookup_result.shared) && pool->pf.discard_passdown) 1068 remap_and_issue(tc, bio, lookup_result.block); 1069 else 1070 bio_endio(bio, 0); 1071 } 1072 break; 1073 1074 case -ENODATA: 1075 /* 1076 * It isn't provisioned, just forget it. 1077 */ 1078 cell_defer_no_holder(tc, cell); 1079 bio_endio(bio, 0); 1080 break; 1081 1082 default: 1083 DMERR_LIMIT("%s: dm_thin_find_block() failed: error = %d", 1084 __func__, r); 1085 cell_defer_no_holder(tc, cell); 1086 bio_io_error(bio); 1087 break; 1088 } 1089 } 1090 1091 static void break_sharing(struct thin_c *tc, struct bio *bio, dm_block_t block, 1092 struct dm_cell_key *key, 1093 struct dm_thin_lookup_result *lookup_result, 1094 struct dm_bio_prison_cell *cell) 1095 { 1096 int r; 1097 dm_block_t data_block; 1098 struct pool *pool = tc->pool; 1099 1100 r = alloc_data_block(tc, &data_block); 1101 switch (r) { 1102 case 0: 1103 schedule_internal_copy(tc, block, lookup_result->block, 1104 data_block, cell, bio); 1105 break; 1106 1107 case -ENOSPC: 1108 no_space(pool, cell); 1109 break; 1110 1111 default: 1112 DMERR_LIMIT("%s: alloc_data_block() failed: error = %d", 1113 __func__, r); 1114 set_pool_mode(pool, PM_READ_ONLY); 1115 cell_error(pool, cell); 1116 break; 1117 } 1118 } 1119 1120 static void process_shared_bio(struct thin_c *tc, struct bio *bio, 1121 dm_block_t block, 1122 struct dm_thin_lookup_result *lookup_result) 1123 { 1124 struct dm_bio_prison_cell *cell; 1125 struct pool *pool = tc->pool; 1126 struct dm_cell_key key; 1127 1128 /* 1129 * If cell is already occupied, then sharing is already in the process 1130 * of being broken so we have nothing further to do here. 1131 */ 1132 build_data_key(tc->td, lookup_result->block, &key); 1133 if (bio_detain(pool, &key, bio, &cell)) 1134 return; 1135 1136 if (bio_data_dir(bio) == WRITE && bio->bi_size) 1137 break_sharing(tc, bio, block, &key, lookup_result, cell); 1138 else { 1139 struct dm_thin_endio_hook *h = dm_per_bio_data(bio, sizeof(struct dm_thin_endio_hook)); 1140 1141 h->shared_read_entry = dm_deferred_entry_inc(pool->shared_read_ds); 1142 inc_all_io_entry(pool, bio); 1143 cell_defer_no_holder(tc, cell); 1144 1145 remap_and_issue(tc, bio, lookup_result->block); 1146 } 1147 } 1148 1149 static void provision_block(struct thin_c *tc, struct bio *bio, dm_block_t block, 1150 struct dm_bio_prison_cell *cell) 1151 { 1152 int r; 1153 dm_block_t data_block; 1154 struct pool *pool = tc->pool; 1155 1156 /* 1157 * Remap empty bios (flushes) immediately, without provisioning. 1158 */ 1159 if (!bio->bi_size) { 1160 inc_all_io_entry(pool, bio); 1161 cell_defer_no_holder(tc, cell); 1162 1163 remap_and_issue(tc, bio, 0); 1164 return; 1165 } 1166 1167 /* 1168 * Fill read bios with zeroes and complete them immediately. 1169 */ 1170 if (bio_data_dir(bio) == READ) { 1171 zero_fill_bio(bio); 1172 cell_defer_no_holder(tc, cell); 1173 bio_endio(bio, 0); 1174 return; 1175 } 1176 1177 r = alloc_data_block(tc, &data_block); 1178 switch (r) { 1179 case 0: 1180 if (tc->origin_dev) 1181 schedule_external_copy(tc, block, data_block, cell, bio); 1182 else 1183 schedule_zero(tc, block, data_block, cell, bio); 1184 break; 1185 1186 case -ENOSPC: 1187 no_space(pool, cell); 1188 break; 1189 1190 default: 1191 DMERR_LIMIT("%s: alloc_data_block() failed: error = %d", 1192 __func__, r); 1193 set_pool_mode(pool, PM_READ_ONLY); 1194 cell_error(pool, cell); 1195 break; 1196 } 1197 } 1198 1199 static void process_bio(struct thin_c *tc, struct bio *bio) 1200 { 1201 int r; 1202 struct pool *pool = tc->pool; 1203 dm_block_t block = get_bio_block(tc, bio); 1204 struct dm_bio_prison_cell *cell; 1205 struct dm_cell_key key; 1206 struct dm_thin_lookup_result lookup_result; 1207 1208 /* 1209 * If cell is already occupied, then the block is already 1210 * being provisioned so we have nothing further to do here. 1211 */ 1212 build_virtual_key(tc->td, block, &key); 1213 if (bio_detain(pool, &key, bio, &cell)) 1214 return; 1215 1216 r = dm_thin_find_block(tc->td, block, 1, &lookup_result); 1217 switch (r) { 1218 case 0: 1219 if (lookup_result.shared) { 1220 process_shared_bio(tc, bio, block, &lookup_result); 1221 cell_defer_no_holder(tc, cell); /* FIXME: pass this cell into process_shared? */ 1222 } else { 1223 inc_all_io_entry(pool, bio); 1224 cell_defer_no_holder(tc, cell); 1225 1226 remap_and_issue(tc, bio, lookup_result.block); 1227 } 1228 break; 1229 1230 case -ENODATA: 1231 if (bio_data_dir(bio) == READ && tc->origin_dev) { 1232 inc_all_io_entry(pool, bio); 1233 cell_defer_no_holder(tc, cell); 1234 1235 remap_to_origin_and_issue(tc, bio); 1236 } else 1237 provision_block(tc, bio, block, cell); 1238 break; 1239 1240 default: 1241 DMERR_LIMIT("%s: dm_thin_find_block() failed: error = %d", 1242 __func__, r); 1243 cell_defer_no_holder(tc, cell); 1244 bio_io_error(bio); 1245 break; 1246 } 1247 } 1248 1249 static void process_bio_read_only(struct thin_c *tc, struct bio *bio) 1250 { 1251 int r; 1252 int rw = bio_data_dir(bio); 1253 dm_block_t block = get_bio_block(tc, bio); 1254 struct dm_thin_lookup_result lookup_result; 1255 1256 r = dm_thin_find_block(tc->td, block, 1, &lookup_result); 1257 switch (r) { 1258 case 0: 1259 if (lookup_result.shared && (rw == WRITE) && bio->bi_size) 1260 bio_io_error(bio); 1261 else { 1262 inc_all_io_entry(tc->pool, bio); 1263 remap_and_issue(tc, bio, lookup_result.block); 1264 } 1265 break; 1266 1267 case -ENODATA: 1268 if (rw != READ) { 1269 bio_io_error(bio); 1270 break; 1271 } 1272 1273 if (tc->origin_dev) { 1274 inc_all_io_entry(tc->pool, bio); 1275 remap_to_origin_and_issue(tc, bio); 1276 break; 1277 } 1278 1279 zero_fill_bio(bio); 1280 bio_endio(bio, 0); 1281 break; 1282 1283 default: 1284 DMERR_LIMIT("%s: dm_thin_find_block() failed: error = %d", 1285 __func__, r); 1286 bio_io_error(bio); 1287 break; 1288 } 1289 } 1290 1291 static void process_bio_fail(struct thin_c *tc, struct bio *bio) 1292 { 1293 bio_io_error(bio); 1294 } 1295 1296 /* 1297 * FIXME: should we also commit due to size of transaction, measured in 1298 * metadata blocks? 1299 */ 1300 static int need_commit_due_to_time(struct pool *pool) 1301 { 1302 return jiffies < pool->last_commit_jiffies || 1303 jiffies > pool->last_commit_jiffies + COMMIT_PERIOD; 1304 } 1305 1306 static void process_deferred_bios(struct pool *pool) 1307 { 1308 unsigned long flags; 1309 struct bio *bio; 1310 struct bio_list bios; 1311 1312 bio_list_init(&bios); 1313 1314 spin_lock_irqsave(&pool->lock, flags); 1315 bio_list_merge(&bios, &pool->deferred_bios); 1316 bio_list_init(&pool->deferred_bios); 1317 spin_unlock_irqrestore(&pool->lock, flags); 1318 1319 while ((bio = bio_list_pop(&bios))) { 1320 struct dm_thin_endio_hook *h = dm_per_bio_data(bio, sizeof(struct dm_thin_endio_hook)); 1321 struct thin_c *tc = h->tc; 1322 1323 /* 1324 * If we've got no free new_mapping structs, and processing 1325 * this bio might require one, we pause until there are some 1326 * prepared mappings to process. 1327 */ 1328 if (ensure_next_mapping(pool)) { 1329 spin_lock_irqsave(&pool->lock, flags); 1330 bio_list_merge(&pool->deferred_bios, &bios); 1331 spin_unlock_irqrestore(&pool->lock, flags); 1332 1333 break; 1334 } 1335 1336 if (bio->bi_rw & REQ_DISCARD) 1337 pool->process_discard(tc, bio); 1338 else 1339 pool->process_bio(tc, bio); 1340 } 1341 1342 /* 1343 * If there are any deferred flush bios, we must commit 1344 * the metadata before issuing them. 1345 */ 1346 bio_list_init(&bios); 1347 spin_lock_irqsave(&pool->lock, flags); 1348 bio_list_merge(&bios, &pool->deferred_flush_bios); 1349 bio_list_init(&pool->deferred_flush_bios); 1350 spin_unlock_irqrestore(&pool->lock, flags); 1351 1352 if (bio_list_empty(&bios) && !need_commit_due_to_time(pool)) 1353 return; 1354 1355 if (commit(pool)) { 1356 while ((bio = bio_list_pop(&bios))) 1357 bio_io_error(bio); 1358 return; 1359 } 1360 pool->last_commit_jiffies = jiffies; 1361 1362 while ((bio = bio_list_pop(&bios))) 1363 generic_make_request(bio); 1364 } 1365 1366 static void do_worker(struct work_struct *ws) 1367 { 1368 struct pool *pool = container_of(ws, struct pool, worker); 1369 1370 process_prepared(pool, &pool->prepared_mappings, &pool->process_prepared_mapping); 1371 process_prepared(pool, &pool->prepared_discards, &pool->process_prepared_discard); 1372 process_deferred_bios(pool); 1373 } 1374 1375 /* 1376 * We want to commit periodically so that not too much 1377 * unwritten data builds up. 1378 */ 1379 static void do_waker(struct work_struct *ws) 1380 { 1381 struct pool *pool = container_of(to_delayed_work(ws), struct pool, waker); 1382 wake_worker(pool); 1383 queue_delayed_work(pool->wq, &pool->waker, COMMIT_PERIOD); 1384 } 1385 1386 /*----------------------------------------------------------------*/ 1387 1388 static enum pool_mode get_pool_mode(struct pool *pool) 1389 { 1390 return pool->pf.mode; 1391 } 1392 1393 static void set_pool_mode(struct pool *pool, enum pool_mode mode) 1394 { 1395 int r; 1396 1397 pool->pf.mode = mode; 1398 1399 switch (mode) { 1400 case PM_FAIL: 1401 DMERR("%s: switching pool to failure mode", 1402 dm_device_name(pool->pool_md)); 1403 dm_pool_metadata_read_only(pool->pmd); 1404 pool->process_bio = process_bio_fail; 1405 pool->process_discard = process_bio_fail; 1406 pool->process_prepared_mapping = process_prepared_mapping_fail; 1407 pool->process_prepared_discard = process_prepared_discard_fail; 1408 break; 1409 1410 case PM_READ_ONLY: 1411 DMERR("%s: switching pool to read-only mode", 1412 dm_device_name(pool->pool_md)); 1413 r = dm_pool_abort_metadata(pool->pmd); 1414 if (r) { 1415 DMERR("%s: aborting transaction failed", 1416 dm_device_name(pool->pool_md)); 1417 set_pool_mode(pool, PM_FAIL); 1418 } else { 1419 dm_pool_metadata_read_only(pool->pmd); 1420 pool->process_bio = process_bio_read_only; 1421 pool->process_discard = process_discard; 1422 pool->process_prepared_mapping = process_prepared_mapping_fail; 1423 pool->process_prepared_discard = process_prepared_discard_passdown; 1424 } 1425 break; 1426 1427 case PM_WRITE: 1428 dm_pool_metadata_read_write(pool->pmd); 1429 pool->process_bio = process_bio; 1430 pool->process_discard = process_discard; 1431 pool->process_prepared_mapping = process_prepared_mapping; 1432 pool->process_prepared_discard = process_prepared_discard; 1433 break; 1434 } 1435 } 1436 1437 /*----------------------------------------------------------------*/ 1438 1439 /* 1440 * Mapping functions. 1441 */ 1442 1443 /* 1444 * Called only while mapping a thin bio to hand it over to the workqueue. 1445 */ 1446 static void thin_defer_bio(struct thin_c *tc, struct bio *bio) 1447 { 1448 unsigned long flags; 1449 struct pool *pool = tc->pool; 1450 1451 spin_lock_irqsave(&pool->lock, flags); 1452 bio_list_add(&pool->deferred_bios, bio); 1453 spin_unlock_irqrestore(&pool->lock, flags); 1454 1455 wake_worker(pool); 1456 } 1457 1458 static void thin_hook_bio(struct thin_c *tc, struct bio *bio) 1459 { 1460 struct dm_thin_endio_hook *h = dm_per_bio_data(bio, sizeof(struct dm_thin_endio_hook)); 1461 1462 h->tc = tc; 1463 h->shared_read_entry = NULL; 1464 h->all_io_entry = NULL; 1465 h->overwrite_mapping = NULL; 1466 } 1467 1468 /* 1469 * Non-blocking function called from the thin target's map function. 1470 */ 1471 static int thin_bio_map(struct dm_target *ti, struct bio *bio) 1472 { 1473 int r; 1474 struct thin_c *tc = ti->private; 1475 dm_block_t block = get_bio_block(tc, bio); 1476 struct dm_thin_device *td = tc->td; 1477 struct dm_thin_lookup_result result; 1478 struct dm_bio_prison_cell cell1, cell2; 1479 struct dm_bio_prison_cell *cell_result; 1480 struct dm_cell_key key; 1481 1482 thin_hook_bio(tc, bio); 1483 1484 if (get_pool_mode(tc->pool) == PM_FAIL) { 1485 bio_io_error(bio); 1486 return DM_MAPIO_SUBMITTED; 1487 } 1488 1489 if (bio->bi_rw & (REQ_DISCARD | REQ_FLUSH | REQ_FUA)) { 1490 thin_defer_bio(tc, bio); 1491 return DM_MAPIO_SUBMITTED; 1492 } 1493 1494 r = dm_thin_find_block(td, block, 0, &result); 1495 1496 /* 1497 * Note that we defer readahead too. 1498 */ 1499 switch (r) { 1500 case 0: 1501 if (unlikely(result.shared)) { 1502 /* 1503 * We have a race condition here between the 1504 * result.shared value returned by the lookup and 1505 * snapshot creation, which may cause new 1506 * sharing. 1507 * 1508 * To avoid this always quiesce the origin before 1509 * taking the snap. You want to do this anyway to 1510 * ensure a consistent application view 1511 * (i.e. lockfs). 1512 * 1513 * More distant ancestors are irrelevant. The 1514 * shared flag will be set in their case. 1515 */ 1516 thin_defer_bio(tc, bio); 1517 return DM_MAPIO_SUBMITTED; 1518 } 1519 1520 build_virtual_key(tc->td, block, &key); 1521 if (dm_bio_detain(tc->pool->prison, &key, bio, &cell1, &cell_result)) 1522 return DM_MAPIO_SUBMITTED; 1523 1524 build_data_key(tc->td, result.block, &key); 1525 if (dm_bio_detain(tc->pool->prison, &key, bio, &cell2, &cell_result)) { 1526 cell_defer_no_holder_no_free(tc, &cell1); 1527 return DM_MAPIO_SUBMITTED; 1528 } 1529 1530 inc_all_io_entry(tc->pool, bio); 1531 cell_defer_no_holder_no_free(tc, &cell2); 1532 cell_defer_no_holder_no_free(tc, &cell1); 1533 1534 remap(tc, bio, result.block); 1535 return DM_MAPIO_REMAPPED; 1536 1537 case -ENODATA: 1538 if (get_pool_mode(tc->pool) == PM_READ_ONLY) { 1539 /* 1540 * This block isn't provisioned, and we have no way 1541 * of doing so. Just error it. 1542 */ 1543 bio_io_error(bio); 1544 return DM_MAPIO_SUBMITTED; 1545 } 1546 /* fall through */ 1547 1548 case -EWOULDBLOCK: 1549 /* 1550 * In future, the failed dm_thin_find_block above could 1551 * provide the hint to load the metadata into cache. 1552 */ 1553 thin_defer_bio(tc, bio); 1554 return DM_MAPIO_SUBMITTED; 1555 1556 default: 1557 /* 1558 * Must always call bio_io_error on failure. 1559 * dm_thin_find_block can fail with -EINVAL if the 1560 * pool is switched to fail-io mode. 1561 */ 1562 bio_io_error(bio); 1563 return DM_MAPIO_SUBMITTED; 1564 } 1565 } 1566 1567 static int pool_is_congested(struct dm_target_callbacks *cb, int bdi_bits) 1568 { 1569 int r; 1570 unsigned long flags; 1571 struct pool_c *pt = container_of(cb, struct pool_c, callbacks); 1572 1573 spin_lock_irqsave(&pt->pool->lock, flags); 1574 r = !bio_list_empty(&pt->pool->retry_on_resume_list); 1575 spin_unlock_irqrestore(&pt->pool->lock, flags); 1576 1577 if (!r) { 1578 struct request_queue *q = bdev_get_queue(pt->data_dev->bdev); 1579 r = bdi_congested(&q->backing_dev_info, bdi_bits); 1580 } 1581 1582 return r; 1583 } 1584 1585 static void __requeue_bios(struct pool *pool) 1586 { 1587 bio_list_merge(&pool->deferred_bios, &pool->retry_on_resume_list); 1588 bio_list_init(&pool->retry_on_resume_list); 1589 } 1590 1591 /*---------------------------------------------------------------- 1592 * Binding of control targets to a pool object 1593 *--------------------------------------------------------------*/ 1594 static bool data_dev_supports_discard(struct pool_c *pt) 1595 { 1596 struct request_queue *q = bdev_get_queue(pt->data_dev->bdev); 1597 1598 return q && blk_queue_discard(q); 1599 } 1600 1601 static bool is_factor(sector_t block_size, uint32_t n) 1602 { 1603 return !sector_div(block_size, n); 1604 } 1605 1606 /* 1607 * If discard_passdown was enabled verify that the data device 1608 * supports discards. Disable discard_passdown if not. 1609 */ 1610 static void disable_passdown_if_not_supported(struct pool_c *pt) 1611 { 1612 struct pool *pool = pt->pool; 1613 struct block_device *data_bdev = pt->data_dev->bdev; 1614 struct queue_limits *data_limits = &bdev_get_queue(data_bdev)->limits; 1615 sector_t block_size = pool->sectors_per_block << SECTOR_SHIFT; 1616 const char *reason = NULL; 1617 char buf[BDEVNAME_SIZE]; 1618 1619 if (!pt->adjusted_pf.discard_passdown) 1620 return; 1621 1622 if (!data_dev_supports_discard(pt)) 1623 reason = "discard unsupported"; 1624 1625 else if (data_limits->max_discard_sectors < pool->sectors_per_block) 1626 reason = "max discard sectors smaller than a block"; 1627 1628 else if (data_limits->discard_granularity > block_size) 1629 reason = "discard granularity larger than a block"; 1630 1631 else if (!is_factor(block_size, data_limits->discard_granularity)) 1632 reason = "discard granularity not a factor of block size"; 1633 1634 if (reason) { 1635 DMWARN("Data device (%s) %s: Disabling discard passdown.", bdevname(data_bdev, buf), reason); 1636 pt->adjusted_pf.discard_passdown = false; 1637 } 1638 } 1639 1640 static int bind_control_target(struct pool *pool, struct dm_target *ti) 1641 { 1642 struct pool_c *pt = ti->private; 1643 1644 /* 1645 * We want to make sure that a pool in PM_FAIL mode is never upgraded. 1646 */ 1647 enum pool_mode old_mode = pool->pf.mode; 1648 enum pool_mode new_mode = pt->adjusted_pf.mode; 1649 1650 /* 1651 * If we were in PM_FAIL mode, rollback of metadata failed. We're 1652 * not going to recover without a thin_repair. So we never let the 1653 * pool move out of the old mode. On the other hand a PM_READ_ONLY 1654 * may have been due to a lack of metadata or data space, and may 1655 * now work (ie. if the underlying devices have been resized). 1656 */ 1657 if (old_mode == PM_FAIL) 1658 new_mode = old_mode; 1659 1660 pool->ti = ti; 1661 pool->low_water_blocks = pt->low_water_blocks; 1662 pool->pf = pt->adjusted_pf; 1663 1664 set_pool_mode(pool, new_mode); 1665 1666 return 0; 1667 } 1668 1669 static void unbind_control_target(struct pool *pool, struct dm_target *ti) 1670 { 1671 if (pool->ti == ti) 1672 pool->ti = NULL; 1673 } 1674 1675 /*---------------------------------------------------------------- 1676 * Pool creation 1677 *--------------------------------------------------------------*/ 1678 /* Initialize pool features. */ 1679 static void pool_features_init(struct pool_features *pf) 1680 { 1681 pf->mode = PM_WRITE; 1682 pf->zero_new_blocks = true; 1683 pf->discard_enabled = true; 1684 pf->discard_passdown = true; 1685 } 1686 1687 static void __pool_destroy(struct pool *pool) 1688 { 1689 __pool_table_remove(pool); 1690 1691 if (dm_pool_metadata_close(pool->pmd) < 0) 1692 DMWARN("%s: dm_pool_metadata_close() failed.", __func__); 1693 1694 dm_bio_prison_destroy(pool->prison); 1695 dm_kcopyd_client_destroy(pool->copier); 1696 1697 if (pool->wq) 1698 destroy_workqueue(pool->wq); 1699 1700 if (pool->next_mapping) 1701 mempool_free(pool->next_mapping, pool->mapping_pool); 1702 mempool_destroy(pool->mapping_pool); 1703 dm_deferred_set_destroy(pool->shared_read_ds); 1704 dm_deferred_set_destroy(pool->all_io_ds); 1705 kfree(pool); 1706 } 1707 1708 static struct kmem_cache *_new_mapping_cache; 1709 1710 static struct pool *pool_create(struct mapped_device *pool_md, 1711 struct block_device *metadata_dev, 1712 unsigned long block_size, 1713 int read_only, char **error) 1714 { 1715 int r; 1716 void *err_p; 1717 struct pool *pool; 1718 struct dm_pool_metadata *pmd; 1719 bool format_device = read_only ? false : true; 1720 1721 pmd = dm_pool_metadata_open(metadata_dev, block_size, format_device); 1722 if (IS_ERR(pmd)) { 1723 *error = "Error creating metadata object"; 1724 return (struct pool *)pmd; 1725 } 1726 1727 pool = kmalloc(sizeof(*pool), GFP_KERNEL); 1728 if (!pool) { 1729 *error = "Error allocating memory for pool"; 1730 err_p = ERR_PTR(-ENOMEM); 1731 goto bad_pool; 1732 } 1733 1734 pool->pmd = pmd; 1735 pool->sectors_per_block = block_size; 1736 if (block_size & (block_size - 1)) 1737 pool->sectors_per_block_shift = -1; 1738 else 1739 pool->sectors_per_block_shift = __ffs(block_size); 1740 pool->low_water_blocks = 0; 1741 pool_features_init(&pool->pf); 1742 pool->prison = dm_bio_prison_create(PRISON_CELLS); 1743 if (!pool->prison) { 1744 *error = "Error creating pool's bio prison"; 1745 err_p = ERR_PTR(-ENOMEM); 1746 goto bad_prison; 1747 } 1748 1749 pool->copier = dm_kcopyd_client_create(&dm_kcopyd_throttle); 1750 if (IS_ERR(pool->copier)) { 1751 r = PTR_ERR(pool->copier); 1752 *error = "Error creating pool's kcopyd client"; 1753 err_p = ERR_PTR(r); 1754 goto bad_kcopyd_client; 1755 } 1756 1757 /* 1758 * Create singlethreaded workqueue that will service all devices 1759 * that use this metadata. 1760 */ 1761 pool->wq = alloc_ordered_workqueue("dm-" DM_MSG_PREFIX, WQ_MEM_RECLAIM); 1762 if (!pool->wq) { 1763 *error = "Error creating pool's workqueue"; 1764 err_p = ERR_PTR(-ENOMEM); 1765 goto bad_wq; 1766 } 1767 1768 INIT_WORK(&pool->worker, do_worker); 1769 INIT_DELAYED_WORK(&pool->waker, do_waker); 1770 spin_lock_init(&pool->lock); 1771 bio_list_init(&pool->deferred_bios); 1772 bio_list_init(&pool->deferred_flush_bios); 1773 INIT_LIST_HEAD(&pool->prepared_mappings); 1774 INIT_LIST_HEAD(&pool->prepared_discards); 1775 pool->low_water_triggered = 0; 1776 pool->no_free_space = 0; 1777 bio_list_init(&pool->retry_on_resume_list); 1778 1779 pool->shared_read_ds = dm_deferred_set_create(); 1780 if (!pool->shared_read_ds) { 1781 *error = "Error creating pool's shared read deferred set"; 1782 err_p = ERR_PTR(-ENOMEM); 1783 goto bad_shared_read_ds; 1784 } 1785 1786 pool->all_io_ds = dm_deferred_set_create(); 1787 if (!pool->all_io_ds) { 1788 *error = "Error creating pool's all io deferred set"; 1789 err_p = ERR_PTR(-ENOMEM); 1790 goto bad_all_io_ds; 1791 } 1792 1793 pool->next_mapping = NULL; 1794 pool->mapping_pool = mempool_create_slab_pool(MAPPING_POOL_SIZE, 1795 _new_mapping_cache); 1796 if (!pool->mapping_pool) { 1797 *error = "Error creating pool's mapping mempool"; 1798 err_p = ERR_PTR(-ENOMEM); 1799 goto bad_mapping_pool; 1800 } 1801 1802 pool->ref_count = 1; 1803 pool->last_commit_jiffies = jiffies; 1804 pool->pool_md = pool_md; 1805 pool->md_dev = metadata_dev; 1806 __pool_table_insert(pool); 1807 1808 return pool; 1809 1810 bad_mapping_pool: 1811 dm_deferred_set_destroy(pool->all_io_ds); 1812 bad_all_io_ds: 1813 dm_deferred_set_destroy(pool->shared_read_ds); 1814 bad_shared_read_ds: 1815 destroy_workqueue(pool->wq); 1816 bad_wq: 1817 dm_kcopyd_client_destroy(pool->copier); 1818 bad_kcopyd_client: 1819 dm_bio_prison_destroy(pool->prison); 1820 bad_prison: 1821 kfree(pool); 1822 bad_pool: 1823 if (dm_pool_metadata_close(pmd)) 1824 DMWARN("%s: dm_pool_metadata_close() failed.", __func__); 1825 1826 return err_p; 1827 } 1828 1829 static void __pool_inc(struct pool *pool) 1830 { 1831 BUG_ON(!mutex_is_locked(&dm_thin_pool_table.mutex)); 1832 pool->ref_count++; 1833 } 1834 1835 static void __pool_dec(struct pool *pool) 1836 { 1837 BUG_ON(!mutex_is_locked(&dm_thin_pool_table.mutex)); 1838 BUG_ON(!pool->ref_count); 1839 if (!--pool->ref_count) 1840 __pool_destroy(pool); 1841 } 1842 1843 static struct pool *__pool_find(struct mapped_device *pool_md, 1844 struct block_device *metadata_dev, 1845 unsigned long block_size, int read_only, 1846 char **error, int *created) 1847 { 1848 struct pool *pool = __pool_table_lookup_metadata_dev(metadata_dev); 1849 1850 if (pool) { 1851 if (pool->pool_md != pool_md) { 1852 *error = "metadata device already in use by a pool"; 1853 return ERR_PTR(-EBUSY); 1854 } 1855 __pool_inc(pool); 1856 1857 } else { 1858 pool = __pool_table_lookup(pool_md); 1859 if (pool) { 1860 if (pool->md_dev != metadata_dev) { 1861 *error = "different pool cannot replace a pool"; 1862 return ERR_PTR(-EINVAL); 1863 } 1864 __pool_inc(pool); 1865 1866 } else { 1867 pool = pool_create(pool_md, metadata_dev, block_size, read_only, error); 1868 *created = 1; 1869 } 1870 } 1871 1872 return pool; 1873 } 1874 1875 /*---------------------------------------------------------------- 1876 * Pool target methods 1877 *--------------------------------------------------------------*/ 1878 static void pool_dtr(struct dm_target *ti) 1879 { 1880 struct pool_c *pt = ti->private; 1881 1882 mutex_lock(&dm_thin_pool_table.mutex); 1883 1884 unbind_control_target(pt->pool, ti); 1885 __pool_dec(pt->pool); 1886 dm_put_device(ti, pt->metadata_dev); 1887 dm_put_device(ti, pt->data_dev); 1888 kfree(pt); 1889 1890 mutex_unlock(&dm_thin_pool_table.mutex); 1891 } 1892 1893 static int parse_pool_features(struct dm_arg_set *as, struct pool_features *pf, 1894 struct dm_target *ti) 1895 { 1896 int r; 1897 unsigned argc; 1898 const char *arg_name; 1899 1900 static struct dm_arg _args[] = { 1901 {0, 3, "Invalid number of pool feature arguments"}, 1902 }; 1903 1904 /* 1905 * No feature arguments supplied. 1906 */ 1907 if (!as->argc) 1908 return 0; 1909 1910 r = dm_read_arg_group(_args, as, &argc, &ti->error); 1911 if (r) 1912 return -EINVAL; 1913 1914 while (argc && !r) { 1915 arg_name = dm_shift_arg(as); 1916 argc--; 1917 1918 if (!strcasecmp(arg_name, "skip_block_zeroing")) 1919 pf->zero_new_blocks = false; 1920 1921 else if (!strcasecmp(arg_name, "ignore_discard")) 1922 pf->discard_enabled = false; 1923 1924 else if (!strcasecmp(arg_name, "no_discard_passdown")) 1925 pf->discard_passdown = false; 1926 1927 else if (!strcasecmp(arg_name, "read_only")) 1928 pf->mode = PM_READ_ONLY; 1929 1930 else { 1931 ti->error = "Unrecognised pool feature requested"; 1932 r = -EINVAL; 1933 break; 1934 } 1935 } 1936 1937 return r; 1938 } 1939 1940 static void metadata_low_callback(void *context) 1941 { 1942 struct pool *pool = context; 1943 1944 DMWARN("%s: reached low water mark for metadata device: sending event.", 1945 dm_device_name(pool->pool_md)); 1946 1947 dm_table_event(pool->ti->table); 1948 } 1949 1950 static sector_t get_metadata_dev_size(struct block_device *bdev) 1951 { 1952 sector_t metadata_dev_size = i_size_read(bdev->bd_inode) >> SECTOR_SHIFT; 1953 char buffer[BDEVNAME_SIZE]; 1954 1955 if (metadata_dev_size > THIN_METADATA_MAX_SECTORS_WARNING) { 1956 DMWARN("Metadata device %s is larger than %u sectors: excess space will not be used.", 1957 bdevname(bdev, buffer), THIN_METADATA_MAX_SECTORS); 1958 metadata_dev_size = THIN_METADATA_MAX_SECTORS_WARNING; 1959 } 1960 1961 return metadata_dev_size; 1962 } 1963 1964 static dm_block_t get_metadata_dev_size_in_blocks(struct block_device *bdev) 1965 { 1966 sector_t metadata_dev_size = get_metadata_dev_size(bdev); 1967 1968 sector_div(metadata_dev_size, THIN_METADATA_BLOCK_SIZE >> SECTOR_SHIFT); 1969 1970 return metadata_dev_size; 1971 } 1972 1973 /* 1974 * When a metadata threshold is crossed a dm event is triggered, and 1975 * userland should respond by growing the metadata device. We could let 1976 * userland set the threshold, like we do with the data threshold, but I'm 1977 * not sure they know enough to do this well. 1978 */ 1979 static dm_block_t calc_metadata_threshold(struct pool_c *pt) 1980 { 1981 /* 1982 * 4M is ample for all ops with the possible exception of thin 1983 * device deletion which is harmless if it fails (just retry the 1984 * delete after you've grown the device). 1985 */ 1986 dm_block_t quarter = get_metadata_dev_size_in_blocks(pt->metadata_dev->bdev) / 4; 1987 return min((dm_block_t)1024ULL /* 4M */, quarter); 1988 } 1989 1990 /* 1991 * thin-pool <metadata dev> <data dev> 1992 * <data block size (sectors)> 1993 * <low water mark (blocks)> 1994 * [<#feature args> [<arg>]*] 1995 * 1996 * Optional feature arguments are: 1997 * skip_block_zeroing: skips the zeroing of newly-provisioned blocks. 1998 * ignore_discard: disable discard 1999 * no_discard_passdown: don't pass discards down to the data device 2000 */ 2001 static int pool_ctr(struct dm_target *ti, unsigned argc, char **argv) 2002 { 2003 int r, pool_created = 0; 2004 struct pool_c *pt; 2005 struct pool *pool; 2006 struct pool_features pf; 2007 struct dm_arg_set as; 2008 struct dm_dev *data_dev; 2009 unsigned long block_size; 2010 dm_block_t low_water_blocks; 2011 struct dm_dev *metadata_dev; 2012 fmode_t metadata_mode; 2013 2014 /* 2015 * FIXME Remove validation from scope of lock. 2016 */ 2017 mutex_lock(&dm_thin_pool_table.mutex); 2018 2019 if (argc < 4) { 2020 ti->error = "Invalid argument count"; 2021 r = -EINVAL; 2022 goto out_unlock; 2023 } 2024 2025 as.argc = argc; 2026 as.argv = argv; 2027 2028 /* 2029 * Set default pool features. 2030 */ 2031 pool_features_init(&pf); 2032 2033 dm_consume_args(&as, 4); 2034 r = parse_pool_features(&as, &pf, ti); 2035 if (r) 2036 goto out_unlock; 2037 2038 metadata_mode = FMODE_READ | ((pf.mode == PM_READ_ONLY) ? 0 : FMODE_WRITE); 2039 r = dm_get_device(ti, argv[0], metadata_mode, &metadata_dev); 2040 if (r) { 2041 ti->error = "Error opening metadata block device"; 2042 goto out_unlock; 2043 } 2044 2045 /* 2046 * Run for the side-effect of possibly issuing a warning if the 2047 * device is too big. 2048 */ 2049 (void) get_metadata_dev_size(metadata_dev->bdev); 2050 2051 r = dm_get_device(ti, argv[1], FMODE_READ | FMODE_WRITE, &data_dev); 2052 if (r) { 2053 ti->error = "Error getting data device"; 2054 goto out_metadata; 2055 } 2056 2057 if (kstrtoul(argv[2], 10, &block_size) || !block_size || 2058 block_size < DATA_DEV_BLOCK_SIZE_MIN_SECTORS || 2059 block_size > DATA_DEV_BLOCK_SIZE_MAX_SECTORS || 2060 block_size & (DATA_DEV_BLOCK_SIZE_MIN_SECTORS - 1)) { 2061 ti->error = "Invalid block size"; 2062 r = -EINVAL; 2063 goto out; 2064 } 2065 2066 if (kstrtoull(argv[3], 10, (unsigned long long *)&low_water_blocks)) { 2067 ti->error = "Invalid low water mark"; 2068 r = -EINVAL; 2069 goto out; 2070 } 2071 2072 pt = kzalloc(sizeof(*pt), GFP_KERNEL); 2073 if (!pt) { 2074 r = -ENOMEM; 2075 goto out; 2076 } 2077 2078 pool = __pool_find(dm_table_get_md(ti->table), metadata_dev->bdev, 2079 block_size, pf.mode == PM_READ_ONLY, &ti->error, &pool_created); 2080 if (IS_ERR(pool)) { 2081 r = PTR_ERR(pool); 2082 goto out_free_pt; 2083 } 2084 2085 /* 2086 * 'pool_created' reflects whether this is the first table load. 2087 * Top level discard support is not allowed to be changed after 2088 * initial load. This would require a pool reload to trigger thin 2089 * device changes. 2090 */ 2091 if (!pool_created && pf.discard_enabled != pool->pf.discard_enabled) { 2092 ti->error = "Discard support cannot be disabled once enabled"; 2093 r = -EINVAL; 2094 goto out_flags_changed; 2095 } 2096 2097 pt->pool = pool; 2098 pt->ti = ti; 2099 pt->metadata_dev = metadata_dev; 2100 pt->data_dev = data_dev; 2101 pt->low_water_blocks = low_water_blocks; 2102 pt->adjusted_pf = pt->requested_pf = pf; 2103 ti->num_flush_bios = 1; 2104 2105 /* 2106 * Only need to enable discards if the pool should pass 2107 * them down to the data device. The thin device's discard 2108 * processing will cause mappings to be removed from the btree. 2109 */ 2110 ti->discard_zeroes_data_unsupported = true; 2111 if (pf.discard_enabled && pf.discard_passdown) { 2112 ti->num_discard_bios = 1; 2113 2114 /* 2115 * Setting 'discards_supported' circumvents the normal 2116 * stacking of discard limits (this keeps the pool and 2117 * thin devices' discard limits consistent). 2118 */ 2119 ti->discards_supported = true; 2120 } 2121 ti->private = pt; 2122 2123 r = dm_pool_register_metadata_threshold(pt->pool->pmd, 2124 calc_metadata_threshold(pt), 2125 metadata_low_callback, 2126 pool); 2127 if (r) 2128 goto out_free_pt; 2129 2130 pt->callbacks.congested_fn = pool_is_congested; 2131 dm_table_add_target_callbacks(ti->table, &pt->callbacks); 2132 2133 mutex_unlock(&dm_thin_pool_table.mutex); 2134 2135 return 0; 2136 2137 out_flags_changed: 2138 __pool_dec(pool); 2139 out_free_pt: 2140 kfree(pt); 2141 out: 2142 dm_put_device(ti, data_dev); 2143 out_metadata: 2144 dm_put_device(ti, metadata_dev); 2145 out_unlock: 2146 mutex_unlock(&dm_thin_pool_table.mutex); 2147 2148 return r; 2149 } 2150 2151 static int pool_map(struct dm_target *ti, struct bio *bio) 2152 { 2153 int r; 2154 struct pool_c *pt = ti->private; 2155 struct pool *pool = pt->pool; 2156 unsigned long flags; 2157 2158 /* 2159 * As this is a singleton target, ti->begin is always zero. 2160 */ 2161 spin_lock_irqsave(&pool->lock, flags); 2162 bio->bi_bdev = pt->data_dev->bdev; 2163 r = DM_MAPIO_REMAPPED; 2164 spin_unlock_irqrestore(&pool->lock, flags); 2165 2166 return r; 2167 } 2168 2169 static int maybe_resize_data_dev(struct dm_target *ti, bool *need_commit) 2170 { 2171 int r; 2172 struct pool_c *pt = ti->private; 2173 struct pool *pool = pt->pool; 2174 sector_t data_size = ti->len; 2175 dm_block_t sb_data_size; 2176 2177 *need_commit = false; 2178 2179 (void) sector_div(data_size, pool->sectors_per_block); 2180 2181 r = dm_pool_get_data_dev_size(pool->pmd, &sb_data_size); 2182 if (r) { 2183 DMERR("%s: failed to retrieve data device size", 2184 dm_device_name(pool->pool_md)); 2185 return r; 2186 } 2187 2188 if (data_size < sb_data_size) { 2189 DMERR("%s: pool target (%llu blocks) too small: expected %llu", 2190 dm_device_name(pool->pool_md), 2191 (unsigned long long)data_size, sb_data_size); 2192 return -EINVAL; 2193 2194 } else if (data_size > sb_data_size) { 2195 r = dm_pool_resize_data_dev(pool->pmd, data_size); 2196 if (r) { 2197 DMERR("%s: failed to resize data device", 2198 dm_device_name(pool->pool_md)); 2199 set_pool_mode(pool, PM_READ_ONLY); 2200 return r; 2201 } 2202 2203 *need_commit = true; 2204 } 2205 2206 return 0; 2207 } 2208 2209 static int maybe_resize_metadata_dev(struct dm_target *ti, bool *need_commit) 2210 { 2211 int r; 2212 struct pool_c *pt = ti->private; 2213 struct pool *pool = pt->pool; 2214 dm_block_t metadata_dev_size, sb_metadata_dev_size; 2215 2216 *need_commit = false; 2217 2218 metadata_dev_size = get_metadata_dev_size_in_blocks(pool->md_dev); 2219 2220 r = dm_pool_get_metadata_dev_size(pool->pmd, &sb_metadata_dev_size); 2221 if (r) { 2222 DMERR("%s: failed to retrieve metadata device size", 2223 dm_device_name(pool->pool_md)); 2224 return r; 2225 } 2226 2227 if (metadata_dev_size < sb_metadata_dev_size) { 2228 DMERR("%s: metadata device (%llu blocks) too small: expected %llu", 2229 dm_device_name(pool->pool_md), 2230 metadata_dev_size, sb_metadata_dev_size); 2231 return -EINVAL; 2232 2233 } else if (metadata_dev_size > sb_metadata_dev_size) { 2234 r = dm_pool_resize_metadata_dev(pool->pmd, metadata_dev_size); 2235 if (r) { 2236 DMERR("%s: failed to resize metadata device", 2237 dm_device_name(pool->pool_md)); 2238 return r; 2239 } 2240 2241 *need_commit = true; 2242 } 2243 2244 return 0; 2245 } 2246 2247 /* 2248 * Retrieves the number of blocks of the data device from 2249 * the superblock and compares it to the actual device size, 2250 * thus resizing the data device in case it has grown. 2251 * 2252 * This both copes with opening preallocated data devices in the ctr 2253 * being followed by a resume 2254 * -and- 2255 * calling the resume method individually after userspace has 2256 * grown the data device in reaction to a table event. 2257 */ 2258 static int pool_preresume(struct dm_target *ti) 2259 { 2260 int r; 2261 bool need_commit1, need_commit2; 2262 struct pool_c *pt = ti->private; 2263 struct pool *pool = pt->pool; 2264 2265 /* 2266 * Take control of the pool object. 2267 */ 2268 r = bind_control_target(pool, ti); 2269 if (r) 2270 return r; 2271 2272 r = maybe_resize_data_dev(ti, &need_commit1); 2273 if (r) 2274 return r; 2275 2276 r = maybe_resize_metadata_dev(ti, &need_commit2); 2277 if (r) 2278 return r; 2279 2280 if (need_commit1 || need_commit2) 2281 (void) commit(pool); 2282 2283 return 0; 2284 } 2285 2286 static void pool_resume(struct dm_target *ti) 2287 { 2288 struct pool_c *pt = ti->private; 2289 struct pool *pool = pt->pool; 2290 unsigned long flags; 2291 2292 spin_lock_irqsave(&pool->lock, flags); 2293 pool->low_water_triggered = 0; 2294 pool->no_free_space = 0; 2295 __requeue_bios(pool); 2296 spin_unlock_irqrestore(&pool->lock, flags); 2297 2298 do_waker(&pool->waker.work); 2299 } 2300 2301 static void pool_postsuspend(struct dm_target *ti) 2302 { 2303 struct pool_c *pt = ti->private; 2304 struct pool *pool = pt->pool; 2305 2306 cancel_delayed_work(&pool->waker); 2307 flush_workqueue(pool->wq); 2308 (void) commit(pool); 2309 } 2310 2311 static int check_arg_count(unsigned argc, unsigned args_required) 2312 { 2313 if (argc != args_required) { 2314 DMWARN("Message received with %u arguments instead of %u.", 2315 argc, args_required); 2316 return -EINVAL; 2317 } 2318 2319 return 0; 2320 } 2321 2322 static int read_dev_id(char *arg, dm_thin_id *dev_id, int warning) 2323 { 2324 if (!kstrtoull(arg, 10, (unsigned long long *)dev_id) && 2325 *dev_id <= MAX_DEV_ID) 2326 return 0; 2327 2328 if (warning) 2329 DMWARN("Message received with invalid device id: %s", arg); 2330 2331 return -EINVAL; 2332 } 2333 2334 static int process_create_thin_mesg(unsigned argc, char **argv, struct pool *pool) 2335 { 2336 dm_thin_id dev_id; 2337 int r; 2338 2339 r = check_arg_count(argc, 2); 2340 if (r) 2341 return r; 2342 2343 r = read_dev_id(argv[1], &dev_id, 1); 2344 if (r) 2345 return r; 2346 2347 r = dm_pool_create_thin(pool->pmd, dev_id); 2348 if (r) { 2349 DMWARN("Creation of new thinly-provisioned device with id %s failed.", 2350 argv[1]); 2351 return r; 2352 } 2353 2354 return 0; 2355 } 2356 2357 static int process_create_snap_mesg(unsigned argc, char **argv, struct pool *pool) 2358 { 2359 dm_thin_id dev_id; 2360 dm_thin_id origin_dev_id; 2361 int r; 2362 2363 r = check_arg_count(argc, 3); 2364 if (r) 2365 return r; 2366 2367 r = read_dev_id(argv[1], &dev_id, 1); 2368 if (r) 2369 return r; 2370 2371 r = read_dev_id(argv[2], &origin_dev_id, 1); 2372 if (r) 2373 return r; 2374 2375 r = dm_pool_create_snap(pool->pmd, dev_id, origin_dev_id); 2376 if (r) { 2377 DMWARN("Creation of new snapshot %s of device %s failed.", 2378 argv[1], argv[2]); 2379 return r; 2380 } 2381 2382 return 0; 2383 } 2384 2385 static int process_delete_mesg(unsigned argc, char **argv, struct pool *pool) 2386 { 2387 dm_thin_id dev_id; 2388 int r; 2389 2390 r = check_arg_count(argc, 2); 2391 if (r) 2392 return r; 2393 2394 r = read_dev_id(argv[1], &dev_id, 1); 2395 if (r) 2396 return r; 2397 2398 r = dm_pool_delete_thin_device(pool->pmd, dev_id); 2399 if (r) 2400 DMWARN("Deletion of thin device %s failed.", argv[1]); 2401 2402 return r; 2403 } 2404 2405 static int process_set_transaction_id_mesg(unsigned argc, char **argv, struct pool *pool) 2406 { 2407 dm_thin_id old_id, new_id; 2408 int r; 2409 2410 r = check_arg_count(argc, 3); 2411 if (r) 2412 return r; 2413 2414 if (kstrtoull(argv[1], 10, (unsigned long long *)&old_id)) { 2415 DMWARN("set_transaction_id message: Unrecognised id %s.", argv[1]); 2416 return -EINVAL; 2417 } 2418 2419 if (kstrtoull(argv[2], 10, (unsigned long long *)&new_id)) { 2420 DMWARN("set_transaction_id message: Unrecognised new id %s.", argv[2]); 2421 return -EINVAL; 2422 } 2423 2424 r = dm_pool_set_metadata_transaction_id(pool->pmd, old_id, new_id); 2425 if (r) { 2426 DMWARN("Failed to change transaction id from %s to %s.", 2427 argv[1], argv[2]); 2428 return r; 2429 } 2430 2431 return 0; 2432 } 2433 2434 static int process_reserve_metadata_snap_mesg(unsigned argc, char **argv, struct pool *pool) 2435 { 2436 int r; 2437 2438 r = check_arg_count(argc, 1); 2439 if (r) 2440 return r; 2441 2442 (void) commit(pool); 2443 2444 r = dm_pool_reserve_metadata_snap(pool->pmd); 2445 if (r) 2446 DMWARN("reserve_metadata_snap message failed."); 2447 2448 return r; 2449 } 2450 2451 static int process_release_metadata_snap_mesg(unsigned argc, char **argv, struct pool *pool) 2452 { 2453 int r; 2454 2455 r = check_arg_count(argc, 1); 2456 if (r) 2457 return r; 2458 2459 r = dm_pool_release_metadata_snap(pool->pmd); 2460 if (r) 2461 DMWARN("release_metadata_snap message failed."); 2462 2463 return r; 2464 } 2465 2466 /* 2467 * Messages supported: 2468 * create_thin <dev_id> 2469 * create_snap <dev_id> <origin_id> 2470 * delete <dev_id> 2471 * trim <dev_id> <new_size_in_sectors> 2472 * set_transaction_id <current_trans_id> <new_trans_id> 2473 * reserve_metadata_snap 2474 * release_metadata_snap 2475 */ 2476 static int pool_message(struct dm_target *ti, unsigned argc, char **argv) 2477 { 2478 int r = -EINVAL; 2479 struct pool_c *pt = ti->private; 2480 struct pool *pool = pt->pool; 2481 2482 if (!strcasecmp(argv[0], "create_thin")) 2483 r = process_create_thin_mesg(argc, argv, pool); 2484 2485 else if (!strcasecmp(argv[0], "create_snap")) 2486 r = process_create_snap_mesg(argc, argv, pool); 2487 2488 else if (!strcasecmp(argv[0], "delete")) 2489 r = process_delete_mesg(argc, argv, pool); 2490 2491 else if (!strcasecmp(argv[0], "set_transaction_id")) 2492 r = process_set_transaction_id_mesg(argc, argv, pool); 2493 2494 else if (!strcasecmp(argv[0], "reserve_metadata_snap")) 2495 r = process_reserve_metadata_snap_mesg(argc, argv, pool); 2496 2497 else if (!strcasecmp(argv[0], "release_metadata_snap")) 2498 r = process_release_metadata_snap_mesg(argc, argv, pool); 2499 2500 else 2501 DMWARN("Unrecognised thin pool target message received: %s", argv[0]); 2502 2503 if (!r) 2504 (void) commit(pool); 2505 2506 return r; 2507 } 2508 2509 static void emit_flags(struct pool_features *pf, char *result, 2510 unsigned sz, unsigned maxlen) 2511 { 2512 unsigned count = !pf->zero_new_blocks + !pf->discard_enabled + 2513 !pf->discard_passdown + (pf->mode == PM_READ_ONLY); 2514 DMEMIT("%u ", count); 2515 2516 if (!pf->zero_new_blocks) 2517 DMEMIT("skip_block_zeroing "); 2518 2519 if (!pf->discard_enabled) 2520 DMEMIT("ignore_discard "); 2521 2522 if (!pf->discard_passdown) 2523 DMEMIT("no_discard_passdown "); 2524 2525 if (pf->mode == PM_READ_ONLY) 2526 DMEMIT("read_only "); 2527 } 2528 2529 /* 2530 * Status line is: 2531 * <transaction id> <used metadata sectors>/<total metadata sectors> 2532 * <used data sectors>/<total data sectors> <held metadata root> 2533 */ 2534 static void pool_status(struct dm_target *ti, status_type_t type, 2535 unsigned status_flags, char *result, unsigned maxlen) 2536 { 2537 int r; 2538 unsigned sz = 0; 2539 uint64_t transaction_id; 2540 dm_block_t nr_free_blocks_data; 2541 dm_block_t nr_free_blocks_metadata; 2542 dm_block_t nr_blocks_data; 2543 dm_block_t nr_blocks_metadata; 2544 dm_block_t held_root; 2545 char buf[BDEVNAME_SIZE]; 2546 char buf2[BDEVNAME_SIZE]; 2547 struct pool_c *pt = ti->private; 2548 struct pool *pool = pt->pool; 2549 2550 switch (type) { 2551 case STATUSTYPE_INFO: 2552 if (get_pool_mode(pool) == PM_FAIL) { 2553 DMEMIT("Fail"); 2554 break; 2555 } 2556 2557 /* Commit to ensure statistics aren't out-of-date */ 2558 if (!(status_flags & DM_STATUS_NOFLUSH_FLAG) && !dm_suspended(ti)) 2559 (void) commit(pool); 2560 2561 r = dm_pool_get_metadata_transaction_id(pool->pmd, &transaction_id); 2562 if (r) { 2563 DMERR("%s: dm_pool_get_metadata_transaction_id returned %d", 2564 dm_device_name(pool->pool_md), r); 2565 goto err; 2566 } 2567 2568 r = dm_pool_get_free_metadata_block_count(pool->pmd, &nr_free_blocks_metadata); 2569 if (r) { 2570 DMERR("%s: dm_pool_get_free_metadata_block_count returned %d", 2571 dm_device_name(pool->pool_md), r); 2572 goto err; 2573 } 2574 2575 r = dm_pool_get_metadata_dev_size(pool->pmd, &nr_blocks_metadata); 2576 if (r) { 2577 DMERR("%s: dm_pool_get_metadata_dev_size returned %d", 2578 dm_device_name(pool->pool_md), r); 2579 goto err; 2580 } 2581 2582 r = dm_pool_get_free_block_count(pool->pmd, &nr_free_blocks_data); 2583 if (r) { 2584 DMERR("%s: dm_pool_get_free_block_count returned %d", 2585 dm_device_name(pool->pool_md), r); 2586 goto err; 2587 } 2588 2589 r = dm_pool_get_data_dev_size(pool->pmd, &nr_blocks_data); 2590 if (r) { 2591 DMERR("%s: dm_pool_get_data_dev_size returned %d", 2592 dm_device_name(pool->pool_md), r); 2593 goto err; 2594 } 2595 2596 r = dm_pool_get_metadata_snap(pool->pmd, &held_root); 2597 if (r) { 2598 DMERR("%s: dm_pool_get_metadata_snap returned %d", 2599 dm_device_name(pool->pool_md), r); 2600 goto err; 2601 } 2602 2603 DMEMIT("%llu %llu/%llu %llu/%llu ", 2604 (unsigned long long)transaction_id, 2605 (unsigned long long)(nr_blocks_metadata - nr_free_blocks_metadata), 2606 (unsigned long long)nr_blocks_metadata, 2607 (unsigned long long)(nr_blocks_data - nr_free_blocks_data), 2608 (unsigned long long)nr_blocks_data); 2609 2610 if (held_root) 2611 DMEMIT("%llu ", held_root); 2612 else 2613 DMEMIT("- "); 2614 2615 if (pool->pf.mode == PM_READ_ONLY) 2616 DMEMIT("ro "); 2617 else 2618 DMEMIT("rw "); 2619 2620 if (!pool->pf.discard_enabled) 2621 DMEMIT("ignore_discard"); 2622 else if (pool->pf.discard_passdown) 2623 DMEMIT("discard_passdown"); 2624 else 2625 DMEMIT("no_discard_passdown"); 2626 2627 break; 2628 2629 case STATUSTYPE_TABLE: 2630 DMEMIT("%s %s %lu %llu ", 2631 format_dev_t(buf, pt->metadata_dev->bdev->bd_dev), 2632 format_dev_t(buf2, pt->data_dev->bdev->bd_dev), 2633 (unsigned long)pool->sectors_per_block, 2634 (unsigned long long)pt->low_water_blocks); 2635 emit_flags(&pt->requested_pf, result, sz, maxlen); 2636 break; 2637 } 2638 return; 2639 2640 err: 2641 DMEMIT("Error"); 2642 } 2643 2644 static int pool_iterate_devices(struct dm_target *ti, 2645 iterate_devices_callout_fn fn, void *data) 2646 { 2647 struct pool_c *pt = ti->private; 2648 2649 return fn(ti, pt->data_dev, 0, ti->len, data); 2650 } 2651 2652 static int pool_merge(struct dm_target *ti, struct bvec_merge_data *bvm, 2653 struct bio_vec *biovec, int max_size) 2654 { 2655 struct pool_c *pt = ti->private; 2656 struct request_queue *q = bdev_get_queue(pt->data_dev->bdev); 2657 2658 if (!q->merge_bvec_fn) 2659 return max_size; 2660 2661 bvm->bi_bdev = pt->data_dev->bdev; 2662 2663 return min(max_size, q->merge_bvec_fn(q, bvm, biovec)); 2664 } 2665 2666 static void set_discard_limits(struct pool_c *pt, struct queue_limits *limits) 2667 { 2668 struct pool *pool = pt->pool; 2669 struct queue_limits *data_limits; 2670 2671 limits->max_discard_sectors = pool->sectors_per_block; 2672 2673 /* 2674 * discard_granularity is just a hint, and not enforced. 2675 */ 2676 if (pt->adjusted_pf.discard_passdown) { 2677 data_limits = &bdev_get_queue(pt->data_dev->bdev)->limits; 2678 limits->discard_granularity = data_limits->discard_granularity; 2679 } else 2680 limits->discard_granularity = pool->sectors_per_block << SECTOR_SHIFT; 2681 } 2682 2683 static void pool_io_hints(struct dm_target *ti, struct queue_limits *limits) 2684 { 2685 struct pool_c *pt = ti->private; 2686 struct pool *pool = pt->pool; 2687 uint64_t io_opt_sectors = limits->io_opt >> SECTOR_SHIFT; 2688 2689 /* 2690 * If the system-determined stacked limits are compatible with the 2691 * pool's blocksize (io_opt is a factor) do not override them. 2692 */ 2693 if (io_opt_sectors < pool->sectors_per_block || 2694 do_div(io_opt_sectors, pool->sectors_per_block)) { 2695 blk_limits_io_min(limits, 0); 2696 blk_limits_io_opt(limits, pool->sectors_per_block << SECTOR_SHIFT); 2697 } 2698 2699 /* 2700 * pt->adjusted_pf is a staging area for the actual features to use. 2701 * They get transferred to the live pool in bind_control_target() 2702 * called from pool_preresume(). 2703 */ 2704 if (!pt->adjusted_pf.discard_enabled) { 2705 /* 2706 * Must explicitly disallow stacking discard limits otherwise the 2707 * block layer will stack them if pool's data device has support. 2708 * QUEUE_FLAG_DISCARD wouldn't be set but there is no way for the 2709 * user to see that, so make sure to set all discard limits to 0. 2710 */ 2711 limits->discard_granularity = 0; 2712 return; 2713 } 2714 2715 disable_passdown_if_not_supported(pt); 2716 2717 set_discard_limits(pt, limits); 2718 } 2719 2720 static struct target_type pool_target = { 2721 .name = "thin-pool", 2722 .features = DM_TARGET_SINGLETON | DM_TARGET_ALWAYS_WRITEABLE | 2723 DM_TARGET_IMMUTABLE, 2724 .version = {1, 9, 0}, 2725 .module = THIS_MODULE, 2726 .ctr = pool_ctr, 2727 .dtr = pool_dtr, 2728 .map = pool_map, 2729 .postsuspend = pool_postsuspend, 2730 .preresume = pool_preresume, 2731 .resume = pool_resume, 2732 .message = pool_message, 2733 .status = pool_status, 2734 .merge = pool_merge, 2735 .iterate_devices = pool_iterate_devices, 2736 .io_hints = pool_io_hints, 2737 }; 2738 2739 /*---------------------------------------------------------------- 2740 * Thin target methods 2741 *--------------------------------------------------------------*/ 2742 static void thin_dtr(struct dm_target *ti) 2743 { 2744 struct thin_c *tc = ti->private; 2745 2746 mutex_lock(&dm_thin_pool_table.mutex); 2747 2748 __pool_dec(tc->pool); 2749 dm_pool_close_thin_device(tc->td); 2750 dm_put_device(ti, tc->pool_dev); 2751 if (tc->origin_dev) 2752 dm_put_device(ti, tc->origin_dev); 2753 kfree(tc); 2754 2755 mutex_unlock(&dm_thin_pool_table.mutex); 2756 } 2757 2758 /* 2759 * Thin target parameters: 2760 * 2761 * <pool_dev> <dev_id> [origin_dev] 2762 * 2763 * pool_dev: the path to the pool (eg, /dev/mapper/my_pool) 2764 * dev_id: the internal device identifier 2765 * origin_dev: a device external to the pool that should act as the origin 2766 * 2767 * If the pool device has discards disabled, they get disabled for the thin 2768 * device as well. 2769 */ 2770 static int thin_ctr(struct dm_target *ti, unsigned argc, char **argv) 2771 { 2772 int r; 2773 struct thin_c *tc; 2774 struct dm_dev *pool_dev, *origin_dev; 2775 struct mapped_device *pool_md; 2776 2777 mutex_lock(&dm_thin_pool_table.mutex); 2778 2779 if (argc != 2 && argc != 3) { 2780 ti->error = "Invalid argument count"; 2781 r = -EINVAL; 2782 goto out_unlock; 2783 } 2784 2785 tc = ti->private = kzalloc(sizeof(*tc), GFP_KERNEL); 2786 if (!tc) { 2787 ti->error = "Out of memory"; 2788 r = -ENOMEM; 2789 goto out_unlock; 2790 } 2791 2792 if (argc == 3) { 2793 r = dm_get_device(ti, argv[2], FMODE_READ, &origin_dev); 2794 if (r) { 2795 ti->error = "Error opening origin device"; 2796 goto bad_origin_dev; 2797 } 2798 tc->origin_dev = origin_dev; 2799 } 2800 2801 r = dm_get_device(ti, argv[0], dm_table_get_mode(ti->table), &pool_dev); 2802 if (r) { 2803 ti->error = "Error opening pool device"; 2804 goto bad_pool_dev; 2805 } 2806 tc->pool_dev = pool_dev; 2807 2808 if (read_dev_id(argv[1], (unsigned long long *)&tc->dev_id, 0)) { 2809 ti->error = "Invalid device id"; 2810 r = -EINVAL; 2811 goto bad_common; 2812 } 2813 2814 pool_md = dm_get_md(tc->pool_dev->bdev->bd_dev); 2815 if (!pool_md) { 2816 ti->error = "Couldn't get pool mapped device"; 2817 r = -EINVAL; 2818 goto bad_common; 2819 } 2820 2821 tc->pool = __pool_table_lookup(pool_md); 2822 if (!tc->pool) { 2823 ti->error = "Couldn't find pool object"; 2824 r = -EINVAL; 2825 goto bad_pool_lookup; 2826 } 2827 __pool_inc(tc->pool); 2828 2829 if (get_pool_mode(tc->pool) == PM_FAIL) { 2830 ti->error = "Couldn't open thin device, Pool is in fail mode"; 2831 goto bad_thin_open; 2832 } 2833 2834 r = dm_pool_open_thin_device(tc->pool->pmd, tc->dev_id, &tc->td); 2835 if (r) { 2836 ti->error = "Couldn't open thin internal device"; 2837 goto bad_thin_open; 2838 } 2839 2840 r = dm_set_target_max_io_len(ti, tc->pool->sectors_per_block); 2841 if (r) 2842 goto bad_thin_open; 2843 2844 ti->num_flush_bios = 1; 2845 ti->flush_supported = true; 2846 ti->per_bio_data_size = sizeof(struct dm_thin_endio_hook); 2847 2848 /* In case the pool supports discards, pass them on. */ 2849 ti->discard_zeroes_data_unsupported = true; 2850 if (tc->pool->pf.discard_enabled) { 2851 ti->discards_supported = true; 2852 ti->num_discard_bios = 1; 2853 /* Discard bios must be split on a block boundary */ 2854 ti->split_discard_bios = true; 2855 } 2856 2857 dm_put(pool_md); 2858 2859 mutex_unlock(&dm_thin_pool_table.mutex); 2860 2861 return 0; 2862 2863 bad_thin_open: 2864 __pool_dec(tc->pool); 2865 bad_pool_lookup: 2866 dm_put(pool_md); 2867 bad_common: 2868 dm_put_device(ti, tc->pool_dev); 2869 bad_pool_dev: 2870 if (tc->origin_dev) 2871 dm_put_device(ti, tc->origin_dev); 2872 bad_origin_dev: 2873 kfree(tc); 2874 out_unlock: 2875 mutex_unlock(&dm_thin_pool_table.mutex); 2876 2877 return r; 2878 } 2879 2880 static int thin_map(struct dm_target *ti, struct bio *bio) 2881 { 2882 bio->bi_sector = dm_target_offset(ti, bio->bi_sector); 2883 2884 return thin_bio_map(ti, bio); 2885 } 2886 2887 static int thin_endio(struct dm_target *ti, struct bio *bio, int err) 2888 { 2889 unsigned long flags; 2890 struct dm_thin_endio_hook *h = dm_per_bio_data(bio, sizeof(struct dm_thin_endio_hook)); 2891 struct list_head work; 2892 struct dm_thin_new_mapping *m, *tmp; 2893 struct pool *pool = h->tc->pool; 2894 2895 if (h->shared_read_entry) { 2896 INIT_LIST_HEAD(&work); 2897 dm_deferred_entry_dec(h->shared_read_entry, &work); 2898 2899 spin_lock_irqsave(&pool->lock, flags); 2900 list_for_each_entry_safe(m, tmp, &work, list) { 2901 list_del(&m->list); 2902 m->quiesced = 1; 2903 __maybe_add_mapping(m); 2904 } 2905 spin_unlock_irqrestore(&pool->lock, flags); 2906 } 2907 2908 if (h->all_io_entry) { 2909 INIT_LIST_HEAD(&work); 2910 dm_deferred_entry_dec(h->all_io_entry, &work); 2911 if (!list_empty(&work)) { 2912 spin_lock_irqsave(&pool->lock, flags); 2913 list_for_each_entry_safe(m, tmp, &work, list) 2914 list_add(&m->list, &pool->prepared_discards); 2915 spin_unlock_irqrestore(&pool->lock, flags); 2916 wake_worker(pool); 2917 } 2918 } 2919 2920 return 0; 2921 } 2922 2923 static void thin_postsuspend(struct dm_target *ti) 2924 { 2925 if (dm_noflush_suspending(ti)) 2926 requeue_io((struct thin_c *)ti->private); 2927 } 2928 2929 /* 2930 * <nr mapped sectors> <highest mapped sector> 2931 */ 2932 static void thin_status(struct dm_target *ti, status_type_t type, 2933 unsigned status_flags, char *result, unsigned maxlen) 2934 { 2935 int r; 2936 ssize_t sz = 0; 2937 dm_block_t mapped, highest; 2938 char buf[BDEVNAME_SIZE]; 2939 struct thin_c *tc = ti->private; 2940 2941 if (get_pool_mode(tc->pool) == PM_FAIL) { 2942 DMEMIT("Fail"); 2943 return; 2944 } 2945 2946 if (!tc->td) 2947 DMEMIT("-"); 2948 else { 2949 switch (type) { 2950 case STATUSTYPE_INFO: 2951 r = dm_thin_get_mapped_count(tc->td, &mapped); 2952 if (r) { 2953 DMERR("dm_thin_get_mapped_count returned %d", r); 2954 goto err; 2955 } 2956 2957 r = dm_thin_get_highest_mapped_block(tc->td, &highest); 2958 if (r < 0) { 2959 DMERR("dm_thin_get_highest_mapped_block returned %d", r); 2960 goto err; 2961 } 2962 2963 DMEMIT("%llu ", mapped * tc->pool->sectors_per_block); 2964 if (r) 2965 DMEMIT("%llu", ((highest + 1) * 2966 tc->pool->sectors_per_block) - 1); 2967 else 2968 DMEMIT("-"); 2969 break; 2970 2971 case STATUSTYPE_TABLE: 2972 DMEMIT("%s %lu", 2973 format_dev_t(buf, tc->pool_dev->bdev->bd_dev), 2974 (unsigned long) tc->dev_id); 2975 if (tc->origin_dev) 2976 DMEMIT(" %s", format_dev_t(buf, tc->origin_dev->bdev->bd_dev)); 2977 break; 2978 } 2979 } 2980 2981 return; 2982 2983 err: 2984 DMEMIT("Error"); 2985 } 2986 2987 static int thin_iterate_devices(struct dm_target *ti, 2988 iterate_devices_callout_fn fn, void *data) 2989 { 2990 sector_t blocks; 2991 struct thin_c *tc = ti->private; 2992 struct pool *pool = tc->pool; 2993 2994 /* 2995 * We can't call dm_pool_get_data_dev_size() since that blocks. So 2996 * we follow a more convoluted path through to the pool's target. 2997 */ 2998 if (!pool->ti) 2999 return 0; /* nothing is bound */ 3000 3001 blocks = pool->ti->len; 3002 (void) sector_div(blocks, pool->sectors_per_block); 3003 if (blocks) 3004 return fn(ti, tc->pool_dev, 0, pool->sectors_per_block * blocks, data); 3005 3006 return 0; 3007 } 3008 3009 static struct target_type thin_target = { 3010 .name = "thin", 3011 .version = {1, 9, 0}, 3012 .module = THIS_MODULE, 3013 .ctr = thin_ctr, 3014 .dtr = thin_dtr, 3015 .map = thin_map, 3016 .end_io = thin_endio, 3017 .postsuspend = thin_postsuspend, 3018 .status = thin_status, 3019 .iterate_devices = thin_iterate_devices, 3020 }; 3021 3022 /*----------------------------------------------------------------*/ 3023 3024 static int __init dm_thin_init(void) 3025 { 3026 int r; 3027 3028 pool_table_init(); 3029 3030 r = dm_register_target(&thin_target); 3031 if (r) 3032 return r; 3033 3034 r = dm_register_target(&pool_target); 3035 if (r) 3036 goto bad_pool_target; 3037 3038 r = -ENOMEM; 3039 3040 _new_mapping_cache = KMEM_CACHE(dm_thin_new_mapping, 0); 3041 if (!_new_mapping_cache) 3042 goto bad_new_mapping_cache; 3043 3044 return 0; 3045 3046 bad_new_mapping_cache: 3047 dm_unregister_target(&pool_target); 3048 bad_pool_target: 3049 dm_unregister_target(&thin_target); 3050 3051 return r; 3052 } 3053 3054 static void dm_thin_exit(void) 3055 { 3056 dm_unregister_target(&thin_target); 3057 dm_unregister_target(&pool_target); 3058 3059 kmem_cache_destroy(_new_mapping_cache); 3060 } 3061 3062 module_init(dm_thin_init); 3063 module_exit(dm_thin_exit); 3064 3065 MODULE_DESCRIPTION(DM_NAME " thin provisioning target"); 3066 MODULE_AUTHOR("Joe Thornber <dm-devel@redhat.com>"); 3067 MODULE_LICENSE("GPL"); 3068