1 /* 2 * Copyright (C) 2011-2012 Red Hat UK. 3 * 4 * This file is released under the GPL. 5 */ 6 7 #include "dm-thin-metadata.h" 8 #include "dm-bio-prison.h" 9 #include "dm.h" 10 11 #include <linux/device-mapper.h> 12 #include <linux/dm-io.h> 13 #include <linux/dm-kcopyd.h> 14 #include <linux/list.h> 15 #include <linux/rculist.h> 16 #include <linux/init.h> 17 #include <linux/module.h> 18 #include <linux/slab.h> 19 #include <linux/rbtree.h> 20 21 #define DM_MSG_PREFIX "thin" 22 23 /* 24 * Tunable constants 25 */ 26 #define ENDIO_HOOK_POOL_SIZE 1024 27 #define MAPPING_POOL_SIZE 1024 28 #define PRISON_CELLS 1024 29 #define COMMIT_PERIOD HZ 30 #define NO_SPACE_TIMEOUT_SECS 60 31 32 static unsigned no_space_timeout_secs = NO_SPACE_TIMEOUT_SECS; 33 34 DECLARE_DM_KCOPYD_THROTTLE_WITH_MODULE_PARM(snapshot_copy_throttle, 35 "A percentage of time allocated for copy on write"); 36 37 /* 38 * The block size of the device holding pool data must be 39 * between 64KB and 1GB. 40 */ 41 #define DATA_DEV_BLOCK_SIZE_MIN_SECTORS (64 * 1024 >> SECTOR_SHIFT) 42 #define DATA_DEV_BLOCK_SIZE_MAX_SECTORS (1024 * 1024 * 1024 >> SECTOR_SHIFT) 43 44 /* 45 * Device id is restricted to 24 bits. 46 */ 47 #define MAX_DEV_ID ((1 << 24) - 1) 48 49 /* 50 * How do we handle breaking sharing of data blocks? 51 * ================================================= 52 * 53 * We use a standard copy-on-write btree to store the mappings for the 54 * devices (note I'm talking about copy-on-write of the metadata here, not 55 * the data). When you take an internal snapshot you clone the root node 56 * of the origin btree. After this there is no concept of an origin or a 57 * snapshot. They are just two device trees that happen to point to the 58 * same data blocks. 59 * 60 * When we get a write in we decide if it's to a shared data block using 61 * some timestamp magic. If it is, we have to break sharing. 62 * 63 * Let's say we write to a shared block in what was the origin. The 64 * steps are: 65 * 66 * i) plug io further to this physical block. (see bio_prison code). 67 * 68 * ii) quiesce any read io to that shared data block. Obviously 69 * including all devices that share this block. (see dm_deferred_set code) 70 * 71 * iii) copy the data block to a newly allocate block. This step can be 72 * missed out if the io covers the block. (schedule_copy). 73 * 74 * iv) insert the new mapping into the origin's btree 75 * (process_prepared_mapping). This act of inserting breaks some 76 * sharing of btree nodes between the two devices. Breaking sharing only 77 * effects the btree of that specific device. Btrees for the other 78 * devices that share the block never change. The btree for the origin 79 * device as it was after the last commit is untouched, ie. we're using 80 * persistent data structures in the functional programming sense. 81 * 82 * v) unplug io to this physical block, including the io that triggered 83 * the breaking of sharing. 84 * 85 * Steps (ii) and (iii) occur in parallel. 86 * 87 * The metadata _doesn't_ need to be committed before the io continues. We 88 * get away with this because the io is always written to a _new_ block. 89 * If there's a crash, then: 90 * 91 * - The origin mapping will point to the old origin block (the shared 92 * one). This will contain the data as it was before the io that triggered 93 * the breaking of sharing came in. 94 * 95 * - The snap mapping still points to the old block. As it would after 96 * the commit. 97 * 98 * The downside of this scheme is the timestamp magic isn't perfect, and 99 * will continue to think that data block in the snapshot device is shared 100 * even after the write to the origin has broken sharing. I suspect data 101 * blocks will typically be shared by many different devices, so we're 102 * breaking sharing n + 1 times, rather than n, where n is the number of 103 * devices that reference this data block. At the moment I think the 104 * benefits far, far outweigh the disadvantages. 105 */ 106 107 /*----------------------------------------------------------------*/ 108 109 /* 110 * Key building. 111 */ 112 static void build_data_key(struct dm_thin_device *td, 113 dm_block_t b, struct dm_cell_key *key) 114 { 115 key->virtual = 0; 116 key->dev = dm_thin_dev_id(td); 117 key->block = b; 118 } 119 120 static void build_virtual_key(struct dm_thin_device *td, dm_block_t b, 121 struct dm_cell_key *key) 122 { 123 key->virtual = 1; 124 key->dev = dm_thin_dev_id(td); 125 key->block = b; 126 } 127 128 /*----------------------------------------------------------------*/ 129 130 /* 131 * A pool device ties together a metadata device and a data device. It 132 * also provides the interface for creating and destroying internal 133 * devices. 134 */ 135 struct dm_thin_new_mapping; 136 137 /* 138 * The pool runs in 4 modes. Ordered in degraded order for comparisons. 139 */ 140 enum pool_mode { 141 PM_WRITE, /* metadata may be changed */ 142 PM_OUT_OF_DATA_SPACE, /* metadata may be changed, though data may not be allocated */ 143 PM_READ_ONLY, /* metadata may not be changed */ 144 PM_FAIL, /* all I/O fails */ 145 }; 146 147 struct pool_features { 148 enum pool_mode mode; 149 150 bool zero_new_blocks:1; 151 bool discard_enabled:1; 152 bool discard_passdown:1; 153 bool error_if_no_space:1; 154 }; 155 156 struct thin_c; 157 typedef void (*process_bio_fn)(struct thin_c *tc, struct bio *bio); 158 typedef void (*process_mapping_fn)(struct dm_thin_new_mapping *m); 159 160 struct pool { 161 struct list_head list; 162 struct dm_target *ti; /* Only set if a pool target is bound */ 163 164 struct mapped_device *pool_md; 165 struct block_device *md_dev; 166 struct dm_pool_metadata *pmd; 167 168 dm_block_t low_water_blocks; 169 uint32_t sectors_per_block; 170 int sectors_per_block_shift; 171 172 struct pool_features pf; 173 bool low_water_triggered:1; /* A dm event has been sent */ 174 175 struct dm_bio_prison *prison; 176 struct dm_kcopyd_client *copier; 177 178 struct workqueue_struct *wq; 179 struct work_struct worker; 180 struct delayed_work waker; 181 struct delayed_work no_space_timeout; 182 183 unsigned long last_commit_jiffies; 184 unsigned ref_count; 185 186 spinlock_t lock; 187 struct bio_list deferred_flush_bios; 188 struct list_head prepared_mappings; 189 struct list_head prepared_discards; 190 struct list_head active_thins; 191 192 struct dm_deferred_set *shared_read_ds; 193 struct dm_deferred_set *all_io_ds; 194 195 struct dm_thin_new_mapping *next_mapping; 196 mempool_t *mapping_pool; 197 198 process_bio_fn process_bio; 199 process_bio_fn process_discard; 200 201 process_mapping_fn process_prepared_mapping; 202 process_mapping_fn process_prepared_discard; 203 }; 204 205 static enum pool_mode get_pool_mode(struct pool *pool); 206 static void metadata_operation_failed(struct pool *pool, const char *op, int r); 207 208 /* 209 * Target context for a pool. 210 */ 211 struct pool_c { 212 struct dm_target *ti; 213 struct pool *pool; 214 struct dm_dev *data_dev; 215 struct dm_dev *metadata_dev; 216 struct dm_target_callbacks callbacks; 217 218 dm_block_t low_water_blocks; 219 struct pool_features requested_pf; /* Features requested during table load */ 220 struct pool_features adjusted_pf; /* Features used after adjusting for constituent devices */ 221 }; 222 223 /* 224 * Target context for a thin. 225 */ 226 struct thin_c { 227 struct list_head list; 228 struct dm_dev *pool_dev; 229 struct dm_dev *origin_dev; 230 dm_thin_id dev_id; 231 232 struct pool *pool; 233 struct dm_thin_device *td; 234 bool requeue_mode:1; 235 spinlock_t lock; 236 struct bio_list deferred_bio_list; 237 struct bio_list retry_on_resume_list; 238 struct rb_root sort_bio_list; /* sorted list of deferred bios */ 239 240 /* 241 * Ensures the thin is not destroyed until the worker has finished 242 * iterating the active_thins list. 243 */ 244 atomic_t refcount; 245 struct completion can_destroy; 246 }; 247 248 /*----------------------------------------------------------------*/ 249 250 /* 251 * wake_worker() is used when new work is queued and when pool_resume is 252 * ready to continue deferred IO processing. 253 */ 254 static void wake_worker(struct pool *pool) 255 { 256 queue_work(pool->wq, &pool->worker); 257 } 258 259 /*----------------------------------------------------------------*/ 260 261 static int bio_detain(struct pool *pool, struct dm_cell_key *key, struct bio *bio, 262 struct dm_bio_prison_cell **cell_result) 263 { 264 int r; 265 struct dm_bio_prison_cell *cell_prealloc; 266 267 /* 268 * Allocate a cell from the prison's mempool. 269 * This might block but it can't fail. 270 */ 271 cell_prealloc = dm_bio_prison_alloc_cell(pool->prison, GFP_NOIO); 272 273 r = dm_bio_detain(pool->prison, key, bio, cell_prealloc, cell_result); 274 if (r) 275 /* 276 * We reused an old cell; we can get rid of 277 * the new one. 278 */ 279 dm_bio_prison_free_cell(pool->prison, cell_prealloc); 280 281 return r; 282 } 283 284 static void cell_release(struct pool *pool, 285 struct dm_bio_prison_cell *cell, 286 struct bio_list *bios) 287 { 288 dm_cell_release(pool->prison, cell, bios); 289 dm_bio_prison_free_cell(pool->prison, cell); 290 } 291 292 static void cell_release_no_holder(struct pool *pool, 293 struct dm_bio_prison_cell *cell, 294 struct bio_list *bios) 295 { 296 dm_cell_release_no_holder(pool->prison, cell, bios); 297 dm_bio_prison_free_cell(pool->prison, cell); 298 } 299 300 static void cell_defer_no_holder_no_free(struct thin_c *tc, 301 struct dm_bio_prison_cell *cell) 302 { 303 struct pool *pool = tc->pool; 304 unsigned long flags; 305 306 spin_lock_irqsave(&tc->lock, flags); 307 dm_cell_release_no_holder(pool->prison, cell, &tc->deferred_bio_list); 308 spin_unlock_irqrestore(&tc->lock, flags); 309 310 wake_worker(pool); 311 } 312 313 static void cell_error(struct pool *pool, 314 struct dm_bio_prison_cell *cell) 315 { 316 dm_cell_error(pool->prison, cell); 317 dm_bio_prison_free_cell(pool->prison, cell); 318 } 319 320 /*----------------------------------------------------------------*/ 321 322 /* 323 * A global list of pools that uses a struct mapped_device as a key. 324 */ 325 static struct dm_thin_pool_table { 326 struct mutex mutex; 327 struct list_head pools; 328 } dm_thin_pool_table; 329 330 static void pool_table_init(void) 331 { 332 mutex_init(&dm_thin_pool_table.mutex); 333 INIT_LIST_HEAD(&dm_thin_pool_table.pools); 334 } 335 336 static void __pool_table_insert(struct pool *pool) 337 { 338 BUG_ON(!mutex_is_locked(&dm_thin_pool_table.mutex)); 339 list_add(&pool->list, &dm_thin_pool_table.pools); 340 } 341 342 static void __pool_table_remove(struct pool *pool) 343 { 344 BUG_ON(!mutex_is_locked(&dm_thin_pool_table.mutex)); 345 list_del(&pool->list); 346 } 347 348 static struct pool *__pool_table_lookup(struct mapped_device *md) 349 { 350 struct pool *pool = NULL, *tmp; 351 352 BUG_ON(!mutex_is_locked(&dm_thin_pool_table.mutex)); 353 354 list_for_each_entry(tmp, &dm_thin_pool_table.pools, list) { 355 if (tmp->pool_md == md) { 356 pool = tmp; 357 break; 358 } 359 } 360 361 return pool; 362 } 363 364 static struct pool *__pool_table_lookup_metadata_dev(struct block_device *md_dev) 365 { 366 struct pool *pool = NULL, *tmp; 367 368 BUG_ON(!mutex_is_locked(&dm_thin_pool_table.mutex)); 369 370 list_for_each_entry(tmp, &dm_thin_pool_table.pools, list) { 371 if (tmp->md_dev == md_dev) { 372 pool = tmp; 373 break; 374 } 375 } 376 377 return pool; 378 } 379 380 /*----------------------------------------------------------------*/ 381 382 struct dm_thin_endio_hook { 383 struct thin_c *tc; 384 struct dm_deferred_entry *shared_read_entry; 385 struct dm_deferred_entry *all_io_entry; 386 struct dm_thin_new_mapping *overwrite_mapping; 387 struct rb_node rb_node; 388 }; 389 390 static void requeue_bio_list(struct thin_c *tc, struct bio_list *master) 391 { 392 struct bio *bio; 393 struct bio_list bios; 394 unsigned long flags; 395 396 bio_list_init(&bios); 397 398 spin_lock_irqsave(&tc->lock, flags); 399 bio_list_merge(&bios, master); 400 bio_list_init(master); 401 spin_unlock_irqrestore(&tc->lock, flags); 402 403 while ((bio = bio_list_pop(&bios))) 404 bio_endio(bio, DM_ENDIO_REQUEUE); 405 } 406 407 static void requeue_io(struct thin_c *tc) 408 { 409 requeue_bio_list(tc, &tc->deferred_bio_list); 410 requeue_bio_list(tc, &tc->retry_on_resume_list); 411 } 412 413 static void error_thin_retry_list(struct thin_c *tc) 414 { 415 struct bio *bio; 416 unsigned long flags; 417 struct bio_list bios; 418 419 bio_list_init(&bios); 420 421 spin_lock_irqsave(&tc->lock, flags); 422 bio_list_merge(&bios, &tc->retry_on_resume_list); 423 bio_list_init(&tc->retry_on_resume_list); 424 spin_unlock_irqrestore(&tc->lock, flags); 425 426 while ((bio = bio_list_pop(&bios))) 427 bio_io_error(bio); 428 } 429 430 static void error_retry_list(struct pool *pool) 431 { 432 struct thin_c *tc; 433 434 rcu_read_lock(); 435 list_for_each_entry_rcu(tc, &pool->active_thins, list) 436 error_thin_retry_list(tc); 437 rcu_read_unlock(); 438 } 439 440 /* 441 * This section of code contains the logic for processing a thin device's IO. 442 * Much of the code depends on pool object resources (lists, workqueues, etc) 443 * but most is exclusively called from the thin target rather than the thin-pool 444 * target. 445 */ 446 447 static bool block_size_is_power_of_two(struct pool *pool) 448 { 449 return pool->sectors_per_block_shift >= 0; 450 } 451 452 static dm_block_t get_bio_block(struct thin_c *tc, struct bio *bio) 453 { 454 struct pool *pool = tc->pool; 455 sector_t block_nr = bio->bi_iter.bi_sector; 456 457 if (block_size_is_power_of_two(pool)) 458 block_nr >>= pool->sectors_per_block_shift; 459 else 460 (void) sector_div(block_nr, pool->sectors_per_block); 461 462 return block_nr; 463 } 464 465 static void remap(struct thin_c *tc, struct bio *bio, dm_block_t block) 466 { 467 struct pool *pool = tc->pool; 468 sector_t bi_sector = bio->bi_iter.bi_sector; 469 470 bio->bi_bdev = tc->pool_dev->bdev; 471 if (block_size_is_power_of_two(pool)) 472 bio->bi_iter.bi_sector = 473 (block << pool->sectors_per_block_shift) | 474 (bi_sector & (pool->sectors_per_block - 1)); 475 else 476 bio->bi_iter.bi_sector = (block * pool->sectors_per_block) + 477 sector_div(bi_sector, pool->sectors_per_block); 478 } 479 480 static void remap_to_origin(struct thin_c *tc, struct bio *bio) 481 { 482 bio->bi_bdev = tc->origin_dev->bdev; 483 } 484 485 static int bio_triggers_commit(struct thin_c *tc, struct bio *bio) 486 { 487 return (bio->bi_rw & (REQ_FLUSH | REQ_FUA)) && 488 dm_thin_changed_this_transaction(tc->td); 489 } 490 491 static void inc_all_io_entry(struct pool *pool, struct bio *bio) 492 { 493 struct dm_thin_endio_hook *h; 494 495 if (bio->bi_rw & REQ_DISCARD) 496 return; 497 498 h = dm_per_bio_data(bio, sizeof(struct dm_thin_endio_hook)); 499 h->all_io_entry = dm_deferred_entry_inc(pool->all_io_ds); 500 } 501 502 static void issue(struct thin_c *tc, struct bio *bio) 503 { 504 struct pool *pool = tc->pool; 505 unsigned long flags; 506 507 if (!bio_triggers_commit(tc, bio)) { 508 generic_make_request(bio); 509 return; 510 } 511 512 /* 513 * Complete bio with an error if earlier I/O caused changes to 514 * the metadata that can't be committed e.g, due to I/O errors 515 * on the metadata device. 516 */ 517 if (dm_thin_aborted_changes(tc->td)) { 518 bio_io_error(bio); 519 return; 520 } 521 522 /* 523 * Batch together any bios that trigger commits and then issue a 524 * single commit for them in process_deferred_bios(). 525 */ 526 spin_lock_irqsave(&pool->lock, flags); 527 bio_list_add(&pool->deferred_flush_bios, bio); 528 spin_unlock_irqrestore(&pool->lock, flags); 529 } 530 531 static void remap_to_origin_and_issue(struct thin_c *tc, struct bio *bio) 532 { 533 remap_to_origin(tc, bio); 534 issue(tc, bio); 535 } 536 537 static void remap_and_issue(struct thin_c *tc, struct bio *bio, 538 dm_block_t block) 539 { 540 remap(tc, bio, block); 541 issue(tc, bio); 542 } 543 544 /*----------------------------------------------------------------*/ 545 546 /* 547 * Bio endio functions. 548 */ 549 struct dm_thin_new_mapping { 550 struct list_head list; 551 552 bool quiesced:1; 553 bool prepared:1; 554 bool pass_discard:1; 555 bool definitely_not_shared:1; 556 557 int err; 558 struct thin_c *tc; 559 dm_block_t virt_block; 560 dm_block_t data_block; 561 struct dm_bio_prison_cell *cell, *cell2; 562 563 /* 564 * If the bio covers the whole area of a block then we can avoid 565 * zeroing or copying. Instead this bio is hooked. The bio will 566 * still be in the cell, so care has to be taken to avoid issuing 567 * the bio twice. 568 */ 569 struct bio *bio; 570 bio_end_io_t *saved_bi_end_io; 571 }; 572 573 static void __maybe_add_mapping(struct dm_thin_new_mapping *m) 574 { 575 struct pool *pool = m->tc->pool; 576 577 if (m->quiesced && m->prepared) { 578 list_add_tail(&m->list, &pool->prepared_mappings); 579 wake_worker(pool); 580 } 581 } 582 583 static void copy_complete(int read_err, unsigned long write_err, void *context) 584 { 585 unsigned long flags; 586 struct dm_thin_new_mapping *m = context; 587 struct pool *pool = m->tc->pool; 588 589 m->err = read_err || write_err ? -EIO : 0; 590 591 spin_lock_irqsave(&pool->lock, flags); 592 m->prepared = true; 593 __maybe_add_mapping(m); 594 spin_unlock_irqrestore(&pool->lock, flags); 595 } 596 597 static void overwrite_endio(struct bio *bio, int err) 598 { 599 unsigned long flags; 600 struct dm_thin_endio_hook *h = dm_per_bio_data(bio, sizeof(struct dm_thin_endio_hook)); 601 struct dm_thin_new_mapping *m = h->overwrite_mapping; 602 struct pool *pool = m->tc->pool; 603 604 m->err = err; 605 606 spin_lock_irqsave(&pool->lock, flags); 607 m->prepared = true; 608 __maybe_add_mapping(m); 609 spin_unlock_irqrestore(&pool->lock, flags); 610 } 611 612 /*----------------------------------------------------------------*/ 613 614 /* 615 * Workqueue. 616 */ 617 618 /* 619 * Prepared mapping jobs. 620 */ 621 622 /* 623 * This sends the bios in the cell back to the deferred_bios list. 624 */ 625 static void cell_defer(struct thin_c *tc, struct dm_bio_prison_cell *cell) 626 { 627 struct pool *pool = tc->pool; 628 unsigned long flags; 629 630 spin_lock_irqsave(&tc->lock, flags); 631 cell_release(pool, cell, &tc->deferred_bio_list); 632 spin_unlock_irqrestore(&tc->lock, flags); 633 634 wake_worker(pool); 635 } 636 637 /* 638 * Same as cell_defer above, except it omits the original holder of the cell. 639 */ 640 static void cell_defer_no_holder(struct thin_c *tc, struct dm_bio_prison_cell *cell) 641 { 642 struct pool *pool = tc->pool; 643 unsigned long flags; 644 645 spin_lock_irqsave(&tc->lock, flags); 646 cell_release_no_holder(pool, cell, &tc->deferred_bio_list); 647 spin_unlock_irqrestore(&tc->lock, flags); 648 649 wake_worker(pool); 650 } 651 652 static void process_prepared_mapping_fail(struct dm_thin_new_mapping *m) 653 { 654 if (m->bio) { 655 m->bio->bi_end_io = m->saved_bi_end_io; 656 atomic_inc(&m->bio->bi_remaining); 657 } 658 cell_error(m->tc->pool, m->cell); 659 list_del(&m->list); 660 mempool_free(m, m->tc->pool->mapping_pool); 661 } 662 663 static void process_prepared_mapping(struct dm_thin_new_mapping *m) 664 { 665 struct thin_c *tc = m->tc; 666 struct pool *pool = tc->pool; 667 struct bio *bio; 668 int r; 669 670 bio = m->bio; 671 if (bio) { 672 bio->bi_end_io = m->saved_bi_end_io; 673 atomic_inc(&bio->bi_remaining); 674 } 675 676 if (m->err) { 677 cell_error(pool, m->cell); 678 goto out; 679 } 680 681 /* 682 * Commit the prepared block into the mapping btree. 683 * Any I/O for this block arriving after this point will get 684 * remapped to it directly. 685 */ 686 r = dm_thin_insert_block(tc->td, m->virt_block, m->data_block); 687 if (r) { 688 metadata_operation_failed(pool, "dm_thin_insert_block", r); 689 cell_error(pool, m->cell); 690 goto out; 691 } 692 693 /* 694 * Release any bios held while the block was being provisioned. 695 * If we are processing a write bio that completely covers the block, 696 * we already processed it so can ignore it now when processing 697 * the bios in the cell. 698 */ 699 if (bio) { 700 cell_defer_no_holder(tc, m->cell); 701 bio_endio(bio, 0); 702 } else 703 cell_defer(tc, m->cell); 704 705 out: 706 list_del(&m->list); 707 mempool_free(m, pool->mapping_pool); 708 } 709 710 static void process_prepared_discard_fail(struct dm_thin_new_mapping *m) 711 { 712 struct thin_c *tc = m->tc; 713 714 bio_io_error(m->bio); 715 cell_defer_no_holder(tc, m->cell); 716 cell_defer_no_holder(tc, m->cell2); 717 mempool_free(m, tc->pool->mapping_pool); 718 } 719 720 static void process_prepared_discard_passdown(struct dm_thin_new_mapping *m) 721 { 722 struct thin_c *tc = m->tc; 723 724 inc_all_io_entry(tc->pool, m->bio); 725 cell_defer_no_holder(tc, m->cell); 726 cell_defer_no_holder(tc, m->cell2); 727 728 if (m->pass_discard) 729 if (m->definitely_not_shared) 730 remap_and_issue(tc, m->bio, m->data_block); 731 else { 732 bool used = false; 733 if (dm_pool_block_is_used(tc->pool->pmd, m->data_block, &used) || used) 734 bio_endio(m->bio, 0); 735 else 736 remap_and_issue(tc, m->bio, m->data_block); 737 } 738 else 739 bio_endio(m->bio, 0); 740 741 mempool_free(m, tc->pool->mapping_pool); 742 } 743 744 static void process_prepared_discard(struct dm_thin_new_mapping *m) 745 { 746 int r; 747 struct thin_c *tc = m->tc; 748 749 r = dm_thin_remove_block(tc->td, m->virt_block); 750 if (r) 751 DMERR_LIMIT("dm_thin_remove_block() failed"); 752 753 process_prepared_discard_passdown(m); 754 } 755 756 static void process_prepared(struct pool *pool, struct list_head *head, 757 process_mapping_fn *fn) 758 { 759 unsigned long flags; 760 struct list_head maps; 761 struct dm_thin_new_mapping *m, *tmp; 762 763 INIT_LIST_HEAD(&maps); 764 spin_lock_irqsave(&pool->lock, flags); 765 list_splice_init(head, &maps); 766 spin_unlock_irqrestore(&pool->lock, flags); 767 768 list_for_each_entry_safe(m, tmp, &maps, list) 769 (*fn)(m); 770 } 771 772 /* 773 * Deferred bio jobs. 774 */ 775 static int io_overlaps_block(struct pool *pool, struct bio *bio) 776 { 777 return bio->bi_iter.bi_size == 778 (pool->sectors_per_block << SECTOR_SHIFT); 779 } 780 781 static int io_overwrites_block(struct pool *pool, struct bio *bio) 782 { 783 return (bio_data_dir(bio) == WRITE) && 784 io_overlaps_block(pool, bio); 785 } 786 787 static void save_and_set_endio(struct bio *bio, bio_end_io_t **save, 788 bio_end_io_t *fn) 789 { 790 *save = bio->bi_end_io; 791 bio->bi_end_io = fn; 792 } 793 794 static int ensure_next_mapping(struct pool *pool) 795 { 796 if (pool->next_mapping) 797 return 0; 798 799 pool->next_mapping = mempool_alloc(pool->mapping_pool, GFP_ATOMIC); 800 801 return pool->next_mapping ? 0 : -ENOMEM; 802 } 803 804 static struct dm_thin_new_mapping *get_next_mapping(struct pool *pool) 805 { 806 struct dm_thin_new_mapping *m = pool->next_mapping; 807 808 BUG_ON(!pool->next_mapping); 809 810 memset(m, 0, sizeof(struct dm_thin_new_mapping)); 811 INIT_LIST_HEAD(&m->list); 812 m->bio = NULL; 813 814 pool->next_mapping = NULL; 815 816 return m; 817 } 818 819 static void schedule_copy(struct thin_c *tc, dm_block_t virt_block, 820 struct dm_dev *origin, dm_block_t data_origin, 821 dm_block_t data_dest, 822 struct dm_bio_prison_cell *cell, struct bio *bio) 823 { 824 int r; 825 struct pool *pool = tc->pool; 826 struct dm_thin_new_mapping *m = get_next_mapping(pool); 827 828 m->tc = tc; 829 m->virt_block = virt_block; 830 m->data_block = data_dest; 831 m->cell = cell; 832 833 if (!dm_deferred_set_add_work(pool->shared_read_ds, &m->list)) 834 m->quiesced = true; 835 836 /* 837 * IO to pool_dev remaps to the pool target's data_dev. 838 * 839 * If the whole block of data is being overwritten, we can issue the 840 * bio immediately. Otherwise we use kcopyd to clone the data first. 841 */ 842 if (io_overwrites_block(pool, bio)) { 843 struct dm_thin_endio_hook *h = dm_per_bio_data(bio, sizeof(struct dm_thin_endio_hook)); 844 845 h->overwrite_mapping = m; 846 m->bio = bio; 847 save_and_set_endio(bio, &m->saved_bi_end_io, overwrite_endio); 848 inc_all_io_entry(pool, bio); 849 remap_and_issue(tc, bio, data_dest); 850 } else { 851 struct dm_io_region from, to; 852 853 from.bdev = origin->bdev; 854 from.sector = data_origin * pool->sectors_per_block; 855 from.count = pool->sectors_per_block; 856 857 to.bdev = tc->pool_dev->bdev; 858 to.sector = data_dest * pool->sectors_per_block; 859 to.count = pool->sectors_per_block; 860 861 r = dm_kcopyd_copy(pool->copier, &from, 1, &to, 862 0, copy_complete, m); 863 if (r < 0) { 864 mempool_free(m, pool->mapping_pool); 865 DMERR_LIMIT("dm_kcopyd_copy() failed"); 866 cell_error(pool, cell); 867 } 868 } 869 } 870 871 static void schedule_internal_copy(struct thin_c *tc, dm_block_t virt_block, 872 dm_block_t data_origin, dm_block_t data_dest, 873 struct dm_bio_prison_cell *cell, struct bio *bio) 874 { 875 schedule_copy(tc, virt_block, tc->pool_dev, 876 data_origin, data_dest, cell, bio); 877 } 878 879 static void schedule_external_copy(struct thin_c *tc, dm_block_t virt_block, 880 dm_block_t data_dest, 881 struct dm_bio_prison_cell *cell, struct bio *bio) 882 { 883 schedule_copy(tc, virt_block, tc->origin_dev, 884 virt_block, data_dest, cell, bio); 885 } 886 887 static void schedule_zero(struct thin_c *tc, dm_block_t virt_block, 888 dm_block_t data_block, struct dm_bio_prison_cell *cell, 889 struct bio *bio) 890 { 891 struct pool *pool = tc->pool; 892 struct dm_thin_new_mapping *m = get_next_mapping(pool); 893 894 m->quiesced = true; 895 m->prepared = false; 896 m->tc = tc; 897 m->virt_block = virt_block; 898 m->data_block = data_block; 899 m->cell = cell; 900 901 /* 902 * If the whole block of data is being overwritten or we are not 903 * zeroing pre-existing data, we can issue the bio immediately. 904 * Otherwise we use kcopyd to zero the data first. 905 */ 906 if (!pool->pf.zero_new_blocks) 907 process_prepared_mapping(m); 908 909 else if (io_overwrites_block(pool, bio)) { 910 struct dm_thin_endio_hook *h = dm_per_bio_data(bio, sizeof(struct dm_thin_endio_hook)); 911 912 h->overwrite_mapping = m; 913 m->bio = bio; 914 save_and_set_endio(bio, &m->saved_bi_end_io, overwrite_endio); 915 inc_all_io_entry(pool, bio); 916 remap_and_issue(tc, bio, data_block); 917 } else { 918 int r; 919 struct dm_io_region to; 920 921 to.bdev = tc->pool_dev->bdev; 922 to.sector = data_block * pool->sectors_per_block; 923 to.count = pool->sectors_per_block; 924 925 r = dm_kcopyd_zero(pool->copier, 1, &to, 0, copy_complete, m); 926 if (r < 0) { 927 mempool_free(m, pool->mapping_pool); 928 DMERR_LIMIT("dm_kcopyd_zero() failed"); 929 cell_error(pool, cell); 930 } 931 } 932 } 933 934 /* 935 * A non-zero return indicates read_only or fail_io mode. 936 * Many callers don't care about the return value. 937 */ 938 static int commit(struct pool *pool) 939 { 940 int r; 941 942 if (get_pool_mode(pool) >= PM_READ_ONLY) 943 return -EINVAL; 944 945 r = dm_pool_commit_metadata(pool->pmd); 946 if (r) 947 metadata_operation_failed(pool, "dm_pool_commit_metadata", r); 948 949 return r; 950 } 951 952 static void check_low_water_mark(struct pool *pool, dm_block_t free_blocks) 953 { 954 unsigned long flags; 955 956 if (free_blocks <= pool->low_water_blocks && !pool->low_water_triggered) { 957 DMWARN("%s: reached low water mark for data device: sending event.", 958 dm_device_name(pool->pool_md)); 959 spin_lock_irqsave(&pool->lock, flags); 960 pool->low_water_triggered = true; 961 spin_unlock_irqrestore(&pool->lock, flags); 962 dm_table_event(pool->ti->table); 963 } 964 } 965 966 static void set_pool_mode(struct pool *pool, enum pool_mode new_mode); 967 968 static int alloc_data_block(struct thin_c *tc, dm_block_t *result) 969 { 970 int r; 971 dm_block_t free_blocks; 972 struct pool *pool = tc->pool; 973 974 if (WARN_ON(get_pool_mode(pool) != PM_WRITE)) 975 return -EINVAL; 976 977 r = dm_pool_get_free_block_count(pool->pmd, &free_blocks); 978 if (r) { 979 metadata_operation_failed(pool, "dm_pool_get_free_block_count", r); 980 return r; 981 } 982 983 check_low_water_mark(pool, free_blocks); 984 985 if (!free_blocks) { 986 /* 987 * Try to commit to see if that will free up some 988 * more space. 989 */ 990 r = commit(pool); 991 if (r) 992 return r; 993 994 r = dm_pool_get_free_block_count(pool->pmd, &free_blocks); 995 if (r) { 996 metadata_operation_failed(pool, "dm_pool_get_free_block_count", r); 997 return r; 998 } 999 1000 if (!free_blocks) { 1001 set_pool_mode(pool, PM_OUT_OF_DATA_SPACE); 1002 return -ENOSPC; 1003 } 1004 } 1005 1006 r = dm_pool_alloc_data_block(pool->pmd, result); 1007 if (r) { 1008 metadata_operation_failed(pool, "dm_pool_alloc_data_block", r); 1009 return r; 1010 } 1011 1012 return 0; 1013 } 1014 1015 /* 1016 * If we have run out of space, queue bios until the device is 1017 * resumed, presumably after having been reloaded with more space. 1018 */ 1019 static void retry_on_resume(struct bio *bio) 1020 { 1021 struct dm_thin_endio_hook *h = dm_per_bio_data(bio, sizeof(struct dm_thin_endio_hook)); 1022 struct thin_c *tc = h->tc; 1023 unsigned long flags; 1024 1025 spin_lock_irqsave(&tc->lock, flags); 1026 bio_list_add(&tc->retry_on_resume_list, bio); 1027 spin_unlock_irqrestore(&tc->lock, flags); 1028 } 1029 1030 static bool should_error_unserviceable_bio(struct pool *pool) 1031 { 1032 enum pool_mode m = get_pool_mode(pool); 1033 1034 switch (m) { 1035 case PM_WRITE: 1036 /* Shouldn't get here */ 1037 DMERR_LIMIT("bio unserviceable, yet pool is in PM_WRITE mode"); 1038 return true; 1039 1040 case PM_OUT_OF_DATA_SPACE: 1041 return pool->pf.error_if_no_space; 1042 1043 case PM_READ_ONLY: 1044 case PM_FAIL: 1045 return true; 1046 default: 1047 /* Shouldn't get here */ 1048 DMERR_LIMIT("bio unserviceable, yet pool has an unknown mode"); 1049 return true; 1050 } 1051 } 1052 1053 static void handle_unserviceable_bio(struct pool *pool, struct bio *bio) 1054 { 1055 if (should_error_unserviceable_bio(pool)) 1056 bio_io_error(bio); 1057 else 1058 retry_on_resume(bio); 1059 } 1060 1061 static void retry_bios_on_resume(struct pool *pool, struct dm_bio_prison_cell *cell) 1062 { 1063 struct bio *bio; 1064 struct bio_list bios; 1065 1066 if (should_error_unserviceable_bio(pool)) { 1067 cell_error(pool, cell); 1068 return; 1069 } 1070 1071 bio_list_init(&bios); 1072 cell_release(pool, cell, &bios); 1073 1074 if (should_error_unserviceable_bio(pool)) 1075 while ((bio = bio_list_pop(&bios))) 1076 bio_io_error(bio); 1077 else 1078 while ((bio = bio_list_pop(&bios))) 1079 retry_on_resume(bio); 1080 } 1081 1082 static void process_discard(struct thin_c *tc, struct bio *bio) 1083 { 1084 int r; 1085 unsigned long flags; 1086 struct pool *pool = tc->pool; 1087 struct dm_bio_prison_cell *cell, *cell2; 1088 struct dm_cell_key key, key2; 1089 dm_block_t block = get_bio_block(tc, bio); 1090 struct dm_thin_lookup_result lookup_result; 1091 struct dm_thin_new_mapping *m; 1092 1093 build_virtual_key(tc->td, block, &key); 1094 if (bio_detain(tc->pool, &key, bio, &cell)) 1095 return; 1096 1097 r = dm_thin_find_block(tc->td, block, 1, &lookup_result); 1098 switch (r) { 1099 case 0: 1100 /* 1101 * Check nobody is fiddling with this pool block. This can 1102 * happen if someone's in the process of breaking sharing 1103 * on this block. 1104 */ 1105 build_data_key(tc->td, lookup_result.block, &key2); 1106 if (bio_detain(tc->pool, &key2, bio, &cell2)) { 1107 cell_defer_no_holder(tc, cell); 1108 break; 1109 } 1110 1111 if (io_overlaps_block(pool, bio)) { 1112 /* 1113 * IO may still be going to the destination block. We must 1114 * quiesce before we can do the removal. 1115 */ 1116 m = get_next_mapping(pool); 1117 m->tc = tc; 1118 m->pass_discard = pool->pf.discard_passdown; 1119 m->definitely_not_shared = !lookup_result.shared; 1120 m->virt_block = block; 1121 m->data_block = lookup_result.block; 1122 m->cell = cell; 1123 m->cell2 = cell2; 1124 m->bio = bio; 1125 1126 if (!dm_deferred_set_add_work(pool->all_io_ds, &m->list)) { 1127 spin_lock_irqsave(&pool->lock, flags); 1128 list_add_tail(&m->list, &pool->prepared_discards); 1129 spin_unlock_irqrestore(&pool->lock, flags); 1130 wake_worker(pool); 1131 } 1132 } else { 1133 inc_all_io_entry(pool, bio); 1134 cell_defer_no_holder(tc, cell); 1135 cell_defer_no_holder(tc, cell2); 1136 1137 /* 1138 * The DM core makes sure that the discard doesn't span 1139 * a block boundary. So we submit the discard of a 1140 * partial block appropriately. 1141 */ 1142 if ((!lookup_result.shared) && pool->pf.discard_passdown) 1143 remap_and_issue(tc, bio, lookup_result.block); 1144 else 1145 bio_endio(bio, 0); 1146 } 1147 break; 1148 1149 case -ENODATA: 1150 /* 1151 * It isn't provisioned, just forget it. 1152 */ 1153 cell_defer_no_holder(tc, cell); 1154 bio_endio(bio, 0); 1155 break; 1156 1157 default: 1158 DMERR_LIMIT("%s: dm_thin_find_block() failed: error = %d", 1159 __func__, r); 1160 cell_defer_no_holder(tc, cell); 1161 bio_io_error(bio); 1162 break; 1163 } 1164 } 1165 1166 static void break_sharing(struct thin_c *tc, struct bio *bio, dm_block_t block, 1167 struct dm_cell_key *key, 1168 struct dm_thin_lookup_result *lookup_result, 1169 struct dm_bio_prison_cell *cell) 1170 { 1171 int r; 1172 dm_block_t data_block; 1173 struct pool *pool = tc->pool; 1174 1175 r = alloc_data_block(tc, &data_block); 1176 switch (r) { 1177 case 0: 1178 schedule_internal_copy(tc, block, lookup_result->block, 1179 data_block, cell, bio); 1180 break; 1181 1182 case -ENOSPC: 1183 retry_bios_on_resume(pool, cell); 1184 break; 1185 1186 default: 1187 DMERR_LIMIT("%s: alloc_data_block() failed: error = %d", 1188 __func__, r); 1189 cell_error(pool, cell); 1190 break; 1191 } 1192 } 1193 1194 static void process_shared_bio(struct thin_c *tc, struct bio *bio, 1195 dm_block_t block, 1196 struct dm_thin_lookup_result *lookup_result) 1197 { 1198 struct dm_bio_prison_cell *cell; 1199 struct pool *pool = tc->pool; 1200 struct dm_cell_key key; 1201 1202 /* 1203 * If cell is already occupied, then sharing is already in the process 1204 * of being broken so we have nothing further to do here. 1205 */ 1206 build_data_key(tc->td, lookup_result->block, &key); 1207 if (bio_detain(pool, &key, bio, &cell)) 1208 return; 1209 1210 if (bio_data_dir(bio) == WRITE && bio->bi_iter.bi_size) 1211 break_sharing(tc, bio, block, &key, lookup_result, cell); 1212 else { 1213 struct dm_thin_endio_hook *h = dm_per_bio_data(bio, sizeof(struct dm_thin_endio_hook)); 1214 1215 h->shared_read_entry = dm_deferred_entry_inc(pool->shared_read_ds); 1216 inc_all_io_entry(pool, bio); 1217 cell_defer_no_holder(tc, cell); 1218 1219 remap_and_issue(tc, bio, lookup_result->block); 1220 } 1221 } 1222 1223 static void provision_block(struct thin_c *tc, struct bio *bio, dm_block_t block, 1224 struct dm_bio_prison_cell *cell) 1225 { 1226 int r; 1227 dm_block_t data_block; 1228 struct pool *pool = tc->pool; 1229 1230 /* 1231 * Remap empty bios (flushes) immediately, without provisioning. 1232 */ 1233 if (!bio->bi_iter.bi_size) { 1234 inc_all_io_entry(pool, bio); 1235 cell_defer_no_holder(tc, cell); 1236 1237 remap_and_issue(tc, bio, 0); 1238 return; 1239 } 1240 1241 /* 1242 * Fill read bios with zeroes and complete them immediately. 1243 */ 1244 if (bio_data_dir(bio) == READ) { 1245 zero_fill_bio(bio); 1246 cell_defer_no_holder(tc, cell); 1247 bio_endio(bio, 0); 1248 return; 1249 } 1250 1251 r = alloc_data_block(tc, &data_block); 1252 switch (r) { 1253 case 0: 1254 if (tc->origin_dev) 1255 schedule_external_copy(tc, block, data_block, cell, bio); 1256 else 1257 schedule_zero(tc, block, data_block, cell, bio); 1258 break; 1259 1260 case -ENOSPC: 1261 retry_bios_on_resume(pool, cell); 1262 break; 1263 1264 default: 1265 DMERR_LIMIT("%s: alloc_data_block() failed: error = %d", 1266 __func__, r); 1267 cell_error(pool, cell); 1268 break; 1269 } 1270 } 1271 1272 static void process_bio(struct thin_c *tc, struct bio *bio) 1273 { 1274 int r; 1275 struct pool *pool = tc->pool; 1276 dm_block_t block = get_bio_block(tc, bio); 1277 struct dm_bio_prison_cell *cell; 1278 struct dm_cell_key key; 1279 struct dm_thin_lookup_result lookup_result; 1280 1281 /* 1282 * If cell is already occupied, then the block is already 1283 * being provisioned so we have nothing further to do here. 1284 */ 1285 build_virtual_key(tc->td, block, &key); 1286 if (bio_detain(pool, &key, bio, &cell)) 1287 return; 1288 1289 r = dm_thin_find_block(tc->td, block, 1, &lookup_result); 1290 switch (r) { 1291 case 0: 1292 if (lookup_result.shared) { 1293 process_shared_bio(tc, bio, block, &lookup_result); 1294 cell_defer_no_holder(tc, cell); /* FIXME: pass this cell into process_shared? */ 1295 } else { 1296 inc_all_io_entry(pool, bio); 1297 cell_defer_no_holder(tc, cell); 1298 1299 remap_and_issue(tc, bio, lookup_result.block); 1300 } 1301 break; 1302 1303 case -ENODATA: 1304 if (bio_data_dir(bio) == READ && tc->origin_dev) { 1305 inc_all_io_entry(pool, bio); 1306 cell_defer_no_holder(tc, cell); 1307 1308 remap_to_origin_and_issue(tc, bio); 1309 } else 1310 provision_block(tc, bio, block, cell); 1311 break; 1312 1313 default: 1314 DMERR_LIMIT("%s: dm_thin_find_block() failed: error = %d", 1315 __func__, r); 1316 cell_defer_no_holder(tc, cell); 1317 bio_io_error(bio); 1318 break; 1319 } 1320 } 1321 1322 static void process_bio_read_only(struct thin_c *tc, struct bio *bio) 1323 { 1324 int r; 1325 int rw = bio_data_dir(bio); 1326 dm_block_t block = get_bio_block(tc, bio); 1327 struct dm_thin_lookup_result lookup_result; 1328 1329 r = dm_thin_find_block(tc->td, block, 1, &lookup_result); 1330 switch (r) { 1331 case 0: 1332 if (lookup_result.shared && (rw == WRITE) && bio->bi_iter.bi_size) 1333 handle_unserviceable_bio(tc->pool, bio); 1334 else { 1335 inc_all_io_entry(tc->pool, bio); 1336 remap_and_issue(tc, bio, lookup_result.block); 1337 } 1338 break; 1339 1340 case -ENODATA: 1341 if (rw != READ) { 1342 handle_unserviceable_bio(tc->pool, bio); 1343 break; 1344 } 1345 1346 if (tc->origin_dev) { 1347 inc_all_io_entry(tc->pool, bio); 1348 remap_to_origin_and_issue(tc, bio); 1349 break; 1350 } 1351 1352 zero_fill_bio(bio); 1353 bio_endio(bio, 0); 1354 break; 1355 1356 default: 1357 DMERR_LIMIT("%s: dm_thin_find_block() failed: error = %d", 1358 __func__, r); 1359 bio_io_error(bio); 1360 break; 1361 } 1362 } 1363 1364 static void process_bio_success(struct thin_c *tc, struct bio *bio) 1365 { 1366 bio_endio(bio, 0); 1367 } 1368 1369 static void process_bio_fail(struct thin_c *tc, struct bio *bio) 1370 { 1371 bio_io_error(bio); 1372 } 1373 1374 /* 1375 * FIXME: should we also commit due to size of transaction, measured in 1376 * metadata blocks? 1377 */ 1378 static int need_commit_due_to_time(struct pool *pool) 1379 { 1380 return jiffies < pool->last_commit_jiffies || 1381 jiffies > pool->last_commit_jiffies + COMMIT_PERIOD; 1382 } 1383 1384 #define thin_pbd(node) rb_entry((node), struct dm_thin_endio_hook, rb_node) 1385 #define thin_bio(pbd) dm_bio_from_per_bio_data((pbd), sizeof(struct dm_thin_endio_hook)) 1386 1387 static void __thin_bio_rb_add(struct thin_c *tc, struct bio *bio) 1388 { 1389 struct rb_node **rbp, *parent; 1390 struct dm_thin_endio_hook *pbd; 1391 sector_t bi_sector = bio->bi_iter.bi_sector; 1392 1393 rbp = &tc->sort_bio_list.rb_node; 1394 parent = NULL; 1395 while (*rbp) { 1396 parent = *rbp; 1397 pbd = thin_pbd(parent); 1398 1399 if (bi_sector < thin_bio(pbd)->bi_iter.bi_sector) 1400 rbp = &(*rbp)->rb_left; 1401 else 1402 rbp = &(*rbp)->rb_right; 1403 } 1404 1405 pbd = dm_per_bio_data(bio, sizeof(struct dm_thin_endio_hook)); 1406 rb_link_node(&pbd->rb_node, parent, rbp); 1407 rb_insert_color(&pbd->rb_node, &tc->sort_bio_list); 1408 } 1409 1410 static void __extract_sorted_bios(struct thin_c *tc) 1411 { 1412 struct rb_node *node; 1413 struct dm_thin_endio_hook *pbd; 1414 struct bio *bio; 1415 1416 for (node = rb_first(&tc->sort_bio_list); node; node = rb_next(node)) { 1417 pbd = thin_pbd(node); 1418 bio = thin_bio(pbd); 1419 1420 bio_list_add(&tc->deferred_bio_list, bio); 1421 rb_erase(&pbd->rb_node, &tc->sort_bio_list); 1422 } 1423 1424 WARN_ON(!RB_EMPTY_ROOT(&tc->sort_bio_list)); 1425 } 1426 1427 static void __sort_thin_deferred_bios(struct thin_c *tc) 1428 { 1429 struct bio *bio; 1430 struct bio_list bios; 1431 1432 bio_list_init(&bios); 1433 bio_list_merge(&bios, &tc->deferred_bio_list); 1434 bio_list_init(&tc->deferred_bio_list); 1435 1436 /* Sort deferred_bio_list using rb-tree */ 1437 while ((bio = bio_list_pop(&bios))) 1438 __thin_bio_rb_add(tc, bio); 1439 1440 /* 1441 * Transfer the sorted bios in sort_bio_list back to 1442 * deferred_bio_list to allow lockless submission of 1443 * all bios. 1444 */ 1445 __extract_sorted_bios(tc); 1446 } 1447 1448 static void process_thin_deferred_bios(struct thin_c *tc) 1449 { 1450 struct pool *pool = tc->pool; 1451 unsigned long flags; 1452 struct bio *bio; 1453 struct bio_list bios; 1454 struct blk_plug plug; 1455 1456 if (tc->requeue_mode) { 1457 requeue_bio_list(tc, &tc->deferred_bio_list); 1458 return; 1459 } 1460 1461 bio_list_init(&bios); 1462 1463 spin_lock_irqsave(&tc->lock, flags); 1464 1465 if (bio_list_empty(&tc->deferred_bio_list)) { 1466 spin_unlock_irqrestore(&tc->lock, flags); 1467 return; 1468 } 1469 1470 __sort_thin_deferred_bios(tc); 1471 1472 bio_list_merge(&bios, &tc->deferred_bio_list); 1473 bio_list_init(&tc->deferred_bio_list); 1474 1475 spin_unlock_irqrestore(&tc->lock, flags); 1476 1477 blk_start_plug(&plug); 1478 while ((bio = bio_list_pop(&bios))) { 1479 /* 1480 * If we've got no free new_mapping structs, and processing 1481 * this bio might require one, we pause until there are some 1482 * prepared mappings to process. 1483 */ 1484 if (ensure_next_mapping(pool)) { 1485 spin_lock_irqsave(&tc->lock, flags); 1486 bio_list_add(&tc->deferred_bio_list, bio); 1487 bio_list_merge(&tc->deferred_bio_list, &bios); 1488 spin_unlock_irqrestore(&tc->lock, flags); 1489 break; 1490 } 1491 1492 if (bio->bi_rw & REQ_DISCARD) 1493 pool->process_discard(tc, bio); 1494 else 1495 pool->process_bio(tc, bio); 1496 } 1497 blk_finish_plug(&plug); 1498 } 1499 1500 static void thin_get(struct thin_c *tc); 1501 static void thin_put(struct thin_c *tc); 1502 1503 /* 1504 * We can't hold rcu_read_lock() around code that can block. So we 1505 * find a thin with the rcu lock held; bump a refcount; then drop 1506 * the lock. 1507 */ 1508 static struct thin_c *get_first_thin(struct pool *pool) 1509 { 1510 struct thin_c *tc = NULL; 1511 1512 rcu_read_lock(); 1513 if (!list_empty(&pool->active_thins)) { 1514 tc = list_entry_rcu(pool->active_thins.next, struct thin_c, list); 1515 thin_get(tc); 1516 } 1517 rcu_read_unlock(); 1518 1519 return tc; 1520 } 1521 1522 static struct thin_c *get_next_thin(struct pool *pool, struct thin_c *tc) 1523 { 1524 struct thin_c *old_tc = tc; 1525 1526 rcu_read_lock(); 1527 list_for_each_entry_continue_rcu(tc, &pool->active_thins, list) { 1528 thin_get(tc); 1529 thin_put(old_tc); 1530 rcu_read_unlock(); 1531 return tc; 1532 } 1533 thin_put(old_tc); 1534 rcu_read_unlock(); 1535 1536 return NULL; 1537 } 1538 1539 static void process_deferred_bios(struct pool *pool) 1540 { 1541 unsigned long flags; 1542 struct bio *bio; 1543 struct bio_list bios; 1544 struct thin_c *tc; 1545 1546 tc = get_first_thin(pool); 1547 while (tc) { 1548 process_thin_deferred_bios(tc); 1549 tc = get_next_thin(pool, tc); 1550 } 1551 1552 /* 1553 * If there are any deferred flush bios, we must commit 1554 * the metadata before issuing them. 1555 */ 1556 bio_list_init(&bios); 1557 spin_lock_irqsave(&pool->lock, flags); 1558 bio_list_merge(&bios, &pool->deferred_flush_bios); 1559 bio_list_init(&pool->deferred_flush_bios); 1560 spin_unlock_irqrestore(&pool->lock, flags); 1561 1562 if (bio_list_empty(&bios) && 1563 !(dm_pool_changed_this_transaction(pool->pmd) && need_commit_due_to_time(pool))) 1564 return; 1565 1566 if (commit(pool)) { 1567 while ((bio = bio_list_pop(&bios))) 1568 bio_io_error(bio); 1569 return; 1570 } 1571 pool->last_commit_jiffies = jiffies; 1572 1573 while ((bio = bio_list_pop(&bios))) 1574 generic_make_request(bio); 1575 } 1576 1577 static void do_worker(struct work_struct *ws) 1578 { 1579 struct pool *pool = container_of(ws, struct pool, worker); 1580 1581 process_prepared(pool, &pool->prepared_mappings, &pool->process_prepared_mapping); 1582 process_prepared(pool, &pool->prepared_discards, &pool->process_prepared_discard); 1583 process_deferred_bios(pool); 1584 } 1585 1586 /* 1587 * We want to commit periodically so that not too much 1588 * unwritten data builds up. 1589 */ 1590 static void do_waker(struct work_struct *ws) 1591 { 1592 struct pool *pool = container_of(to_delayed_work(ws), struct pool, waker); 1593 wake_worker(pool); 1594 queue_delayed_work(pool->wq, &pool->waker, COMMIT_PERIOD); 1595 } 1596 1597 /* 1598 * We're holding onto IO to allow userland time to react. After the 1599 * timeout either the pool will have been resized (and thus back in 1600 * PM_WRITE mode), or we degrade to PM_READ_ONLY and start erroring IO. 1601 */ 1602 static void do_no_space_timeout(struct work_struct *ws) 1603 { 1604 struct pool *pool = container_of(to_delayed_work(ws), struct pool, 1605 no_space_timeout); 1606 1607 if (get_pool_mode(pool) == PM_OUT_OF_DATA_SPACE && !pool->pf.error_if_no_space) 1608 set_pool_mode(pool, PM_READ_ONLY); 1609 } 1610 1611 /*----------------------------------------------------------------*/ 1612 1613 struct noflush_work { 1614 struct work_struct worker; 1615 struct thin_c *tc; 1616 1617 atomic_t complete; 1618 wait_queue_head_t wait; 1619 }; 1620 1621 static void complete_noflush_work(struct noflush_work *w) 1622 { 1623 atomic_set(&w->complete, 1); 1624 wake_up(&w->wait); 1625 } 1626 1627 static void do_noflush_start(struct work_struct *ws) 1628 { 1629 struct noflush_work *w = container_of(ws, struct noflush_work, worker); 1630 w->tc->requeue_mode = true; 1631 requeue_io(w->tc); 1632 complete_noflush_work(w); 1633 } 1634 1635 static void do_noflush_stop(struct work_struct *ws) 1636 { 1637 struct noflush_work *w = container_of(ws, struct noflush_work, worker); 1638 w->tc->requeue_mode = false; 1639 complete_noflush_work(w); 1640 } 1641 1642 static void noflush_work(struct thin_c *tc, void (*fn)(struct work_struct *)) 1643 { 1644 struct noflush_work w; 1645 1646 INIT_WORK_ONSTACK(&w.worker, fn); 1647 w.tc = tc; 1648 atomic_set(&w.complete, 0); 1649 init_waitqueue_head(&w.wait); 1650 1651 queue_work(tc->pool->wq, &w.worker); 1652 1653 wait_event(w.wait, atomic_read(&w.complete)); 1654 } 1655 1656 /*----------------------------------------------------------------*/ 1657 1658 static enum pool_mode get_pool_mode(struct pool *pool) 1659 { 1660 return pool->pf.mode; 1661 } 1662 1663 static void notify_of_pool_mode_change(struct pool *pool, const char *new_mode) 1664 { 1665 dm_table_event(pool->ti->table); 1666 DMINFO("%s: switching pool to %s mode", 1667 dm_device_name(pool->pool_md), new_mode); 1668 } 1669 1670 static void set_pool_mode(struct pool *pool, enum pool_mode new_mode) 1671 { 1672 struct pool_c *pt = pool->ti->private; 1673 bool needs_check = dm_pool_metadata_needs_check(pool->pmd); 1674 enum pool_mode old_mode = get_pool_mode(pool); 1675 unsigned long no_space_timeout = ACCESS_ONCE(no_space_timeout_secs) * HZ; 1676 1677 /* 1678 * Never allow the pool to transition to PM_WRITE mode if user 1679 * intervention is required to verify metadata and data consistency. 1680 */ 1681 if (new_mode == PM_WRITE && needs_check) { 1682 DMERR("%s: unable to switch pool to write mode until repaired.", 1683 dm_device_name(pool->pool_md)); 1684 if (old_mode != new_mode) 1685 new_mode = old_mode; 1686 else 1687 new_mode = PM_READ_ONLY; 1688 } 1689 /* 1690 * If we were in PM_FAIL mode, rollback of metadata failed. We're 1691 * not going to recover without a thin_repair. So we never let the 1692 * pool move out of the old mode. 1693 */ 1694 if (old_mode == PM_FAIL) 1695 new_mode = old_mode; 1696 1697 switch (new_mode) { 1698 case PM_FAIL: 1699 if (old_mode != new_mode) 1700 notify_of_pool_mode_change(pool, "failure"); 1701 dm_pool_metadata_read_only(pool->pmd); 1702 pool->process_bio = process_bio_fail; 1703 pool->process_discard = process_bio_fail; 1704 pool->process_prepared_mapping = process_prepared_mapping_fail; 1705 pool->process_prepared_discard = process_prepared_discard_fail; 1706 1707 error_retry_list(pool); 1708 break; 1709 1710 case PM_READ_ONLY: 1711 if (old_mode != new_mode) 1712 notify_of_pool_mode_change(pool, "read-only"); 1713 dm_pool_metadata_read_only(pool->pmd); 1714 pool->process_bio = process_bio_read_only; 1715 pool->process_discard = process_bio_success; 1716 pool->process_prepared_mapping = process_prepared_mapping_fail; 1717 pool->process_prepared_discard = process_prepared_discard_passdown; 1718 1719 error_retry_list(pool); 1720 break; 1721 1722 case PM_OUT_OF_DATA_SPACE: 1723 /* 1724 * Ideally we'd never hit this state; the low water mark 1725 * would trigger userland to extend the pool before we 1726 * completely run out of data space. However, many small 1727 * IOs to unprovisioned space can consume data space at an 1728 * alarming rate. Adjust your low water mark if you're 1729 * frequently seeing this mode. 1730 */ 1731 if (old_mode != new_mode) 1732 notify_of_pool_mode_change(pool, "out-of-data-space"); 1733 pool->process_bio = process_bio_read_only; 1734 pool->process_discard = process_discard; 1735 pool->process_prepared_mapping = process_prepared_mapping; 1736 pool->process_prepared_discard = process_prepared_discard_passdown; 1737 1738 if (!pool->pf.error_if_no_space && no_space_timeout) 1739 queue_delayed_work(pool->wq, &pool->no_space_timeout, no_space_timeout); 1740 break; 1741 1742 case PM_WRITE: 1743 if (old_mode != new_mode) 1744 notify_of_pool_mode_change(pool, "write"); 1745 dm_pool_metadata_read_write(pool->pmd); 1746 pool->process_bio = process_bio; 1747 pool->process_discard = process_discard; 1748 pool->process_prepared_mapping = process_prepared_mapping; 1749 pool->process_prepared_discard = process_prepared_discard; 1750 break; 1751 } 1752 1753 pool->pf.mode = new_mode; 1754 /* 1755 * The pool mode may have changed, sync it so bind_control_target() 1756 * doesn't cause an unexpected mode transition on resume. 1757 */ 1758 pt->adjusted_pf.mode = new_mode; 1759 } 1760 1761 static void abort_transaction(struct pool *pool) 1762 { 1763 const char *dev_name = dm_device_name(pool->pool_md); 1764 1765 DMERR_LIMIT("%s: aborting current metadata transaction", dev_name); 1766 if (dm_pool_abort_metadata(pool->pmd)) { 1767 DMERR("%s: failed to abort metadata transaction", dev_name); 1768 set_pool_mode(pool, PM_FAIL); 1769 } 1770 1771 if (dm_pool_metadata_set_needs_check(pool->pmd)) { 1772 DMERR("%s: failed to set 'needs_check' flag in metadata", dev_name); 1773 set_pool_mode(pool, PM_FAIL); 1774 } 1775 } 1776 1777 static void metadata_operation_failed(struct pool *pool, const char *op, int r) 1778 { 1779 DMERR_LIMIT("%s: metadata operation '%s' failed: error = %d", 1780 dm_device_name(pool->pool_md), op, r); 1781 1782 abort_transaction(pool); 1783 set_pool_mode(pool, PM_READ_ONLY); 1784 } 1785 1786 /*----------------------------------------------------------------*/ 1787 1788 /* 1789 * Mapping functions. 1790 */ 1791 1792 /* 1793 * Called only while mapping a thin bio to hand it over to the workqueue. 1794 */ 1795 static void thin_defer_bio(struct thin_c *tc, struct bio *bio) 1796 { 1797 unsigned long flags; 1798 struct pool *pool = tc->pool; 1799 1800 spin_lock_irqsave(&tc->lock, flags); 1801 bio_list_add(&tc->deferred_bio_list, bio); 1802 spin_unlock_irqrestore(&tc->lock, flags); 1803 1804 wake_worker(pool); 1805 } 1806 1807 static void thin_hook_bio(struct thin_c *tc, struct bio *bio) 1808 { 1809 struct dm_thin_endio_hook *h = dm_per_bio_data(bio, sizeof(struct dm_thin_endio_hook)); 1810 1811 h->tc = tc; 1812 h->shared_read_entry = NULL; 1813 h->all_io_entry = NULL; 1814 h->overwrite_mapping = NULL; 1815 } 1816 1817 /* 1818 * Non-blocking function called from the thin target's map function. 1819 */ 1820 static int thin_bio_map(struct dm_target *ti, struct bio *bio) 1821 { 1822 int r; 1823 struct thin_c *tc = ti->private; 1824 dm_block_t block = get_bio_block(tc, bio); 1825 struct dm_thin_device *td = tc->td; 1826 struct dm_thin_lookup_result result; 1827 struct dm_bio_prison_cell cell1, cell2; 1828 struct dm_bio_prison_cell *cell_result; 1829 struct dm_cell_key key; 1830 1831 thin_hook_bio(tc, bio); 1832 1833 if (tc->requeue_mode) { 1834 bio_endio(bio, DM_ENDIO_REQUEUE); 1835 return DM_MAPIO_SUBMITTED; 1836 } 1837 1838 if (get_pool_mode(tc->pool) == PM_FAIL) { 1839 bio_io_error(bio); 1840 return DM_MAPIO_SUBMITTED; 1841 } 1842 1843 if (bio->bi_rw & (REQ_DISCARD | REQ_FLUSH | REQ_FUA)) { 1844 thin_defer_bio(tc, bio); 1845 return DM_MAPIO_SUBMITTED; 1846 } 1847 1848 r = dm_thin_find_block(td, block, 0, &result); 1849 1850 /* 1851 * Note that we defer readahead too. 1852 */ 1853 switch (r) { 1854 case 0: 1855 if (unlikely(result.shared)) { 1856 /* 1857 * We have a race condition here between the 1858 * result.shared value returned by the lookup and 1859 * snapshot creation, which may cause new 1860 * sharing. 1861 * 1862 * To avoid this always quiesce the origin before 1863 * taking the snap. You want to do this anyway to 1864 * ensure a consistent application view 1865 * (i.e. lockfs). 1866 * 1867 * More distant ancestors are irrelevant. The 1868 * shared flag will be set in their case. 1869 */ 1870 thin_defer_bio(tc, bio); 1871 return DM_MAPIO_SUBMITTED; 1872 } 1873 1874 build_virtual_key(tc->td, block, &key); 1875 if (dm_bio_detain(tc->pool->prison, &key, bio, &cell1, &cell_result)) 1876 return DM_MAPIO_SUBMITTED; 1877 1878 build_data_key(tc->td, result.block, &key); 1879 if (dm_bio_detain(tc->pool->prison, &key, bio, &cell2, &cell_result)) { 1880 cell_defer_no_holder_no_free(tc, &cell1); 1881 return DM_MAPIO_SUBMITTED; 1882 } 1883 1884 inc_all_io_entry(tc->pool, bio); 1885 cell_defer_no_holder_no_free(tc, &cell2); 1886 cell_defer_no_holder_no_free(tc, &cell1); 1887 1888 remap(tc, bio, result.block); 1889 return DM_MAPIO_REMAPPED; 1890 1891 case -ENODATA: 1892 if (get_pool_mode(tc->pool) == PM_READ_ONLY) { 1893 /* 1894 * This block isn't provisioned, and we have no way 1895 * of doing so. 1896 */ 1897 handle_unserviceable_bio(tc->pool, bio); 1898 return DM_MAPIO_SUBMITTED; 1899 } 1900 /* fall through */ 1901 1902 case -EWOULDBLOCK: 1903 /* 1904 * In future, the failed dm_thin_find_block above could 1905 * provide the hint to load the metadata into cache. 1906 */ 1907 thin_defer_bio(tc, bio); 1908 return DM_MAPIO_SUBMITTED; 1909 1910 default: 1911 /* 1912 * Must always call bio_io_error on failure. 1913 * dm_thin_find_block can fail with -EINVAL if the 1914 * pool is switched to fail-io mode. 1915 */ 1916 bio_io_error(bio); 1917 return DM_MAPIO_SUBMITTED; 1918 } 1919 } 1920 1921 static int pool_is_congested(struct dm_target_callbacks *cb, int bdi_bits) 1922 { 1923 struct pool_c *pt = container_of(cb, struct pool_c, callbacks); 1924 struct request_queue *q; 1925 1926 if (get_pool_mode(pt->pool) == PM_OUT_OF_DATA_SPACE) 1927 return 1; 1928 1929 q = bdev_get_queue(pt->data_dev->bdev); 1930 return bdi_congested(&q->backing_dev_info, bdi_bits); 1931 } 1932 1933 static void requeue_bios(struct pool *pool) 1934 { 1935 unsigned long flags; 1936 struct thin_c *tc; 1937 1938 rcu_read_lock(); 1939 list_for_each_entry_rcu(tc, &pool->active_thins, list) { 1940 spin_lock_irqsave(&tc->lock, flags); 1941 bio_list_merge(&tc->deferred_bio_list, &tc->retry_on_resume_list); 1942 bio_list_init(&tc->retry_on_resume_list); 1943 spin_unlock_irqrestore(&tc->lock, flags); 1944 } 1945 rcu_read_unlock(); 1946 } 1947 1948 /*---------------------------------------------------------------- 1949 * Binding of control targets to a pool object 1950 *--------------------------------------------------------------*/ 1951 static bool data_dev_supports_discard(struct pool_c *pt) 1952 { 1953 struct request_queue *q = bdev_get_queue(pt->data_dev->bdev); 1954 1955 return q && blk_queue_discard(q); 1956 } 1957 1958 static bool is_factor(sector_t block_size, uint32_t n) 1959 { 1960 return !sector_div(block_size, n); 1961 } 1962 1963 /* 1964 * If discard_passdown was enabled verify that the data device 1965 * supports discards. Disable discard_passdown if not. 1966 */ 1967 static void disable_passdown_if_not_supported(struct pool_c *pt) 1968 { 1969 struct pool *pool = pt->pool; 1970 struct block_device *data_bdev = pt->data_dev->bdev; 1971 struct queue_limits *data_limits = &bdev_get_queue(data_bdev)->limits; 1972 sector_t block_size = pool->sectors_per_block << SECTOR_SHIFT; 1973 const char *reason = NULL; 1974 char buf[BDEVNAME_SIZE]; 1975 1976 if (!pt->adjusted_pf.discard_passdown) 1977 return; 1978 1979 if (!data_dev_supports_discard(pt)) 1980 reason = "discard unsupported"; 1981 1982 else if (data_limits->max_discard_sectors < pool->sectors_per_block) 1983 reason = "max discard sectors smaller than a block"; 1984 1985 else if (data_limits->discard_granularity > block_size) 1986 reason = "discard granularity larger than a block"; 1987 1988 else if (!is_factor(block_size, data_limits->discard_granularity)) 1989 reason = "discard granularity not a factor of block size"; 1990 1991 if (reason) { 1992 DMWARN("Data device (%s) %s: Disabling discard passdown.", bdevname(data_bdev, buf), reason); 1993 pt->adjusted_pf.discard_passdown = false; 1994 } 1995 } 1996 1997 static int bind_control_target(struct pool *pool, struct dm_target *ti) 1998 { 1999 struct pool_c *pt = ti->private; 2000 2001 /* 2002 * We want to make sure that a pool in PM_FAIL mode is never upgraded. 2003 */ 2004 enum pool_mode old_mode = get_pool_mode(pool); 2005 enum pool_mode new_mode = pt->adjusted_pf.mode; 2006 2007 /* 2008 * Don't change the pool's mode until set_pool_mode() below. 2009 * Otherwise the pool's process_* function pointers may 2010 * not match the desired pool mode. 2011 */ 2012 pt->adjusted_pf.mode = old_mode; 2013 2014 pool->ti = ti; 2015 pool->pf = pt->adjusted_pf; 2016 pool->low_water_blocks = pt->low_water_blocks; 2017 2018 set_pool_mode(pool, new_mode); 2019 2020 return 0; 2021 } 2022 2023 static void unbind_control_target(struct pool *pool, struct dm_target *ti) 2024 { 2025 if (pool->ti == ti) 2026 pool->ti = NULL; 2027 } 2028 2029 /*---------------------------------------------------------------- 2030 * Pool creation 2031 *--------------------------------------------------------------*/ 2032 /* Initialize pool features. */ 2033 static void pool_features_init(struct pool_features *pf) 2034 { 2035 pf->mode = PM_WRITE; 2036 pf->zero_new_blocks = true; 2037 pf->discard_enabled = true; 2038 pf->discard_passdown = true; 2039 pf->error_if_no_space = false; 2040 } 2041 2042 static void __pool_destroy(struct pool *pool) 2043 { 2044 __pool_table_remove(pool); 2045 2046 if (dm_pool_metadata_close(pool->pmd) < 0) 2047 DMWARN("%s: dm_pool_metadata_close() failed.", __func__); 2048 2049 dm_bio_prison_destroy(pool->prison); 2050 dm_kcopyd_client_destroy(pool->copier); 2051 2052 if (pool->wq) 2053 destroy_workqueue(pool->wq); 2054 2055 if (pool->next_mapping) 2056 mempool_free(pool->next_mapping, pool->mapping_pool); 2057 mempool_destroy(pool->mapping_pool); 2058 dm_deferred_set_destroy(pool->shared_read_ds); 2059 dm_deferred_set_destroy(pool->all_io_ds); 2060 kfree(pool); 2061 } 2062 2063 static struct kmem_cache *_new_mapping_cache; 2064 2065 static struct pool *pool_create(struct mapped_device *pool_md, 2066 struct block_device *metadata_dev, 2067 unsigned long block_size, 2068 int read_only, char **error) 2069 { 2070 int r; 2071 void *err_p; 2072 struct pool *pool; 2073 struct dm_pool_metadata *pmd; 2074 bool format_device = read_only ? false : true; 2075 2076 pmd = dm_pool_metadata_open(metadata_dev, block_size, format_device); 2077 if (IS_ERR(pmd)) { 2078 *error = "Error creating metadata object"; 2079 return (struct pool *)pmd; 2080 } 2081 2082 pool = kmalloc(sizeof(*pool), GFP_KERNEL); 2083 if (!pool) { 2084 *error = "Error allocating memory for pool"; 2085 err_p = ERR_PTR(-ENOMEM); 2086 goto bad_pool; 2087 } 2088 2089 pool->pmd = pmd; 2090 pool->sectors_per_block = block_size; 2091 if (block_size & (block_size - 1)) 2092 pool->sectors_per_block_shift = -1; 2093 else 2094 pool->sectors_per_block_shift = __ffs(block_size); 2095 pool->low_water_blocks = 0; 2096 pool_features_init(&pool->pf); 2097 pool->prison = dm_bio_prison_create(PRISON_CELLS); 2098 if (!pool->prison) { 2099 *error = "Error creating pool's bio prison"; 2100 err_p = ERR_PTR(-ENOMEM); 2101 goto bad_prison; 2102 } 2103 2104 pool->copier = dm_kcopyd_client_create(&dm_kcopyd_throttle); 2105 if (IS_ERR(pool->copier)) { 2106 r = PTR_ERR(pool->copier); 2107 *error = "Error creating pool's kcopyd client"; 2108 err_p = ERR_PTR(r); 2109 goto bad_kcopyd_client; 2110 } 2111 2112 /* 2113 * Create singlethreaded workqueue that will service all devices 2114 * that use this metadata. 2115 */ 2116 pool->wq = alloc_ordered_workqueue("dm-" DM_MSG_PREFIX, WQ_MEM_RECLAIM); 2117 if (!pool->wq) { 2118 *error = "Error creating pool's workqueue"; 2119 err_p = ERR_PTR(-ENOMEM); 2120 goto bad_wq; 2121 } 2122 2123 INIT_WORK(&pool->worker, do_worker); 2124 INIT_DELAYED_WORK(&pool->waker, do_waker); 2125 INIT_DELAYED_WORK(&pool->no_space_timeout, do_no_space_timeout); 2126 spin_lock_init(&pool->lock); 2127 bio_list_init(&pool->deferred_flush_bios); 2128 INIT_LIST_HEAD(&pool->prepared_mappings); 2129 INIT_LIST_HEAD(&pool->prepared_discards); 2130 INIT_LIST_HEAD(&pool->active_thins); 2131 pool->low_water_triggered = false; 2132 2133 pool->shared_read_ds = dm_deferred_set_create(); 2134 if (!pool->shared_read_ds) { 2135 *error = "Error creating pool's shared read deferred set"; 2136 err_p = ERR_PTR(-ENOMEM); 2137 goto bad_shared_read_ds; 2138 } 2139 2140 pool->all_io_ds = dm_deferred_set_create(); 2141 if (!pool->all_io_ds) { 2142 *error = "Error creating pool's all io deferred set"; 2143 err_p = ERR_PTR(-ENOMEM); 2144 goto bad_all_io_ds; 2145 } 2146 2147 pool->next_mapping = NULL; 2148 pool->mapping_pool = mempool_create_slab_pool(MAPPING_POOL_SIZE, 2149 _new_mapping_cache); 2150 if (!pool->mapping_pool) { 2151 *error = "Error creating pool's mapping mempool"; 2152 err_p = ERR_PTR(-ENOMEM); 2153 goto bad_mapping_pool; 2154 } 2155 2156 pool->ref_count = 1; 2157 pool->last_commit_jiffies = jiffies; 2158 pool->pool_md = pool_md; 2159 pool->md_dev = metadata_dev; 2160 __pool_table_insert(pool); 2161 2162 return pool; 2163 2164 bad_mapping_pool: 2165 dm_deferred_set_destroy(pool->all_io_ds); 2166 bad_all_io_ds: 2167 dm_deferred_set_destroy(pool->shared_read_ds); 2168 bad_shared_read_ds: 2169 destroy_workqueue(pool->wq); 2170 bad_wq: 2171 dm_kcopyd_client_destroy(pool->copier); 2172 bad_kcopyd_client: 2173 dm_bio_prison_destroy(pool->prison); 2174 bad_prison: 2175 kfree(pool); 2176 bad_pool: 2177 if (dm_pool_metadata_close(pmd)) 2178 DMWARN("%s: dm_pool_metadata_close() failed.", __func__); 2179 2180 return err_p; 2181 } 2182 2183 static void __pool_inc(struct pool *pool) 2184 { 2185 BUG_ON(!mutex_is_locked(&dm_thin_pool_table.mutex)); 2186 pool->ref_count++; 2187 } 2188 2189 static void __pool_dec(struct pool *pool) 2190 { 2191 BUG_ON(!mutex_is_locked(&dm_thin_pool_table.mutex)); 2192 BUG_ON(!pool->ref_count); 2193 if (!--pool->ref_count) 2194 __pool_destroy(pool); 2195 } 2196 2197 static struct pool *__pool_find(struct mapped_device *pool_md, 2198 struct block_device *metadata_dev, 2199 unsigned long block_size, int read_only, 2200 char **error, int *created) 2201 { 2202 struct pool *pool = __pool_table_lookup_metadata_dev(metadata_dev); 2203 2204 if (pool) { 2205 if (pool->pool_md != pool_md) { 2206 *error = "metadata device already in use by a pool"; 2207 return ERR_PTR(-EBUSY); 2208 } 2209 __pool_inc(pool); 2210 2211 } else { 2212 pool = __pool_table_lookup(pool_md); 2213 if (pool) { 2214 if (pool->md_dev != metadata_dev) { 2215 *error = "different pool cannot replace a pool"; 2216 return ERR_PTR(-EINVAL); 2217 } 2218 __pool_inc(pool); 2219 2220 } else { 2221 pool = pool_create(pool_md, metadata_dev, block_size, read_only, error); 2222 *created = 1; 2223 } 2224 } 2225 2226 return pool; 2227 } 2228 2229 /*---------------------------------------------------------------- 2230 * Pool target methods 2231 *--------------------------------------------------------------*/ 2232 static void pool_dtr(struct dm_target *ti) 2233 { 2234 struct pool_c *pt = ti->private; 2235 2236 mutex_lock(&dm_thin_pool_table.mutex); 2237 2238 unbind_control_target(pt->pool, ti); 2239 __pool_dec(pt->pool); 2240 dm_put_device(ti, pt->metadata_dev); 2241 dm_put_device(ti, pt->data_dev); 2242 kfree(pt); 2243 2244 mutex_unlock(&dm_thin_pool_table.mutex); 2245 } 2246 2247 static int parse_pool_features(struct dm_arg_set *as, struct pool_features *pf, 2248 struct dm_target *ti) 2249 { 2250 int r; 2251 unsigned argc; 2252 const char *arg_name; 2253 2254 static struct dm_arg _args[] = { 2255 {0, 4, "Invalid number of pool feature arguments"}, 2256 }; 2257 2258 /* 2259 * No feature arguments supplied. 2260 */ 2261 if (!as->argc) 2262 return 0; 2263 2264 r = dm_read_arg_group(_args, as, &argc, &ti->error); 2265 if (r) 2266 return -EINVAL; 2267 2268 while (argc && !r) { 2269 arg_name = dm_shift_arg(as); 2270 argc--; 2271 2272 if (!strcasecmp(arg_name, "skip_block_zeroing")) 2273 pf->zero_new_blocks = false; 2274 2275 else if (!strcasecmp(arg_name, "ignore_discard")) 2276 pf->discard_enabled = false; 2277 2278 else if (!strcasecmp(arg_name, "no_discard_passdown")) 2279 pf->discard_passdown = false; 2280 2281 else if (!strcasecmp(arg_name, "read_only")) 2282 pf->mode = PM_READ_ONLY; 2283 2284 else if (!strcasecmp(arg_name, "error_if_no_space")) 2285 pf->error_if_no_space = true; 2286 2287 else { 2288 ti->error = "Unrecognised pool feature requested"; 2289 r = -EINVAL; 2290 break; 2291 } 2292 } 2293 2294 return r; 2295 } 2296 2297 static void metadata_low_callback(void *context) 2298 { 2299 struct pool *pool = context; 2300 2301 DMWARN("%s: reached low water mark for metadata device: sending event.", 2302 dm_device_name(pool->pool_md)); 2303 2304 dm_table_event(pool->ti->table); 2305 } 2306 2307 static sector_t get_dev_size(struct block_device *bdev) 2308 { 2309 return i_size_read(bdev->bd_inode) >> SECTOR_SHIFT; 2310 } 2311 2312 static void warn_if_metadata_device_too_big(struct block_device *bdev) 2313 { 2314 sector_t metadata_dev_size = get_dev_size(bdev); 2315 char buffer[BDEVNAME_SIZE]; 2316 2317 if (metadata_dev_size > THIN_METADATA_MAX_SECTORS_WARNING) 2318 DMWARN("Metadata device %s is larger than %u sectors: excess space will not be used.", 2319 bdevname(bdev, buffer), THIN_METADATA_MAX_SECTORS); 2320 } 2321 2322 static sector_t get_metadata_dev_size(struct block_device *bdev) 2323 { 2324 sector_t metadata_dev_size = get_dev_size(bdev); 2325 2326 if (metadata_dev_size > THIN_METADATA_MAX_SECTORS) 2327 metadata_dev_size = THIN_METADATA_MAX_SECTORS; 2328 2329 return metadata_dev_size; 2330 } 2331 2332 static dm_block_t get_metadata_dev_size_in_blocks(struct block_device *bdev) 2333 { 2334 sector_t metadata_dev_size = get_metadata_dev_size(bdev); 2335 2336 sector_div(metadata_dev_size, THIN_METADATA_BLOCK_SIZE); 2337 2338 return metadata_dev_size; 2339 } 2340 2341 /* 2342 * When a metadata threshold is crossed a dm event is triggered, and 2343 * userland should respond by growing the metadata device. We could let 2344 * userland set the threshold, like we do with the data threshold, but I'm 2345 * not sure they know enough to do this well. 2346 */ 2347 static dm_block_t calc_metadata_threshold(struct pool_c *pt) 2348 { 2349 /* 2350 * 4M is ample for all ops with the possible exception of thin 2351 * device deletion which is harmless if it fails (just retry the 2352 * delete after you've grown the device). 2353 */ 2354 dm_block_t quarter = get_metadata_dev_size_in_blocks(pt->metadata_dev->bdev) / 4; 2355 return min((dm_block_t)1024ULL /* 4M */, quarter); 2356 } 2357 2358 /* 2359 * thin-pool <metadata dev> <data dev> 2360 * <data block size (sectors)> 2361 * <low water mark (blocks)> 2362 * [<#feature args> [<arg>]*] 2363 * 2364 * Optional feature arguments are: 2365 * skip_block_zeroing: skips the zeroing of newly-provisioned blocks. 2366 * ignore_discard: disable discard 2367 * no_discard_passdown: don't pass discards down to the data device 2368 * read_only: Don't allow any changes to be made to the pool metadata. 2369 * error_if_no_space: error IOs, instead of queueing, if no space. 2370 */ 2371 static int pool_ctr(struct dm_target *ti, unsigned argc, char **argv) 2372 { 2373 int r, pool_created = 0; 2374 struct pool_c *pt; 2375 struct pool *pool; 2376 struct pool_features pf; 2377 struct dm_arg_set as; 2378 struct dm_dev *data_dev; 2379 unsigned long block_size; 2380 dm_block_t low_water_blocks; 2381 struct dm_dev *metadata_dev; 2382 fmode_t metadata_mode; 2383 2384 /* 2385 * FIXME Remove validation from scope of lock. 2386 */ 2387 mutex_lock(&dm_thin_pool_table.mutex); 2388 2389 if (argc < 4) { 2390 ti->error = "Invalid argument count"; 2391 r = -EINVAL; 2392 goto out_unlock; 2393 } 2394 2395 as.argc = argc; 2396 as.argv = argv; 2397 2398 /* 2399 * Set default pool features. 2400 */ 2401 pool_features_init(&pf); 2402 2403 dm_consume_args(&as, 4); 2404 r = parse_pool_features(&as, &pf, ti); 2405 if (r) 2406 goto out_unlock; 2407 2408 metadata_mode = FMODE_READ | ((pf.mode == PM_READ_ONLY) ? 0 : FMODE_WRITE); 2409 r = dm_get_device(ti, argv[0], metadata_mode, &metadata_dev); 2410 if (r) { 2411 ti->error = "Error opening metadata block device"; 2412 goto out_unlock; 2413 } 2414 warn_if_metadata_device_too_big(metadata_dev->bdev); 2415 2416 r = dm_get_device(ti, argv[1], FMODE_READ | FMODE_WRITE, &data_dev); 2417 if (r) { 2418 ti->error = "Error getting data device"; 2419 goto out_metadata; 2420 } 2421 2422 if (kstrtoul(argv[2], 10, &block_size) || !block_size || 2423 block_size < DATA_DEV_BLOCK_SIZE_MIN_SECTORS || 2424 block_size > DATA_DEV_BLOCK_SIZE_MAX_SECTORS || 2425 block_size & (DATA_DEV_BLOCK_SIZE_MIN_SECTORS - 1)) { 2426 ti->error = "Invalid block size"; 2427 r = -EINVAL; 2428 goto out; 2429 } 2430 2431 if (kstrtoull(argv[3], 10, (unsigned long long *)&low_water_blocks)) { 2432 ti->error = "Invalid low water mark"; 2433 r = -EINVAL; 2434 goto out; 2435 } 2436 2437 pt = kzalloc(sizeof(*pt), GFP_KERNEL); 2438 if (!pt) { 2439 r = -ENOMEM; 2440 goto out; 2441 } 2442 2443 pool = __pool_find(dm_table_get_md(ti->table), metadata_dev->bdev, 2444 block_size, pf.mode == PM_READ_ONLY, &ti->error, &pool_created); 2445 if (IS_ERR(pool)) { 2446 r = PTR_ERR(pool); 2447 goto out_free_pt; 2448 } 2449 2450 /* 2451 * 'pool_created' reflects whether this is the first table load. 2452 * Top level discard support is not allowed to be changed after 2453 * initial load. This would require a pool reload to trigger thin 2454 * device changes. 2455 */ 2456 if (!pool_created && pf.discard_enabled != pool->pf.discard_enabled) { 2457 ti->error = "Discard support cannot be disabled once enabled"; 2458 r = -EINVAL; 2459 goto out_flags_changed; 2460 } 2461 2462 pt->pool = pool; 2463 pt->ti = ti; 2464 pt->metadata_dev = metadata_dev; 2465 pt->data_dev = data_dev; 2466 pt->low_water_blocks = low_water_blocks; 2467 pt->adjusted_pf = pt->requested_pf = pf; 2468 ti->num_flush_bios = 1; 2469 2470 /* 2471 * Only need to enable discards if the pool should pass 2472 * them down to the data device. The thin device's discard 2473 * processing will cause mappings to be removed from the btree. 2474 */ 2475 ti->discard_zeroes_data_unsupported = true; 2476 if (pf.discard_enabled && pf.discard_passdown) { 2477 ti->num_discard_bios = 1; 2478 2479 /* 2480 * Setting 'discards_supported' circumvents the normal 2481 * stacking of discard limits (this keeps the pool and 2482 * thin devices' discard limits consistent). 2483 */ 2484 ti->discards_supported = true; 2485 } 2486 ti->private = pt; 2487 2488 r = dm_pool_register_metadata_threshold(pt->pool->pmd, 2489 calc_metadata_threshold(pt), 2490 metadata_low_callback, 2491 pool); 2492 if (r) 2493 goto out_free_pt; 2494 2495 pt->callbacks.congested_fn = pool_is_congested; 2496 dm_table_add_target_callbacks(ti->table, &pt->callbacks); 2497 2498 mutex_unlock(&dm_thin_pool_table.mutex); 2499 2500 return 0; 2501 2502 out_flags_changed: 2503 __pool_dec(pool); 2504 out_free_pt: 2505 kfree(pt); 2506 out: 2507 dm_put_device(ti, data_dev); 2508 out_metadata: 2509 dm_put_device(ti, metadata_dev); 2510 out_unlock: 2511 mutex_unlock(&dm_thin_pool_table.mutex); 2512 2513 return r; 2514 } 2515 2516 static int pool_map(struct dm_target *ti, struct bio *bio) 2517 { 2518 int r; 2519 struct pool_c *pt = ti->private; 2520 struct pool *pool = pt->pool; 2521 unsigned long flags; 2522 2523 /* 2524 * As this is a singleton target, ti->begin is always zero. 2525 */ 2526 spin_lock_irqsave(&pool->lock, flags); 2527 bio->bi_bdev = pt->data_dev->bdev; 2528 r = DM_MAPIO_REMAPPED; 2529 spin_unlock_irqrestore(&pool->lock, flags); 2530 2531 return r; 2532 } 2533 2534 static int maybe_resize_data_dev(struct dm_target *ti, bool *need_commit) 2535 { 2536 int r; 2537 struct pool_c *pt = ti->private; 2538 struct pool *pool = pt->pool; 2539 sector_t data_size = ti->len; 2540 dm_block_t sb_data_size; 2541 2542 *need_commit = false; 2543 2544 (void) sector_div(data_size, pool->sectors_per_block); 2545 2546 r = dm_pool_get_data_dev_size(pool->pmd, &sb_data_size); 2547 if (r) { 2548 DMERR("%s: failed to retrieve data device size", 2549 dm_device_name(pool->pool_md)); 2550 return r; 2551 } 2552 2553 if (data_size < sb_data_size) { 2554 DMERR("%s: pool target (%llu blocks) too small: expected %llu", 2555 dm_device_name(pool->pool_md), 2556 (unsigned long long)data_size, sb_data_size); 2557 return -EINVAL; 2558 2559 } else if (data_size > sb_data_size) { 2560 if (dm_pool_metadata_needs_check(pool->pmd)) { 2561 DMERR("%s: unable to grow the data device until repaired.", 2562 dm_device_name(pool->pool_md)); 2563 return 0; 2564 } 2565 2566 if (sb_data_size) 2567 DMINFO("%s: growing the data device from %llu to %llu blocks", 2568 dm_device_name(pool->pool_md), 2569 sb_data_size, (unsigned long long)data_size); 2570 r = dm_pool_resize_data_dev(pool->pmd, data_size); 2571 if (r) { 2572 metadata_operation_failed(pool, "dm_pool_resize_data_dev", r); 2573 return r; 2574 } 2575 2576 *need_commit = true; 2577 } 2578 2579 return 0; 2580 } 2581 2582 static int maybe_resize_metadata_dev(struct dm_target *ti, bool *need_commit) 2583 { 2584 int r; 2585 struct pool_c *pt = ti->private; 2586 struct pool *pool = pt->pool; 2587 dm_block_t metadata_dev_size, sb_metadata_dev_size; 2588 2589 *need_commit = false; 2590 2591 metadata_dev_size = get_metadata_dev_size_in_blocks(pool->md_dev); 2592 2593 r = dm_pool_get_metadata_dev_size(pool->pmd, &sb_metadata_dev_size); 2594 if (r) { 2595 DMERR("%s: failed to retrieve metadata device size", 2596 dm_device_name(pool->pool_md)); 2597 return r; 2598 } 2599 2600 if (metadata_dev_size < sb_metadata_dev_size) { 2601 DMERR("%s: metadata device (%llu blocks) too small: expected %llu", 2602 dm_device_name(pool->pool_md), 2603 metadata_dev_size, sb_metadata_dev_size); 2604 return -EINVAL; 2605 2606 } else if (metadata_dev_size > sb_metadata_dev_size) { 2607 if (dm_pool_metadata_needs_check(pool->pmd)) { 2608 DMERR("%s: unable to grow the metadata device until repaired.", 2609 dm_device_name(pool->pool_md)); 2610 return 0; 2611 } 2612 2613 warn_if_metadata_device_too_big(pool->md_dev); 2614 DMINFO("%s: growing the metadata device from %llu to %llu blocks", 2615 dm_device_name(pool->pool_md), 2616 sb_metadata_dev_size, metadata_dev_size); 2617 r = dm_pool_resize_metadata_dev(pool->pmd, metadata_dev_size); 2618 if (r) { 2619 metadata_operation_failed(pool, "dm_pool_resize_metadata_dev", r); 2620 return r; 2621 } 2622 2623 *need_commit = true; 2624 } 2625 2626 return 0; 2627 } 2628 2629 /* 2630 * Retrieves the number of blocks of the data device from 2631 * the superblock and compares it to the actual device size, 2632 * thus resizing the data device in case it has grown. 2633 * 2634 * This both copes with opening preallocated data devices in the ctr 2635 * being followed by a resume 2636 * -and- 2637 * calling the resume method individually after userspace has 2638 * grown the data device in reaction to a table event. 2639 */ 2640 static int pool_preresume(struct dm_target *ti) 2641 { 2642 int r; 2643 bool need_commit1, need_commit2; 2644 struct pool_c *pt = ti->private; 2645 struct pool *pool = pt->pool; 2646 2647 /* 2648 * Take control of the pool object. 2649 */ 2650 r = bind_control_target(pool, ti); 2651 if (r) 2652 return r; 2653 2654 r = maybe_resize_data_dev(ti, &need_commit1); 2655 if (r) 2656 return r; 2657 2658 r = maybe_resize_metadata_dev(ti, &need_commit2); 2659 if (r) 2660 return r; 2661 2662 if (need_commit1 || need_commit2) 2663 (void) commit(pool); 2664 2665 return 0; 2666 } 2667 2668 static void pool_resume(struct dm_target *ti) 2669 { 2670 struct pool_c *pt = ti->private; 2671 struct pool *pool = pt->pool; 2672 unsigned long flags; 2673 2674 spin_lock_irqsave(&pool->lock, flags); 2675 pool->low_water_triggered = false; 2676 spin_unlock_irqrestore(&pool->lock, flags); 2677 requeue_bios(pool); 2678 2679 do_waker(&pool->waker.work); 2680 } 2681 2682 static void pool_postsuspend(struct dm_target *ti) 2683 { 2684 struct pool_c *pt = ti->private; 2685 struct pool *pool = pt->pool; 2686 2687 cancel_delayed_work(&pool->waker); 2688 cancel_delayed_work(&pool->no_space_timeout); 2689 flush_workqueue(pool->wq); 2690 (void) commit(pool); 2691 } 2692 2693 static int check_arg_count(unsigned argc, unsigned args_required) 2694 { 2695 if (argc != args_required) { 2696 DMWARN("Message received with %u arguments instead of %u.", 2697 argc, args_required); 2698 return -EINVAL; 2699 } 2700 2701 return 0; 2702 } 2703 2704 static int read_dev_id(char *arg, dm_thin_id *dev_id, int warning) 2705 { 2706 if (!kstrtoull(arg, 10, (unsigned long long *)dev_id) && 2707 *dev_id <= MAX_DEV_ID) 2708 return 0; 2709 2710 if (warning) 2711 DMWARN("Message received with invalid device id: %s", arg); 2712 2713 return -EINVAL; 2714 } 2715 2716 static int process_create_thin_mesg(unsigned argc, char **argv, struct pool *pool) 2717 { 2718 dm_thin_id dev_id; 2719 int r; 2720 2721 r = check_arg_count(argc, 2); 2722 if (r) 2723 return r; 2724 2725 r = read_dev_id(argv[1], &dev_id, 1); 2726 if (r) 2727 return r; 2728 2729 r = dm_pool_create_thin(pool->pmd, dev_id); 2730 if (r) { 2731 DMWARN("Creation of new thinly-provisioned device with id %s failed.", 2732 argv[1]); 2733 return r; 2734 } 2735 2736 return 0; 2737 } 2738 2739 static int process_create_snap_mesg(unsigned argc, char **argv, struct pool *pool) 2740 { 2741 dm_thin_id dev_id; 2742 dm_thin_id origin_dev_id; 2743 int r; 2744 2745 r = check_arg_count(argc, 3); 2746 if (r) 2747 return r; 2748 2749 r = read_dev_id(argv[1], &dev_id, 1); 2750 if (r) 2751 return r; 2752 2753 r = read_dev_id(argv[2], &origin_dev_id, 1); 2754 if (r) 2755 return r; 2756 2757 r = dm_pool_create_snap(pool->pmd, dev_id, origin_dev_id); 2758 if (r) { 2759 DMWARN("Creation of new snapshot %s of device %s failed.", 2760 argv[1], argv[2]); 2761 return r; 2762 } 2763 2764 return 0; 2765 } 2766 2767 static int process_delete_mesg(unsigned argc, char **argv, struct pool *pool) 2768 { 2769 dm_thin_id dev_id; 2770 int r; 2771 2772 r = check_arg_count(argc, 2); 2773 if (r) 2774 return r; 2775 2776 r = read_dev_id(argv[1], &dev_id, 1); 2777 if (r) 2778 return r; 2779 2780 r = dm_pool_delete_thin_device(pool->pmd, dev_id); 2781 if (r) 2782 DMWARN("Deletion of thin device %s failed.", argv[1]); 2783 2784 return r; 2785 } 2786 2787 static int process_set_transaction_id_mesg(unsigned argc, char **argv, struct pool *pool) 2788 { 2789 dm_thin_id old_id, new_id; 2790 int r; 2791 2792 r = check_arg_count(argc, 3); 2793 if (r) 2794 return r; 2795 2796 if (kstrtoull(argv[1], 10, (unsigned long long *)&old_id)) { 2797 DMWARN("set_transaction_id message: Unrecognised id %s.", argv[1]); 2798 return -EINVAL; 2799 } 2800 2801 if (kstrtoull(argv[2], 10, (unsigned long long *)&new_id)) { 2802 DMWARN("set_transaction_id message: Unrecognised new id %s.", argv[2]); 2803 return -EINVAL; 2804 } 2805 2806 r = dm_pool_set_metadata_transaction_id(pool->pmd, old_id, new_id); 2807 if (r) { 2808 DMWARN("Failed to change transaction id from %s to %s.", 2809 argv[1], argv[2]); 2810 return r; 2811 } 2812 2813 return 0; 2814 } 2815 2816 static int process_reserve_metadata_snap_mesg(unsigned argc, char **argv, struct pool *pool) 2817 { 2818 int r; 2819 2820 r = check_arg_count(argc, 1); 2821 if (r) 2822 return r; 2823 2824 (void) commit(pool); 2825 2826 r = dm_pool_reserve_metadata_snap(pool->pmd); 2827 if (r) 2828 DMWARN("reserve_metadata_snap message failed."); 2829 2830 return r; 2831 } 2832 2833 static int process_release_metadata_snap_mesg(unsigned argc, char **argv, struct pool *pool) 2834 { 2835 int r; 2836 2837 r = check_arg_count(argc, 1); 2838 if (r) 2839 return r; 2840 2841 r = dm_pool_release_metadata_snap(pool->pmd); 2842 if (r) 2843 DMWARN("release_metadata_snap message failed."); 2844 2845 return r; 2846 } 2847 2848 /* 2849 * Messages supported: 2850 * create_thin <dev_id> 2851 * create_snap <dev_id> <origin_id> 2852 * delete <dev_id> 2853 * trim <dev_id> <new_size_in_sectors> 2854 * set_transaction_id <current_trans_id> <new_trans_id> 2855 * reserve_metadata_snap 2856 * release_metadata_snap 2857 */ 2858 static int pool_message(struct dm_target *ti, unsigned argc, char **argv) 2859 { 2860 int r = -EINVAL; 2861 struct pool_c *pt = ti->private; 2862 struct pool *pool = pt->pool; 2863 2864 if (!strcasecmp(argv[0], "create_thin")) 2865 r = process_create_thin_mesg(argc, argv, pool); 2866 2867 else if (!strcasecmp(argv[0], "create_snap")) 2868 r = process_create_snap_mesg(argc, argv, pool); 2869 2870 else if (!strcasecmp(argv[0], "delete")) 2871 r = process_delete_mesg(argc, argv, pool); 2872 2873 else if (!strcasecmp(argv[0], "set_transaction_id")) 2874 r = process_set_transaction_id_mesg(argc, argv, pool); 2875 2876 else if (!strcasecmp(argv[0], "reserve_metadata_snap")) 2877 r = process_reserve_metadata_snap_mesg(argc, argv, pool); 2878 2879 else if (!strcasecmp(argv[0], "release_metadata_snap")) 2880 r = process_release_metadata_snap_mesg(argc, argv, pool); 2881 2882 else 2883 DMWARN("Unrecognised thin pool target message received: %s", argv[0]); 2884 2885 if (!r) 2886 (void) commit(pool); 2887 2888 return r; 2889 } 2890 2891 static void emit_flags(struct pool_features *pf, char *result, 2892 unsigned sz, unsigned maxlen) 2893 { 2894 unsigned count = !pf->zero_new_blocks + !pf->discard_enabled + 2895 !pf->discard_passdown + (pf->mode == PM_READ_ONLY) + 2896 pf->error_if_no_space; 2897 DMEMIT("%u ", count); 2898 2899 if (!pf->zero_new_blocks) 2900 DMEMIT("skip_block_zeroing "); 2901 2902 if (!pf->discard_enabled) 2903 DMEMIT("ignore_discard "); 2904 2905 if (!pf->discard_passdown) 2906 DMEMIT("no_discard_passdown "); 2907 2908 if (pf->mode == PM_READ_ONLY) 2909 DMEMIT("read_only "); 2910 2911 if (pf->error_if_no_space) 2912 DMEMIT("error_if_no_space "); 2913 } 2914 2915 /* 2916 * Status line is: 2917 * <transaction id> <used metadata sectors>/<total metadata sectors> 2918 * <used data sectors>/<total data sectors> <held metadata root> 2919 */ 2920 static void pool_status(struct dm_target *ti, status_type_t type, 2921 unsigned status_flags, char *result, unsigned maxlen) 2922 { 2923 int r; 2924 unsigned sz = 0; 2925 uint64_t transaction_id; 2926 dm_block_t nr_free_blocks_data; 2927 dm_block_t nr_free_blocks_metadata; 2928 dm_block_t nr_blocks_data; 2929 dm_block_t nr_blocks_metadata; 2930 dm_block_t held_root; 2931 char buf[BDEVNAME_SIZE]; 2932 char buf2[BDEVNAME_SIZE]; 2933 struct pool_c *pt = ti->private; 2934 struct pool *pool = pt->pool; 2935 2936 switch (type) { 2937 case STATUSTYPE_INFO: 2938 if (get_pool_mode(pool) == PM_FAIL) { 2939 DMEMIT("Fail"); 2940 break; 2941 } 2942 2943 /* Commit to ensure statistics aren't out-of-date */ 2944 if (!(status_flags & DM_STATUS_NOFLUSH_FLAG) && !dm_suspended(ti)) 2945 (void) commit(pool); 2946 2947 r = dm_pool_get_metadata_transaction_id(pool->pmd, &transaction_id); 2948 if (r) { 2949 DMERR("%s: dm_pool_get_metadata_transaction_id returned %d", 2950 dm_device_name(pool->pool_md), r); 2951 goto err; 2952 } 2953 2954 r = dm_pool_get_free_metadata_block_count(pool->pmd, &nr_free_blocks_metadata); 2955 if (r) { 2956 DMERR("%s: dm_pool_get_free_metadata_block_count returned %d", 2957 dm_device_name(pool->pool_md), r); 2958 goto err; 2959 } 2960 2961 r = dm_pool_get_metadata_dev_size(pool->pmd, &nr_blocks_metadata); 2962 if (r) { 2963 DMERR("%s: dm_pool_get_metadata_dev_size returned %d", 2964 dm_device_name(pool->pool_md), r); 2965 goto err; 2966 } 2967 2968 r = dm_pool_get_free_block_count(pool->pmd, &nr_free_blocks_data); 2969 if (r) { 2970 DMERR("%s: dm_pool_get_free_block_count returned %d", 2971 dm_device_name(pool->pool_md), r); 2972 goto err; 2973 } 2974 2975 r = dm_pool_get_data_dev_size(pool->pmd, &nr_blocks_data); 2976 if (r) { 2977 DMERR("%s: dm_pool_get_data_dev_size returned %d", 2978 dm_device_name(pool->pool_md), r); 2979 goto err; 2980 } 2981 2982 r = dm_pool_get_metadata_snap(pool->pmd, &held_root); 2983 if (r) { 2984 DMERR("%s: dm_pool_get_metadata_snap returned %d", 2985 dm_device_name(pool->pool_md), r); 2986 goto err; 2987 } 2988 2989 DMEMIT("%llu %llu/%llu %llu/%llu ", 2990 (unsigned long long)transaction_id, 2991 (unsigned long long)(nr_blocks_metadata - nr_free_blocks_metadata), 2992 (unsigned long long)nr_blocks_metadata, 2993 (unsigned long long)(nr_blocks_data - nr_free_blocks_data), 2994 (unsigned long long)nr_blocks_data); 2995 2996 if (held_root) 2997 DMEMIT("%llu ", held_root); 2998 else 2999 DMEMIT("- "); 3000 3001 if (pool->pf.mode == PM_OUT_OF_DATA_SPACE) 3002 DMEMIT("out_of_data_space "); 3003 else if (pool->pf.mode == PM_READ_ONLY) 3004 DMEMIT("ro "); 3005 else 3006 DMEMIT("rw "); 3007 3008 if (!pool->pf.discard_enabled) 3009 DMEMIT("ignore_discard "); 3010 else if (pool->pf.discard_passdown) 3011 DMEMIT("discard_passdown "); 3012 else 3013 DMEMIT("no_discard_passdown "); 3014 3015 if (pool->pf.error_if_no_space) 3016 DMEMIT("error_if_no_space "); 3017 else 3018 DMEMIT("queue_if_no_space "); 3019 3020 break; 3021 3022 case STATUSTYPE_TABLE: 3023 DMEMIT("%s %s %lu %llu ", 3024 format_dev_t(buf, pt->metadata_dev->bdev->bd_dev), 3025 format_dev_t(buf2, pt->data_dev->bdev->bd_dev), 3026 (unsigned long)pool->sectors_per_block, 3027 (unsigned long long)pt->low_water_blocks); 3028 emit_flags(&pt->requested_pf, result, sz, maxlen); 3029 break; 3030 } 3031 return; 3032 3033 err: 3034 DMEMIT("Error"); 3035 } 3036 3037 static int pool_iterate_devices(struct dm_target *ti, 3038 iterate_devices_callout_fn fn, void *data) 3039 { 3040 struct pool_c *pt = ti->private; 3041 3042 return fn(ti, pt->data_dev, 0, ti->len, data); 3043 } 3044 3045 static int pool_merge(struct dm_target *ti, struct bvec_merge_data *bvm, 3046 struct bio_vec *biovec, int max_size) 3047 { 3048 struct pool_c *pt = ti->private; 3049 struct request_queue *q = bdev_get_queue(pt->data_dev->bdev); 3050 3051 if (!q->merge_bvec_fn) 3052 return max_size; 3053 3054 bvm->bi_bdev = pt->data_dev->bdev; 3055 3056 return min(max_size, q->merge_bvec_fn(q, bvm, biovec)); 3057 } 3058 3059 static void set_discard_limits(struct pool_c *pt, struct queue_limits *limits) 3060 { 3061 struct pool *pool = pt->pool; 3062 struct queue_limits *data_limits; 3063 3064 limits->max_discard_sectors = pool->sectors_per_block; 3065 3066 /* 3067 * discard_granularity is just a hint, and not enforced. 3068 */ 3069 if (pt->adjusted_pf.discard_passdown) { 3070 data_limits = &bdev_get_queue(pt->data_dev->bdev)->limits; 3071 limits->discard_granularity = data_limits->discard_granularity; 3072 } else 3073 limits->discard_granularity = pool->sectors_per_block << SECTOR_SHIFT; 3074 } 3075 3076 static void pool_io_hints(struct dm_target *ti, struct queue_limits *limits) 3077 { 3078 struct pool_c *pt = ti->private; 3079 struct pool *pool = pt->pool; 3080 uint64_t io_opt_sectors = limits->io_opt >> SECTOR_SHIFT; 3081 3082 /* 3083 * If the system-determined stacked limits are compatible with the 3084 * pool's blocksize (io_opt is a factor) do not override them. 3085 */ 3086 if (io_opt_sectors < pool->sectors_per_block || 3087 do_div(io_opt_sectors, pool->sectors_per_block)) { 3088 blk_limits_io_min(limits, 0); 3089 blk_limits_io_opt(limits, pool->sectors_per_block << SECTOR_SHIFT); 3090 } 3091 3092 /* 3093 * pt->adjusted_pf is a staging area for the actual features to use. 3094 * They get transferred to the live pool in bind_control_target() 3095 * called from pool_preresume(). 3096 */ 3097 if (!pt->adjusted_pf.discard_enabled) { 3098 /* 3099 * Must explicitly disallow stacking discard limits otherwise the 3100 * block layer will stack them if pool's data device has support. 3101 * QUEUE_FLAG_DISCARD wouldn't be set but there is no way for the 3102 * user to see that, so make sure to set all discard limits to 0. 3103 */ 3104 limits->discard_granularity = 0; 3105 return; 3106 } 3107 3108 disable_passdown_if_not_supported(pt); 3109 3110 set_discard_limits(pt, limits); 3111 } 3112 3113 static struct target_type pool_target = { 3114 .name = "thin-pool", 3115 .features = DM_TARGET_SINGLETON | DM_TARGET_ALWAYS_WRITEABLE | 3116 DM_TARGET_IMMUTABLE, 3117 .version = {1, 12, 0}, 3118 .module = THIS_MODULE, 3119 .ctr = pool_ctr, 3120 .dtr = pool_dtr, 3121 .map = pool_map, 3122 .postsuspend = pool_postsuspend, 3123 .preresume = pool_preresume, 3124 .resume = pool_resume, 3125 .message = pool_message, 3126 .status = pool_status, 3127 .merge = pool_merge, 3128 .iterate_devices = pool_iterate_devices, 3129 .io_hints = pool_io_hints, 3130 }; 3131 3132 /*---------------------------------------------------------------- 3133 * Thin target methods 3134 *--------------------------------------------------------------*/ 3135 static void thin_get(struct thin_c *tc) 3136 { 3137 atomic_inc(&tc->refcount); 3138 } 3139 3140 static void thin_put(struct thin_c *tc) 3141 { 3142 if (atomic_dec_and_test(&tc->refcount)) 3143 complete(&tc->can_destroy); 3144 } 3145 3146 static void thin_dtr(struct dm_target *ti) 3147 { 3148 struct thin_c *tc = ti->private; 3149 unsigned long flags; 3150 3151 thin_put(tc); 3152 wait_for_completion(&tc->can_destroy); 3153 3154 spin_lock_irqsave(&tc->pool->lock, flags); 3155 list_del_rcu(&tc->list); 3156 spin_unlock_irqrestore(&tc->pool->lock, flags); 3157 synchronize_rcu(); 3158 3159 mutex_lock(&dm_thin_pool_table.mutex); 3160 3161 __pool_dec(tc->pool); 3162 dm_pool_close_thin_device(tc->td); 3163 dm_put_device(ti, tc->pool_dev); 3164 if (tc->origin_dev) 3165 dm_put_device(ti, tc->origin_dev); 3166 kfree(tc); 3167 3168 mutex_unlock(&dm_thin_pool_table.mutex); 3169 } 3170 3171 /* 3172 * Thin target parameters: 3173 * 3174 * <pool_dev> <dev_id> [origin_dev] 3175 * 3176 * pool_dev: the path to the pool (eg, /dev/mapper/my_pool) 3177 * dev_id: the internal device identifier 3178 * origin_dev: a device external to the pool that should act as the origin 3179 * 3180 * If the pool device has discards disabled, they get disabled for the thin 3181 * device as well. 3182 */ 3183 static int thin_ctr(struct dm_target *ti, unsigned argc, char **argv) 3184 { 3185 int r; 3186 struct thin_c *tc; 3187 struct dm_dev *pool_dev, *origin_dev; 3188 struct mapped_device *pool_md; 3189 unsigned long flags; 3190 3191 mutex_lock(&dm_thin_pool_table.mutex); 3192 3193 if (argc != 2 && argc != 3) { 3194 ti->error = "Invalid argument count"; 3195 r = -EINVAL; 3196 goto out_unlock; 3197 } 3198 3199 tc = ti->private = kzalloc(sizeof(*tc), GFP_KERNEL); 3200 if (!tc) { 3201 ti->error = "Out of memory"; 3202 r = -ENOMEM; 3203 goto out_unlock; 3204 } 3205 spin_lock_init(&tc->lock); 3206 bio_list_init(&tc->deferred_bio_list); 3207 bio_list_init(&tc->retry_on_resume_list); 3208 tc->sort_bio_list = RB_ROOT; 3209 3210 if (argc == 3) { 3211 r = dm_get_device(ti, argv[2], FMODE_READ, &origin_dev); 3212 if (r) { 3213 ti->error = "Error opening origin device"; 3214 goto bad_origin_dev; 3215 } 3216 tc->origin_dev = origin_dev; 3217 } 3218 3219 r = dm_get_device(ti, argv[0], dm_table_get_mode(ti->table), &pool_dev); 3220 if (r) { 3221 ti->error = "Error opening pool device"; 3222 goto bad_pool_dev; 3223 } 3224 tc->pool_dev = pool_dev; 3225 3226 if (read_dev_id(argv[1], (unsigned long long *)&tc->dev_id, 0)) { 3227 ti->error = "Invalid device id"; 3228 r = -EINVAL; 3229 goto bad_common; 3230 } 3231 3232 pool_md = dm_get_md(tc->pool_dev->bdev->bd_dev); 3233 if (!pool_md) { 3234 ti->error = "Couldn't get pool mapped device"; 3235 r = -EINVAL; 3236 goto bad_common; 3237 } 3238 3239 tc->pool = __pool_table_lookup(pool_md); 3240 if (!tc->pool) { 3241 ti->error = "Couldn't find pool object"; 3242 r = -EINVAL; 3243 goto bad_pool_lookup; 3244 } 3245 __pool_inc(tc->pool); 3246 3247 if (get_pool_mode(tc->pool) == PM_FAIL) { 3248 ti->error = "Couldn't open thin device, Pool is in fail mode"; 3249 r = -EINVAL; 3250 goto bad_thin_open; 3251 } 3252 3253 r = dm_pool_open_thin_device(tc->pool->pmd, tc->dev_id, &tc->td); 3254 if (r) { 3255 ti->error = "Couldn't open thin internal device"; 3256 goto bad_thin_open; 3257 } 3258 3259 r = dm_set_target_max_io_len(ti, tc->pool->sectors_per_block); 3260 if (r) 3261 goto bad_target_max_io_len; 3262 3263 ti->num_flush_bios = 1; 3264 ti->flush_supported = true; 3265 ti->per_bio_data_size = sizeof(struct dm_thin_endio_hook); 3266 3267 /* In case the pool supports discards, pass them on. */ 3268 ti->discard_zeroes_data_unsupported = true; 3269 if (tc->pool->pf.discard_enabled) { 3270 ti->discards_supported = true; 3271 ti->num_discard_bios = 1; 3272 /* Discard bios must be split on a block boundary */ 3273 ti->split_discard_bios = true; 3274 } 3275 3276 dm_put(pool_md); 3277 3278 mutex_unlock(&dm_thin_pool_table.mutex); 3279 3280 atomic_set(&tc->refcount, 1); 3281 init_completion(&tc->can_destroy); 3282 3283 spin_lock_irqsave(&tc->pool->lock, flags); 3284 list_add_tail_rcu(&tc->list, &tc->pool->active_thins); 3285 spin_unlock_irqrestore(&tc->pool->lock, flags); 3286 /* 3287 * This synchronize_rcu() call is needed here otherwise we risk a 3288 * wake_worker() call finding no bios to process (because the newly 3289 * added tc isn't yet visible). So this reduces latency since we 3290 * aren't then dependent on the periodic commit to wake_worker(). 3291 */ 3292 synchronize_rcu(); 3293 3294 return 0; 3295 3296 bad_target_max_io_len: 3297 dm_pool_close_thin_device(tc->td); 3298 bad_thin_open: 3299 __pool_dec(tc->pool); 3300 bad_pool_lookup: 3301 dm_put(pool_md); 3302 bad_common: 3303 dm_put_device(ti, tc->pool_dev); 3304 bad_pool_dev: 3305 if (tc->origin_dev) 3306 dm_put_device(ti, tc->origin_dev); 3307 bad_origin_dev: 3308 kfree(tc); 3309 out_unlock: 3310 mutex_unlock(&dm_thin_pool_table.mutex); 3311 3312 return r; 3313 } 3314 3315 static int thin_map(struct dm_target *ti, struct bio *bio) 3316 { 3317 bio->bi_iter.bi_sector = dm_target_offset(ti, bio->bi_iter.bi_sector); 3318 3319 return thin_bio_map(ti, bio); 3320 } 3321 3322 static int thin_endio(struct dm_target *ti, struct bio *bio, int err) 3323 { 3324 unsigned long flags; 3325 struct dm_thin_endio_hook *h = dm_per_bio_data(bio, sizeof(struct dm_thin_endio_hook)); 3326 struct list_head work; 3327 struct dm_thin_new_mapping *m, *tmp; 3328 struct pool *pool = h->tc->pool; 3329 3330 if (h->shared_read_entry) { 3331 INIT_LIST_HEAD(&work); 3332 dm_deferred_entry_dec(h->shared_read_entry, &work); 3333 3334 spin_lock_irqsave(&pool->lock, flags); 3335 list_for_each_entry_safe(m, tmp, &work, list) { 3336 list_del(&m->list); 3337 m->quiesced = true; 3338 __maybe_add_mapping(m); 3339 } 3340 spin_unlock_irqrestore(&pool->lock, flags); 3341 } 3342 3343 if (h->all_io_entry) { 3344 INIT_LIST_HEAD(&work); 3345 dm_deferred_entry_dec(h->all_io_entry, &work); 3346 if (!list_empty(&work)) { 3347 spin_lock_irqsave(&pool->lock, flags); 3348 list_for_each_entry_safe(m, tmp, &work, list) 3349 list_add_tail(&m->list, &pool->prepared_discards); 3350 spin_unlock_irqrestore(&pool->lock, flags); 3351 wake_worker(pool); 3352 } 3353 } 3354 3355 return 0; 3356 } 3357 3358 static void thin_presuspend(struct dm_target *ti) 3359 { 3360 struct thin_c *tc = ti->private; 3361 3362 if (dm_noflush_suspending(ti)) 3363 noflush_work(tc, do_noflush_start); 3364 } 3365 3366 static void thin_postsuspend(struct dm_target *ti) 3367 { 3368 struct thin_c *tc = ti->private; 3369 3370 /* 3371 * The dm_noflush_suspending flag has been cleared by now, so 3372 * unfortunately we must always run this. 3373 */ 3374 noflush_work(tc, do_noflush_stop); 3375 } 3376 3377 /* 3378 * <nr mapped sectors> <highest mapped sector> 3379 */ 3380 static void thin_status(struct dm_target *ti, status_type_t type, 3381 unsigned status_flags, char *result, unsigned maxlen) 3382 { 3383 int r; 3384 ssize_t sz = 0; 3385 dm_block_t mapped, highest; 3386 char buf[BDEVNAME_SIZE]; 3387 struct thin_c *tc = ti->private; 3388 3389 if (get_pool_mode(tc->pool) == PM_FAIL) { 3390 DMEMIT("Fail"); 3391 return; 3392 } 3393 3394 if (!tc->td) 3395 DMEMIT("-"); 3396 else { 3397 switch (type) { 3398 case STATUSTYPE_INFO: 3399 r = dm_thin_get_mapped_count(tc->td, &mapped); 3400 if (r) { 3401 DMERR("dm_thin_get_mapped_count returned %d", r); 3402 goto err; 3403 } 3404 3405 r = dm_thin_get_highest_mapped_block(tc->td, &highest); 3406 if (r < 0) { 3407 DMERR("dm_thin_get_highest_mapped_block returned %d", r); 3408 goto err; 3409 } 3410 3411 DMEMIT("%llu ", mapped * tc->pool->sectors_per_block); 3412 if (r) 3413 DMEMIT("%llu", ((highest + 1) * 3414 tc->pool->sectors_per_block) - 1); 3415 else 3416 DMEMIT("-"); 3417 break; 3418 3419 case STATUSTYPE_TABLE: 3420 DMEMIT("%s %lu", 3421 format_dev_t(buf, tc->pool_dev->bdev->bd_dev), 3422 (unsigned long) tc->dev_id); 3423 if (tc->origin_dev) 3424 DMEMIT(" %s", format_dev_t(buf, tc->origin_dev->bdev->bd_dev)); 3425 break; 3426 } 3427 } 3428 3429 return; 3430 3431 err: 3432 DMEMIT("Error"); 3433 } 3434 3435 static int thin_iterate_devices(struct dm_target *ti, 3436 iterate_devices_callout_fn fn, void *data) 3437 { 3438 sector_t blocks; 3439 struct thin_c *tc = ti->private; 3440 struct pool *pool = tc->pool; 3441 3442 /* 3443 * We can't call dm_pool_get_data_dev_size() since that blocks. So 3444 * we follow a more convoluted path through to the pool's target. 3445 */ 3446 if (!pool->ti) 3447 return 0; /* nothing is bound */ 3448 3449 blocks = pool->ti->len; 3450 (void) sector_div(blocks, pool->sectors_per_block); 3451 if (blocks) 3452 return fn(ti, tc->pool_dev, 0, pool->sectors_per_block * blocks, data); 3453 3454 return 0; 3455 } 3456 3457 static struct target_type thin_target = { 3458 .name = "thin", 3459 .version = {1, 12, 0}, 3460 .module = THIS_MODULE, 3461 .ctr = thin_ctr, 3462 .dtr = thin_dtr, 3463 .map = thin_map, 3464 .end_io = thin_endio, 3465 .presuspend = thin_presuspend, 3466 .postsuspend = thin_postsuspend, 3467 .status = thin_status, 3468 .iterate_devices = thin_iterate_devices, 3469 }; 3470 3471 /*----------------------------------------------------------------*/ 3472 3473 static int __init dm_thin_init(void) 3474 { 3475 int r; 3476 3477 pool_table_init(); 3478 3479 r = dm_register_target(&thin_target); 3480 if (r) 3481 return r; 3482 3483 r = dm_register_target(&pool_target); 3484 if (r) 3485 goto bad_pool_target; 3486 3487 r = -ENOMEM; 3488 3489 _new_mapping_cache = KMEM_CACHE(dm_thin_new_mapping, 0); 3490 if (!_new_mapping_cache) 3491 goto bad_new_mapping_cache; 3492 3493 return 0; 3494 3495 bad_new_mapping_cache: 3496 dm_unregister_target(&pool_target); 3497 bad_pool_target: 3498 dm_unregister_target(&thin_target); 3499 3500 return r; 3501 } 3502 3503 static void dm_thin_exit(void) 3504 { 3505 dm_unregister_target(&thin_target); 3506 dm_unregister_target(&pool_target); 3507 3508 kmem_cache_destroy(_new_mapping_cache); 3509 } 3510 3511 module_init(dm_thin_init); 3512 module_exit(dm_thin_exit); 3513 3514 module_param_named(no_space_timeout, no_space_timeout_secs, uint, S_IRUGO | S_IWUSR); 3515 MODULE_PARM_DESC(no_space_timeout, "Out of data space queue IO timeout in seconds"); 3516 3517 MODULE_DESCRIPTION(DM_NAME " thin provisioning target"); 3518 MODULE_AUTHOR("Joe Thornber <dm-devel@redhat.com>"); 3519 MODULE_LICENSE("GPL"); 3520