1 /* 2 * Copyright (C) 2011-2012 Red Hat UK. 3 * 4 * This file is released under the GPL. 5 */ 6 7 #include "dm-thin-metadata.h" 8 #include "dm-bio-prison.h" 9 #include "dm.h" 10 11 #include <linux/device-mapper.h> 12 #include <linux/dm-io.h> 13 #include <linux/dm-kcopyd.h> 14 #include <linux/list.h> 15 #include <linux/init.h> 16 #include <linux/module.h> 17 #include <linux/slab.h> 18 19 #define DM_MSG_PREFIX "thin" 20 21 /* 22 * Tunable constants 23 */ 24 #define ENDIO_HOOK_POOL_SIZE 1024 25 #define MAPPING_POOL_SIZE 1024 26 #define PRISON_CELLS 1024 27 #define COMMIT_PERIOD HZ 28 29 /* 30 * The block size of the device holding pool data must be 31 * between 64KB and 1GB. 32 */ 33 #define DATA_DEV_BLOCK_SIZE_MIN_SECTORS (64 * 1024 >> SECTOR_SHIFT) 34 #define DATA_DEV_BLOCK_SIZE_MAX_SECTORS (1024 * 1024 * 1024 >> SECTOR_SHIFT) 35 36 /* 37 * Device id is restricted to 24 bits. 38 */ 39 #define MAX_DEV_ID ((1 << 24) - 1) 40 41 /* 42 * How do we handle breaking sharing of data blocks? 43 * ================================================= 44 * 45 * We use a standard copy-on-write btree to store the mappings for the 46 * devices (note I'm talking about copy-on-write of the metadata here, not 47 * the data). When you take an internal snapshot you clone the root node 48 * of the origin btree. After this there is no concept of an origin or a 49 * snapshot. They are just two device trees that happen to point to the 50 * same data blocks. 51 * 52 * When we get a write in we decide if it's to a shared data block using 53 * some timestamp magic. If it is, we have to break sharing. 54 * 55 * Let's say we write to a shared block in what was the origin. The 56 * steps are: 57 * 58 * i) plug io further to this physical block. (see bio_prison code). 59 * 60 * ii) quiesce any read io to that shared data block. Obviously 61 * including all devices that share this block. (see dm_deferred_set code) 62 * 63 * iii) copy the data block to a newly allocate block. This step can be 64 * missed out if the io covers the block. (schedule_copy). 65 * 66 * iv) insert the new mapping into the origin's btree 67 * (process_prepared_mapping). This act of inserting breaks some 68 * sharing of btree nodes between the two devices. Breaking sharing only 69 * effects the btree of that specific device. Btrees for the other 70 * devices that share the block never change. The btree for the origin 71 * device as it was after the last commit is untouched, ie. we're using 72 * persistent data structures in the functional programming sense. 73 * 74 * v) unplug io to this physical block, including the io that triggered 75 * the breaking of sharing. 76 * 77 * Steps (ii) and (iii) occur in parallel. 78 * 79 * The metadata _doesn't_ need to be committed before the io continues. We 80 * get away with this because the io is always written to a _new_ block. 81 * If there's a crash, then: 82 * 83 * - The origin mapping will point to the old origin block (the shared 84 * one). This will contain the data as it was before the io that triggered 85 * the breaking of sharing came in. 86 * 87 * - The snap mapping still points to the old block. As it would after 88 * the commit. 89 * 90 * The downside of this scheme is the timestamp magic isn't perfect, and 91 * will continue to think that data block in the snapshot device is shared 92 * even after the write to the origin has broken sharing. I suspect data 93 * blocks will typically be shared by many different devices, so we're 94 * breaking sharing n + 1 times, rather than n, where n is the number of 95 * devices that reference this data block. At the moment I think the 96 * benefits far, far outweigh the disadvantages. 97 */ 98 99 /*----------------------------------------------------------------*/ 100 101 /* 102 * Key building. 103 */ 104 static void build_data_key(struct dm_thin_device *td, 105 dm_block_t b, struct dm_cell_key *key) 106 { 107 key->virtual = 0; 108 key->dev = dm_thin_dev_id(td); 109 key->block = b; 110 } 111 112 static void build_virtual_key(struct dm_thin_device *td, dm_block_t b, 113 struct dm_cell_key *key) 114 { 115 key->virtual = 1; 116 key->dev = dm_thin_dev_id(td); 117 key->block = b; 118 } 119 120 /*----------------------------------------------------------------*/ 121 122 /* 123 * A pool device ties together a metadata device and a data device. It 124 * also provides the interface for creating and destroying internal 125 * devices. 126 */ 127 struct dm_thin_new_mapping; 128 129 /* 130 * The pool runs in 3 modes. Ordered in degraded order for comparisons. 131 */ 132 enum pool_mode { 133 PM_WRITE, /* metadata may be changed */ 134 PM_READ_ONLY, /* metadata may not be changed */ 135 PM_FAIL, /* all I/O fails */ 136 }; 137 138 struct pool_features { 139 enum pool_mode mode; 140 141 bool zero_new_blocks:1; 142 bool discard_enabled:1; 143 bool discard_passdown:1; 144 }; 145 146 struct thin_c; 147 typedef void (*process_bio_fn)(struct thin_c *tc, struct bio *bio); 148 typedef void (*process_mapping_fn)(struct dm_thin_new_mapping *m); 149 150 struct pool { 151 struct list_head list; 152 struct dm_target *ti; /* Only set if a pool target is bound */ 153 154 struct mapped_device *pool_md; 155 struct block_device *md_dev; 156 struct dm_pool_metadata *pmd; 157 158 dm_block_t low_water_blocks; 159 uint32_t sectors_per_block; 160 int sectors_per_block_shift; 161 162 struct pool_features pf; 163 unsigned low_water_triggered:1; /* A dm event has been sent */ 164 unsigned no_free_space:1; /* A -ENOSPC warning has been issued */ 165 166 struct dm_bio_prison *prison; 167 struct dm_kcopyd_client *copier; 168 169 struct workqueue_struct *wq; 170 struct work_struct worker; 171 struct delayed_work waker; 172 173 unsigned long last_commit_jiffies; 174 unsigned ref_count; 175 176 spinlock_t lock; 177 struct bio_list deferred_bios; 178 struct bio_list deferred_flush_bios; 179 struct list_head prepared_mappings; 180 struct list_head prepared_discards; 181 182 struct bio_list retry_on_resume_list; 183 184 struct dm_deferred_set *shared_read_ds; 185 struct dm_deferred_set *all_io_ds; 186 187 struct dm_thin_new_mapping *next_mapping; 188 mempool_t *mapping_pool; 189 mempool_t *endio_hook_pool; 190 191 process_bio_fn process_bio; 192 process_bio_fn process_discard; 193 194 process_mapping_fn process_prepared_mapping; 195 process_mapping_fn process_prepared_discard; 196 }; 197 198 static enum pool_mode get_pool_mode(struct pool *pool); 199 static void set_pool_mode(struct pool *pool, enum pool_mode mode); 200 201 /* 202 * Target context for a pool. 203 */ 204 struct pool_c { 205 struct dm_target *ti; 206 struct pool *pool; 207 struct dm_dev *data_dev; 208 struct dm_dev *metadata_dev; 209 struct dm_target_callbacks callbacks; 210 211 dm_block_t low_water_blocks; 212 struct pool_features requested_pf; /* Features requested during table load */ 213 struct pool_features adjusted_pf; /* Features used after adjusting for constituent devices */ 214 }; 215 216 /* 217 * Target context for a thin. 218 */ 219 struct thin_c { 220 struct dm_dev *pool_dev; 221 struct dm_dev *origin_dev; 222 dm_thin_id dev_id; 223 224 struct pool *pool; 225 struct dm_thin_device *td; 226 }; 227 228 /*----------------------------------------------------------------*/ 229 230 /* 231 * A global list of pools that uses a struct mapped_device as a key. 232 */ 233 static struct dm_thin_pool_table { 234 struct mutex mutex; 235 struct list_head pools; 236 } dm_thin_pool_table; 237 238 static void pool_table_init(void) 239 { 240 mutex_init(&dm_thin_pool_table.mutex); 241 INIT_LIST_HEAD(&dm_thin_pool_table.pools); 242 } 243 244 static void __pool_table_insert(struct pool *pool) 245 { 246 BUG_ON(!mutex_is_locked(&dm_thin_pool_table.mutex)); 247 list_add(&pool->list, &dm_thin_pool_table.pools); 248 } 249 250 static void __pool_table_remove(struct pool *pool) 251 { 252 BUG_ON(!mutex_is_locked(&dm_thin_pool_table.mutex)); 253 list_del(&pool->list); 254 } 255 256 static struct pool *__pool_table_lookup(struct mapped_device *md) 257 { 258 struct pool *pool = NULL, *tmp; 259 260 BUG_ON(!mutex_is_locked(&dm_thin_pool_table.mutex)); 261 262 list_for_each_entry(tmp, &dm_thin_pool_table.pools, list) { 263 if (tmp->pool_md == md) { 264 pool = tmp; 265 break; 266 } 267 } 268 269 return pool; 270 } 271 272 static struct pool *__pool_table_lookup_metadata_dev(struct block_device *md_dev) 273 { 274 struct pool *pool = NULL, *tmp; 275 276 BUG_ON(!mutex_is_locked(&dm_thin_pool_table.mutex)); 277 278 list_for_each_entry(tmp, &dm_thin_pool_table.pools, list) { 279 if (tmp->md_dev == md_dev) { 280 pool = tmp; 281 break; 282 } 283 } 284 285 return pool; 286 } 287 288 /*----------------------------------------------------------------*/ 289 290 struct dm_thin_endio_hook { 291 struct thin_c *tc; 292 struct dm_deferred_entry *shared_read_entry; 293 struct dm_deferred_entry *all_io_entry; 294 struct dm_thin_new_mapping *overwrite_mapping; 295 }; 296 297 static void __requeue_bio_list(struct thin_c *tc, struct bio_list *master) 298 { 299 struct bio *bio; 300 struct bio_list bios; 301 302 bio_list_init(&bios); 303 bio_list_merge(&bios, master); 304 bio_list_init(master); 305 306 while ((bio = bio_list_pop(&bios))) { 307 struct dm_thin_endio_hook *h = dm_get_mapinfo(bio)->ptr; 308 309 if (h->tc == tc) 310 bio_endio(bio, DM_ENDIO_REQUEUE); 311 else 312 bio_list_add(master, bio); 313 } 314 } 315 316 static void requeue_io(struct thin_c *tc) 317 { 318 struct pool *pool = tc->pool; 319 unsigned long flags; 320 321 spin_lock_irqsave(&pool->lock, flags); 322 __requeue_bio_list(tc, &pool->deferred_bios); 323 __requeue_bio_list(tc, &pool->retry_on_resume_list); 324 spin_unlock_irqrestore(&pool->lock, flags); 325 } 326 327 /* 328 * This section of code contains the logic for processing a thin device's IO. 329 * Much of the code depends on pool object resources (lists, workqueues, etc) 330 * but most is exclusively called from the thin target rather than the thin-pool 331 * target. 332 */ 333 334 static dm_block_t get_bio_block(struct thin_c *tc, struct bio *bio) 335 { 336 sector_t block_nr = bio->bi_sector; 337 338 if (tc->pool->sectors_per_block_shift < 0) 339 (void) sector_div(block_nr, tc->pool->sectors_per_block); 340 else 341 block_nr >>= tc->pool->sectors_per_block_shift; 342 343 return block_nr; 344 } 345 346 static void remap(struct thin_c *tc, struct bio *bio, dm_block_t block) 347 { 348 struct pool *pool = tc->pool; 349 sector_t bi_sector = bio->bi_sector; 350 351 bio->bi_bdev = tc->pool_dev->bdev; 352 if (tc->pool->sectors_per_block_shift < 0) 353 bio->bi_sector = (block * pool->sectors_per_block) + 354 sector_div(bi_sector, pool->sectors_per_block); 355 else 356 bio->bi_sector = (block << pool->sectors_per_block_shift) | 357 (bi_sector & (pool->sectors_per_block - 1)); 358 } 359 360 static void remap_to_origin(struct thin_c *tc, struct bio *bio) 361 { 362 bio->bi_bdev = tc->origin_dev->bdev; 363 } 364 365 static int bio_triggers_commit(struct thin_c *tc, struct bio *bio) 366 { 367 return (bio->bi_rw & (REQ_FLUSH | REQ_FUA)) && 368 dm_thin_changed_this_transaction(tc->td); 369 } 370 371 static void issue(struct thin_c *tc, struct bio *bio) 372 { 373 struct pool *pool = tc->pool; 374 unsigned long flags; 375 376 if (!bio_triggers_commit(tc, bio)) { 377 generic_make_request(bio); 378 return; 379 } 380 381 /* 382 * Complete bio with an error if earlier I/O caused changes to 383 * the metadata that can't be committed e.g, due to I/O errors 384 * on the metadata device. 385 */ 386 if (dm_thin_aborted_changes(tc->td)) { 387 bio_io_error(bio); 388 return; 389 } 390 391 /* 392 * Batch together any bios that trigger commits and then issue a 393 * single commit for them in process_deferred_bios(). 394 */ 395 spin_lock_irqsave(&pool->lock, flags); 396 bio_list_add(&pool->deferred_flush_bios, bio); 397 spin_unlock_irqrestore(&pool->lock, flags); 398 } 399 400 static void remap_to_origin_and_issue(struct thin_c *tc, struct bio *bio) 401 { 402 remap_to_origin(tc, bio); 403 issue(tc, bio); 404 } 405 406 static void remap_and_issue(struct thin_c *tc, struct bio *bio, 407 dm_block_t block) 408 { 409 remap(tc, bio, block); 410 issue(tc, bio); 411 } 412 413 /* 414 * wake_worker() is used when new work is queued and when pool_resume is 415 * ready to continue deferred IO processing. 416 */ 417 static void wake_worker(struct pool *pool) 418 { 419 queue_work(pool->wq, &pool->worker); 420 } 421 422 /*----------------------------------------------------------------*/ 423 424 /* 425 * Bio endio functions. 426 */ 427 struct dm_thin_new_mapping { 428 struct list_head list; 429 430 unsigned quiesced:1; 431 unsigned prepared:1; 432 unsigned pass_discard:1; 433 434 struct thin_c *tc; 435 dm_block_t virt_block; 436 dm_block_t data_block; 437 struct dm_bio_prison_cell *cell, *cell2; 438 int err; 439 440 /* 441 * If the bio covers the whole area of a block then we can avoid 442 * zeroing or copying. Instead this bio is hooked. The bio will 443 * still be in the cell, so care has to be taken to avoid issuing 444 * the bio twice. 445 */ 446 struct bio *bio; 447 bio_end_io_t *saved_bi_end_io; 448 }; 449 450 static void __maybe_add_mapping(struct dm_thin_new_mapping *m) 451 { 452 struct pool *pool = m->tc->pool; 453 454 if (m->quiesced && m->prepared) { 455 list_add(&m->list, &pool->prepared_mappings); 456 wake_worker(pool); 457 } 458 } 459 460 static void copy_complete(int read_err, unsigned long write_err, void *context) 461 { 462 unsigned long flags; 463 struct dm_thin_new_mapping *m = context; 464 struct pool *pool = m->tc->pool; 465 466 m->err = read_err || write_err ? -EIO : 0; 467 468 spin_lock_irqsave(&pool->lock, flags); 469 m->prepared = 1; 470 __maybe_add_mapping(m); 471 spin_unlock_irqrestore(&pool->lock, flags); 472 } 473 474 static void overwrite_endio(struct bio *bio, int err) 475 { 476 unsigned long flags; 477 struct dm_thin_endio_hook *h = dm_get_mapinfo(bio)->ptr; 478 struct dm_thin_new_mapping *m = h->overwrite_mapping; 479 struct pool *pool = m->tc->pool; 480 481 m->err = err; 482 483 spin_lock_irqsave(&pool->lock, flags); 484 m->prepared = 1; 485 __maybe_add_mapping(m); 486 spin_unlock_irqrestore(&pool->lock, flags); 487 } 488 489 /*----------------------------------------------------------------*/ 490 491 /* 492 * Workqueue. 493 */ 494 495 /* 496 * Prepared mapping jobs. 497 */ 498 499 /* 500 * This sends the bios in the cell back to the deferred_bios list. 501 */ 502 static void cell_defer(struct thin_c *tc, struct dm_bio_prison_cell *cell, 503 dm_block_t data_block) 504 { 505 struct pool *pool = tc->pool; 506 unsigned long flags; 507 508 spin_lock_irqsave(&pool->lock, flags); 509 dm_cell_release(cell, &pool->deferred_bios); 510 spin_unlock_irqrestore(&tc->pool->lock, flags); 511 512 wake_worker(pool); 513 } 514 515 /* 516 * Same as cell_defer above, except it omits one particular detainee, 517 * a write bio that covers the block and has already been processed. 518 */ 519 static void cell_defer_except(struct thin_c *tc, struct dm_bio_prison_cell *cell) 520 { 521 struct bio_list bios; 522 struct pool *pool = tc->pool; 523 unsigned long flags; 524 525 bio_list_init(&bios); 526 527 spin_lock_irqsave(&pool->lock, flags); 528 dm_cell_release_no_holder(cell, &pool->deferred_bios); 529 spin_unlock_irqrestore(&pool->lock, flags); 530 531 wake_worker(pool); 532 } 533 534 static void process_prepared_mapping_fail(struct dm_thin_new_mapping *m) 535 { 536 if (m->bio) 537 m->bio->bi_end_io = m->saved_bi_end_io; 538 dm_cell_error(m->cell); 539 list_del(&m->list); 540 mempool_free(m, m->tc->pool->mapping_pool); 541 } 542 static void process_prepared_mapping(struct dm_thin_new_mapping *m) 543 { 544 struct thin_c *tc = m->tc; 545 struct bio *bio; 546 int r; 547 548 bio = m->bio; 549 if (bio) 550 bio->bi_end_io = m->saved_bi_end_io; 551 552 if (m->err) { 553 dm_cell_error(m->cell); 554 goto out; 555 } 556 557 /* 558 * Commit the prepared block into the mapping btree. 559 * Any I/O for this block arriving after this point will get 560 * remapped to it directly. 561 */ 562 r = dm_thin_insert_block(tc->td, m->virt_block, m->data_block); 563 if (r) { 564 DMERR("dm_thin_insert_block() failed"); 565 dm_cell_error(m->cell); 566 goto out; 567 } 568 569 /* 570 * Release any bios held while the block was being provisioned. 571 * If we are processing a write bio that completely covers the block, 572 * we already processed it so can ignore it now when processing 573 * the bios in the cell. 574 */ 575 if (bio) { 576 cell_defer_except(tc, m->cell); 577 bio_endio(bio, 0); 578 } else 579 cell_defer(tc, m->cell, m->data_block); 580 581 out: 582 list_del(&m->list); 583 mempool_free(m, tc->pool->mapping_pool); 584 } 585 586 static void process_prepared_discard_fail(struct dm_thin_new_mapping *m) 587 { 588 struct thin_c *tc = m->tc; 589 590 bio_io_error(m->bio); 591 cell_defer_except(tc, m->cell); 592 cell_defer_except(tc, m->cell2); 593 mempool_free(m, tc->pool->mapping_pool); 594 } 595 596 static void process_prepared_discard_passdown(struct dm_thin_new_mapping *m) 597 { 598 struct thin_c *tc = m->tc; 599 600 if (m->pass_discard) 601 remap_and_issue(tc, m->bio, m->data_block); 602 else 603 bio_endio(m->bio, 0); 604 605 cell_defer_except(tc, m->cell); 606 cell_defer_except(tc, m->cell2); 607 mempool_free(m, tc->pool->mapping_pool); 608 } 609 610 static void process_prepared_discard(struct dm_thin_new_mapping *m) 611 { 612 int r; 613 struct thin_c *tc = m->tc; 614 615 r = dm_thin_remove_block(tc->td, m->virt_block); 616 if (r) 617 DMERR("dm_thin_remove_block() failed"); 618 619 process_prepared_discard_passdown(m); 620 } 621 622 static void process_prepared(struct pool *pool, struct list_head *head, 623 process_mapping_fn *fn) 624 { 625 unsigned long flags; 626 struct list_head maps; 627 struct dm_thin_new_mapping *m, *tmp; 628 629 INIT_LIST_HEAD(&maps); 630 spin_lock_irqsave(&pool->lock, flags); 631 list_splice_init(head, &maps); 632 spin_unlock_irqrestore(&pool->lock, flags); 633 634 list_for_each_entry_safe(m, tmp, &maps, list) 635 (*fn)(m); 636 } 637 638 /* 639 * Deferred bio jobs. 640 */ 641 static int io_overlaps_block(struct pool *pool, struct bio *bio) 642 { 643 return bio->bi_size == (pool->sectors_per_block << SECTOR_SHIFT); 644 } 645 646 static int io_overwrites_block(struct pool *pool, struct bio *bio) 647 { 648 return (bio_data_dir(bio) == WRITE) && 649 io_overlaps_block(pool, bio); 650 } 651 652 static void save_and_set_endio(struct bio *bio, bio_end_io_t **save, 653 bio_end_io_t *fn) 654 { 655 *save = bio->bi_end_io; 656 bio->bi_end_io = fn; 657 } 658 659 static int ensure_next_mapping(struct pool *pool) 660 { 661 if (pool->next_mapping) 662 return 0; 663 664 pool->next_mapping = mempool_alloc(pool->mapping_pool, GFP_ATOMIC); 665 666 return pool->next_mapping ? 0 : -ENOMEM; 667 } 668 669 static struct dm_thin_new_mapping *get_next_mapping(struct pool *pool) 670 { 671 struct dm_thin_new_mapping *r = pool->next_mapping; 672 673 BUG_ON(!pool->next_mapping); 674 675 pool->next_mapping = NULL; 676 677 return r; 678 } 679 680 static void schedule_copy(struct thin_c *tc, dm_block_t virt_block, 681 struct dm_dev *origin, dm_block_t data_origin, 682 dm_block_t data_dest, 683 struct dm_bio_prison_cell *cell, struct bio *bio) 684 { 685 int r; 686 struct pool *pool = tc->pool; 687 struct dm_thin_new_mapping *m = get_next_mapping(pool); 688 689 INIT_LIST_HEAD(&m->list); 690 m->quiesced = 0; 691 m->prepared = 0; 692 m->tc = tc; 693 m->virt_block = virt_block; 694 m->data_block = data_dest; 695 m->cell = cell; 696 m->err = 0; 697 m->bio = NULL; 698 699 if (!dm_deferred_set_add_work(pool->shared_read_ds, &m->list)) 700 m->quiesced = 1; 701 702 /* 703 * IO to pool_dev remaps to the pool target's data_dev. 704 * 705 * If the whole block of data is being overwritten, we can issue the 706 * bio immediately. Otherwise we use kcopyd to clone the data first. 707 */ 708 if (io_overwrites_block(pool, bio)) { 709 struct dm_thin_endio_hook *h = dm_get_mapinfo(bio)->ptr; 710 711 h->overwrite_mapping = m; 712 m->bio = bio; 713 save_and_set_endio(bio, &m->saved_bi_end_io, overwrite_endio); 714 remap_and_issue(tc, bio, data_dest); 715 } else { 716 struct dm_io_region from, to; 717 718 from.bdev = origin->bdev; 719 from.sector = data_origin * pool->sectors_per_block; 720 from.count = pool->sectors_per_block; 721 722 to.bdev = tc->pool_dev->bdev; 723 to.sector = data_dest * pool->sectors_per_block; 724 to.count = pool->sectors_per_block; 725 726 r = dm_kcopyd_copy(pool->copier, &from, 1, &to, 727 0, copy_complete, m); 728 if (r < 0) { 729 mempool_free(m, pool->mapping_pool); 730 DMERR("dm_kcopyd_copy() failed"); 731 dm_cell_error(cell); 732 } 733 } 734 } 735 736 static void schedule_internal_copy(struct thin_c *tc, dm_block_t virt_block, 737 dm_block_t data_origin, dm_block_t data_dest, 738 struct dm_bio_prison_cell *cell, struct bio *bio) 739 { 740 schedule_copy(tc, virt_block, tc->pool_dev, 741 data_origin, data_dest, cell, bio); 742 } 743 744 static void schedule_external_copy(struct thin_c *tc, dm_block_t virt_block, 745 dm_block_t data_dest, 746 struct dm_bio_prison_cell *cell, struct bio *bio) 747 { 748 schedule_copy(tc, virt_block, tc->origin_dev, 749 virt_block, data_dest, cell, bio); 750 } 751 752 static void schedule_zero(struct thin_c *tc, dm_block_t virt_block, 753 dm_block_t data_block, struct dm_bio_prison_cell *cell, 754 struct bio *bio) 755 { 756 struct pool *pool = tc->pool; 757 struct dm_thin_new_mapping *m = get_next_mapping(pool); 758 759 INIT_LIST_HEAD(&m->list); 760 m->quiesced = 1; 761 m->prepared = 0; 762 m->tc = tc; 763 m->virt_block = virt_block; 764 m->data_block = data_block; 765 m->cell = cell; 766 m->err = 0; 767 m->bio = NULL; 768 769 /* 770 * If the whole block of data is being overwritten or we are not 771 * zeroing pre-existing data, we can issue the bio immediately. 772 * Otherwise we use kcopyd to zero the data first. 773 */ 774 if (!pool->pf.zero_new_blocks) 775 process_prepared_mapping(m); 776 777 else if (io_overwrites_block(pool, bio)) { 778 struct dm_thin_endio_hook *h = dm_get_mapinfo(bio)->ptr; 779 780 h->overwrite_mapping = m; 781 m->bio = bio; 782 save_and_set_endio(bio, &m->saved_bi_end_io, overwrite_endio); 783 remap_and_issue(tc, bio, data_block); 784 } else { 785 int r; 786 struct dm_io_region to; 787 788 to.bdev = tc->pool_dev->bdev; 789 to.sector = data_block * pool->sectors_per_block; 790 to.count = pool->sectors_per_block; 791 792 r = dm_kcopyd_zero(pool->copier, 1, &to, 0, copy_complete, m); 793 if (r < 0) { 794 mempool_free(m, pool->mapping_pool); 795 DMERR("dm_kcopyd_zero() failed"); 796 dm_cell_error(cell); 797 } 798 } 799 } 800 801 static int commit(struct pool *pool) 802 { 803 int r; 804 805 r = dm_pool_commit_metadata(pool->pmd); 806 if (r) 807 DMERR("commit failed, error = %d", r); 808 809 return r; 810 } 811 812 /* 813 * A non-zero return indicates read_only or fail_io mode. 814 * Many callers don't care about the return value. 815 */ 816 static int commit_or_fallback(struct pool *pool) 817 { 818 int r; 819 820 if (get_pool_mode(pool) != PM_WRITE) 821 return -EINVAL; 822 823 r = commit(pool); 824 if (r) 825 set_pool_mode(pool, PM_READ_ONLY); 826 827 return r; 828 } 829 830 static int alloc_data_block(struct thin_c *tc, dm_block_t *result) 831 { 832 int r; 833 dm_block_t free_blocks; 834 unsigned long flags; 835 struct pool *pool = tc->pool; 836 837 r = dm_pool_get_free_block_count(pool->pmd, &free_blocks); 838 if (r) 839 return r; 840 841 if (free_blocks <= pool->low_water_blocks && !pool->low_water_triggered) { 842 DMWARN("%s: reached low water mark, sending event.", 843 dm_device_name(pool->pool_md)); 844 spin_lock_irqsave(&pool->lock, flags); 845 pool->low_water_triggered = 1; 846 spin_unlock_irqrestore(&pool->lock, flags); 847 dm_table_event(pool->ti->table); 848 } 849 850 if (!free_blocks) { 851 if (pool->no_free_space) 852 return -ENOSPC; 853 else { 854 /* 855 * Try to commit to see if that will free up some 856 * more space. 857 */ 858 (void) commit_or_fallback(pool); 859 860 r = dm_pool_get_free_block_count(pool->pmd, &free_blocks); 861 if (r) 862 return r; 863 864 /* 865 * If we still have no space we set a flag to avoid 866 * doing all this checking and return -ENOSPC. 867 */ 868 if (!free_blocks) { 869 DMWARN("%s: no free space available.", 870 dm_device_name(pool->pool_md)); 871 spin_lock_irqsave(&pool->lock, flags); 872 pool->no_free_space = 1; 873 spin_unlock_irqrestore(&pool->lock, flags); 874 return -ENOSPC; 875 } 876 } 877 } 878 879 r = dm_pool_alloc_data_block(pool->pmd, result); 880 if (r) 881 return r; 882 883 return 0; 884 } 885 886 /* 887 * If we have run out of space, queue bios until the device is 888 * resumed, presumably after having been reloaded with more space. 889 */ 890 static void retry_on_resume(struct bio *bio) 891 { 892 struct dm_thin_endio_hook *h = dm_get_mapinfo(bio)->ptr; 893 struct thin_c *tc = h->tc; 894 struct pool *pool = tc->pool; 895 unsigned long flags; 896 897 spin_lock_irqsave(&pool->lock, flags); 898 bio_list_add(&pool->retry_on_resume_list, bio); 899 spin_unlock_irqrestore(&pool->lock, flags); 900 } 901 902 static void no_space(struct dm_bio_prison_cell *cell) 903 { 904 struct bio *bio; 905 struct bio_list bios; 906 907 bio_list_init(&bios); 908 dm_cell_release(cell, &bios); 909 910 while ((bio = bio_list_pop(&bios))) 911 retry_on_resume(bio); 912 } 913 914 static void process_discard(struct thin_c *tc, struct bio *bio) 915 { 916 int r; 917 unsigned long flags; 918 struct pool *pool = tc->pool; 919 struct dm_bio_prison_cell *cell, *cell2; 920 struct dm_cell_key key, key2; 921 dm_block_t block = get_bio_block(tc, bio); 922 struct dm_thin_lookup_result lookup_result; 923 struct dm_thin_new_mapping *m; 924 925 build_virtual_key(tc->td, block, &key); 926 if (dm_bio_detain(tc->pool->prison, &key, bio, &cell)) 927 return; 928 929 r = dm_thin_find_block(tc->td, block, 1, &lookup_result); 930 switch (r) { 931 case 0: 932 /* 933 * Check nobody is fiddling with this pool block. This can 934 * happen if someone's in the process of breaking sharing 935 * on this block. 936 */ 937 build_data_key(tc->td, lookup_result.block, &key2); 938 if (dm_bio_detain(tc->pool->prison, &key2, bio, &cell2)) { 939 dm_cell_release_singleton(cell, bio); 940 break; 941 } 942 943 if (io_overlaps_block(pool, bio)) { 944 /* 945 * IO may still be going to the destination block. We must 946 * quiesce before we can do the removal. 947 */ 948 m = get_next_mapping(pool); 949 m->tc = tc; 950 m->pass_discard = (!lookup_result.shared) && pool->pf.discard_passdown; 951 m->virt_block = block; 952 m->data_block = lookup_result.block; 953 m->cell = cell; 954 m->cell2 = cell2; 955 m->err = 0; 956 m->bio = bio; 957 958 if (!dm_deferred_set_add_work(pool->all_io_ds, &m->list)) { 959 spin_lock_irqsave(&pool->lock, flags); 960 list_add(&m->list, &pool->prepared_discards); 961 spin_unlock_irqrestore(&pool->lock, flags); 962 wake_worker(pool); 963 } 964 } else { 965 /* 966 * The DM core makes sure that the discard doesn't span 967 * a block boundary. So we submit the discard of a 968 * partial block appropriately. 969 */ 970 dm_cell_release_singleton(cell, bio); 971 dm_cell_release_singleton(cell2, bio); 972 if ((!lookup_result.shared) && pool->pf.discard_passdown) 973 remap_and_issue(tc, bio, lookup_result.block); 974 else 975 bio_endio(bio, 0); 976 } 977 break; 978 979 case -ENODATA: 980 /* 981 * It isn't provisioned, just forget it. 982 */ 983 dm_cell_release_singleton(cell, bio); 984 bio_endio(bio, 0); 985 break; 986 987 default: 988 DMERR("discard: find block unexpectedly returned %d", r); 989 dm_cell_release_singleton(cell, bio); 990 bio_io_error(bio); 991 break; 992 } 993 } 994 995 static void break_sharing(struct thin_c *tc, struct bio *bio, dm_block_t block, 996 struct dm_cell_key *key, 997 struct dm_thin_lookup_result *lookup_result, 998 struct dm_bio_prison_cell *cell) 999 { 1000 int r; 1001 dm_block_t data_block; 1002 1003 r = alloc_data_block(tc, &data_block); 1004 switch (r) { 1005 case 0: 1006 schedule_internal_copy(tc, block, lookup_result->block, 1007 data_block, cell, bio); 1008 break; 1009 1010 case -ENOSPC: 1011 no_space(cell); 1012 break; 1013 1014 default: 1015 DMERR("%s: alloc_data_block() failed, error = %d", __func__, r); 1016 dm_cell_error(cell); 1017 break; 1018 } 1019 } 1020 1021 static void process_shared_bio(struct thin_c *tc, struct bio *bio, 1022 dm_block_t block, 1023 struct dm_thin_lookup_result *lookup_result) 1024 { 1025 struct dm_bio_prison_cell *cell; 1026 struct pool *pool = tc->pool; 1027 struct dm_cell_key key; 1028 1029 /* 1030 * If cell is already occupied, then sharing is already in the process 1031 * of being broken so we have nothing further to do here. 1032 */ 1033 build_data_key(tc->td, lookup_result->block, &key); 1034 if (dm_bio_detain(pool->prison, &key, bio, &cell)) 1035 return; 1036 1037 if (bio_data_dir(bio) == WRITE && bio->bi_size) 1038 break_sharing(tc, bio, block, &key, lookup_result, cell); 1039 else { 1040 struct dm_thin_endio_hook *h = dm_get_mapinfo(bio)->ptr; 1041 1042 h->shared_read_entry = dm_deferred_entry_inc(pool->shared_read_ds); 1043 1044 dm_cell_release_singleton(cell, bio); 1045 remap_and_issue(tc, bio, lookup_result->block); 1046 } 1047 } 1048 1049 static void provision_block(struct thin_c *tc, struct bio *bio, dm_block_t block, 1050 struct dm_bio_prison_cell *cell) 1051 { 1052 int r; 1053 dm_block_t data_block; 1054 1055 /* 1056 * Remap empty bios (flushes) immediately, without provisioning. 1057 */ 1058 if (!bio->bi_size) { 1059 dm_cell_release_singleton(cell, bio); 1060 remap_and_issue(tc, bio, 0); 1061 return; 1062 } 1063 1064 /* 1065 * Fill read bios with zeroes and complete them immediately. 1066 */ 1067 if (bio_data_dir(bio) == READ) { 1068 zero_fill_bio(bio); 1069 dm_cell_release_singleton(cell, bio); 1070 bio_endio(bio, 0); 1071 return; 1072 } 1073 1074 r = alloc_data_block(tc, &data_block); 1075 switch (r) { 1076 case 0: 1077 if (tc->origin_dev) 1078 schedule_external_copy(tc, block, data_block, cell, bio); 1079 else 1080 schedule_zero(tc, block, data_block, cell, bio); 1081 break; 1082 1083 case -ENOSPC: 1084 no_space(cell); 1085 break; 1086 1087 default: 1088 DMERR("%s: alloc_data_block() failed, error = %d", __func__, r); 1089 set_pool_mode(tc->pool, PM_READ_ONLY); 1090 dm_cell_error(cell); 1091 break; 1092 } 1093 } 1094 1095 static void process_bio(struct thin_c *tc, struct bio *bio) 1096 { 1097 int r; 1098 dm_block_t block = get_bio_block(tc, bio); 1099 struct dm_bio_prison_cell *cell; 1100 struct dm_cell_key key; 1101 struct dm_thin_lookup_result lookup_result; 1102 1103 /* 1104 * If cell is already occupied, then the block is already 1105 * being provisioned so we have nothing further to do here. 1106 */ 1107 build_virtual_key(tc->td, block, &key); 1108 if (dm_bio_detain(tc->pool->prison, &key, bio, &cell)) 1109 return; 1110 1111 r = dm_thin_find_block(tc->td, block, 1, &lookup_result); 1112 switch (r) { 1113 case 0: 1114 /* 1115 * We can release this cell now. This thread is the only 1116 * one that puts bios into a cell, and we know there were 1117 * no preceding bios. 1118 */ 1119 /* 1120 * TODO: this will probably have to change when discard goes 1121 * back in. 1122 */ 1123 dm_cell_release_singleton(cell, bio); 1124 1125 if (lookup_result.shared) 1126 process_shared_bio(tc, bio, block, &lookup_result); 1127 else 1128 remap_and_issue(tc, bio, lookup_result.block); 1129 break; 1130 1131 case -ENODATA: 1132 if (bio_data_dir(bio) == READ && tc->origin_dev) { 1133 dm_cell_release_singleton(cell, bio); 1134 remap_to_origin_and_issue(tc, bio); 1135 } else 1136 provision_block(tc, bio, block, cell); 1137 break; 1138 1139 default: 1140 DMERR("dm_thin_find_block() failed, error = %d", r); 1141 dm_cell_release_singleton(cell, bio); 1142 bio_io_error(bio); 1143 break; 1144 } 1145 } 1146 1147 static void process_bio_read_only(struct thin_c *tc, struct bio *bio) 1148 { 1149 int r; 1150 int rw = bio_data_dir(bio); 1151 dm_block_t block = get_bio_block(tc, bio); 1152 struct dm_thin_lookup_result lookup_result; 1153 1154 r = dm_thin_find_block(tc->td, block, 1, &lookup_result); 1155 switch (r) { 1156 case 0: 1157 if (lookup_result.shared && (rw == WRITE) && bio->bi_size) 1158 bio_io_error(bio); 1159 else 1160 remap_and_issue(tc, bio, lookup_result.block); 1161 break; 1162 1163 case -ENODATA: 1164 if (rw != READ) { 1165 bio_io_error(bio); 1166 break; 1167 } 1168 1169 if (tc->origin_dev) { 1170 remap_to_origin_and_issue(tc, bio); 1171 break; 1172 } 1173 1174 zero_fill_bio(bio); 1175 bio_endio(bio, 0); 1176 break; 1177 1178 default: 1179 DMERR("dm_thin_find_block() failed, error = %d", r); 1180 bio_io_error(bio); 1181 break; 1182 } 1183 } 1184 1185 static void process_bio_fail(struct thin_c *tc, struct bio *bio) 1186 { 1187 bio_io_error(bio); 1188 } 1189 1190 static int need_commit_due_to_time(struct pool *pool) 1191 { 1192 return jiffies < pool->last_commit_jiffies || 1193 jiffies > pool->last_commit_jiffies + COMMIT_PERIOD; 1194 } 1195 1196 static void process_deferred_bios(struct pool *pool) 1197 { 1198 unsigned long flags; 1199 struct bio *bio; 1200 struct bio_list bios; 1201 1202 bio_list_init(&bios); 1203 1204 spin_lock_irqsave(&pool->lock, flags); 1205 bio_list_merge(&bios, &pool->deferred_bios); 1206 bio_list_init(&pool->deferred_bios); 1207 spin_unlock_irqrestore(&pool->lock, flags); 1208 1209 while ((bio = bio_list_pop(&bios))) { 1210 struct dm_thin_endio_hook *h = dm_get_mapinfo(bio)->ptr; 1211 struct thin_c *tc = h->tc; 1212 1213 /* 1214 * If we've got no free new_mapping structs, and processing 1215 * this bio might require one, we pause until there are some 1216 * prepared mappings to process. 1217 */ 1218 if (ensure_next_mapping(pool)) { 1219 spin_lock_irqsave(&pool->lock, flags); 1220 bio_list_merge(&pool->deferred_bios, &bios); 1221 spin_unlock_irqrestore(&pool->lock, flags); 1222 1223 break; 1224 } 1225 1226 if (bio->bi_rw & REQ_DISCARD) 1227 pool->process_discard(tc, bio); 1228 else 1229 pool->process_bio(tc, bio); 1230 } 1231 1232 /* 1233 * If there are any deferred flush bios, we must commit 1234 * the metadata before issuing them. 1235 */ 1236 bio_list_init(&bios); 1237 spin_lock_irqsave(&pool->lock, flags); 1238 bio_list_merge(&bios, &pool->deferred_flush_bios); 1239 bio_list_init(&pool->deferred_flush_bios); 1240 spin_unlock_irqrestore(&pool->lock, flags); 1241 1242 if (bio_list_empty(&bios) && !need_commit_due_to_time(pool)) 1243 return; 1244 1245 if (commit_or_fallback(pool)) { 1246 while ((bio = bio_list_pop(&bios))) 1247 bio_io_error(bio); 1248 return; 1249 } 1250 pool->last_commit_jiffies = jiffies; 1251 1252 while ((bio = bio_list_pop(&bios))) 1253 generic_make_request(bio); 1254 } 1255 1256 static void do_worker(struct work_struct *ws) 1257 { 1258 struct pool *pool = container_of(ws, struct pool, worker); 1259 1260 process_prepared(pool, &pool->prepared_mappings, &pool->process_prepared_mapping); 1261 process_prepared(pool, &pool->prepared_discards, &pool->process_prepared_discard); 1262 process_deferred_bios(pool); 1263 } 1264 1265 /* 1266 * We want to commit periodically so that not too much 1267 * unwritten data builds up. 1268 */ 1269 static void do_waker(struct work_struct *ws) 1270 { 1271 struct pool *pool = container_of(to_delayed_work(ws), struct pool, waker); 1272 wake_worker(pool); 1273 queue_delayed_work(pool->wq, &pool->waker, COMMIT_PERIOD); 1274 } 1275 1276 /*----------------------------------------------------------------*/ 1277 1278 static enum pool_mode get_pool_mode(struct pool *pool) 1279 { 1280 return pool->pf.mode; 1281 } 1282 1283 static void set_pool_mode(struct pool *pool, enum pool_mode mode) 1284 { 1285 int r; 1286 1287 pool->pf.mode = mode; 1288 1289 switch (mode) { 1290 case PM_FAIL: 1291 DMERR("switching pool to failure mode"); 1292 pool->process_bio = process_bio_fail; 1293 pool->process_discard = process_bio_fail; 1294 pool->process_prepared_mapping = process_prepared_mapping_fail; 1295 pool->process_prepared_discard = process_prepared_discard_fail; 1296 break; 1297 1298 case PM_READ_ONLY: 1299 DMERR("switching pool to read-only mode"); 1300 r = dm_pool_abort_metadata(pool->pmd); 1301 if (r) { 1302 DMERR("aborting transaction failed"); 1303 set_pool_mode(pool, PM_FAIL); 1304 } else { 1305 dm_pool_metadata_read_only(pool->pmd); 1306 pool->process_bio = process_bio_read_only; 1307 pool->process_discard = process_discard; 1308 pool->process_prepared_mapping = process_prepared_mapping_fail; 1309 pool->process_prepared_discard = process_prepared_discard_passdown; 1310 } 1311 break; 1312 1313 case PM_WRITE: 1314 pool->process_bio = process_bio; 1315 pool->process_discard = process_discard; 1316 pool->process_prepared_mapping = process_prepared_mapping; 1317 pool->process_prepared_discard = process_prepared_discard; 1318 break; 1319 } 1320 } 1321 1322 /*----------------------------------------------------------------*/ 1323 1324 /* 1325 * Mapping functions. 1326 */ 1327 1328 /* 1329 * Called only while mapping a thin bio to hand it over to the workqueue. 1330 */ 1331 static void thin_defer_bio(struct thin_c *tc, struct bio *bio) 1332 { 1333 unsigned long flags; 1334 struct pool *pool = tc->pool; 1335 1336 spin_lock_irqsave(&pool->lock, flags); 1337 bio_list_add(&pool->deferred_bios, bio); 1338 spin_unlock_irqrestore(&pool->lock, flags); 1339 1340 wake_worker(pool); 1341 } 1342 1343 static struct dm_thin_endio_hook *thin_hook_bio(struct thin_c *tc, struct bio *bio) 1344 { 1345 struct pool *pool = tc->pool; 1346 struct dm_thin_endio_hook *h = mempool_alloc(pool->endio_hook_pool, GFP_NOIO); 1347 1348 h->tc = tc; 1349 h->shared_read_entry = NULL; 1350 h->all_io_entry = bio->bi_rw & REQ_DISCARD ? NULL : dm_deferred_entry_inc(pool->all_io_ds); 1351 h->overwrite_mapping = NULL; 1352 1353 return h; 1354 } 1355 1356 /* 1357 * Non-blocking function called from the thin target's map function. 1358 */ 1359 static int thin_bio_map(struct dm_target *ti, struct bio *bio, 1360 union map_info *map_context) 1361 { 1362 int r; 1363 struct thin_c *tc = ti->private; 1364 dm_block_t block = get_bio_block(tc, bio); 1365 struct dm_thin_device *td = tc->td; 1366 struct dm_thin_lookup_result result; 1367 1368 map_context->ptr = thin_hook_bio(tc, bio); 1369 1370 if (get_pool_mode(tc->pool) == PM_FAIL) { 1371 bio_io_error(bio); 1372 return DM_MAPIO_SUBMITTED; 1373 } 1374 1375 if (bio->bi_rw & (REQ_DISCARD | REQ_FLUSH | REQ_FUA)) { 1376 thin_defer_bio(tc, bio); 1377 return DM_MAPIO_SUBMITTED; 1378 } 1379 1380 r = dm_thin_find_block(td, block, 0, &result); 1381 1382 /* 1383 * Note that we defer readahead too. 1384 */ 1385 switch (r) { 1386 case 0: 1387 if (unlikely(result.shared)) { 1388 /* 1389 * We have a race condition here between the 1390 * result.shared value returned by the lookup and 1391 * snapshot creation, which may cause new 1392 * sharing. 1393 * 1394 * To avoid this always quiesce the origin before 1395 * taking the snap. You want to do this anyway to 1396 * ensure a consistent application view 1397 * (i.e. lockfs). 1398 * 1399 * More distant ancestors are irrelevant. The 1400 * shared flag will be set in their case. 1401 */ 1402 thin_defer_bio(tc, bio); 1403 r = DM_MAPIO_SUBMITTED; 1404 } else { 1405 remap(tc, bio, result.block); 1406 r = DM_MAPIO_REMAPPED; 1407 } 1408 break; 1409 1410 case -ENODATA: 1411 if (get_pool_mode(tc->pool) == PM_READ_ONLY) { 1412 /* 1413 * This block isn't provisioned, and we have no way 1414 * of doing so. Just error it. 1415 */ 1416 bio_io_error(bio); 1417 r = DM_MAPIO_SUBMITTED; 1418 break; 1419 } 1420 /* fall through */ 1421 1422 case -EWOULDBLOCK: 1423 /* 1424 * In future, the failed dm_thin_find_block above could 1425 * provide the hint to load the metadata into cache. 1426 */ 1427 thin_defer_bio(tc, bio); 1428 r = DM_MAPIO_SUBMITTED; 1429 break; 1430 1431 default: 1432 /* 1433 * Must always call bio_io_error on failure. 1434 * dm_thin_find_block can fail with -EINVAL if the 1435 * pool is switched to fail-io mode. 1436 */ 1437 bio_io_error(bio); 1438 r = DM_MAPIO_SUBMITTED; 1439 break; 1440 } 1441 1442 return r; 1443 } 1444 1445 static int pool_is_congested(struct dm_target_callbacks *cb, int bdi_bits) 1446 { 1447 int r; 1448 unsigned long flags; 1449 struct pool_c *pt = container_of(cb, struct pool_c, callbacks); 1450 1451 spin_lock_irqsave(&pt->pool->lock, flags); 1452 r = !bio_list_empty(&pt->pool->retry_on_resume_list); 1453 spin_unlock_irqrestore(&pt->pool->lock, flags); 1454 1455 if (!r) { 1456 struct request_queue *q = bdev_get_queue(pt->data_dev->bdev); 1457 r = bdi_congested(&q->backing_dev_info, bdi_bits); 1458 } 1459 1460 return r; 1461 } 1462 1463 static void __requeue_bios(struct pool *pool) 1464 { 1465 bio_list_merge(&pool->deferred_bios, &pool->retry_on_resume_list); 1466 bio_list_init(&pool->retry_on_resume_list); 1467 } 1468 1469 /*---------------------------------------------------------------- 1470 * Binding of control targets to a pool object 1471 *--------------------------------------------------------------*/ 1472 static bool data_dev_supports_discard(struct pool_c *pt) 1473 { 1474 struct request_queue *q = bdev_get_queue(pt->data_dev->bdev); 1475 1476 return q && blk_queue_discard(q); 1477 } 1478 1479 /* 1480 * If discard_passdown was enabled verify that the data device 1481 * supports discards. Disable discard_passdown if not. 1482 */ 1483 static void disable_passdown_if_not_supported(struct pool_c *pt) 1484 { 1485 struct pool *pool = pt->pool; 1486 struct block_device *data_bdev = pt->data_dev->bdev; 1487 struct queue_limits *data_limits = &bdev_get_queue(data_bdev)->limits; 1488 sector_t block_size = pool->sectors_per_block << SECTOR_SHIFT; 1489 const char *reason = NULL; 1490 char buf[BDEVNAME_SIZE]; 1491 1492 if (!pt->adjusted_pf.discard_passdown) 1493 return; 1494 1495 if (!data_dev_supports_discard(pt)) 1496 reason = "discard unsupported"; 1497 1498 else if (data_limits->max_discard_sectors < pool->sectors_per_block) 1499 reason = "max discard sectors smaller than a block"; 1500 1501 else if (data_limits->discard_granularity > block_size) 1502 reason = "discard granularity larger than a block"; 1503 1504 else if (block_size & (data_limits->discard_granularity - 1)) 1505 reason = "discard granularity not a factor of block size"; 1506 1507 if (reason) { 1508 DMWARN("Data device (%s) %s: Disabling discard passdown.", bdevname(data_bdev, buf), reason); 1509 pt->adjusted_pf.discard_passdown = false; 1510 } 1511 } 1512 1513 static int bind_control_target(struct pool *pool, struct dm_target *ti) 1514 { 1515 struct pool_c *pt = ti->private; 1516 1517 /* 1518 * We want to make sure that degraded pools are never upgraded. 1519 */ 1520 enum pool_mode old_mode = pool->pf.mode; 1521 enum pool_mode new_mode = pt->adjusted_pf.mode; 1522 1523 if (old_mode > new_mode) 1524 new_mode = old_mode; 1525 1526 pool->ti = ti; 1527 pool->low_water_blocks = pt->low_water_blocks; 1528 pool->pf = pt->adjusted_pf; 1529 1530 set_pool_mode(pool, new_mode); 1531 1532 return 0; 1533 } 1534 1535 static void unbind_control_target(struct pool *pool, struct dm_target *ti) 1536 { 1537 if (pool->ti == ti) 1538 pool->ti = NULL; 1539 } 1540 1541 /*---------------------------------------------------------------- 1542 * Pool creation 1543 *--------------------------------------------------------------*/ 1544 /* Initialize pool features. */ 1545 static void pool_features_init(struct pool_features *pf) 1546 { 1547 pf->mode = PM_WRITE; 1548 pf->zero_new_blocks = true; 1549 pf->discard_enabled = true; 1550 pf->discard_passdown = true; 1551 } 1552 1553 static void __pool_destroy(struct pool *pool) 1554 { 1555 __pool_table_remove(pool); 1556 1557 if (dm_pool_metadata_close(pool->pmd) < 0) 1558 DMWARN("%s: dm_pool_metadata_close() failed.", __func__); 1559 1560 dm_bio_prison_destroy(pool->prison); 1561 dm_kcopyd_client_destroy(pool->copier); 1562 1563 if (pool->wq) 1564 destroy_workqueue(pool->wq); 1565 1566 if (pool->next_mapping) 1567 mempool_free(pool->next_mapping, pool->mapping_pool); 1568 mempool_destroy(pool->mapping_pool); 1569 mempool_destroy(pool->endio_hook_pool); 1570 dm_deferred_set_destroy(pool->shared_read_ds); 1571 dm_deferred_set_destroy(pool->all_io_ds); 1572 kfree(pool); 1573 } 1574 1575 static struct kmem_cache *_new_mapping_cache; 1576 static struct kmem_cache *_endio_hook_cache; 1577 1578 static struct pool *pool_create(struct mapped_device *pool_md, 1579 struct block_device *metadata_dev, 1580 unsigned long block_size, 1581 int read_only, char **error) 1582 { 1583 int r; 1584 void *err_p; 1585 struct pool *pool; 1586 struct dm_pool_metadata *pmd; 1587 bool format_device = read_only ? false : true; 1588 1589 pmd = dm_pool_metadata_open(metadata_dev, block_size, format_device); 1590 if (IS_ERR(pmd)) { 1591 *error = "Error creating metadata object"; 1592 return (struct pool *)pmd; 1593 } 1594 1595 pool = kmalloc(sizeof(*pool), GFP_KERNEL); 1596 if (!pool) { 1597 *error = "Error allocating memory for pool"; 1598 err_p = ERR_PTR(-ENOMEM); 1599 goto bad_pool; 1600 } 1601 1602 pool->pmd = pmd; 1603 pool->sectors_per_block = block_size; 1604 if (block_size & (block_size - 1)) 1605 pool->sectors_per_block_shift = -1; 1606 else 1607 pool->sectors_per_block_shift = __ffs(block_size); 1608 pool->low_water_blocks = 0; 1609 pool_features_init(&pool->pf); 1610 pool->prison = dm_bio_prison_create(PRISON_CELLS); 1611 if (!pool->prison) { 1612 *error = "Error creating pool's bio prison"; 1613 err_p = ERR_PTR(-ENOMEM); 1614 goto bad_prison; 1615 } 1616 1617 pool->copier = dm_kcopyd_client_create(); 1618 if (IS_ERR(pool->copier)) { 1619 r = PTR_ERR(pool->copier); 1620 *error = "Error creating pool's kcopyd client"; 1621 err_p = ERR_PTR(r); 1622 goto bad_kcopyd_client; 1623 } 1624 1625 /* 1626 * Create singlethreaded workqueue that will service all devices 1627 * that use this metadata. 1628 */ 1629 pool->wq = alloc_ordered_workqueue("dm-" DM_MSG_PREFIX, WQ_MEM_RECLAIM); 1630 if (!pool->wq) { 1631 *error = "Error creating pool's workqueue"; 1632 err_p = ERR_PTR(-ENOMEM); 1633 goto bad_wq; 1634 } 1635 1636 INIT_WORK(&pool->worker, do_worker); 1637 INIT_DELAYED_WORK(&pool->waker, do_waker); 1638 spin_lock_init(&pool->lock); 1639 bio_list_init(&pool->deferred_bios); 1640 bio_list_init(&pool->deferred_flush_bios); 1641 INIT_LIST_HEAD(&pool->prepared_mappings); 1642 INIT_LIST_HEAD(&pool->prepared_discards); 1643 pool->low_water_triggered = 0; 1644 pool->no_free_space = 0; 1645 bio_list_init(&pool->retry_on_resume_list); 1646 1647 pool->shared_read_ds = dm_deferred_set_create(); 1648 if (!pool->shared_read_ds) { 1649 *error = "Error creating pool's shared read deferred set"; 1650 err_p = ERR_PTR(-ENOMEM); 1651 goto bad_shared_read_ds; 1652 } 1653 1654 pool->all_io_ds = dm_deferred_set_create(); 1655 if (!pool->all_io_ds) { 1656 *error = "Error creating pool's all io deferred set"; 1657 err_p = ERR_PTR(-ENOMEM); 1658 goto bad_all_io_ds; 1659 } 1660 1661 pool->next_mapping = NULL; 1662 pool->mapping_pool = mempool_create_slab_pool(MAPPING_POOL_SIZE, 1663 _new_mapping_cache); 1664 if (!pool->mapping_pool) { 1665 *error = "Error creating pool's mapping mempool"; 1666 err_p = ERR_PTR(-ENOMEM); 1667 goto bad_mapping_pool; 1668 } 1669 1670 pool->endio_hook_pool = mempool_create_slab_pool(ENDIO_HOOK_POOL_SIZE, 1671 _endio_hook_cache); 1672 if (!pool->endio_hook_pool) { 1673 *error = "Error creating pool's endio_hook mempool"; 1674 err_p = ERR_PTR(-ENOMEM); 1675 goto bad_endio_hook_pool; 1676 } 1677 pool->ref_count = 1; 1678 pool->last_commit_jiffies = jiffies; 1679 pool->pool_md = pool_md; 1680 pool->md_dev = metadata_dev; 1681 __pool_table_insert(pool); 1682 1683 return pool; 1684 1685 bad_endio_hook_pool: 1686 mempool_destroy(pool->mapping_pool); 1687 bad_mapping_pool: 1688 dm_deferred_set_destroy(pool->all_io_ds); 1689 bad_all_io_ds: 1690 dm_deferred_set_destroy(pool->shared_read_ds); 1691 bad_shared_read_ds: 1692 destroy_workqueue(pool->wq); 1693 bad_wq: 1694 dm_kcopyd_client_destroy(pool->copier); 1695 bad_kcopyd_client: 1696 dm_bio_prison_destroy(pool->prison); 1697 bad_prison: 1698 kfree(pool); 1699 bad_pool: 1700 if (dm_pool_metadata_close(pmd)) 1701 DMWARN("%s: dm_pool_metadata_close() failed.", __func__); 1702 1703 return err_p; 1704 } 1705 1706 static void __pool_inc(struct pool *pool) 1707 { 1708 BUG_ON(!mutex_is_locked(&dm_thin_pool_table.mutex)); 1709 pool->ref_count++; 1710 } 1711 1712 static void __pool_dec(struct pool *pool) 1713 { 1714 BUG_ON(!mutex_is_locked(&dm_thin_pool_table.mutex)); 1715 BUG_ON(!pool->ref_count); 1716 if (!--pool->ref_count) 1717 __pool_destroy(pool); 1718 } 1719 1720 static struct pool *__pool_find(struct mapped_device *pool_md, 1721 struct block_device *metadata_dev, 1722 unsigned long block_size, int read_only, 1723 char **error, int *created) 1724 { 1725 struct pool *pool = __pool_table_lookup_metadata_dev(metadata_dev); 1726 1727 if (pool) { 1728 if (pool->pool_md != pool_md) { 1729 *error = "metadata device already in use by a pool"; 1730 return ERR_PTR(-EBUSY); 1731 } 1732 __pool_inc(pool); 1733 1734 } else { 1735 pool = __pool_table_lookup(pool_md); 1736 if (pool) { 1737 if (pool->md_dev != metadata_dev) { 1738 *error = "different pool cannot replace a pool"; 1739 return ERR_PTR(-EINVAL); 1740 } 1741 __pool_inc(pool); 1742 1743 } else { 1744 pool = pool_create(pool_md, metadata_dev, block_size, read_only, error); 1745 *created = 1; 1746 } 1747 } 1748 1749 return pool; 1750 } 1751 1752 /*---------------------------------------------------------------- 1753 * Pool target methods 1754 *--------------------------------------------------------------*/ 1755 static void pool_dtr(struct dm_target *ti) 1756 { 1757 struct pool_c *pt = ti->private; 1758 1759 mutex_lock(&dm_thin_pool_table.mutex); 1760 1761 unbind_control_target(pt->pool, ti); 1762 __pool_dec(pt->pool); 1763 dm_put_device(ti, pt->metadata_dev); 1764 dm_put_device(ti, pt->data_dev); 1765 kfree(pt); 1766 1767 mutex_unlock(&dm_thin_pool_table.mutex); 1768 } 1769 1770 static int parse_pool_features(struct dm_arg_set *as, struct pool_features *pf, 1771 struct dm_target *ti) 1772 { 1773 int r; 1774 unsigned argc; 1775 const char *arg_name; 1776 1777 static struct dm_arg _args[] = { 1778 {0, 3, "Invalid number of pool feature arguments"}, 1779 }; 1780 1781 /* 1782 * No feature arguments supplied. 1783 */ 1784 if (!as->argc) 1785 return 0; 1786 1787 r = dm_read_arg_group(_args, as, &argc, &ti->error); 1788 if (r) 1789 return -EINVAL; 1790 1791 while (argc && !r) { 1792 arg_name = dm_shift_arg(as); 1793 argc--; 1794 1795 if (!strcasecmp(arg_name, "skip_block_zeroing")) 1796 pf->zero_new_blocks = false; 1797 1798 else if (!strcasecmp(arg_name, "ignore_discard")) 1799 pf->discard_enabled = false; 1800 1801 else if (!strcasecmp(arg_name, "no_discard_passdown")) 1802 pf->discard_passdown = false; 1803 1804 else if (!strcasecmp(arg_name, "read_only")) 1805 pf->mode = PM_READ_ONLY; 1806 1807 else { 1808 ti->error = "Unrecognised pool feature requested"; 1809 r = -EINVAL; 1810 break; 1811 } 1812 } 1813 1814 return r; 1815 } 1816 1817 /* 1818 * thin-pool <metadata dev> <data dev> 1819 * <data block size (sectors)> 1820 * <low water mark (blocks)> 1821 * [<#feature args> [<arg>]*] 1822 * 1823 * Optional feature arguments are: 1824 * skip_block_zeroing: skips the zeroing of newly-provisioned blocks. 1825 * ignore_discard: disable discard 1826 * no_discard_passdown: don't pass discards down to the data device 1827 */ 1828 static int pool_ctr(struct dm_target *ti, unsigned argc, char **argv) 1829 { 1830 int r, pool_created = 0; 1831 struct pool_c *pt; 1832 struct pool *pool; 1833 struct pool_features pf; 1834 struct dm_arg_set as; 1835 struct dm_dev *data_dev; 1836 unsigned long block_size; 1837 dm_block_t low_water_blocks; 1838 struct dm_dev *metadata_dev; 1839 sector_t metadata_dev_size; 1840 char b[BDEVNAME_SIZE]; 1841 1842 /* 1843 * FIXME Remove validation from scope of lock. 1844 */ 1845 mutex_lock(&dm_thin_pool_table.mutex); 1846 1847 if (argc < 4) { 1848 ti->error = "Invalid argument count"; 1849 r = -EINVAL; 1850 goto out_unlock; 1851 } 1852 as.argc = argc; 1853 as.argv = argv; 1854 1855 r = dm_get_device(ti, argv[0], FMODE_READ | FMODE_WRITE, &metadata_dev); 1856 if (r) { 1857 ti->error = "Error opening metadata block device"; 1858 goto out_unlock; 1859 } 1860 1861 metadata_dev_size = i_size_read(metadata_dev->bdev->bd_inode) >> SECTOR_SHIFT; 1862 if (metadata_dev_size > THIN_METADATA_MAX_SECTORS_WARNING) 1863 DMWARN("Metadata device %s is larger than %u sectors: excess space will not be used.", 1864 bdevname(metadata_dev->bdev, b), THIN_METADATA_MAX_SECTORS); 1865 1866 r = dm_get_device(ti, argv[1], FMODE_READ | FMODE_WRITE, &data_dev); 1867 if (r) { 1868 ti->error = "Error getting data device"; 1869 goto out_metadata; 1870 } 1871 1872 if (kstrtoul(argv[2], 10, &block_size) || !block_size || 1873 block_size < DATA_DEV_BLOCK_SIZE_MIN_SECTORS || 1874 block_size > DATA_DEV_BLOCK_SIZE_MAX_SECTORS || 1875 block_size & (DATA_DEV_BLOCK_SIZE_MIN_SECTORS - 1)) { 1876 ti->error = "Invalid block size"; 1877 r = -EINVAL; 1878 goto out; 1879 } 1880 1881 if (kstrtoull(argv[3], 10, (unsigned long long *)&low_water_blocks)) { 1882 ti->error = "Invalid low water mark"; 1883 r = -EINVAL; 1884 goto out; 1885 } 1886 1887 /* 1888 * Set default pool features. 1889 */ 1890 pool_features_init(&pf); 1891 1892 dm_consume_args(&as, 4); 1893 r = parse_pool_features(&as, &pf, ti); 1894 if (r) 1895 goto out; 1896 1897 pt = kzalloc(sizeof(*pt), GFP_KERNEL); 1898 if (!pt) { 1899 r = -ENOMEM; 1900 goto out; 1901 } 1902 1903 pool = __pool_find(dm_table_get_md(ti->table), metadata_dev->bdev, 1904 block_size, pf.mode == PM_READ_ONLY, &ti->error, &pool_created); 1905 if (IS_ERR(pool)) { 1906 r = PTR_ERR(pool); 1907 goto out_free_pt; 1908 } 1909 1910 /* 1911 * 'pool_created' reflects whether this is the first table load. 1912 * Top level discard support is not allowed to be changed after 1913 * initial load. This would require a pool reload to trigger thin 1914 * device changes. 1915 */ 1916 if (!pool_created && pf.discard_enabled != pool->pf.discard_enabled) { 1917 ti->error = "Discard support cannot be disabled once enabled"; 1918 r = -EINVAL; 1919 goto out_flags_changed; 1920 } 1921 1922 pt->pool = pool; 1923 pt->ti = ti; 1924 pt->metadata_dev = metadata_dev; 1925 pt->data_dev = data_dev; 1926 pt->low_water_blocks = low_water_blocks; 1927 pt->adjusted_pf = pt->requested_pf = pf; 1928 ti->num_flush_requests = 1; 1929 1930 /* 1931 * Only need to enable discards if the pool should pass 1932 * them down to the data device. The thin device's discard 1933 * processing will cause mappings to be removed from the btree. 1934 */ 1935 if (pf.discard_enabled && pf.discard_passdown) { 1936 ti->num_discard_requests = 1; 1937 1938 /* 1939 * Setting 'discards_supported' circumvents the normal 1940 * stacking of discard limits (this keeps the pool and 1941 * thin devices' discard limits consistent). 1942 */ 1943 ti->discards_supported = true; 1944 ti->discard_zeroes_data_unsupported = true; 1945 } 1946 ti->private = pt; 1947 1948 pt->callbacks.congested_fn = pool_is_congested; 1949 dm_table_add_target_callbacks(ti->table, &pt->callbacks); 1950 1951 mutex_unlock(&dm_thin_pool_table.mutex); 1952 1953 return 0; 1954 1955 out_flags_changed: 1956 __pool_dec(pool); 1957 out_free_pt: 1958 kfree(pt); 1959 out: 1960 dm_put_device(ti, data_dev); 1961 out_metadata: 1962 dm_put_device(ti, metadata_dev); 1963 out_unlock: 1964 mutex_unlock(&dm_thin_pool_table.mutex); 1965 1966 return r; 1967 } 1968 1969 static int pool_map(struct dm_target *ti, struct bio *bio, 1970 union map_info *map_context) 1971 { 1972 int r; 1973 struct pool_c *pt = ti->private; 1974 struct pool *pool = pt->pool; 1975 unsigned long flags; 1976 1977 /* 1978 * As this is a singleton target, ti->begin is always zero. 1979 */ 1980 spin_lock_irqsave(&pool->lock, flags); 1981 bio->bi_bdev = pt->data_dev->bdev; 1982 r = DM_MAPIO_REMAPPED; 1983 spin_unlock_irqrestore(&pool->lock, flags); 1984 1985 return r; 1986 } 1987 1988 /* 1989 * Retrieves the number of blocks of the data device from 1990 * the superblock and compares it to the actual device size, 1991 * thus resizing the data device in case it has grown. 1992 * 1993 * This both copes with opening preallocated data devices in the ctr 1994 * being followed by a resume 1995 * -and- 1996 * calling the resume method individually after userspace has 1997 * grown the data device in reaction to a table event. 1998 */ 1999 static int pool_preresume(struct dm_target *ti) 2000 { 2001 int r; 2002 struct pool_c *pt = ti->private; 2003 struct pool *pool = pt->pool; 2004 sector_t data_size = ti->len; 2005 dm_block_t sb_data_size; 2006 2007 /* 2008 * Take control of the pool object. 2009 */ 2010 r = bind_control_target(pool, ti); 2011 if (r) 2012 return r; 2013 2014 (void) sector_div(data_size, pool->sectors_per_block); 2015 2016 r = dm_pool_get_data_dev_size(pool->pmd, &sb_data_size); 2017 if (r) { 2018 DMERR("failed to retrieve data device size"); 2019 return r; 2020 } 2021 2022 if (data_size < sb_data_size) { 2023 DMERR("pool target too small, is %llu blocks (expected %llu)", 2024 (unsigned long long)data_size, sb_data_size); 2025 return -EINVAL; 2026 2027 } else if (data_size > sb_data_size) { 2028 r = dm_pool_resize_data_dev(pool->pmd, data_size); 2029 if (r) { 2030 DMERR("failed to resize data device"); 2031 /* FIXME Stricter than necessary: Rollback transaction instead here */ 2032 set_pool_mode(pool, PM_READ_ONLY); 2033 return r; 2034 } 2035 2036 (void) commit_or_fallback(pool); 2037 } 2038 2039 return 0; 2040 } 2041 2042 static void pool_resume(struct dm_target *ti) 2043 { 2044 struct pool_c *pt = ti->private; 2045 struct pool *pool = pt->pool; 2046 unsigned long flags; 2047 2048 spin_lock_irqsave(&pool->lock, flags); 2049 pool->low_water_triggered = 0; 2050 pool->no_free_space = 0; 2051 __requeue_bios(pool); 2052 spin_unlock_irqrestore(&pool->lock, flags); 2053 2054 do_waker(&pool->waker.work); 2055 } 2056 2057 static void pool_postsuspend(struct dm_target *ti) 2058 { 2059 struct pool_c *pt = ti->private; 2060 struct pool *pool = pt->pool; 2061 2062 cancel_delayed_work(&pool->waker); 2063 flush_workqueue(pool->wq); 2064 (void) commit_or_fallback(pool); 2065 } 2066 2067 static int check_arg_count(unsigned argc, unsigned args_required) 2068 { 2069 if (argc != args_required) { 2070 DMWARN("Message received with %u arguments instead of %u.", 2071 argc, args_required); 2072 return -EINVAL; 2073 } 2074 2075 return 0; 2076 } 2077 2078 static int read_dev_id(char *arg, dm_thin_id *dev_id, int warning) 2079 { 2080 if (!kstrtoull(arg, 10, (unsigned long long *)dev_id) && 2081 *dev_id <= MAX_DEV_ID) 2082 return 0; 2083 2084 if (warning) 2085 DMWARN("Message received with invalid device id: %s", arg); 2086 2087 return -EINVAL; 2088 } 2089 2090 static int process_create_thin_mesg(unsigned argc, char **argv, struct pool *pool) 2091 { 2092 dm_thin_id dev_id; 2093 int r; 2094 2095 r = check_arg_count(argc, 2); 2096 if (r) 2097 return r; 2098 2099 r = read_dev_id(argv[1], &dev_id, 1); 2100 if (r) 2101 return r; 2102 2103 r = dm_pool_create_thin(pool->pmd, dev_id); 2104 if (r) { 2105 DMWARN("Creation of new thinly-provisioned device with id %s failed.", 2106 argv[1]); 2107 return r; 2108 } 2109 2110 return 0; 2111 } 2112 2113 static int process_create_snap_mesg(unsigned argc, char **argv, struct pool *pool) 2114 { 2115 dm_thin_id dev_id; 2116 dm_thin_id origin_dev_id; 2117 int r; 2118 2119 r = check_arg_count(argc, 3); 2120 if (r) 2121 return r; 2122 2123 r = read_dev_id(argv[1], &dev_id, 1); 2124 if (r) 2125 return r; 2126 2127 r = read_dev_id(argv[2], &origin_dev_id, 1); 2128 if (r) 2129 return r; 2130 2131 r = dm_pool_create_snap(pool->pmd, dev_id, origin_dev_id); 2132 if (r) { 2133 DMWARN("Creation of new snapshot %s of device %s failed.", 2134 argv[1], argv[2]); 2135 return r; 2136 } 2137 2138 return 0; 2139 } 2140 2141 static int process_delete_mesg(unsigned argc, char **argv, struct pool *pool) 2142 { 2143 dm_thin_id dev_id; 2144 int r; 2145 2146 r = check_arg_count(argc, 2); 2147 if (r) 2148 return r; 2149 2150 r = read_dev_id(argv[1], &dev_id, 1); 2151 if (r) 2152 return r; 2153 2154 r = dm_pool_delete_thin_device(pool->pmd, dev_id); 2155 if (r) 2156 DMWARN("Deletion of thin device %s failed.", argv[1]); 2157 2158 return r; 2159 } 2160 2161 static int process_set_transaction_id_mesg(unsigned argc, char **argv, struct pool *pool) 2162 { 2163 dm_thin_id old_id, new_id; 2164 int r; 2165 2166 r = check_arg_count(argc, 3); 2167 if (r) 2168 return r; 2169 2170 if (kstrtoull(argv[1], 10, (unsigned long long *)&old_id)) { 2171 DMWARN("set_transaction_id message: Unrecognised id %s.", argv[1]); 2172 return -EINVAL; 2173 } 2174 2175 if (kstrtoull(argv[2], 10, (unsigned long long *)&new_id)) { 2176 DMWARN("set_transaction_id message: Unrecognised new id %s.", argv[2]); 2177 return -EINVAL; 2178 } 2179 2180 r = dm_pool_set_metadata_transaction_id(pool->pmd, old_id, new_id); 2181 if (r) { 2182 DMWARN("Failed to change transaction id from %s to %s.", 2183 argv[1], argv[2]); 2184 return r; 2185 } 2186 2187 return 0; 2188 } 2189 2190 static int process_reserve_metadata_snap_mesg(unsigned argc, char **argv, struct pool *pool) 2191 { 2192 int r; 2193 2194 r = check_arg_count(argc, 1); 2195 if (r) 2196 return r; 2197 2198 (void) commit_or_fallback(pool); 2199 2200 r = dm_pool_reserve_metadata_snap(pool->pmd); 2201 if (r) 2202 DMWARN("reserve_metadata_snap message failed."); 2203 2204 return r; 2205 } 2206 2207 static int process_release_metadata_snap_mesg(unsigned argc, char **argv, struct pool *pool) 2208 { 2209 int r; 2210 2211 r = check_arg_count(argc, 1); 2212 if (r) 2213 return r; 2214 2215 r = dm_pool_release_metadata_snap(pool->pmd); 2216 if (r) 2217 DMWARN("release_metadata_snap message failed."); 2218 2219 return r; 2220 } 2221 2222 /* 2223 * Messages supported: 2224 * create_thin <dev_id> 2225 * create_snap <dev_id> <origin_id> 2226 * delete <dev_id> 2227 * trim <dev_id> <new_size_in_sectors> 2228 * set_transaction_id <current_trans_id> <new_trans_id> 2229 * reserve_metadata_snap 2230 * release_metadata_snap 2231 */ 2232 static int pool_message(struct dm_target *ti, unsigned argc, char **argv) 2233 { 2234 int r = -EINVAL; 2235 struct pool_c *pt = ti->private; 2236 struct pool *pool = pt->pool; 2237 2238 if (!strcasecmp(argv[0], "create_thin")) 2239 r = process_create_thin_mesg(argc, argv, pool); 2240 2241 else if (!strcasecmp(argv[0], "create_snap")) 2242 r = process_create_snap_mesg(argc, argv, pool); 2243 2244 else if (!strcasecmp(argv[0], "delete")) 2245 r = process_delete_mesg(argc, argv, pool); 2246 2247 else if (!strcasecmp(argv[0], "set_transaction_id")) 2248 r = process_set_transaction_id_mesg(argc, argv, pool); 2249 2250 else if (!strcasecmp(argv[0], "reserve_metadata_snap")) 2251 r = process_reserve_metadata_snap_mesg(argc, argv, pool); 2252 2253 else if (!strcasecmp(argv[0], "release_metadata_snap")) 2254 r = process_release_metadata_snap_mesg(argc, argv, pool); 2255 2256 else 2257 DMWARN("Unrecognised thin pool target message received: %s", argv[0]); 2258 2259 if (!r) 2260 (void) commit_or_fallback(pool); 2261 2262 return r; 2263 } 2264 2265 static void emit_flags(struct pool_features *pf, char *result, 2266 unsigned sz, unsigned maxlen) 2267 { 2268 unsigned count = !pf->zero_new_blocks + !pf->discard_enabled + 2269 !pf->discard_passdown + (pf->mode == PM_READ_ONLY); 2270 DMEMIT("%u ", count); 2271 2272 if (!pf->zero_new_blocks) 2273 DMEMIT("skip_block_zeroing "); 2274 2275 if (!pf->discard_enabled) 2276 DMEMIT("ignore_discard "); 2277 2278 if (!pf->discard_passdown) 2279 DMEMIT("no_discard_passdown "); 2280 2281 if (pf->mode == PM_READ_ONLY) 2282 DMEMIT("read_only "); 2283 } 2284 2285 /* 2286 * Status line is: 2287 * <transaction id> <used metadata sectors>/<total metadata sectors> 2288 * <used data sectors>/<total data sectors> <held metadata root> 2289 */ 2290 static int pool_status(struct dm_target *ti, status_type_t type, 2291 unsigned status_flags, char *result, unsigned maxlen) 2292 { 2293 int r; 2294 unsigned sz = 0; 2295 uint64_t transaction_id; 2296 dm_block_t nr_free_blocks_data; 2297 dm_block_t nr_free_blocks_metadata; 2298 dm_block_t nr_blocks_data; 2299 dm_block_t nr_blocks_metadata; 2300 dm_block_t held_root; 2301 char buf[BDEVNAME_SIZE]; 2302 char buf2[BDEVNAME_SIZE]; 2303 struct pool_c *pt = ti->private; 2304 struct pool *pool = pt->pool; 2305 2306 switch (type) { 2307 case STATUSTYPE_INFO: 2308 if (get_pool_mode(pool) == PM_FAIL) { 2309 DMEMIT("Fail"); 2310 break; 2311 } 2312 2313 /* Commit to ensure statistics aren't out-of-date */ 2314 if (!(status_flags & DM_STATUS_NOFLUSH_FLAG) && !dm_suspended(ti)) 2315 (void) commit_or_fallback(pool); 2316 2317 r = dm_pool_get_metadata_transaction_id(pool->pmd, 2318 &transaction_id); 2319 if (r) 2320 return r; 2321 2322 r = dm_pool_get_free_metadata_block_count(pool->pmd, 2323 &nr_free_blocks_metadata); 2324 if (r) 2325 return r; 2326 2327 r = dm_pool_get_metadata_dev_size(pool->pmd, &nr_blocks_metadata); 2328 if (r) 2329 return r; 2330 2331 r = dm_pool_get_free_block_count(pool->pmd, 2332 &nr_free_blocks_data); 2333 if (r) 2334 return r; 2335 2336 r = dm_pool_get_data_dev_size(pool->pmd, &nr_blocks_data); 2337 if (r) 2338 return r; 2339 2340 r = dm_pool_get_metadata_snap(pool->pmd, &held_root); 2341 if (r) 2342 return r; 2343 2344 DMEMIT("%llu %llu/%llu %llu/%llu ", 2345 (unsigned long long)transaction_id, 2346 (unsigned long long)(nr_blocks_metadata - nr_free_blocks_metadata), 2347 (unsigned long long)nr_blocks_metadata, 2348 (unsigned long long)(nr_blocks_data - nr_free_blocks_data), 2349 (unsigned long long)nr_blocks_data); 2350 2351 if (held_root) 2352 DMEMIT("%llu ", held_root); 2353 else 2354 DMEMIT("- "); 2355 2356 if (pool->pf.mode == PM_READ_ONLY) 2357 DMEMIT("ro "); 2358 else 2359 DMEMIT("rw "); 2360 2361 if (pool->pf.discard_enabled && pool->pf.discard_passdown) 2362 DMEMIT("discard_passdown"); 2363 else 2364 DMEMIT("no_discard_passdown"); 2365 2366 break; 2367 2368 case STATUSTYPE_TABLE: 2369 DMEMIT("%s %s %lu %llu ", 2370 format_dev_t(buf, pt->metadata_dev->bdev->bd_dev), 2371 format_dev_t(buf2, pt->data_dev->bdev->bd_dev), 2372 (unsigned long)pool->sectors_per_block, 2373 (unsigned long long)pt->low_water_blocks); 2374 emit_flags(&pt->requested_pf, result, sz, maxlen); 2375 break; 2376 } 2377 2378 return 0; 2379 } 2380 2381 static int pool_iterate_devices(struct dm_target *ti, 2382 iterate_devices_callout_fn fn, void *data) 2383 { 2384 struct pool_c *pt = ti->private; 2385 2386 return fn(ti, pt->data_dev, 0, ti->len, data); 2387 } 2388 2389 static int pool_merge(struct dm_target *ti, struct bvec_merge_data *bvm, 2390 struct bio_vec *biovec, int max_size) 2391 { 2392 struct pool_c *pt = ti->private; 2393 struct request_queue *q = bdev_get_queue(pt->data_dev->bdev); 2394 2395 if (!q->merge_bvec_fn) 2396 return max_size; 2397 2398 bvm->bi_bdev = pt->data_dev->bdev; 2399 2400 return min(max_size, q->merge_bvec_fn(q, bvm, biovec)); 2401 } 2402 2403 static bool block_size_is_power_of_two(struct pool *pool) 2404 { 2405 return pool->sectors_per_block_shift >= 0; 2406 } 2407 2408 static void set_discard_limits(struct pool_c *pt, struct queue_limits *limits) 2409 { 2410 struct pool *pool = pt->pool; 2411 struct queue_limits *data_limits; 2412 2413 limits->max_discard_sectors = pool->sectors_per_block; 2414 2415 /* 2416 * discard_granularity is just a hint, and not enforced. 2417 */ 2418 if (pt->adjusted_pf.discard_passdown) { 2419 data_limits = &bdev_get_queue(pt->data_dev->bdev)->limits; 2420 limits->discard_granularity = data_limits->discard_granularity; 2421 } else if (block_size_is_power_of_two(pool)) 2422 limits->discard_granularity = pool->sectors_per_block << SECTOR_SHIFT; 2423 else 2424 /* 2425 * Use largest power of 2 that is a factor of sectors_per_block 2426 * but at least DATA_DEV_BLOCK_SIZE_MIN_SECTORS. 2427 */ 2428 limits->discard_granularity = max(1 << (ffs(pool->sectors_per_block) - 1), 2429 DATA_DEV_BLOCK_SIZE_MIN_SECTORS) << SECTOR_SHIFT; 2430 } 2431 2432 static void pool_io_hints(struct dm_target *ti, struct queue_limits *limits) 2433 { 2434 struct pool_c *pt = ti->private; 2435 struct pool *pool = pt->pool; 2436 2437 blk_limits_io_min(limits, 0); 2438 blk_limits_io_opt(limits, pool->sectors_per_block << SECTOR_SHIFT); 2439 2440 /* 2441 * pt->adjusted_pf is a staging area for the actual features to use. 2442 * They get transferred to the live pool in bind_control_target() 2443 * called from pool_preresume(). 2444 */ 2445 if (!pt->adjusted_pf.discard_enabled) 2446 return; 2447 2448 disable_passdown_if_not_supported(pt); 2449 2450 set_discard_limits(pt, limits); 2451 } 2452 2453 static struct target_type pool_target = { 2454 .name = "thin-pool", 2455 .features = DM_TARGET_SINGLETON | DM_TARGET_ALWAYS_WRITEABLE | 2456 DM_TARGET_IMMUTABLE, 2457 .version = {1, 5, 0}, 2458 .module = THIS_MODULE, 2459 .ctr = pool_ctr, 2460 .dtr = pool_dtr, 2461 .map = pool_map, 2462 .postsuspend = pool_postsuspend, 2463 .preresume = pool_preresume, 2464 .resume = pool_resume, 2465 .message = pool_message, 2466 .status = pool_status, 2467 .merge = pool_merge, 2468 .iterate_devices = pool_iterate_devices, 2469 .io_hints = pool_io_hints, 2470 }; 2471 2472 /*---------------------------------------------------------------- 2473 * Thin target methods 2474 *--------------------------------------------------------------*/ 2475 static void thin_dtr(struct dm_target *ti) 2476 { 2477 struct thin_c *tc = ti->private; 2478 2479 mutex_lock(&dm_thin_pool_table.mutex); 2480 2481 __pool_dec(tc->pool); 2482 dm_pool_close_thin_device(tc->td); 2483 dm_put_device(ti, tc->pool_dev); 2484 if (tc->origin_dev) 2485 dm_put_device(ti, tc->origin_dev); 2486 kfree(tc); 2487 2488 mutex_unlock(&dm_thin_pool_table.mutex); 2489 } 2490 2491 /* 2492 * Thin target parameters: 2493 * 2494 * <pool_dev> <dev_id> [origin_dev] 2495 * 2496 * pool_dev: the path to the pool (eg, /dev/mapper/my_pool) 2497 * dev_id: the internal device identifier 2498 * origin_dev: a device external to the pool that should act as the origin 2499 * 2500 * If the pool device has discards disabled, they get disabled for the thin 2501 * device as well. 2502 */ 2503 static int thin_ctr(struct dm_target *ti, unsigned argc, char **argv) 2504 { 2505 int r; 2506 struct thin_c *tc; 2507 struct dm_dev *pool_dev, *origin_dev; 2508 struct mapped_device *pool_md; 2509 2510 mutex_lock(&dm_thin_pool_table.mutex); 2511 2512 if (argc != 2 && argc != 3) { 2513 ti->error = "Invalid argument count"; 2514 r = -EINVAL; 2515 goto out_unlock; 2516 } 2517 2518 tc = ti->private = kzalloc(sizeof(*tc), GFP_KERNEL); 2519 if (!tc) { 2520 ti->error = "Out of memory"; 2521 r = -ENOMEM; 2522 goto out_unlock; 2523 } 2524 2525 if (argc == 3) { 2526 r = dm_get_device(ti, argv[2], FMODE_READ, &origin_dev); 2527 if (r) { 2528 ti->error = "Error opening origin device"; 2529 goto bad_origin_dev; 2530 } 2531 tc->origin_dev = origin_dev; 2532 } 2533 2534 r = dm_get_device(ti, argv[0], dm_table_get_mode(ti->table), &pool_dev); 2535 if (r) { 2536 ti->error = "Error opening pool device"; 2537 goto bad_pool_dev; 2538 } 2539 tc->pool_dev = pool_dev; 2540 2541 if (read_dev_id(argv[1], (unsigned long long *)&tc->dev_id, 0)) { 2542 ti->error = "Invalid device id"; 2543 r = -EINVAL; 2544 goto bad_common; 2545 } 2546 2547 pool_md = dm_get_md(tc->pool_dev->bdev->bd_dev); 2548 if (!pool_md) { 2549 ti->error = "Couldn't get pool mapped device"; 2550 r = -EINVAL; 2551 goto bad_common; 2552 } 2553 2554 tc->pool = __pool_table_lookup(pool_md); 2555 if (!tc->pool) { 2556 ti->error = "Couldn't find pool object"; 2557 r = -EINVAL; 2558 goto bad_pool_lookup; 2559 } 2560 __pool_inc(tc->pool); 2561 2562 if (get_pool_mode(tc->pool) == PM_FAIL) { 2563 ti->error = "Couldn't open thin device, Pool is in fail mode"; 2564 goto bad_thin_open; 2565 } 2566 2567 r = dm_pool_open_thin_device(tc->pool->pmd, tc->dev_id, &tc->td); 2568 if (r) { 2569 ti->error = "Couldn't open thin internal device"; 2570 goto bad_thin_open; 2571 } 2572 2573 r = dm_set_target_max_io_len(ti, tc->pool->sectors_per_block); 2574 if (r) 2575 goto bad_thin_open; 2576 2577 ti->num_flush_requests = 1; 2578 ti->flush_supported = true; 2579 2580 /* In case the pool supports discards, pass them on. */ 2581 if (tc->pool->pf.discard_enabled) { 2582 ti->discards_supported = true; 2583 ti->num_discard_requests = 1; 2584 ti->discard_zeroes_data_unsupported = true; 2585 /* Discard requests must be split on a block boundary */ 2586 ti->split_discard_requests = true; 2587 } 2588 2589 dm_put(pool_md); 2590 2591 mutex_unlock(&dm_thin_pool_table.mutex); 2592 2593 return 0; 2594 2595 bad_thin_open: 2596 __pool_dec(tc->pool); 2597 bad_pool_lookup: 2598 dm_put(pool_md); 2599 bad_common: 2600 dm_put_device(ti, tc->pool_dev); 2601 bad_pool_dev: 2602 if (tc->origin_dev) 2603 dm_put_device(ti, tc->origin_dev); 2604 bad_origin_dev: 2605 kfree(tc); 2606 out_unlock: 2607 mutex_unlock(&dm_thin_pool_table.mutex); 2608 2609 return r; 2610 } 2611 2612 static int thin_map(struct dm_target *ti, struct bio *bio, 2613 union map_info *map_context) 2614 { 2615 bio->bi_sector = dm_target_offset(ti, bio->bi_sector); 2616 2617 return thin_bio_map(ti, bio, map_context); 2618 } 2619 2620 static int thin_endio(struct dm_target *ti, 2621 struct bio *bio, int err, 2622 union map_info *map_context) 2623 { 2624 unsigned long flags; 2625 struct dm_thin_endio_hook *h = map_context->ptr; 2626 struct list_head work; 2627 struct dm_thin_new_mapping *m, *tmp; 2628 struct pool *pool = h->tc->pool; 2629 2630 if (h->shared_read_entry) { 2631 INIT_LIST_HEAD(&work); 2632 dm_deferred_entry_dec(h->shared_read_entry, &work); 2633 2634 spin_lock_irqsave(&pool->lock, flags); 2635 list_for_each_entry_safe(m, tmp, &work, list) { 2636 list_del(&m->list); 2637 m->quiesced = 1; 2638 __maybe_add_mapping(m); 2639 } 2640 spin_unlock_irqrestore(&pool->lock, flags); 2641 } 2642 2643 if (h->all_io_entry) { 2644 INIT_LIST_HEAD(&work); 2645 dm_deferred_entry_dec(h->all_io_entry, &work); 2646 spin_lock_irqsave(&pool->lock, flags); 2647 list_for_each_entry_safe(m, tmp, &work, list) 2648 list_add(&m->list, &pool->prepared_discards); 2649 spin_unlock_irqrestore(&pool->lock, flags); 2650 } 2651 2652 mempool_free(h, pool->endio_hook_pool); 2653 2654 return 0; 2655 } 2656 2657 static void thin_postsuspend(struct dm_target *ti) 2658 { 2659 if (dm_noflush_suspending(ti)) 2660 requeue_io((struct thin_c *)ti->private); 2661 } 2662 2663 /* 2664 * <nr mapped sectors> <highest mapped sector> 2665 */ 2666 static int thin_status(struct dm_target *ti, status_type_t type, 2667 unsigned status_flags, char *result, unsigned maxlen) 2668 { 2669 int r; 2670 ssize_t sz = 0; 2671 dm_block_t mapped, highest; 2672 char buf[BDEVNAME_SIZE]; 2673 struct thin_c *tc = ti->private; 2674 2675 if (get_pool_mode(tc->pool) == PM_FAIL) { 2676 DMEMIT("Fail"); 2677 return 0; 2678 } 2679 2680 if (!tc->td) 2681 DMEMIT("-"); 2682 else { 2683 switch (type) { 2684 case STATUSTYPE_INFO: 2685 r = dm_thin_get_mapped_count(tc->td, &mapped); 2686 if (r) 2687 return r; 2688 2689 r = dm_thin_get_highest_mapped_block(tc->td, &highest); 2690 if (r < 0) 2691 return r; 2692 2693 DMEMIT("%llu ", mapped * tc->pool->sectors_per_block); 2694 if (r) 2695 DMEMIT("%llu", ((highest + 1) * 2696 tc->pool->sectors_per_block) - 1); 2697 else 2698 DMEMIT("-"); 2699 break; 2700 2701 case STATUSTYPE_TABLE: 2702 DMEMIT("%s %lu", 2703 format_dev_t(buf, tc->pool_dev->bdev->bd_dev), 2704 (unsigned long) tc->dev_id); 2705 if (tc->origin_dev) 2706 DMEMIT(" %s", format_dev_t(buf, tc->origin_dev->bdev->bd_dev)); 2707 break; 2708 } 2709 } 2710 2711 return 0; 2712 } 2713 2714 static int thin_iterate_devices(struct dm_target *ti, 2715 iterate_devices_callout_fn fn, void *data) 2716 { 2717 sector_t blocks; 2718 struct thin_c *tc = ti->private; 2719 struct pool *pool = tc->pool; 2720 2721 /* 2722 * We can't call dm_pool_get_data_dev_size() since that blocks. So 2723 * we follow a more convoluted path through to the pool's target. 2724 */ 2725 if (!pool->ti) 2726 return 0; /* nothing is bound */ 2727 2728 blocks = pool->ti->len; 2729 (void) sector_div(blocks, pool->sectors_per_block); 2730 if (blocks) 2731 return fn(ti, tc->pool_dev, 0, pool->sectors_per_block * blocks, data); 2732 2733 return 0; 2734 } 2735 2736 /* 2737 * A thin device always inherits its queue limits from its pool. 2738 */ 2739 static void thin_io_hints(struct dm_target *ti, struct queue_limits *limits) 2740 { 2741 struct thin_c *tc = ti->private; 2742 2743 *limits = bdev_get_queue(tc->pool_dev->bdev)->limits; 2744 } 2745 2746 static struct target_type thin_target = { 2747 .name = "thin", 2748 .version = {1, 5, 0}, 2749 .module = THIS_MODULE, 2750 .ctr = thin_ctr, 2751 .dtr = thin_dtr, 2752 .map = thin_map, 2753 .end_io = thin_endio, 2754 .postsuspend = thin_postsuspend, 2755 .status = thin_status, 2756 .iterate_devices = thin_iterate_devices, 2757 .io_hints = thin_io_hints, 2758 }; 2759 2760 /*----------------------------------------------------------------*/ 2761 2762 static int __init dm_thin_init(void) 2763 { 2764 int r; 2765 2766 pool_table_init(); 2767 2768 r = dm_register_target(&thin_target); 2769 if (r) 2770 return r; 2771 2772 r = dm_register_target(&pool_target); 2773 if (r) 2774 goto bad_pool_target; 2775 2776 r = -ENOMEM; 2777 2778 _new_mapping_cache = KMEM_CACHE(dm_thin_new_mapping, 0); 2779 if (!_new_mapping_cache) 2780 goto bad_new_mapping_cache; 2781 2782 _endio_hook_cache = KMEM_CACHE(dm_thin_endio_hook, 0); 2783 if (!_endio_hook_cache) 2784 goto bad_endio_hook_cache; 2785 2786 return 0; 2787 2788 bad_endio_hook_cache: 2789 kmem_cache_destroy(_new_mapping_cache); 2790 bad_new_mapping_cache: 2791 dm_unregister_target(&pool_target); 2792 bad_pool_target: 2793 dm_unregister_target(&thin_target); 2794 2795 return r; 2796 } 2797 2798 static void dm_thin_exit(void) 2799 { 2800 dm_unregister_target(&thin_target); 2801 dm_unregister_target(&pool_target); 2802 2803 kmem_cache_destroy(_new_mapping_cache); 2804 kmem_cache_destroy(_endio_hook_cache); 2805 } 2806 2807 module_init(dm_thin_init); 2808 module_exit(dm_thin_exit); 2809 2810 MODULE_DESCRIPTION(DM_NAME " thin provisioning target"); 2811 MODULE_AUTHOR("Joe Thornber <dm-devel@redhat.com>"); 2812 MODULE_LICENSE("GPL"); 2813