1 /* 2 * Copyright (C) 2001, 2002 Sistina Software (UK) Limited. 3 * Copyright (C) 2004-2008 Red Hat, Inc. All rights reserved. 4 * 5 * This file is released under the GPL. 6 */ 7 8 #include "dm.h" 9 #include "dm-uevent.h" 10 11 #include <linux/init.h> 12 #include <linux/module.h> 13 #include <linux/mutex.h> 14 #include <linux/moduleparam.h> 15 #include <linux/blkpg.h> 16 #include <linux/bio.h> 17 #include <linux/mempool.h> 18 #include <linux/slab.h> 19 #include <linux/idr.h> 20 #include <linux/hdreg.h> 21 #include <linux/delay.h> 22 #include <linux/wait.h> 23 #include <linux/kthread.h> 24 #include <linux/ktime.h> 25 #include <linux/elevator.h> /* for rq_end_sector() */ 26 #include <linux/blk-mq.h> 27 28 #include <trace/events/block.h> 29 30 #define DM_MSG_PREFIX "core" 31 32 #ifdef CONFIG_PRINTK 33 /* 34 * ratelimit state to be used in DMXXX_LIMIT(). 35 */ 36 DEFINE_RATELIMIT_STATE(dm_ratelimit_state, 37 DEFAULT_RATELIMIT_INTERVAL, 38 DEFAULT_RATELIMIT_BURST); 39 EXPORT_SYMBOL(dm_ratelimit_state); 40 #endif 41 42 /* 43 * Cookies are numeric values sent with CHANGE and REMOVE 44 * uevents while resuming, removing or renaming the device. 45 */ 46 #define DM_COOKIE_ENV_VAR_NAME "DM_COOKIE" 47 #define DM_COOKIE_LENGTH 24 48 49 static const char *_name = DM_NAME; 50 51 static unsigned int major = 0; 52 static unsigned int _major = 0; 53 54 static DEFINE_IDR(_minor_idr); 55 56 static DEFINE_SPINLOCK(_minor_lock); 57 58 static void do_deferred_remove(struct work_struct *w); 59 60 static DECLARE_WORK(deferred_remove_work, do_deferred_remove); 61 62 static struct workqueue_struct *deferred_remove_workqueue; 63 64 /* 65 * For bio-based dm. 66 * One of these is allocated per bio. 67 */ 68 struct dm_io { 69 struct mapped_device *md; 70 int error; 71 atomic_t io_count; 72 struct bio *bio; 73 unsigned long start_time; 74 spinlock_t endio_lock; 75 struct dm_stats_aux stats_aux; 76 }; 77 78 /* 79 * For request-based dm. 80 * One of these is allocated per request. 81 */ 82 struct dm_rq_target_io { 83 struct mapped_device *md; 84 struct dm_target *ti; 85 struct request *orig, *clone; 86 struct kthread_work work; 87 int error; 88 union map_info info; 89 }; 90 91 /* 92 * For request-based dm - the bio clones we allocate are embedded in these 93 * structs. 94 * 95 * We allocate these with bio_alloc_bioset, using the front_pad parameter when 96 * the bioset is created - this means the bio has to come at the end of the 97 * struct. 98 */ 99 struct dm_rq_clone_bio_info { 100 struct bio *orig; 101 struct dm_rq_target_io *tio; 102 struct bio clone; 103 }; 104 105 union map_info *dm_get_rq_mapinfo(struct request *rq) 106 { 107 if (rq && rq->end_io_data) 108 return &((struct dm_rq_target_io *)rq->end_io_data)->info; 109 return NULL; 110 } 111 EXPORT_SYMBOL_GPL(dm_get_rq_mapinfo); 112 113 #define MINOR_ALLOCED ((void *)-1) 114 115 /* 116 * Bits for the md->flags field. 117 */ 118 #define DMF_BLOCK_IO_FOR_SUSPEND 0 119 #define DMF_SUSPENDED 1 120 #define DMF_FROZEN 2 121 #define DMF_FREEING 3 122 #define DMF_DELETING 4 123 #define DMF_NOFLUSH_SUSPENDING 5 124 #define DMF_MERGE_IS_OPTIONAL 6 125 #define DMF_DEFERRED_REMOVE 7 126 #define DMF_SUSPENDED_INTERNALLY 8 127 128 /* 129 * A dummy definition to make RCU happy. 130 * struct dm_table should never be dereferenced in this file. 131 */ 132 struct dm_table { 133 int undefined__; 134 }; 135 136 /* 137 * Work processed by per-device workqueue. 138 */ 139 struct mapped_device { 140 struct srcu_struct io_barrier; 141 struct mutex suspend_lock; 142 atomic_t holders; 143 atomic_t open_count; 144 145 /* 146 * The current mapping. 147 * Use dm_get_live_table{_fast} or take suspend_lock for 148 * dereference. 149 */ 150 struct dm_table __rcu *map; 151 152 struct list_head table_devices; 153 struct mutex table_devices_lock; 154 155 unsigned long flags; 156 157 struct request_queue *queue; 158 unsigned type; 159 /* Protect queue and type against concurrent access. */ 160 struct mutex type_lock; 161 162 struct target_type *immutable_target_type; 163 164 struct gendisk *disk; 165 char name[16]; 166 167 void *interface_ptr; 168 169 /* 170 * A list of ios that arrived while we were suspended. 171 */ 172 atomic_t pending[2]; 173 wait_queue_head_t wait; 174 struct work_struct work; 175 struct bio_list deferred; 176 spinlock_t deferred_lock; 177 178 /* 179 * Processing queue (flush) 180 */ 181 struct workqueue_struct *wq; 182 183 /* 184 * io objects are allocated from here. 185 */ 186 mempool_t *io_pool; 187 mempool_t *rq_pool; 188 189 struct bio_set *bs; 190 191 /* 192 * Event handling. 193 */ 194 atomic_t event_nr; 195 wait_queue_head_t eventq; 196 atomic_t uevent_seq; 197 struct list_head uevent_list; 198 spinlock_t uevent_lock; /* Protect access to uevent_list */ 199 200 /* 201 * freeze/thaw support require holding onto a super block 202 */ 203 struct super_block *frozen_sb; 204 struct block_device *bdev; 205 206 /* forced geometry settings */ 207 struct hd_geometry geometry; 208 209 /* kobject and completion */ 210 struct dm_kobject_holder kobj_holder; 211 212 /* zero-length flush that will be cloned and submitted to targets */ 213 struct bio flush_bio; 214 215 /* the number of internal suspends */ 216 unsigned internal_suspend_count; 217 218 struct dm_stats stats; 219 220 struct kthread_worker kworker; 221 struct task_struct *kworker_task; 222 223 /* for request-based merge heuristic in dm_request_fn() */ 224 unsigned seq_rq_merge_deadline_usecs; 225 int last_rq_rw; 226 sector_t last_rq_pos; 227 ktime_t last_rq_start_time; 228 229 /* for blk-mq request-based DM support */ 230 struct blk_mq_tag_set tag_set; 231 bool use_blk_mq; 232 }; 233 234 #ifdef CONFIG_DM_MQ_DEFAULT 235 static bool use_blk_mq = true; 236 #else 237 static bool use_blk_mq = false; 238 #endif 239 240 bool dm_use_blk_mq(struct mapped_device *md) 241 { 242 return md->use_blk_mq; 243 } 244 245 /* 246 * For mempools pre-allocation at the table loading time. 247 */ 248 struct dm_md_mempools { 249 mempool_t *io_pool; 250 mempool_t *rq_pool; 251 struct bio_set *bs; 252 }; 253 254 struct table_device { 255 struct list_head list; 256 atomic_t count; 257 struct dm_dev dm_dev; 258 }; 259 260 #define RESERVED_BIO_BASED_IOS 16 261 #define RESERVED_REQUEST_BASED_IOS 256 262 #define RESERVED_MAX_IOS 1024 263 static struct kmem_cache *_io_cache; 264 static struct kmem_cache *_rq_tio_cache; 265 static struct kmem_cache *_rq_cache; 266 267 /* 268 * Bio-based DM's mempools' reserved IOs set by the user. 269 */ 270 static unsigned reserved_bio_based_ios = RESERVED_BIO_BASED_IOS; 271 272 /* 273 * Request-based DM's mempools' reserved IOs set by the user. 274 */ 275 static unsigned reserved_rq_based_ios = RESERVED_REQUEST_BASED_IOS; 276 277 static unsigned __dm_get_module_param(unsigned *module_param, 278 unsigned def, unsigned max) 279 { 280 unsigned param = ACCESS_ONCE(*module_param); 281 unsigned modified_param = 0; 282 283 if (!param) 284 modified_param = def; 285 else if (param > max) 286 modified_param = max; 287 288 if (modified_param) { 289 (void)cmpxchg(module_param, param, modified_param); 290 param = modified_param; 291 } 292 293 return param; 294 } 295 296 unsigned dm_get_reserved_bio_based_ios(void) 297 { 298 return __dm_get_module_param(&reserved_bio_based_ios, 299 RESERVED_BIO_BASED_IOS, RESERVED_MAX_IOS); 300 } 301 EXPORT_SYMBOL_GPL(dm_get_reserved_bio_based_ios); 302 303 unsigned dm_get_reserved_rq_based_ios(void) 304 { 305 return __dm_get_module_param(&reserved_rq_based_ios, 306 RESERVED_REQUEST_BASED_IOS, RESERVED_MAX_IOS); 307 } 308 EXPORT_SYMBOL_GPL(dm_get_reserved_rq_based_ios); 309 310 static int __init local_init(void) 311 { 312 int r = -ENOMEM; 313 314 /* allocate a slab for the dm_ios */ 315 _io_cache = KMEM_CACHE(dm_io, 0); 316 if (!_io_cache) 317 return r; 318 319 _rq_tio_cache = KMEM_CACHE(dm_rq_target_io, 0); 320 if (!_rq_tio_cache) 321 goto out_free_io_cache; 322 323 _rq_cache = kmem_cache_create("dm_clone_request", sizeof(struct request), 324 __alignof__(struct request), 0, NULL); 325 if (!_rq_cache) 326 goto out_free_rq_tio_cache; 327 328 r = dm_uevent_init(); 329 if (r) 330 goto out_free_rq_cache; 331 332 deferred_remove_workqueue = alloc_workqueue("kdmremove", WQ_UNBOUND, 1); 333 if (!deferred_remove_workqueue) { 334 r = -ENOMEM; 335 goto out_uevent_exit; 336 } 337 338 _major = major; 339 r = register_blkdev(_major, _name); 340 if (r < 0) 341 goto out_free_workqueue; 342 343 if (!_major) 344 _major = r; 345 346 return 0; 347 348 out_free_workqueue: 349 destroy_workqueue(deferred_remove_workqueue); 350 out_uevent_exit: 351 dm_uevent_exit(); 352 out_free_rq_cache: 353 kmem_cache_destroy(_rq_cache); 354 out_free_rq_tio_cache: 355 kmem_cache_destroy(_rq_tio_cache); 356 out_free_io_cache: 357 kmem_cache_destroy(_io_cache); 358 359 return r; 360 } 361 362 static void local_exit(void) 363 { 364 flush_scheduled_work(); 365 destroy_workqueue(deferred_remove_workqueue); 366 367 kmem_cache_destroy(_rq_cache); 368 kmem_cache_destroy(_rq_tio_cache); 369 kmem_cache_destroy(_io_cache); 370 unregister_blkdev(_major, _name); 371 dm_uevent_exit(); 372 373 _major = 0; 374 375 DMINFO("cleaned up"); 376 } 377 378 static int (*_inits[])(void) __initdata = { 379 local_init, 380 dm_target_init, 381 dm_linear_init, 382 dm_stripe_init, 383 dm_io_init, 384 dm_kcopyd_init, 385 dm_interface_init, 386 dm_statistics_init, 387 }; 388 389 static void (*_exits[])(void) = { 390 local_exit, 391 dm_target_exit, 392 dm_linear_exit, 393 dm_stripe_exit, 394 dm_io_exit, 395 dm_kcopyd_exit, 396 dm_interface_exit, 397 dm_statistics_exit, 398 }; 399 400 static int __init dm_init(void) 401 { 402 const int count = ARRAY_SIZE(_inits); 403 404 int r, i; 405 406 for (i = 0; i < count; i++) { 407 r = _inits[i](); 408 if (r) 409 goto bad; 410 } 411 412 return 0; 413 414 bad: 415 while (i--) 416 _exits[i](); 417 418 return r; 419 } 420 421 static void __exit dm_exit(void) 422 { 423 int i = ARRAY_SIZE(_exits); 424 425 while (i--) 426 _exits[i](); 427 428 /* 429 * Should be empty by this point. 430 */ 431 idr_destroy(&_minor_idr); 432 } 433 434 /* 435 * Block device functions 436 */ 437 int dm_deleting_md(struct mapped_device *md) 438 { 439 return test_bit(DMF_DELETING, &md->flags); 440 } 441 442 static int dm_blk_open(struct block_device *bdev, fmode_t mode) 443 { 444 struct mapped_device *md; 445 446 spin_lock(&_minor_lock); 447 448 md = bdev->bd_disk->private_data; 449 if (!md) 450 goto out; 451 452 if (test_bit(DMF_FREEING, &md->flags) || 453 dm_deleting_md(md)) { 454 md = NULL; 455 goto out; 456 } 457 458 dm_get(md); 459 atomic_inc(&md->open_count); 460 out: 461 spin_unlock(&_minor_lock); 462 463 return md ? 0 : -ENXIO; 464 } 465 466 static void dm_blk_close(struct gendisk *disk, fmode_t mode) 467 { 468 struct mapped_device *md; 469 470 spin_lock(&_minor_lock); 471 472 md = disk->private_data; 473 if (WARN_ON(!md)) 474 goto out; 475 476 if (atomic_dec_and_test(&md->open_count) && 477 (test_bit(DMF_DEFERRED_REMOVE, &md->flags))) 478 queue_work(deferred_remove_workqueue, &deferred_remove_work); 479 480 dm_put(md); 481 out: 482 spin_unlock(&_minor_lock); 483 } 484 485 int dm_open_count(struct mapped_device *md) 486 { 487 return atomic_read(&md->open_count); 488 } 489 490 /* 491 * Guarantees nothing is using the device before it's deleted. 492 */ 493 int dm_lock_for_deletion(struct mapped_device *md, bool mark_deferred, bool only_deferred) 494 { 495 int r = 0; 496 497 spin_lock(&_minor_lock); 498 499 if (dm_open_count(md)) { 500 r = -EBUSY; 501 if (mark_deferred) 502 set_bit(DMF_DEFERRED_REMOVE, &md->flags); 503 } else if (only_deferred && !test_bit(DMF_DEFERRED_REMOVE, &md->flags)) 504 r = -EEXIST; 505 else 506 set_bit(DMF_DELETING, &md->flags); 507 508 spin_unlock(&_minor_lock); 509 510 return r; 511 } 512 513 int dm_cancel_deferred_remove(struct mapped_device *md) 514 { 515 int r = 0; 516 517 spin_lock(&_minor_lock); 518 519 if (test_bit(DMF_DELETING, &md->flags)) 520 r = -EBUSY; 521 else 522 clear_bit(DMF_DEFERRED_REMOVE, &md->flags); 523 524 spin_unlock(&_minor_lock); 525 526 return r; 527 } 528 529 static void do_deferred_remove(struct work_struct *w) 530 { 531 dm_deferred_remove(); 532 } 533 534 sector_t dm_get_size(struct mapped_device *md) 535 { 536 return get_capacity(md->disk); 537 } 538 539 struct request_queue *dm_get_md_queue(struct mapped_device *md) 540 { 541 return md->queue; 542 } 543 544 struct dm_stats *dm_get_stats(struct mapped_device *md) 545 { 546 return &md->stats; 547 } 548 549 static int dm_blk_getgeo(struct block_device *bdev, struct hd_geometry *geo) 550 { 551 struct mapped_device *md = bdev->bd_disk->private_data; 552 553 return dm_get_geometry(md, geo); 554 } 555 556 static int dm_blk_ioctl(struct block_device *bdev, fmode_t mode, 557 unsigned int cmd, unsigned long arg) 558 { 559 struct mapped_device *md = bdev->bd_disk->private_data; 560 int srcu_idx; 561 struct dm_table *map; 562 struct dm_target *tgt; 563 int r = -ENOTTY; 564 565 retry: 566 map = dm_get_live_table(md, &srcu_idx); 567 568 if (!map || !dm_table_get_size(map)) 569 goto out; 570 571 /* We only support devices that have a single target */ 572 if (dm_table_get_num_targets(map) != 1) 573 goto out; 574 575 tgt = dm_table_get_target(map, 0); 576 if (!tgt->type->ioctl) 577 goto out; 578 579 if (dm_suspended_md(md)) { 580 r = -EAGAIN; 581 goto out; 582 } 583 584 r = tgt->type->ioctl(tgt, cmd, arg); 585 586 out: 587 dm_put_live_table(md, srcu_idx); 588 589 if (r == -ENOTCONN) { 590 msleep(10); 591 goto retry; 592 } 593 594 return r; 595 } 596 597 static struct dm_io *alloc_io(struct mapped_device *md) 598 { 599 return mempool_alloc(md->io_pool, GFP_NOIO); 600 } 601 602 static void free_io(struct mapped_device *md, struct dm_io *io) 603 { 604 mempool_free(io, md->io_pool); 605 } 606 607 static void free_tio(struct mapped_device *md, struct dm_target_io *tio) 608 { 609 bio_put(&tio->clone); 610 } 611 612 static struct dm_rq_target_io *alloc_rq_tio(struct mapped_device *md, 613 gfp_t gfp_mask) 614 { 615 return mempool_alloc(md->io_pool, gfp_mask); 616 } 617 618 static void free_rq_tio(struct dm_rq_target_io *tio) 619 { 620 mempool_free(tio, tio->md->io_pool); 621 } 622 623 static struct request *alloc_clone_request(struct mapped_device *md, 624 gfp_t gfp_mask) 625 { 626 return mempool_alloc(md->rq_pool, gfp_mask); 627 } 628 629 static void free_clone_request(struct mapped_device *md, struct request *rq) 630 { 631 mempool_free(rq, md->rq_pool); 632 } 633 634 static int md_in_flight(struct mapped_device *md) 635 { 636 return atomic_read(&md->pending[READ]) + 637 atomic_read(&md->pending[WRITE]); 638 } 639 640 static void start_io_acct(struct dm_io *io) 641 { 642 struct mapped_device *md = io->md; 643 struct bio *bio = io->bio; 644 int cpu; 645 int rw = bio_data_dir(bio); 646 647 io->start_time = jiffies; 648 649 cpu = part_stat_lock(); 650 part_round_stats(cpu, &dm_disk(md)->part0); 651 part_stat_unlock(); 652 atomic_set(&dm_disk(md)->part0.in_flight[rw], 653 atomic_inc_return(&md->pending[rw])); 654 655 if (unlikely(dm_stats_used(&md->stats))) 656 dm_stats_account_io(&md->stats, bio->bi_rw, bio->bi_iter.bi_sector, 657 bio_sectors(bio), false, 0, &io->stats_aux); 658 } 659 660 static void end_io_acct(struct dm_io *io) 661 { 662 struct mapped_device *md = io->md; 663 struct bio *bio = io->bio; 664 unsigned long duration = jiffies - io->start_time; 665 int pending; 666 int rw = bio_data_dir(bio); 667 668 generic_end_io_acct(rw, &dm_disk(md)->part0, io->start_time); 669 670 if (unlikely(dm_stats_used(&md->stats))) 671 dm_stats_account_io(&md->stats, bio->bi_rw, bio->bi_iter.bi_sector, 672 bio_sectors(bio), true, duration, &io->stats_aux); 673 674 /* 675 * After this is decremented the bio must not be touched if it is 676 * a flush. 677 */ 678 pending = atomic_dec_return(&md->pending[rw]); 679 atomic_set(&dm_disk(md)->part0.in_flight[rw], pending); 680 pending += atomic_read(&md->pending[rw^0x1]); 681 682 /* nudge anyone waiting on suspend queue */ 683 if (!pending) 684 wake_up(&md->wait); 685 } 686 687 /* 688 * Add the bio to the list of deferred io. 689 */ 690 static void queue_io(struct mapped_device *md, struct bio *bio) 691 { 692 unsigned long flags; 693 694 spin_lock_irqsave(&md->deferred_lock, flags); 695 bio_list_add(&md->deferred, bio); 696 spin_unlock_irqrestore(&md->deferred_lock, flags); 697 queue_work(md->wq, &md->work); 698 } 699 700 /* 701 * Everyone (including functions in this file), should use this 702 * function to access the md->map field, and make sure they call 703 * dm_put_live_table() when finished. 704 */ 705 struct dm_table *dm_get_live_table(struct mapped_device *md, int *srcu_idx) __acquires(md->io_barrier) 706 { 707 *srcu_idx = srcu_read_lock(&md->io_barrier); 708 709 return srcu_dereference(md->map, &md->io_barrier); 710 } 711 712 void dm_put_live_table(struct mapped_device *md, int srcu_idx) __releases(md->io_barrier) 713 { 714 srcu_read_unlock(&md->io_barrier, srcu_idx); 715 } 716 717 void dm_sync_table(struct mapped_device *md) 718 { 719 synchronize_srcu(&md->io_barrier); 720 synchronize_rcu_expedited(); 721 } 722 723 /* 724 * A fast alternative to dm_get_live_table/dm_put_live_table. 725 * The caller must not block between these two functions. 726 */ 727 static struct dm_table *dm_get_live_table_fast(struct mapped_device *md) __acquires(RCU) 728 { 729 rcu_read_lock(); 730 return rcu_dereference(md->map); 731 } 732 733 static void dm_put_live_table_fast(struct mapped_device *md) __releases(RCU) 734 { 735 rcu_read_unlock(); 736 } 737 738 /* 739 * Open a table device so we can use it as a map destination. 740 */ 741 static int open_table_device(struct table_device *td, dev_t dev, 742 struct mapped_device *md) 743 { 744 static char *_claim_ptr = "I belong to device-mapper"; 745 struct block_device *bdev; 746 747 int r; 748 749 BUG_ON(td->dm_dev.bdev); 750 751 bdev = blkdev_get_by_dev(dev, td->dm_dev.mode | FMODE_EXCL, _claim_ptr); 752 if (IS_ERR(bdev)) 753 return PTR_ERR(bdev); 754 755 r = bd_link_disk_holder(bdev, dm_disk(md)); 756 if (r) { 757 blkdev_put(bdev, td->dm_dev.mode | FMODE_EXCL); 758 return r; 759 } 760 761 td->dm_dev.bdev = bdev; 762 return 0; 763 } 764 765 /* 766 * Close a table device that we've been using. 767 */ 768 static void close_table_device(struct table_device *td, struct mapped_device *md) 769 { 770 if (!td->dm_dev.bdev) 771 return; 772 773 bd_unlink_disk_holder(td->dm_dev.bdev, dm_disk(md)); 774 blkdev_put(td->dm_dev.bdev, td->dm_dev.mode | FMODE_EXCL); 775 td->dm_dev.bdev = NULL; 776 } 777 778 static struct table_device *find_table_device(struct list_head *l, dev_t dev, 779 fmode_t mode) { 780 struct table_device *td; 781 782 list_for_each_entry(td, l, list) 783 if (td->dm_dev.bdev->bd_dev == dev && td->dm_dev.mode == mode) 784 return td; 785 786 return NULL; 787 } 788 789 int dm_get_table_device(struct mapped_device *md, dev_t dev, fmode_t mode, 790 struct dm_dev **result) { 791 int r; 792 struct table_device *td; 793 794 mutex_lock(&md->table_devices_lock); 795 td = find_table_device(&md->table_devices, dev, mode); 796 if (!td) { 797 td = kmalloc(sizeof(*td), GFP_KERNEL); 798 if (!td) { 799 mutex_unlock(&md->table_devices_lock); 800 return -ENOMEM; 801 } 802 803 td->dm_dev.mode = mode; 804 td->dm_dev.bdev = NULL; 805 806 if ((r = open_table_device(td, dev, md))) { 807 mutex_unlock(&md->table_devices_lock); 808 kfree(td); 809 return r; 810 } 811 812 format_dev_t(td->dm_dev.name, dev); 813 814 atomic_set(&td->count, 0); 815 list_add(&td->list, &md->table_devices); 816 } 817 atomic_inc(&td->count); 818 mutex_unlock(&md->table_devices_lock); 819 820 *result = &td->dm_dev; 821 return 0; 822 } 823 EXPORT_SYMBOL_GPL(dm_get_table_device); 824 825 void dm_put_table_device(struct mapped_device *md, struct dm_dev *d) 826 { 827 struct table_device *td = container_of(d, struct table_device, dm_dev); 828 829 mutex_lock(&md->table_devices_lock); 830 if (atomic_dec_and_test(&td->count)) { 831 close_table_device(td, md); 832 list_del(&td->list); 833 kfree(td); 834 } 835 mutex_unlock(&md->table_devices_lock); 836 } 837 EXPORT_SYMBOL(dm_put_table_device); 838 839 static void free_table_devices(struct list_head *devices) 840 { 841 struct list_head *tmp, *next; 842 843 list_for_each_safe(tmp, next, devices) { 844 struct table_device *td = list_entry(tmp, struct table_device, list); 845 846 DMWARN("dm_destroy: %s still exists with %d references", 847 td->dm_dev.name, atomic_read(&td->count)); 848 kfree(td); 849 } 850 } 851 852 /* 853 * Get the geometry associated with a dm device 854 */ 855 int dm_get_geometry(struct mapped_device *md, struct hd_geometry *geo) 856 { 857 *geo = md->geometry; 858 859 return 0; 860 } 861 862 /* 863 * Set the geometry of a device. 864 */ 865 int dm_set_geometry(struct mapped_device *md, struct hd_geometry *geo) 866 { 867 sector_t sz = (sector_t)geo->cylinders * geo->heads * geo->sectors; 868 869 if (geo->start > sz) { 870 DMWARN("Start sector is beyond the geometry limits."); 871 return -EINVAL; 872 } 873 874 md->geometry = *geo; 875 876 return 0; 877 } 878 879 /*----------------------------------------------------------------- 880 * CRUD START: 881 * A more elegant soln is in the works that uses the queue 882 * merge fn, unfortunately there are a couple of changes to 883 * the block layer that I want to make for this. So in the 884 * interests of getting something for people to use I give 885 * you this clearly demarcated crap. 886 *---------------------------------------------------------------*/ 887 888 static int __noflush_suspending(struct mapped_device *md) 889 { 890 return test_bit(DMF_NOFLUSH_SUSPENDING, &md->flags); 891 } 892 893 /* 894 * Decrements the number of outstanding ios that a bio has been 895 * cloned into, completing the original io if necc. 896 */ 897 static void dec_pending(struct dm_io *io, int error) 898 { 899 unsigned long flags; 900 int io_error; 901 struct bio *bio; 902 struct mapped_device *md = io->md; 903 904 /* Push-back supersedes any I/O errors */ 905 if (unlikely(error)) { 906 spin_lock_irqsave(&io->endio_lock, flags); 907 if (!(io->error > 0 && __noflush_suspending(md))) 908 io->error = error; 909 spin_unlock_irqrestore(&io->endio_lock, flags); 910 } 911 912 if (atomic_dec_and_test(&io->io_count)) { 913 if (io->error == DM_ENDIO_REQUEUE) { 914 /* 915 * Target requested pushing back the I/O. 916 */ 917 spin_lock_irqsave(&md->deferred_lock, flags); 918 if (__noflush_suspending(md)) 919 bio_list_add_head(&md->deferred, io->bio); 920 else 921 /* noflush suspend was interrupted. */ 922 io->error = -EIO; 923 spin_unlock_irqrestore(&md->deferred_lock, flags); 924 } 925 926 io_error = io->error; 927 bio = io->bio; 928 end_io_acct(io); 929 free_io(md, io); 930 931 if (io_error == DM_ENDIO_REQUEUE) 932 return; 933 934 if ((bio->bi_rw & REQ_FLUSH) && bio->bi_iter.bi_size) { 935 /* 936 * Preflush done for flush with data, reissue 937 * without REQ_FLUSH. 938 */ 939 bio->bi_rw &= ~REQ_FLUSH; 940 queue_io(md, bio); 941 } else { 942 /* done with normal IO or empty flush */ 943 trace_block_bio_complete(md->queue, bio, io_error); 944 bio_endio(bio, io_error); 945 } 946 } 947 } 948 949 static void disable_write_same(struct mapped_device *md) 950 { 951 struct queue_limits *limits = dm_get_queue_limits(md); 952 953 /* device doesn't really support WRITE SAME, disable it */ 954 limits->max_write_same_sectors = 0; 955 } 956 957 static void clone_endio(struct bio *bio, int error) 958 { 959 int r = error; 960 struct dm_target_io *tio = container_of(bio, struct dm_target_io, clone); 961 struct dm_io *io = tio->io; 962 struct mapped_device *md = tio->io->md; 963 dm_endio_fn endio = tio->ti->type->end_io; 964 965 if (!bio_flagged(bio, BIO_UPTODATE) && !error) 966 error = -EIO; 967 968 if (endio) { 969 r = endio(tio->ti, bio, error); 970 if (r < 0 || r == DM_ENDIO_REQUEUE) 971 /* 972 * error and requeue request are handled 973 * in dec_pending(). 974 */ 975 error = r; 976 else if (r == DM_ENDIO_INCOMPLETE) 977 /* The target will handle the io */ 978 return; 979 else if (r) { 980 DMWARN("unimplemented target endio return value: %d", r); 981 BUG(); 982 } 983 } 984 985 if (unlikely(r == -EREMOTEIO && (bio->bi_rw & REQ_WRITE_SAME) && 986 !bdev_get_queue(bio->bi_bdev)->limits.max_write_same_sectors)) 987 disable_write_same(md); 988 989 free_tio(md, tio); 990 dec_pending(io, error); 991 } 992 993 /* 994 * Partial completion handling for request-based dm 995 */ 996 static void end_clone_bio(struct bio *clone, int error) 997 { 998 struct dm_rq_clone_bio_info *info = 999 container_of(clone, struct dm_rq_clone_bio_info, clone); 1000 struct dm_rq_target_io *tio = info->tio; 1001 struct bio *bio = info->orig; 1002 unsigned int nr_bytes = info->orig->bi_iter.bi_size; 1003 1004 bio_put(clone); 1005 1006 if (tio->error) 1007 /* 1008 * An error has already been detected on the request. 1009 * Once error occurred, just let clone->end_io() handle 1010 * the remainder. 1011 */ 1012 return; 1013 else if (error) { 1014 /* 1015 * Don't notice the error to the upper layer yet. 1016 * The error handling decision is made by the target driver, 1017 * when the request is completed. 1018 */ 1019 tio->error = error; 1020 return; 1021 } 1022 1023 /* 1024 * I/O for the bio successfully completed. 1025 * Notice the data completion to the upper layer. 1026 */ 1027 1028 /* 1029 * bios are processed from the head of the list. 1030 * So the completing bio should always be rq->bio. 1031 * If it's not, something wrong is happening. 1032 */ 1033 if (tio->orig->bio != bio) 1034 DMERR("bio completion is going in the middle of the request"); 1035 1036 /* 1037 * Update the original request. 1038 * Do not use blk_end_request() here, because it may complete 1039 * the original request before the clone, and break the ordering. 1040 */ 1041 blk_update_request(tio->orig, 0, nr_bytes); 1042 } 1043 1044 static struct dm_rq_target_io *tio_from_request(struct request *rq) 1045 { 1046 return (rq->q->mq_ops ? blk_mq_rq_to_pdu(rq) : rq->special); 1047 } 1048 1049 /* 1050 * Don't touch any member of the md after calling this function because 1051 * the md may be freed in dm_put() at the end of this function. 1052 * Or do dm_get() before calling this function and dm_put() later. 1053 */ 1054 static void rq_completed(struct mapped_device *md, int rw, bool run_queue) 1055 { 1056 int nr_requests_pending; 1057 1058 atomic_dec(&md->pending[rw]); 1059 1060 /* nudge anyone waiting on suspend queue */ 1061 nr_requests_pending = md_in_flight(md); 1062 if (!nr_requests_pending) 1063 wake_up(&md->wait); 1064 1065 /* 1066 * Run this off this callpath, as drivers could invoke end_io while 1067 * inside their request_fn (and holding the queue lock). Calling 1068 * back into ->request_fn() could deadlock attempting to grab the 1069 * queue lock again. 1070 */ 1071 if (run_queue) { 1072 if (md->queue->mq_ops) 1073 blk_mq_run_hw_queues(md->queue, true); 1074 else if (!nr_requests_pending || 1075 (nr_requests_pending >= md->queue->nr_congestion_on)) 1076 blk_run_queue_async(md->queue); 1077 } 1078 1079 /* 1080 * dm_put() must be at the end of this function. See the comment above 1081 */ 1082 dm_put(md); 1083 } 1084 1085 static void free_rq_clone(struct request *clone) 1086 { 1087 struct dm_rq_target_io *tio = clone->end_io_data; 1088 struct mapped_device *md = tio->md; 1089 1090 blk_rq_unprep_clone(clone); 1091 1092 if (md->type == DM_TYPE_MQ_REQUEST_BASED) 1093 /* stacked on blk-mq queue(s) */ 1094 tio->ti->type->release_clone_rq(clone); 1095 else if (!md->queue->mq_ops) 1096 /* request_fn queue stacked on request_fn queue(s) */ 1097 free_clone_request(md, clone); 1098 /* 1099 * NOTE: for the blk-mq queue stacked on request_fn queue(s) case: 1100 * no need to call free_clone_request() because we leverage blk-mq by 1101 * allocating the clone at the end of the blk-mq pdu (see: clone_rq) 1102 */ 1103 1104 if (!md->queue->mq_ops) 1105 free_rq_tio(tio); 1106 } 1107 1108 /* 1109 * Complete the clone and the original request. 1110 * Must be called without clone's queue lock held, 1111 * see end_clone_request() for more details. 1112 */ 1113 static void dm_end_request(struct request *clone, int error) 1114 { 1115 int rw = rq_data_dir(clone); 1116 struct dm_rq_target_io *tio = clone->end_io_data; 1117 struct mapped_device *md = tio->md; 1118 struct request *rq = tio->orig; 1119 1120 if (rq->cmd_type == REQ_TYPE_BLOCK_PC) { 1121 rq->errors = clone->errors; 1122 rq->resid_len = clone->resid_len; 1123 1124 if (rq->sense) 1125 /* 1126 * We are using the sense buffer of the original 1127 * request. 1128 * So setting the length of the sense data is enough. 1129 */ 1130 rq->sense_len = clone->sense_len; 1131 } 1132 1133 free_rq_clone(clone); 1134 if (!rq->q->mq_ops) 1135 blk_end_request_all(rq, error); 1136 else 1137 blk_mq_end_request(rq, error); 1138 rq_completed(md, rw, true); 1139 } 1140 1141 static void dm_unprep_request(struct request *rq) 1142 { 1143 struct dm_rq_target_io *tio = tio_from_request(rq); 1144 struct request *clone = tio->clone; 1145 1146 if (!rq->q->mq_ops) { 1147 rq->special = NULL; 1148 rq->cmd_flags &= ~REQ_DONTPREP; 1149 } 1150 1151 if (clone) 1152 free_rq_clone(clone); 1153 } 1154 1155 /* 1156 * Requeue the original request of a clone. 1157 */ 1158 static void old_requeue_request(struct request *rq) 1159 { 1160 struct request_queue *q = rq->q; 1161 unsigned long flags; 1162 1163 spin_lock_irqsave(q->queue_lock, flags); 1164 blk_requeue_request(q, rq); 1165 blk_run_queue_async(q); 1166 spin_unlock_irqrestore(q->queue_lock, flags); 1167 } 1168 1169 static void dm_requeue_unmapped_original_request(struct mapped_device *md, 1170 struct request *rq) 1171 { 1172 int rw = rq_data_dir(rq); 1173 1174 dm_unprep_request(rq); 1175 1176 if (!rq->q->mq_ops) 1177 old_requeue_request(rq); 1178 else { 1179 blk_mq_requeue_request(rq); 1180 blk_mq_kick_requeue_list(rq->q); 1181 } 1182 1183 rq_completed(md, rw, false); 1184 } 1185 1186 static void dm_requeue_unmapped_request(struct request *clone) 1187 { 1188 struct dm_rq_target_io *tio = clone->end_io_data; 1189 1190 dm_requeue_unmapped_original_request(tio->md, tio->orig); 1191 } 1192 1193 static void old_stop_queue(struct request_queue *q) 1194 { 1195 unsigned long flags; 1196 1197 if (blk_queue_stopped(q)) 1198 return; 1199 1200 spin_lock_irqsave(q->queue_lock, flags); 1201 blk_stop_queue(q); 1202 spin_unlock_irqrestore(q->queue_lock, flags); 1203 } 1204 1205 static void stop_queue(struct request_queue *q) 1206 { 1207 if (!q->mq_ops) 1208 old_stop_queue(q); 1209 else 1210 blk_mq_stop_hw_queues(q); 1211 } 1212 1213 static void old_start_queue(struct request_queue *q) 1214 { 1215 unsigned long flags; 1216 1217 spin_lock_irqsave(q->queue_lock, flags); 1218 if (blk_queue_stopped(q)) 1219 blk_start_queue(q); 1220 spin_unlock_irqrestore(q->queue_lock, flags); 1221 } 1222 1223 static void start_queue(struct request_queue *q) 1224 { 1225 if (!q->mq_ops) 1226 old_start_queue(q); 1227 else 1228 blk_mq_start_stopped_hw_queues(q, true); 1229 } 1230 1231 static void dm_done(struct request *clone, int error, bool mapped) 1232 { 1233 int r = error; 1234 struct dm_rq_target_io *tio = clone->end_io_data; 1235 dm_request_endio_fn rq_end_io = NULL; 1236 1237 if (tio->ti) { 1238 rq_end_io = tio->ti->type->rq_end_io; 1239 1240 if (mapped && rq_end_io) 1241 r = rq_end_io(tio->ti, clone, error, &tio->info); 1242 } 1243 1244 if (unlikely(r == -EREMOTEIO && (clone->cmd_flags & REQ_WRITE_SAME) && 1245 !clone->q->limits.max_write_same_sectors)) 1246 disable_write_same(tio->md); 1247 1248 if (r <= 0) 1249 /* The target wants to complete the I/O */ 1250 dm_end_request(clone, r); 1251 else if (r == DM_ENDIO_INCOMPLETE) 1252 /* The target will handle the I/O */ 1253 return; 1254 else if (r == DM_ENDIO_REQUEUE) 1255 /* The target wants to requeue the I/O */ 1256 dm_requeue_unmapped_request(clone); 1257 else { 1258 DMWARN("unimplemented target endio return value: %d", r); 1259 BUG(); 1260 } 1261 } 1262 1263 /* 1264 * Request completion handler for request-based dm 1265 */ 1266 static void dm_softirq_done(struct request *rq) 1267 { 1268 bool mapped = true; 1269 struct dm_rq_target_io *tio = tio_from_request(rq); 1270 struct request *clone = tio->clone; 1271 int rw; 1272 1273 if (!clone) { 1274 rw = rq_data_dir(rq); 1275 if (!rq->q->mq_ops) { 1276 blk_end_request_all(rq, tio->error); 1277 rq_completed(tio->md, rw, false); 1278 free_rq_tio(tio); 1279 } else { 1280 blk_mq_end_request(rq, tio->error); 1281 rq_completed(tio->md, rw, false); 1282 } 1283 return; 1284 } 1285 1286 if (rq->cmd_flags & REQ_FAILED) 1287 mapped = false; 1288 1289 dm_done(clone, tio->error, mapped); 1290 } 1291 1292 /* 1293 * Complete the clone and the original request with the error status 1294 * through softirq context. 1295 */ 1296 static void dm_complete_request(struct request *rq, int error) 1297 { 1298 struct dm_rq_target_io *tio = tio_from_request(rq); 1299 1300 tio->error = error; 1301 blk_complete_request(rq); 1302 } 1303 1304 /* 1305 * Complete the not-mapped clone and the original request with the error status 1306 * through softirq context. 1307 * Target's rq_end_io() function isn't called. 1308 * This may be used when the target's map_rq() or clone_and_map_rq() functions fail. 1309 */ 1310 static void dm_kill_unmapped_request(struct request *rq, int error) 1311 { 1312 rq->cmd_flags |= REQ_FAILED; 1313 dm_complete_request(rq, error); 1314 } 1315 1316 /* 1317 * Called with the clone's queue lock held (for non-blk-mq) 1318 */ 1319 static void end_clone_request(struct request *clone, int error) 1320 { 1321 struct dm_rq_target_io *tio = clone->end_io_data; 1322 1323 if (!clone->q->mq_ops) { 1324 /* 1325 * For just cleaning up the information of the queue in which 1326 * the clone was dispatched. 1327 * The clone is *NOT* freed actually here because it is alloced 1328 * from dm own mempool (REQ_ALLOCED isn't set). 1329 */ 1330 __blk_put_request(clone->q, clone); 1331 } 1332 1333 /* 1334 * Actual request completion is done in a softirq context which doesn't 1335 * hold the clone's queue lock. Otherwise, deadlock could occur because: 1336 * - another request may be submitted by the upper level driver 1337 * of the stacking during the completion 1338 * - the submission which requires queue lock may be done 1339 * against this clone's queue 1340 */ 1341 dm_complete_request(tio->orig, error); 1342 } 1343 1344 /* 1345 * Return maximum size of I/O possible at the supplied sector up to the current 1346 * target boundary. 1347 */ 1348 static sector_t max_io_len_target_boundary(sector_t sector, struct dm_target *ti) 1349 { 1350 sector_t target_offset = dm_target_offset(ti, sector); 1351 1352 return ti->len - target_offset; 1353 } 1354 1355 static sector_t max_io_len(sector_t sector, struct dm_target *ti) 1356 { 1357 sector_t len = max_io_len_target_boundary(sector, ti); 1358 sector_t offset, max_len; 1359 1360 /* 1361 * Does the target need to split even further? 1362 */ 1363 if (ti->max_io_len) { 1364 offset = dm_target_offset(ti, sector); 1365 if (unlikely(ti->max_io_len & (ti->max_io_len - 1))) 1366 max_len = sector_div(offset, ti->max_io_len); 1367 else 1368 max_len = offset & (ti->max_io_len - 1); 1369 max_len = ti->max_io_len - max_len; 1370 1371 if (len > max_len) 1372 len = max_len; 1373 } 1374 1375 return len; 1376 } 1377 1378 int dm_set_target_max_io_len(struct dm_target *ti, sector_t len) 1379 { 1380 if (len > UINT_MAX) { 1381 DMERR("Specified maximum size of target IO (%llu) exceeds limit (%u)", 1382 (unsigned long long)len, UINT_MAX); 1383 ti->error = "Maximum size of target IO is too large"; 1384 return -EINVAL; 1385 } 1386 1387 ti->max_io_len = (uint32_t) len; 1388 1389 return 0; 1390 } 1391 EXPORT_SYMBOL_GPL(dm_set_target_max_io_len); 1392 1393 /* 1394 * A target may call dm_accept_partial_bio only from the map routine. It is 1395 * allowed for all bio types except REQ_FLUSH. 1396 * 1397 * dm_accept_partial_bio informs the dm that the target only wants to process 1398 * additional n_sectors sectors of the bio and the rest of the data should be 1399 * sent in a next bio. 1400 * 1401 * A diagram that explains the arithmetics: 1402 * +--------------------+---------------+-------+ 1403 * | 1 | 2 | 3 | 1404 * +--------------------+---------------+-------+ 1405 * 1406 * <-------------- *tio->len_ptr ---------------> 1407 * <------- bi_size -------> 1408 * <-- n_sectors --> 1409 * 1410 * Region 1 was already iterated over with bio_advance or similar function. 1411 * (it may be empty if the target doesn't use bio_advance) 1412 * Region 2 is the remaining bio size that the target wants to process. 1413 * (it may be empty if region 1 is non-empty, although there is no reason 1414 * to make it empty) 1415 * The target requires that region 3 is to be sent in the next bio. 1416 * 1417 * If the target wants to receive multiple copies of the bio (via num_*bios, etc), 1418 * the partially processed part (the sum of regions 1+2) must be the same for all 1419 * copies of the bio. 1420 */ 1421 void dm_accept_partial_bio(struct bio *bio, unsigned n_sectors) 1422 { 1423 struct dm_target_io *tio = container_of(bio, struct dm_target_io, clone); 1424 unsigned bi_size = bio->bi_iter.bi_size >> SECTOR_SHIFT; 1425 BUG_ON(bio->bi_rw & REQ_FLUSH); 1426 BUG_ON(bi_size > *tio->len_ptr); 1427 BUG_ON(n_sectors > bi_size); 1428 *tio->len_ptr -= bi_size - n_sectors; 1429 bio->bi_iter.bi_size = n_sectors << SECTOR_SHIFT; 1430 } 1431 EXPORT_SYMBOL_GPL(dm_accept_partial_bio); 1432 1433 static void __map_bio(struct dm_target_io *tio) 1434 { 1435 int r; 1436 sector_t sector; 1437 struct mapped_device *md; 1438 struct bio *clone = &tio->clone; 1439 struct dm_target *ti = tio->ti; 1440 1441 clone->bi_end_io = clone_endio; 1442 1443 /* 1444 * Map the clone. If r == 0 we don't need to do 1445 * anything, the target has assumed ownership of 1446 * this io. 1447 */ 1448 atomic_inc(&tio->io->io_count); 1449 sector = clone->bi_iter.bi_sector; 1450 r = ti->type->map(ti, clone); 1451 if (r == DM_MAPIO_REMAPPED) { 1452 /* the bio has been remapped so dispatch it */ 1453 1454 trace_block_bio_remap(bdev_get_queue(clone->bi_bdev), clone, 1455 tio->io->bio->bi_bdev->bd_dev, sector); 1456 1457 generic_make_request(clone); 1458 } else if (r < 0 || r == DM_MAPIO_REQUEUE) { 1459 /* error the io and bail out, or requeue it if needed */ 1460 md = tio->io->md; 1461 dec_pending(tio->io, r); 1462 free_tio(md, tio); 1463 } else if (r) { 1464 DMWARN("unimplemented target map return value: %d", r); 1465 BUG(); 1466 } 1467 } 1468 1469 struct clone_info { 1470 struct mapped_device *md; 1471 struct dm_table *map; 1472 struct bio *bio; 1473 struct dm_io *io; 1474 sector_t sector; 1475 unsigned sector_count; 1476 }; 1477 1478 static void bio_setup_sector(struct bio *bio, sector_t sector, unsigned len) 1479 { 1480 bio->bi_iter.bi_sector = sector; 1481 bio->bi_iter.bi_size = to_bytes(len); 1482 } 1483 1484 /* 1485 * Creates a bio that consists of range of complete bvecs. 1486 */ 1487 static void clone_bio(struct dm_target_io *tio, struct bio *bio, 1488 sector_t sector, unsigned len) 1489 { 1490 struct bio *clone = &tio->clone; 1491 1492 __bio_clone_fast(clone, bio); 1493 1494 if (bio_integrity(bio)) 1495 bio_integrity_clone(clone, bio, GFP_NOIO); 1496 1497 bio_advance(clone, to_bytes(sector - clone->bi_iter.bi_sector)); 1498 clone->bi_iter.bi_size = to_bytes(len); 1499 1500 if (bio_integrity(bio)) 1501 bio_integrity_trim(clone, 0, len); 1502 } 1503 1504 static struct dm_target_io *alloc_tio(struct clone_info *ci, 1505 struct dm_target *ti, 1506 unsigned target_bio_nr) 1507 { 1508 struct dm_target_io *tio; 1509 struct bio *clone; 1510 1511 clone = bio_alloc_bioset(GFP_NOIO, 0, ci->md->bs); 1512 tio = container_of(clone, struct dm_target_io, clone); 1513 1514 tio->io = ci->io; 1515 tio->ti = ti; 1516 tio->target_bio_nr = target_bio_nr; 1517 1518 return tio; 1519 } 1520 1521 static void __clone_and_map_simple_bio(struct clone_info *ci, 1522 struct dm_target *ti, 1523 unsigned target_bio_nr, unsigned *len) 1524 { 1525 struct dm_target_io *tio = alloc_tio(ci, ti, target_bio_nr); 1526 struct bio *clone = &tio->clone; 1527 1528 tio->len_ptr = len; 1529 1530 __bio_clone_fast(clone, ci->bio); 1531 if (len) 1532 bio_setup_sector(clone, ci->sector, *len); 1533 1534 __map_bio(tio); 1535 } 1536 1537 static void __send_duplicate_bios(struct clone_info *ci, struct dm_target *ti, 1538 unsigned num_bios, unsigned *len) 1539 { 1540 unsigned target_bio_nr; 1541 1542 for (target_bio_nr = 0; target_bio_nr < num_bios; target_bio_nr++) 1543 __clone_and_map_simple_bio(ci, ti, target_bio_nr, len); 1544 } 1545 1546 static int __send_empty_flush(struct clone_info *ci) 1547 { 1548 unsigned target_nr = 0; 1549 struct dm_target *ti; 1550 1551 BUG_ON(bio_has_data(ci->bio)); 1552 while ((ti = dm_table_get_target(ci->map, target_nr++))) 1553 __send_duplicate_bios(ci, ti, ti->num_flush_bios, NULL); 1554 1555 return 0; 1556 } 1557 1558 static void __clone_and_map_data_bio(struct clone_info *ci, struct dm_target *ti, 1559 sector_t sector, unsigned *len) 1560 { 1561 struct bio *bio = ci->bio; 1562 struct dm_target_io *tio; 1563 unsigned target_bio_nr; 1564 unsigned num_target_bios = 1; 1565 1566 /* 1567 * Does the target want to receive duplicate copies of the bio? 1568 */ 1569 if (bio_data_dir(bio) == WRITE && ti->num_write_bios) 1570 num_target_bios = ti->num_write_bios(ti, bio); 1571 1572 for (target_bio_nr = 0; target_bio_nr < num_target_bios; target_bio_nr++) { 1573 tio = alloc_tio(ci, ti, target_bio_nr); 1574 tio->len_ptr = len; 1575 clone_bio(tio, bio, sector, *len); 1576 __map_bio(tio); 1577 } 1578 } 1579 1580 typedef unsigned (*get_num_bios_fn)(struct dm_target *ti); 1581 1582 static unsigned get_num_discard_bios(struct dm_target *ti) 1583 { 1584 return ti->num_discard_bios; 1585 } 1586 1587 static unsigned get_num_write_same_bios(struct dm_target *ti) 1588 { 1589 return ti->num_write_same_bios; 1590 } 1591 1592 typedef bool (*is_split_required_fn)(struct dm_target *ti); 1593 1594 static bool is_split_required_for_discard(struct dm_target *ti) 1595 { 1596 return ti->split_discard_bios; 1597 } 1598 1599 static int __send_changing_extent_only(struct clone_info *ci, 1600 get_num_bios_fn get_num_bios, 1601 is_split_required_fn is_split_required) 1602 { 1603 struct dm_target *ti; 1604 unsigned len; 1605 unsigned num_bios; 1606 1607 do { 1608 ti = dm_table_find_target(ci->map, ci->sector); 1609 if (!dm_target_is_valid(ti)) 1610 return -EIO; 1611 1612 /* 1613 * Even though the device advertised support for this type of 1614 * request, that does not mean every target supports it, and 1615 * reconfiguration might also have changed that since the 1616 * check was performed. 1617 */ 1618 num_bios = get_num_bios ? get_num_bios(ti) : 0; 1619 if (!num_bios) 1620 return -EOPNOTSUPP; 1621 1622 if (is_split_required && !is_split_required(ti)) 1623 len = min((sector_t)ci->sector_count, max_io_len_target_boundary(ci->sector, ti)); 1624 else 1625 len = min((sector_t)ci->sector_count, max_io_len(ci->sector, ti)); 1626 1627 __send_duplicate_bios(ci, ti, num_bios, &len); 1628 1629 ci->sector += len; 1630 } while (ci->sector_count -= len); 1631 1632 return 0; 1633 } 1634 1635 static int __send_discard(struct clone_info *ci) 1636 { 1637 return __send_changing_extent_only(ci, get_num_discard_bios, 1638 is_split_required_for_discard); 1639 } 1640 1641 static int __send_write_same(struct clone_info *ci) 1642 { 1643 return __send_changing_extent_only(ci, get_num_write_same_bios, NULL); 1644 } 1645 1646 /* 1647 * Select the correct strategy for processing a non-flush bio. 1648 */ 1649 static int __split_and_process_non_flush(struct clone_info *ci) 1650 { 1651 struct bio *bio = ci->bio; 1652 struct dm_target *ti; 1653 unsigned len; 1654 1655 if (unlikely(bio->bi_rw & REQ_DISCARD)) 1656 return __send_discard(ci); 1657 else if (unlikely(bio->bi_rw & REQ_WRITE_SAME)) 1658 return __send_write_same(ci); 1659 1660 ti = dm_table_find_target(ci->map, ci->sector); 1661 if (!dm_target_is_valid(ti)) 1662 return -EIO; 1663 1664 len = min_t(sector_t, max_io_len(ci->sector, ti), ci->sector_count); 1665 1666 __clone_and_map_data_bio(ci, ti, ci->sector, &len); 1667 1668 ci->sector += len; 1669 ci->sector_count -= len; 1670 1671 return 0; 1672 } 1673 1674 /* 1675 * Entry point to split a bio into clones and submit them to the targets. 1676 */ 1677 static void __split_and_process_bio(struct mapped_device *md, 1678 struct dm_table *map, struct bio *bio) 1679 { 1680 struct clone_info ci; 1681 int error = 0; 1682 1683 if (unlikely(!map)) { 1684 bio_io_error(bio); 1685 return; 1686 } 1687 1688 ci.map = map; 1689 ci.md = md; 1690 ci.io = alloc_io(md); 1691 ci.io->error = 0; 1692 atomic_set(&ci.io->io_count, 1); 1693 ci.io->bio = bio; 1694 ci.io->md = md; 1695 spin_lock_init(&ci.io->endio_lock); 1696 ci.sector = bio->bi_iter.bi_sector; 1697 1698 start_io_acct(ci.io); 1699 1700 if (bio->bi_rw & REQ_FLUSH) { 1701 ci.bio = &ci.md->flush_bio; 1702 ci.sector_count = 0; 1703 error = __send_empty_flush(&ci); 1704 /* dec_pending submits any data associated with flush */ 1705 } else { 1706 ci.bio = bio; 1707 ci.sector_count = bio_sectors(bio); 1708 while (ci.sector_count && !error) 1709 error = __split_and_process_non_flush(&ci); 1710 } 1711 1712 /* drop the extra reference count */ 1713 dec_pending(ci.io, error); 1714 } 1715 /*----------------------------------------------------------------- 1716 * CRUD END 1717 *---------------------------------------------------------------*/ 1718 1719 static int dm_merge_bvec(struct request_queue *q, 1720 struct bvec_merge_data *bvm, 1721 struct bio_vec *biovec) 1722 { 1723 struct mapped_device *md = q->queuedata; 1724 struct dm_table *map = dm_get_live_table_fast(md); 1725 struct dm_target *ti; 1726 sector_t max_sectors, max_size = 0; 1727 1728 if (unlikely(!map)) 1729 goto out; 1730 1731 ti = dm_table_find_target(map, bvm->bi_sector); 1732 if (!dm_target_is_valid(ti)) 1733 goto out; 1734 1735 /* 1736 * Find maximum amount of I/O that won't need splitting 1737 */ 1738 max_sectors = min(max_io_len(bvm->bi_sector, ti), 1739 (sector_t) queue_max_sectors(q)); 1740 max_size = (max_sectors << SECTOR_SHIFT) - bvm->bi_size; 1741 1742 /* 1743 * FIXME: this stop-gap fix _must_ be cleaned up (by passing a sector_t 1744 * to the targets' merge function since it holds sectors not bytes). 1745 * Just doing this as an interim fix for stable@ because the more 1746 * comprehensive cleanup of switching to sector_t will impact every 1747 * DM target that implements a ->merge hook. 1748 */ 1749 if (max_size > INT_MAX) 1750 max_size = INT_MAX; 1751 1752 /* 1753 * merge_bvec_fn() returns number of bytes 1754 * it can accept at this offset 1755 * max is precomputed maximal io size 1756 */ 1757 if (max_size && ti->type->merge) 1758 max_size = ti->type->merge(ti, bvm, biovec, (int) max_size); 1759 /* 1760 * If the target doesn't support merge method and some of the devices 1761 * provided their merge_bvec method (we know this by looking for the 1762 * max_hw_sectors that dm_set_device_limits may set), then we can't 1763 * allow bios with multiple vector entries. So always set max_size 1764 * to 0, and the code below allows just one page. 1765 */ 1766 else if (queue_max_hw_sectors(q) <= PAGE_SIZE >> 9) 1767 max_size = 0; 1768 1769 out: 1770 dm_put_live_table_fast(md); 1771 /* 1772 * Always allow an entire first page 1773 */ 1774 if (max_size <= biovec->bv_len && !(bvm->bi_size >> SECTOR_SHIFT)) 1775 max_size = biovec->bv_len; 1776 1777 return max_size; 1778 } 1779 1780 /* 1781 * The request function that just remaps the bio built up by 1782 * dm_merge_bvec. 1783 */ 1784 static void dm_make_request(struct request_queue *q, struct bio *bio) 1785 { 1786 int rw = bio_data_dir(bio); 1787 struct mapped_device *md = q->queuedata; 1788 int srcu_idx; 1789 struct dm_table *map; 1790 1791 map = dm_get_live_table(md, &srcu_idx); 1792 1793 generic_start_io_acct(rw, bio_sectors(bio), &dm_disk(md)->part0); 1794 1795 /* if we're suspended, we have to queue this io for later */ 1796 if (unlikely(test_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags))) { 1797 dm_put_live_table(md, srcu_idx); 1798 1799 if (bio_rw(bio) != READA) 1800 queue_io(md, bio); 1801 else 1802 bio_io_error(bio); 1803 return; 1804 } 1805 1806 __split_and_process_bio(md, map, bio); 1807 dm_put_live_table(md, srcu_idx); 1808 return; 1809 } 1810 1811 int dm_request_based(struct mapped_device *md) 1812 { 1813 return blk_queue_stackable(md->queue); 1814 } 1815 1816 static void dm_dispatch_clone_request(struct request *clone, struct request *rq) 1817 { 1818 int r; 1819 1820 if (blk_queue_io_stat(clone->q)) 1821 clone->cmd_flags |= REQ_IO_STAT; 1822 1823 clone->start_time = jiffies; 1824 r = blk_insert_cloned_request(clone->q, clone); 1825 if (r) 1826 /* must complete clone in terms of original request */ 1827 dm_complete_request(rq, r); 1828 } 1829 1830 static int dm_rq_bio_constructor(struct bio *bio, struct bio *bio_orig, 1831 void *data) 1832 { 1833 struct dm_rq_target_io *tio = data; 1834 struct dm_rq_clone_bio_info *info = 1835 container_of(bio, struct dm_rq_clone_bio_info, clone); 1836 1837 info->orig = bio_orig; 1838 info->tio = tio; 1839 bio->bi_end_io = end_clone_bio; 1840 1841 return 0; 1842 } 1843 1844 static int setup_clone(struct request *clone, struct request *rq, 1845 struct dm_rq_target_io *tio, gfp_t gfp_mask) 1846 { 1847 int r; 1848 1849 r = blk_rq_prep_clone(clone, rq, tio->md->bs, gfp_mask, 1850 dm_rq_bio_constructor, tio); 1851 if (r) 1852 return r; 1853 1854 clone->cmd = rq->cmd; 1855 clone->cmd_len = rq->cmd_len; 1856 clone->sense = rq->sense; 1857 clone->end_io = end_clone_request; 1858 clone->end_io_data = tio; 1859 1860 tio->clone = clone; 1861 1862 return 0; 1863 } 1864 1865 static struct request *clone_rq(struct request *rq, struct mapped_device *md, 1866 struct dm_rq_target_io *tio, gfp_t gfp_mask) 1867 { 1868 /* 1869 * Do not allocate a clone if tio->clone was already set 1870 * (see: dm_mq_queue_rq). 1871 */ 1872 bool alloc_clone = !tio->clone; 1873 struct request *clone; 1874 1875 if (alloc_clone) { 1876 clone = alloc_clone_request(md, gfp_mask); 1877 if (!clone) 1878 return NULL; 1879 } else 1880 clone = tio->clone; 1881 1882 blk_rq_init(NULL, clone); 1883 if (setup_clone(clone, rq, tio, gfp_mask)) { 1884 /* -ENOMEM */ 1885 if (alloc_clone) 1886 free_clone_request(md, clone); 1887 return NULL; 1888 } 1889 1890 return clone; 1891 } 1892 1893 static void map_tio_request(struct kthread_work *work); 1894 1895 static void init_tio(struct dm_rq_target_io *tio, struct request *rq, 1896 struct mapped_device *md) 1897 { 1898 tio->md = md; 1899 tio->ti = NULL; 1900 tio->clone = NULL; 1901 tio->orig = rq; 1902 tio->error = 0; 1903 memset(&tio->info, 0, sizeof(tio->info)); 1904 if (md->kworker_task) 1905 init_kthread_work(&tio->work, map_tio_request); 1906 } 1907 1908 static struct dm_rq_target_io *prep_tio(struct request *rq, 1909 struct mapped_device *md, gfp_t gfp_mask) 1910 { 1911 struct dm_rq_target_io *tio; 1912 int srcu_idx; 1913 struct dm_table *table; 1914 1915 tio = alloc_rq_tio(md, gfp_mask); 1916 if (!tio) 1917 return NULL; 1918 1919 init_tio(tio, rq, md); 1920 1921 table = dm_get_live_table(md, &srcu_idx); 1922 if (!dm_table_mq_request_based(table)) { 1923 if (!clone_rq(rq, md, tio, gfp_mask)) { 1924 dm_put_live_table(md, srcu_idx); 1925 free_rq_tio(tio); 1926 return NULL; 1927 } 1928 } 1929 dm_put_live_table(md, srcu_idx); 1930 1931 return tio; 1932 } 1933 1934 /* 1935 * Called with the queue lock held. 1936 */ 1937 static int dm_prep_fn(struct request_queue *q, struct request *rq) 1938 { 1939 struct mapped_device *md = q->queuedata; 1940 struct dm_rq_target_io *tio; 1941 1942 if (unlikely(rq->special)) { 1943 DMWARN("Already has something in rq->special."); 1944 return BLKPREP_KILL; 1945 } 1946 1947 tio = prep_tio(rq, md, GFP_ATOMIC); 1948 if (!tio) 1949 return BLKPREP_DEFER; 1950 1951 rq->special = tio; 1952 rq->cmd_flags |= REQ_DONTPREP; 1953 1954 return BLKPREP_OK; 1955 } 1956 1957 /* 1958 * Returns: 1959 * 0 : the request has been processed 1960 * DM_MAPIO_REQUEUE : the original request needs to be requeued 1961 * < 0 : the request was completed due to failure 1962 */ 1963 static int map_request(struct dm_rq_target_io *tio, struct request *rq, 1964 struct mapped_device *md) 1965 { 1966 int r; 1967 struct dm_target *ti = tio->ti; 1968 struct request *clone = NULL; 1969 1970 if (tio->clone) { 1971 clone = tio->clone; 1972 r = ti->type->map_rq(ti, clone, &tio->info); 1973 } else { 1974 r = ti->type->clone_and_map_rq(ti, rq, &tio->info, &clone); 1975 if (r < 0) { 1976 /* The target wants to complete the I/O */ 1977 dm_kill_unmapped_request(rq, r); 1978 return r; 1979 } 1980 if (r != DM_MAPIO_REMAPPED) 1981 return r; 1982 if (setup_clone(clone, rq, tio, GFP_ATOMIC)) { 1983 /* -ENOMEM */ 1984 ti->type->release_clone_rq(clone); 1985 return DM_MAPIO_REQUEUE; 1986 } 1987 } 1988 1989 switch (r) { 1990 case DM_MAPIO_SUBMITTED: 1991 /* The target has taken the I/O to submit by itself later */ 1992 break; 1993 case DM_MAPIO_REMAPPED: 1994 /* The target has remapped the I/O so dispatch it */ 1995 trace_block_rq_remap(clone->q, clone, disk_devt(dm_disk(md)), 1996 blk_rq_pos(rq)); 1997 dm_dispatch_clone_request(clone, rq); 1998 break; 1999 case DM_MAPIO_REQUEUE: 2000 /* The target wants to requeue the I/O */ 2001 dm_requeue_unmapped_request(clone); 2002 break; 2003 default: 2004 if (r > 0) { 2005 DMWARN("unimplemented target map return value: %d", r); 2006 BUG(); 2007 } 2008 2009 /* The target wants to complete the I/O */ 2010 dm_kill_unmapped_request(rq, r); 2011 return r; 2012 } 2013 2014 return 0; 2015 } 2016 2017 static void map_tio_request(struct kthread_work *work) 2018 { 2019 struct dm_rq_target_io *tio = container_of(work, struct dm_rq_target_io, work); 2020 struct request *rq = tio->orig; 2021 struct mapped_device *md = tio->md; 2022 2023 if (map_request(tio, rq, md) == DM_MAPIO_REQUEUE) 2024 dm_requeue_unmapped_original_request(md, rq); 2025 } 2026 2027 static void dm_start_request(struct mapped_device *md, struct request *orig) 2028 { 2029 if (!orig->q->mq_ops) 2030 blk_start_request(orig); 2031 else 2032 blk_mq_start_request(orig); 2033 atomic_inc(&md->pending[rq_data_dir(orig)]); 2034 2035 if (md->seq_rq_merge_deadline_usecs) { 2036 md->last_rq_pos = rq_end_sector(orig); 2037 md->last_rq_rw = rq_data_dir(orig); 2038 md->last_rq_start_time = ktime_get(); 2039 } 2040 2041 /* 2042 * Hold the md reference here for the in-flight I/O. 2043 * We can't rely on the reference count by device opener, 2044 * because the device may be closed during the request completion 2045 * when all bios are completed. 2046 * See the comment in rq_completed() too. 2047 */ 2048 dm_get(md); 2049 } 2050 2051 #define MAX_SEQ_RQ_MERGE_DEADLINE_USECS 100000 2052 2053 ssize_t dm_attr_rq_based_seq_io_merge_deadline_show(struct mapped_device *md, char *buf) 2054 { 2055 return sprintf(buf, "%u\n", md->seq_rq_merge_deadline_usecs); 2056 } 2057 2058 ssize_t dm_attr_rq_based_seq_io_merge_deadline_store(struct mapped_device *md, 2059 const char *buf, size_t count) 2060 { 2061 unsigned deadline; 2062 2063 if (!dm_request_based(md) || md->use_blk_mq) 2064 return count; 2065 2066 if (kstrtouint(buf, 10, &deadline)) 2067 return -EINVAL; 2068 2069 if (deadline > MAX_SEQ_RQ_MERGE_DEADLINE_USECS) 2070 deadline = MAX_SEQ_RQ_MERGE_DEADLINE_USECS; 2071 2072 md->seq_rq_merge_deadline_usecs = deadline; 2073 2074 return count; 2075 } 2076 2077 static bool dm_request_peeked_before_merge_deadline(struct mapped_device *md) 2078 { 2079 ktime_t kt_deadline; 2080 2081 if (!md->seq_rq_merge_deadline_usecs) 2082 return false; 2083 2084 kt_deadline = ns_to_ktime((u64)md->seq_rq_merge_deadline_usecs * NSEC_PER_USEC); 2085 kt_deadline = ktime_add_safe(md->last_rq_start_time, kt_deadline); 2086 2087 return !ktime_after(ktime_get(), kt_deadline); 2088 } 2089 2090 /* 2091 * q->request_fn for request-based dm. 2092 * Called with the queue lock held. 2093 */ 2094 static void dm_request_fn(struct request_queue *q) 2095 { 2096 struct mapped_device *md = q->queuedata; 2097 int srcu_idx; 2098 struct dm_table *map = dm_get_live_table(md, &srcu_idx); 2099 struct dm_target *ti; 2100 struct request *rq; 2101 struct dm_rq_target_io *tio; 2102 sector_t pos; 2103 2104 /* 2105 * For suspend, check blk_queue_stopped() and increment 2106 * ->pending within a single queue_lock not to increment the 2107 * number of in-flight I/Os after the queue is stopped in 2108 * dm_suspend(). 2109 */ 2110 while (!blk_queue_stopped(q)) { 2111 rq = blk_peek_request(q); 2112 if (!rq) 2113 goto out; 2114 2115 /* always use block 0 to find the target for flushes for now */ 2116 pos = 0; 2117 if (!(rq->cmd_flags & REQ_FLUSH)) 2118 pos = blk_rq_pos(rq); 2119 2120 ti = dm_table_find_target(map, pos); 2121 if (!dm_target_is_valid(ti)) { 2122 /* 2123 * Must perform setup, that rq_completed() requires, 2124 * before calling dm_kill_unmapped_request 2125 */ 2126 DMERR_LIMIT("request attempted access beyond the end of device"); 2127 dm_start_request(md, rq); 2128 dm_kill_unmapped_request(rq, -EIO); 2129 continue; 2130 } 2131 2132 if (dm_request_peeked_before_merge_deadline(md) && 2133 md_in_flight(md) && rq->bio && rq->bio->bi_vcnt == 1 && 2134 md->last_rq_pos == pos && md->last_rq_rw == rq_data_dir(rq)) 2135 goto delay_and_out; 2136 2137 if (ti->type->busy && ti->type->busy(ti)) 2138 goto delay_and_out; 2139 2140 dm_start_request(md, rq); 2141 2142 tio = tio_from_request(rq); 2143 /* Establish tio->ti before queuing work (map_tio_request) */ 2144 tio->ti = ti; 2145 queue_kthread_work(&md->kworker, &tio->work); 2146 BUG_ON(!irqs_disabled()); 2147 } 2148 2149 goto out; 2150 2151 delay_and_out: 2152 blk_delay_queue(q, HZ / 100); 2153 out: 2154 dm_put_live_table(md, srcu_idx); 2155 } 2156 2157 static int dm_any_congested(void *congested_data, int bdi_bits) 2158 { 2159 int r = bdi_bits; 2160 struct mapped_device *md = congested_data; 2161 struct dm_table *map; 2162 2163 if (!test_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags)) { 2164 map = dm_get_live_table_fast(md); 2165 if (map) { 2166 /* 2167 * Request-based dm cares about only own queue for 2168 * the query about congestion status of request_queue 2169 */ 2170 if (dm_request_based(md)) 2171 r = md->queue->backing_dev_info.state & 2172 bdi_bits; 2173 else 2174 r = dm_table_any_congested(map, bdi_bits); 2175 } 2176 dm_put_live_table_fast(md); 2177 } 2178 2179 return r; 2180 } 2181 2182 /*----------------------------------------------------------------- 2183 * An IDR is used to keep track of allocated minor numbers. 2184 *---------------------------------------------------------------*/ 2185 static void free_minor(int minor) 2186 { 2187 spin_lock(&_minor_lock); 2188 idr_remove(&_minor_idr, minor); 2189 spin_unlock(&_minor_lock); 2190 } 2191 2192 /* 2193 * See if the device with a specific minor # is free. 2194 */ 2195 static int specific_minor(int minor) 2196 { 2197 int r; 2198 2199 if (minor >= (1 << MINORBITS)) 2200 return -EINVAL; 2201 2202 idr_preload(GFP_KERNEL); 2203 spin_lock(&_minor_lock); 2204 2205 r = idr_alloc(&_minor_idr, MINOR_ALLOCED, minor, minor + 1, GFP_NOWAIT); 2206 2207 spin_unlock(&_minor_lock); 2208 idr_preload_end(); 2209 if (r < 0) 2210 return r == -ENOSPC ? -EBUSY : r; 2211 return 0; 2212 } 2213 2214 static int next_free_minor(int *minor) 2215 { 2216 int r; 2217 2218 idr_preload(GFP_KERNEL); 2219 spin_lock(&_minor_lock); 2220 2221 r = idr_alloc(&_minor_idr, MINOR_ALLOCED, 0, 1 << MINORBITS, GFP_NOWAIT); 2222 2223 spin_unlock(&_minor_lock); 2224 idr_preload_end(); 2225 if (r < 0) 2226 return r; 2227 *minor = r; 2228 return 0; 2229 } 2230 2231 static const struct block_device_operations dm_blk_dops; 2232 2233 static void dm_wq_work(struct work_struct *work); 2234 2235 static void dm_init_md_queue(struct mapped_device *md) 2236 { 2237 /* 2238 * Request-based dm devices cannot be stacked on top of bio-based dm 2239 * devices. The type of this dm device may not have been decided yet. 2240 * The type is decided at the first table loading time. 2241 * To prevent problematic device stacking, clear the queue flag 2242 * for request stacking support until then. 2243 * 2244 * This queue is new, so no concurrency on the queue_flags. 2245 */ 2246 queue_flag_clear_unlocked(QUEUE_FLAG_STACKABLE, md->queue); 2247 } 2248 2249 static void dm_init_old_md_queue(struct mapped_device *md) 2250 { 2251 md->use_blk_mq = false; 2252 dm_init_md_queue(md); 2253 2254 /* 2255 * Initialize aspects of queue that aren't relevant for blk-mq 2256 */ 2257 md->queue->queuedata = md; 2258 md->queue->backing_dev_info.congested_fn = dm_any_congested; 2259 md->queue->backing_dev_info.congested_data = md; 2260 2261 blk_queue_bounce_limit(md->queue, BLK_BOUNCE_ANY); 2262 } 2263 2264 /* 2265 * Allocate and initialise a blank device with a given minor. 2266 */ 2267 static struct mapped_device *alloc_dev(int minor) 2268 { 2269 int r; 2270 struct mapped_device *md = kzalloc(sizeof(*md), GFP_KERNEL); 2271 void *old_md; 2272 2273 if (!md) { 2274 DMWARN("unable to allocate device, out of memory."); 2275 return NULL; 2276 } 2277 2278 if (!try_module_get(THIS_MODULE)) 2279 goto bad_module_get; 2280 2281 /* get a minor number for the dev */ 2282 if (minor == DM_ANY_MINOR) 2283 r = next_free_minor(&minor); 2284 else 2285 r = specific_minor(minor); 2286 if (r < 0) 2287 goto bad_minor; 2288 2289 r = init_srcu_struct(&md->io_barrier); 2290 if (r < 0) 2291 goto bad_io_barrier; 2292 2293 md->use_blk_mq = use_blk_mq; 2294 md->type = DM_TYPE_NONE; 2295 mutex_init(&md->suspend_lock); 2296 mutex_init(&md->type_lock); 2297 mutex_init(&md->table_devices_lock); 2298 spin_lock_init(&md->deferred_lock); 2299 atomic_set(&md->holders, 1); 2300 atomic_set(&md->open_count, 0); 2301 atomic_set(&md->event_nr, 0); 2302 atomic_set(&md->uevent_seq, 0); 2303 INIT_LIST_HEAD(&md->uevent_list); 2304 INIT_LIST_HEAD(&md->table_devices); 2305 spin_lock_init(&md->uevent_lock); 2306 2307 md->queue = blk_alloc_queue(GFP_KERNEL); 2308 if (!md->queue) 2309 goto bad_queue; 2310 2311 dm_init_md_queue(md); 2312 2313 md->disk = alloc_disk(1); 2314 if (!md->disk) 2315 goto bad_disk; 2316 2317 atomic_set(&md->pending[0], 0); 2318 atomic_set(&md->pending[1], 0); 2319 init_waitqueue_head(&md->wait); 2320 INIT_WORK(&md->work, dm_wq_work); 2321 init_waitqueue_head(&md->eventq); 2322 init_completion(&md->kobj_holder.completion); 2323 md->kworker_task = NULL; 2324 2325 md->disk->major = _major; 2326 md->disk->first_minor = minor; 2327 md->disk->fops = &dm_blk_dops; 2328 md->disk->queue = md->queue; 2329 md->disk->private_data = md; 2330 sprintf(md->disk->disk_name, "dm-%d", minor); 2331 add_disk(md->disk); 2332 format_dev_t(md->name, MKDEV(_major, minor)); 2333 2334 md->wq = alloc_workqueue("kdmflush", WQ_MEM_RECLAIM, 0); 2335 if (!md->wq) 2336 goto bad_thread; 2337 2338 md->bdev = bdget_disk(md->disk, 0); 2339 if (!md->bdev) 2340 goto bad_bdev; 2341 2342 bio_init(&md->flush_bio); 2343 md->flush_bio.bi_bdev = md->bdev; 2344 md->flush_bio.bi_rw = WRITE_FLUSH; 2345 2346 dm_stats_init(&md->stats); 2347 2348 /* Populate the mapping, nobody knows we exist yet */ 2349 spin_lock(&_minor_lock); 2350 old_md = idr_replace(&_minor_idr, md, minor); 2351 spin_unlock(&_minor_lock); 2352 2353 BUG_ON(old_md != MINOR_ALLOCED); 2354 2355 return md; 2356 2357 bad_bdev: 2358 destroy_workqueue(md->wq); 2359 bad_thread: 2360 del_gendisk(md->disk); 2361 put_disk(md->disk); 2362 bad_disk: 2363 blk_cleanup_queue(md->queue); 2364 bad_queue: 2365 cleanup_srcu_struct(&md->io_barrier); 2366 bad_io_barrier: 2367 free_minor(minor); 2368 bad_minor: 2369 module_put(THIS_MODULE); 2370 bad_module_get: 2371 kfree(md); 2372 return NULL; 2373 } 2374 2375 static void unlock_fs(struct mapped_device *md); 2376 2377 static void free_dev(struct mapped_device *md) 2378 { 2379 int minor = MINOR(disk_devt(md->disk)); 2380 2381 unlock_fs(md); 2382 destroy_workqueue(md->wq); 2383 2384 if (md->kworker_task) 2385 kthread_stop(md->kworker_task); 2386 if (md->io_pool) 2387 mempool_destroy(md->io_pool); 2388 if (md->rq_pool) 2389 mempool_destroy(md->rq_pool); 2390 if (md->bs) 2391 bioset_free(md->bs); 2392 2393 cleanup_srcu_struct(&md->io_barrier); 2394 free_table_devices(&md->table_devices); 2395 dm_stats_cleanup(&md->stats); 2396 2397 spin_lock(&_minor_lock); 2398 md->disk->private_data = NULL; 2399 spin_unlock(&_minor_lock); 2400 if (blk_get_integrity(md->disk)) 2401 blk_integrity_unregister(md->disk); 2402 del_gendisk(md->disk); 2403 put_disk(md->disk); 2404 blk_cleanup_queue(md->queue); 2405 if (md->use_blk_mq) 2406 blk_mq_free_tag_set(&md->tag_set); 2407 bdput(md->bdev); 2408 free_minor(minor); 2409 2410 module_put(THIS_MODULE); 2411 kfree(md); 2412 } 2413 2414 static void __bind_mempools(struct mapped_device *md, struct dm_table *t) 2415 { 2416 struct dm_md_mempools *p = dm_table_get_md_mempools(t); 2417 2418 if (md->bs) { 2419 /* The md already has necessary mempools. */ 2420 if (dm_table_get_type(t) == DM_TYPE_BIO_BASED) { 2421 /* 2422 * Reload bioset because front_pad may have changed 2423 * because a different table was loaded. 2424 */ 2425 bioset_free(md->bs); 2426 md->bs = p->bs; 2427 p->bs = NULL; 2428 } 2429 /* 2430 * There's no need to reload with request-based dm 2431 * because the size of front_pad doesn't change. 2432 * Note for future: If you are to reload bioset, 2433 * prep-ed requests in the queue may refer 2434 * to bio from the old bioset, so you must walk 2435 * through the queue to unprep. 2436 */ 2437 goto out; 2438 } 2439 2440 BUG_ON(!p || md->io_pool || md->rq_pool || md->bs); 2441 2442 md->io_pool = p->io_pool; 2443 p->io_pool = NULL; 2444 md->rq_pool = p->rq_pool; 2445 p->rq_pool = NULL; 2446 md->bs = p->bs; 2447 p->bs = NULL; 2448 2449 out: 2450 /* mempool bind completed, no longer need any mempools in the table */ 2451 dm_table_free_md_mempools(t); 2452 } 2453 2454 /* 2455 * Bind a table to the device. 2456 */ 2457 static void event_callback(void *context) 2458 { 2459 unsigned long flags; 2460 LIST_HEAD(uevents); 2461 struct mapped_device *md = (struct mapped_device *) context; 2462 2463 spin_lock_irqsave(&md->uevent_lock, flags); 2464 list_splice_init(&md->uevent_list, &uevents); 2465 spin_unlock_irqrestore(&md->uevent_lock, flags); 2466 2467 dm_send_uevents(&uevents, &disk_to_dev(md->disk)->kobj); 2468 2469 atomic_inc(&md->event_nr); 2470 wake_up(&md->eventq); 2471 } 2472 2473 /* 2474 * Protected by md->suspend_lock obtained by dm_swap_table(). 2475 */ 2476 static void __set_size(struct mapped_device *md, sector_t size) 2477 { 2478 set_capacity(md->disk, size); 2479 2480 i_size_write(md->bdev->bd_inode, (loff_t)size << SECTOR_SHIFT); 2481 } 2482 2483 /* 2484 * Return 1 if the queue has a compulsory merge_bvec_fn function. 2485 * 2486 * If this function returns 0, then the device is either a non-dm 2487 * device without a merge_bvec_fn, or it is a dm device that is 2488 * able to split any bios it receives that are too big. 2489 */ 2490 int dm_queue_merge_is_compulsory(struct request_queue *q) 2491 { 2492 struct mapped_device *dev_md; 2493 2494 if (!q->merge_bvec_fn) 2495 return 0; 2496 2497 if (q->make_request_fn == dm_make_request) { 2498 dev_md = q->queuedata; 2499 if (test_bit(DMF_MERGE_IS_OPTIONAL, &dev_md->flags)) 2500 return 0; 2501 } 2502 2503 return 1; 2504 } 2505 2506 static int dm_device_merge_is_compulsory(struct dm_target *ti, 2507 struct dm_dev *dev, sector_t start, 2508 sector_t len, void *data) 2509 { 2510 struct block_device *bdev = dev->bdev; 2511 struct request_queue *q = bdev_get_queue(bdev); 2512 2513 return dm_queue_merge_is_compulsory(q); 2514 } 2515 2516 /* 2517 * Return 1 if it is acceptable to ignore merge_bvec_fn based 2518 * on the properties of the underlying devices. 2519 */ 2520 static int dm_table_merge_is_optional(struct dm_table *table) 2521 { 2522 unsigned i = 0; 2523 struct dm_target *ti; 2524 2525 while (i < dm_table_get_num_targets(table)) { 2526 ti = dm_table_get_target(table, i++); 2527 2528 if (ti->type->iterate_devices && 2529 ti->type->iterate_devices(ti, dm_device_merge_is_compulsory, NULL)) 2530 return 0; 2531 } 2532 2533 return 1; 2534 } 2535 2536 /* 2537 * Returns old map, which caller must destroy. 2538 */ 2539 static struct dm_table *__bind(struct mapped_device *md, struct dm_table *t, 2540 struct queue_limits *limits) 2541 { 2542 struct dm_table *old_map; 2543 struct request_queue *q = md->queue; 2544 sector_t size; 2545 int merge_is_optional; 2546 2547 size = dm_table_get_size(t); 2548 2549 /* 2550 * Wipe any geometry if the size of the table changed. 2551 */ 2552 if (size != dm_get_size(md)) 2553 memset(&md->geometry, 0, sizeof(md->geometry)); 2554 2555 __set_size(md, size); 2556 2557 dm_table_event_callback(t, event_callback, md); 2558 2559 /* 2560 * The queue hasn't been stopped yet, if the old table type wasn't 2561 * for request-based during suspension. So stop it to prevent 2562 * I/O mapping before resume. 2563 * This must be done before setting the queue restrictions, 2564 * because request-based dm may be run just after the setting. 2565 */ 2566 if (dm_table_request_based(t)) 2567 stop_queue(q); 2568 2569 __bind_mempools(md, t); 2570 2571 merge_is_optional = dm_table_merge_is_optional(t); 2572 2573 old_map = rcu_dereference_protected(md->map, lockdep_is_held(&md->suspend_lock)); 2574 rcu_assign_pointer(md->map, t); 2575 md->immutable_target_type = dm_table_get_immutable_target_type(t); 2576 2577 dm_table_set_restrictions(t, q, limits); 2578 if (merge_is_optional) 2579 set_bit(DMF_MERGE_IS_OPTIONAL, &md->flags); 2580 else 2581 clear_bit(DMF_MERGE_IS_OPTIONAL, &md->flags); 2582 if (old_map) 2583 dm_sync_table(md); 2584 2585 return old_map; 2586 } 2587 2588 /* 2589 * Returns unbound table for the caller to free. 2590 */ 2591 static struct dm_table *__unbind(struct mapped_device *md) 2592 { 2593 struct dm_table *map = rcu_dereference_protected(md->map, 1); 2594 2595 if (!map) 2596 return NULL; 2597 2598 dm_table_event_callback(map, NULL, NULL); 2599 RCU_INIT_POINTER(md->map, NULL); 2600 dm_sync_table(md); 2601 2602 return map; 2603 } 2604 2605 /* 2606 * Constructor for a new device. 2607 */ 2608 int dm_create(int minor, struct mapped_device **result) 2609 { 2610 struct mapped_device *md; 2611 2612 md = alloc_dev(minor); 2613 if (!md) 2614 return -ENXIO; 2615 2616 dm_sysfs_init(md); 2617 2618 *result = md; 2619 return 0; 2620 } 2621 2622 /* 2623 * Functions to manage md->type. 2624 * All are required to hold md->type_lock. 2625 */ 2626 void dm_lock_md_type(struct mapped_device *md) 2627 { 2628 mutex_lock(&md->type_lock); 2629 } 2630 2631 void dm_unlock_md_type(struct mapped_device *md) 2632 { 2633 mutex_unlock(&md->type_lock); 2634 } 2635 2636 void dm_set_md_type(struct mapped_device *md, unsigned type) 2637 { 2638 BUG_ON(!mutex_is_locked(&md->type_lock)); 2639 md->type = type; 2640 } 2641 2642 unsigned dm_get_md_type(struct mapped_device *md) 2643 { 2644 BUG_ON(!mutex_is_locked(&md->type_lock)); 2645 return md->type; 2646 } 2647 2648 struct target_type *dm_get_immutable_target_type(struct mapped_device *md) 2649 { 2650 return md->immutable_target_type; 2651 } 2652 2653 /* 2654 * The queue_limits are only valid as long as you have a reference 2655 * count on 'md'. 2656 */ 2657 struct queue_limits *dm_get_queue_limits(struct mapped_device *md) 2658 { 2659 BUG_ON(!atomic_read(&md->holders)); 2660 return &md->queue->limits; 2661 } 2662 EXPORT_SYMBOL_GPL(dm_get_queue_limits); 2663 2664 static void init_rq_based_worker_thread(struct mapped_device *md) 2665 { 2666 /* Initialize the request-based DM worker thread */ 2667 init_kthread_worker(&md->kworker); 2668 md->kworker_task = kthread_run(kthread_worker_fn, &md->kworker, 2669 "kdmwork-%s", dm_device_name(md)); 2670 } 2671 2672 /* 2673 * Fully initialize a request-based queue (->elevator, ->request_fn, etc). 2674 */ 2675 static int dm_init_request_based_queue(struct mapped_device *md) 2676 { 2677 struct request_queue *q = NULL; 2678 2679 /* Fully initialize the queue */ 2680 q = blk_init_allocated_queue(md->queue, dm_request_fn, NULL); 2681 if (!q) 2682 return -EINVAL; 2683 2684 /* disable dm_request_fn's merge heuristic by default */ 2685 md->seq_rq_merge_deadline_usecs = 0; 2686 2687 md->queue = q; 2688 dm_init_old_md_queue(md); 2689 blk_queue_softirq_done(md->queue, dm_softirq_done); 2690 blk_queue_prep_rq(md->queue, dm_prep_fn); 2691 2692 init_rq_based_worker_thread(md); 2693 2694 elv_register_queue(md->queue); 2695 2696 return 0; 2697 } 2698 2699 static int dm_mq_init_request(void *data, struct request *rq, 2700 unsigned int hctx_idx, unsigned int request_idx, 2701 unsigned int numa_node) 2702 { 2703 struct mapped_device *md = data; 2704 struct dm_rq_target_io *tio = blk_mq_rq_to_pdu(rq); 2705 2706 /* 2707 * Must initialize md member of tio, otherwise it won't 2708 * be available in dm_mq_queue_rq. 2709 */ 2710 tio->md = md; 2711 2712 return 0; 2713 } 2714 2715 static int dm_mq_queue_rq(struct blk_mq_hw_ctx *hctx, 2716 const struct blk_mq_queue_data *bd) 2717 { 2718 struct request *rq = bd->rq; 2719 struct dm_rq_target_io *tio = blk_mq_rq_to_pdu(rq); 2720 struct mapped_device *md = tio->md; 2721 int srcu_idx; 2722 struct dm_table *map = dm_get_live_table(md, &srcu_idx); 2723 struct dm_target *ti; 2724 sector_t pos; 2725 2726 /* always use block 0 to find the target for flushes for now */ 2727 pos = 0; 2728 if (!(rq->cmd_flags & REQ_FLUSH)) 2729 pos = blk_rq_pos(rq); 2730 2731 ti = dm_table_find_target(map, pos); 2732 if (!dm_target_is_valid(ti)) { 2733 dm_put_live_table(md, srcu_idx); 2734 DMERR_LIMIT("request attempted access beyond the end of device"); 2735 /* 2736 * Must perform setup, that rq_completed() requires, 2737 * before returning BLK_MQ_RQ_QUEUE_ERROR 2738 */ 2739 dm_start_request(md, rq); 2740 return BLK_MQ_RQ_QUEUE_ERROR; 2741 } 2742 dm_put_live_table(md, srcu_idx); 2743 2744 if (ti->type->busy && ti->type->busy(ti)) 2745 return BLK_MQ_RQ_QUEUE_BUSY; 2746 2747 dm_start_request(md, rq); 2748 2749 /* Init tio using md established in .init_request */ 2750 init_tio(tio, rq, md); 2751 2752 /* 2753 * Establish tio->ti before queuing work (map_tio_request) 2754 * or making direct call to map_request(). 2755 */ 2756 tio->ti = ti; 2757 2758 /* Clone the request if underlying devices aren't blk-mq */ 2759 if (dm_table_get_type(map) == DM_TYPE_REQUEST_BASED) { 2760 /* clone request is allocated at the end of the pdu */ 2761 tio->clone = (void *)blk_mq_rq_to_pdu(rq) + sizeof(struct dm_rq_target_io); 2762 (void) clone_rq(rq, md, tio, GFP_ATOMIC); 2763 queue_kthread_work(&md->kworker, &tio->work); 2764 } else { 2765 /* Direct call is fine since .queue_rq allows allocations */ 2766 if (map_request(tio, rq, md) == DM_MAPIO_REQUEUE) { 2767 /* Undo dm_start_request() before requeuing */ 2768 rq_completed(md, rq_data_dir(rq), false); 2769 return BLK_MQ_RQ_QUEUE_BUSY; 2770 } 2771 } 2772 2773 return BLK_MQ_RQ_QUEUE_OK; 2774 } 2775 2776 static struct blk_mq_ops dm_mq_ops = { 2777 .queue_rq = dm_mq_queue_rq, 2778 .map_queue = blk_mq_map_queue, 2779 .complete = dm_softirq_done, 2780 .init_request = dm_mq_init_request, 2781 }; 2782 2783 static int dm_init_request_based_blk_mq_queue(struct mapped_device *md) 2784 { 2785 unsigned md_type = dm_get_md_type(md); 2786 struct request_queue *q; 2787 int err; 2788 2789 memset(&md->tag_set, 0, sizeof(md->tag_set)); 2790 md->tag_set.ops = &dm_mq_ops; 2791 md->tag_set.queue_depth = BLKDEV_MAX_RQ; 2792 md->tag_set.numa_node = NUMA_NO_NODE; 2793 md->tag_set.flags = BLK_MQ_F_SHOULD_MERGE | BLK_MQ_F_SG_MERGE; 2794 md->tag_set.nr_hw_queues = 1; 2795 if (md_type == DM_TYPE_REQUEST_BASED) { 2796 /* make the memory for non-blk-mq clone part of the pdu */ 2797 md->tag_set.cmd_size = sizeof(struct dm_rq_target_io) + sizeof(struct request); 2798 } else 2799 md->tag_set.cmd_size = sizeof(struct dm_rq_target_io); 2800 md->tag_set.driver_data = md; 2801 2802 err = blk_mq_alloc_tag_set(&md->tag_set); 2803 if (err) 2804 return err; 2805 2806 q = blk_mq_init_allocated_queue(&md->tag_set, md->queue); 2807 if (IS_ERR(q)) { 2808 err = PTR_ERR(q); 2809 goto out_tag_set; 2810 } 2811 md->queue = q; 2812 dm_init_md_queue(md); 2813 2814 /* backfill 'mq' sysfs registration normally done in blk_register_queue */ 2815 blk_mq_register_disk(md->disk); 2816 2817 if (md_type == DM_TYPE_REQUEST_BASED) 2818 init_rq_based_worker_thread(md); 2819 2820 return 0; 2821 2822 out_tag_set: 2823 blk_mq_free_tag_set(&md->tag_set); 2824 return err; 2825 } 2826 2827 static unsigned filter_md_type(unsigned type, struct mapped_device *md) 2828 { 2829 if (type == DM_TYPE_BIO_BASED) 2830 return type; 2831 2832 return !md->use_blk_mq ? DM_TYPE_REQUEST_BASED : DM_TYPE_MQ_REQUEST_BASED; 2833 } 2834 2835 /* 2836 * Setup the DM device's queue based on md's type 2837 */ 2838 int dm_setup_md_queue(struct mapped_device *md) 2839 { 2840 int r; 2841 unsigned md_type = filter_md_type(dm_get_md_type(md), md); 2842 2843 switch (md_type) { 2844 case DM_TYPE_REQUEST_BASED: 2845 r = dm_init_request_based_queue(md); 2846 if (r) { 2847 DMWARN("Cannot initialize queue for request-based mapped device"); 2848 return r; 2849 } 2850 break; 2851 case DM_TYPE_MQ_REQUEST_BASED: 2852 r = dm_init_request_based_blk_mq_queue(md); 2853 if (r) { 2854 DMWARN("Cannot initialize queue for request-based blk-mq mapped device"); 2855 return r; 2856 } 2857 break; 2858 case DM_TYPE_BIO_BASED: 2859 dm_init_old_md_queue(md); 2860 blk_queue_make_request(md->queue, dm_make_request); 2861 blk_queue_merge_bvec(md->queue, dm_merge_bvec); 2862 break; 2863 } 2864 2865 return 0; 2866 } 2867 2868 struct mapped_device *dm_get_md(dev_t dev) 2869 { 2870 struct mapped_device *md; 2871 unsigned minor = MINOR(dev); 2872 2873 if (MAJOR(dev) != _major || minor >= (1 << MINORBITS)) 2874 return NULL; 2875 2876 spin_lock(&_minor_lock); 2877 2878 md = idr_find(&_minor_idr, minor); 2879 if (md) { 2880 if ((md == MINOR_ALLOCED || 2881 (MINOR(disk_devt(dm_disk(md))) != minor) || 2882 dm_deleting_md(md) || 2883 test_bit(DMF_FREEING, &md->flags))) { 2884 md = NULL; 2885 goto out; 2886 } 2887 dm_get(md); 2888 } 2889 2890 out: 2891 spin_unlock(&_minor_lock); 2892 2893 return md; 2894 } 2895 EXPORT_SYMBOL_GPL(dm_get_md); 2896 2897 void *dm_get_mdptr(struct mapped_device *md) 2898 { 2899 return md->interface_ptr; 2900 } 2901 2902 void dm_set_mdptr(struct mapped_device *md, void *ptr) 2903 { 2904 md->interface_ptr = ptr; 2905 } 2906 2907 void dm_get(struct mapped_device *md) 2908 { 2909 atomic_inc(&md->holders); 2910 BUG_ON(test_bit(DMF_FREEING, &md->flags)); 2911 } 2912 2913 int dm_hold(struct mapped_device *md) 2914 { 2915 spin_lock(&_minor_lock); 2916 if (test_bit(DMF_FREEING, &md->flags)) { 2917 spin_unlock(&_minor_lock); 2918 return -EBUSY; 2919 } 2920 dm_get(md); 2921 spin_unlock(&_minor_lock); 2922 return 0; 2923 } 2924 EXPORT_SYMBOL_GPL(dm_hold); 2925 2926 const char *dm_device_name(struct mapped_device *md) 2927 { 2928 return md->name; 2929 } 2930 EXPORT_SYMBOL_GPL(dm_device_name); 2931 2932 static void __dm_destroy(struct mapped_device *md, bool wait) 2933 { 2934 struct dm_table *map; 2935 int srcu_idx; 2936 2937 might_sleep(); 2938 2939 map = dm_get_live_table(md, &srcu_idx); 2940 2941 spin_lock(&_minor_lock); 2942 idr_replace(&_minor_idr, MINOR_ALLOCED, MINOR(disk_devt(dm_disk(md)))); 2943 set_bit(DMF_FREEING, &md->flags); 2944 spin_unlock(&_minor_lock); 2945 2946 if (dm_request_based(md) && md->kworker_task) 2947 flush_kthread_worker(&md->kworker); 2948 2949 /* 2950 * Take suspend_lock so that presuspend and postsuspend methods 2951 * do not race with internal suspend. 2952 */ 2953 mutex_lock(&md->suspend_lock); 2954 if (!dm_suspended_md(md)) { 2955 dm_table_presuspend_targets(map); 2956 dm_table_postsuspend_targets(map); 2957 } 2958 mutex_unlock(&md->suspend_lock); 2959 2960 /* dm_put_live_table must be before msleep, otherwise deadlock is possible */ 2961 dm_put_live_table(md, srcu_idx); 2962 2963 /* 2964 * Rare, but there may be I/O requests still going to complete, 2965 * for example. Wait for all references to disappear. 2966 * No one should increment the reference count of the mapped_device, 2967 * after the mapped_device state becomes DMF_FREEING. 2968 */ 2969 if (wait) 2970 while (atomic_read(&md->holders)) 2971 msleep(1); 2972 else if (atomic_read(&md->holders)) 2973 DMWARN("%s: Forcibly removing mapped_device still in use! (%d users)", 2974 dm_device_name(md), atomic_read(&md->holders)); 2975 2976 dm_sysfs_exit(md); 2977 dm_table_destroy(__unbind(md)); 2978 free_dev(md); 2979 } 2980 2981 void dm_destroy(struct mapped_device *md) 2982 { 2983 __dm_destroy(md, true); 2984 } 2985 2986 void dm_destroy_immediate(struct mapped_device *md) 2987 { 2988 __dm_destroy(md, false); 2989 } 2990 2991 void dm_put(struct mapped_device *md) 2992 { 2993 atomic_dec(&md->holders); 2994 } 2995 EXPORT_SYMBOL_GPL(dm_put); 2996 2997 static int dm_wait_for_completion(struct mapped_device *md, int interruptible) 2998 { 2999 int r = 0; 3000 DECLARE_WAITQUEUE(wait, current); 3001 3002 add_wait_queue(&md->wait, &wait); 3003 3004 while (1) { 3005 set_current_state(interruptible); 3006 3007 if (!md_in_flight(md)) 3008 break; 3009 3010 if (interruptible == TASK_INTERRUPTIBLE && 3011 signal_pending(current)) { 3012 r = -EINTR; 3013 break; 3014 } 3015 3016 io_schedule(); 3017 } 3018 set_current_state(TASK_RUNNING); 3019 3020 remove_wait_queue(&md->wait, &wait); 3021 3022 return r; 3023 } 3024 3025 /* 3026 * Process the deferred bios 3027 */ 3028 static void dm_wq_work(struct work_struct *work) 3029 { 3030 struct mapped_device *md = container_of(work, struct mapped_device, 3031 work); 3032 struct bio *c; 3033 int srcu_idx; 3034 struct dm_table *map; 3035 3036 map = dm_get_live_table(md, &srcu_idx); 3037 3038 while (!test_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags)) { 3039 spin_lock_irq(&md->deferred_lock); 3040 c = bio_list_pop(&md->deferred); 3041 spin_unlock_irq(&md->deferred_lock); 3042 3043 if (!c) 3044 break; 3045 3046 if (dm_request_based(md)) 3047 generic_make_request(c); 3048 else 3049 __split_and_process_bio(md, map, c); 3050 } 3051 3052 dm_put_live_table(md, srcu_idx); 3053 } 3054 3055 static void dm_queue_flush(struct mapped_device *md) 3056 { 3057 clear_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags); 3058 smp_mb__after_atomic(); 3059 queue_work(md->wq, &md->work); 3060 } 3061 3062 /* 3063 * Swap in a new table, returning the old one for the caller to destroy. 3064 */ 3065 struct dm_table *dm_swap_table(struct mapped_device *md, struct dm_table *table) 3066 { 3067 struct dm_table *live_map = NULL, *map = ERR_PTR(-EINVAL); 3068 struct queue_limits limits; 3069 int r; 3070 3071 mutex_lock(&md->suspend_lock); 3072 3073 /* device must be suspended */ 3074 if (!dm_suspended_md(md)) 3075 goto out; 3076 3077 /* 3078 * If the new table has no data devices, retain the existing limits. 3079 * This helps multipath with queue_if_no_path if all paths disappear, 3080 * then new I/O is queued based on these limits, and then some paths 3081 * reappear. 3082 */ 3083 if (dm_table_has_no_data_devices(table)) { 3084 live_map = dm_get_live_table_fast(md); 3085 if (live_map) 3086 limits = md->queue->limits; 3087 dm_put_live_table_fast(md); 3088 } 3089 3090 if (!live_map) { 3091 r = dm_calculate_queue_limits(table, &limits); 3092 if (r) { 3093 map = ERR_PTR(r); 3094 goto out; 3095 } 3096 } 3097 3098 map = __bind(md, table, &limits); 3099 3100 out: 3101 mutex_unlock(&md->suspend_lock); 3102 return map; 3103 } 3104 3105 /* 3106 * Functions to lock and unlock any filesystem running on the 3107 * device. 3108 */ 3109 static int lock_fs(struct mapped_device *md) 3110 { 3111 int r; 3112 3113 WARN_ON(md->frozen_sb); 3114 3115 md->frozen_sb = freeze_bdev(md->bdev); 3116 if (IS_ERR(md->frozen_sb)) { 3117 r = PTR_ERR(md->frozen_sb); 3118 md->frozen_sb = NULL; 3119 return r; 3120 } 3121 3122 set_bit(DMF_FROZEN, &md->flags); 3123 3124 return 0; 3125 } 3126 3127 static void unlock_fs(struct mapped_device *md) 3128 { 3129 if (!test_bit(DMF_FROZEN, &md->flags)) 3130 return; 3131 3132 thaw_bdev(md->bdev, md->frozen_sb); 3133 md->frozen_sb = NULL; 3134 clear_bit(DMF_FROZEN, &md->flags); 3135 } 3136 3137 /* 3138 * If __dm_suspend returns 0, the device is completely quiescent 3139 * now. There is no request-processing activity. All new requests 3140 * are being added to md->deferred list. 3141 * 3142 * Caller must hold md->suspend_lock 3143 */ 3144 static int __dm_suspend(struct mapped_device *md, struct dm_table *map, 3145 unsigned suspend_flags, int interruptible) 3146 { 3147 bool do_lockfs = suspend_flags & DM_SUSPEND_LOCKFS_FLAG; 3148 bool noflush = suspend_flags & DM_SUSPEND_NOFLUSH_FLAG; 3149 int r; 3150 3151 /* 3152 * DMF_NOFLUSH_SUSPENDING must be set before presuspend. 3153 * This flag is cleared before dm_suspend returns. 3154 */ 3155 if (noflush) 3156 set_bit(DMF_NOFLUSH_SUSPENDING, &md->flags); 3157 3158 /* 3159 * This gets reverted if there's an error later and the targets 3160 * provide the .presuspend_undo hook. 3161 */ 3162 dm_table_presuspend_targets(map); 3163 3164 /* 3165 * Flush I/O to the device. 3166 * Any I/O submitted after lock_fs() may not be flushed. 3167 * noflush takes precedence over do_lockfs. 3168 * (lock_fs() flushes I/Os and waits for them to complete.) 3169 */ 3170 if (!noflush && do_lockfs) { 3171 r = lock_fs(md); 3172 if (r) { 3173 dm_table_presuspend_undo_targets(map); 3174 return r; 3175 } 3176 } 3177 3178 /* 3179 * Here we must make sure that no processes are submitting requests 3180 * to target drivers i.e. no one may be executing 3181 * __split_and_process_bio. This is called from dm_request and 3182 * dm_wq_work. 3183 * 3184 * To get all processes out of __split_and_process_bio in dm_request, 3185 * we take the write lock. To prevent any process from reentering 3186 * __split_and_process_bio from dm_request and quiesce the thread 3187 * (dm_wq_work), we set BMF_BLOCK_IO_FOR_SUSPEND and call 3188 * flush_workqueue(md->wq). 3189 */ 3190 set_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags); 3191 if (map) 3192 synchronize_srcu(&md->io_barrier); 3193 3194 /* 3195 * Stop md->queue before flushing md->wq in case request-based 3196 * dm defers requests to md->wq from md->queue. 3197 */ 3198 if (dm_request_based(md)) { 3199 stop_queue(md->queue); 3200 if (md->kworker_task) 3201 flush_kthread_worker(&md->kworker); 3202 } 3203 3204 flush_workqueue(md->wq); 3205 3206 /* 3207 * At this point no more requests are entering target request routines. 3208 * We call dm_wait_for_completion to wait for all existing requests 3209 * to finish. 3210 */ 3211 r = dm_wait_for_completion(md, interruptible); 3212 3213 if (noflush) 3214 clear_bit(DMF_NOFLUSH_SUSPENDING, &md->flags); 3215 if (map) 3216 synchronize_srcu(&md->io_barrier); 3217 3218 /* were we interrupted ? */ 3219 if (r < 0) { 3220 dm_queue_flush(md); 3221 3222 if (dm_request_based(md)) 3223 start_queue(md->queue); 3224 3225 unlock_fs(md); 3226 dm_table_presuspend_undo_targets(map); 3227 /* pushback list is already flushed, so skip flush */ 3228 } 3229 3230 return r; 3231 } 3232 3233 /* 3234 * We need to be able to change a mapping table under a mounted 3235 * filesystem. For example we might want to move some data in 3236 * the background. Before the table can be swapped with 3237 * dm_bind_table, dm_suspend must be called to flush any in 3238 * flight bios and ensure that any further io gets deferred. 3239 */ 3240 /* 3241 * Suspend mechanism in request-based dm. 3242 * 3243 * 1. Flush all I/Os by lock_fs() if needed. 3244 * 2. Stop dispatching any I/O by stopping the request_queue. 3245 * 3. Wait for all in-flight I/Os to be completed or requeued. 3246 * 3247 * To abort suspend, start the request_queue. 3248 */ 3249 int dm_suspend(struct mapped_device *md, unsigned suspend_flags) 3250 { 3251 struct dm_table *map = NULL; 3252 int r = 0; 3253 3254 retry: 3255 mutex_lock_nested(&md->suspend_lock, SINGLE_DEPTH_NESTING); 3256 3257 if (dm_suspended_md(md)) { 3258 r = -EINVAL; 3259 goto out_unlock; 3260 } 3261 3262 if (dm_suspended_internally_md(md)) { 3263 /* already internally suspended, wait for internal resume */ 3264 mutex_unlock(&md->suspend_lock); 3265 r = wait_on_bit(&md->flags, DMF_SUSPENDED_INTERNALLY, TASK_INTERRUPTIBLE); 3266 if (r) 3267 return r; 3268 goto retry; 3269 } 3270 3271 map = rcu_dereference_protected(md->map, lockdep_is_held(&md->suspend_lock)); 3272 3273 r = __dm_suspend(md, map, suspend_flags, TASK_INTERRUPTIBLE); 3274 if (r) 3275 goto out_unlock; 3276 3277 set_bit(DMF_SUSPENDED, &md->flags); 3278 3279 dm_table_postsuspend_targets(map); 3280 3281 out_unlock: 3282 mutex_unlock(&md->suspend_lock); 3283 return r; 3284 } 3285 3286 static int __dm_resume(struct mapped_device *md, struct dm_table *map) 3287 { 3288 if (map) { 3289 int r = dm_table_resume_targets(map); 3290 if (r) 3291 return r; 3292 } 3293 3294 dm_queue_flush(md); 3295 3296 /* 3297 * Flushing deferred I/Os must be done after targets are resumed 3298 * so that mapping of targets can work correctly. 3299 * Request-based dm is queueing the deferred I/Os in its request_queue. 3300 */ 3301 if (dm_request_based(md)) 3302 start_queue(md->queue); 3303 3304 unlock_fs(md); 3305 3306 return 0; 3307 } 3308 3309 int dm_resume(struct mapped_device *md) 3310 { 3311 int r = -EINVAL; 3312 struct dm_table *map = NULL; 3313 3314 retry: 3315 mutex_lock_nested(&md->suspend_lock, SINGLE_DEPTH_NESTING); 3316 3317 if (!dm_suspended_md(md)) 3318 goto out; 3319 3320 if (dm_suspended_internally_md(md)) { 3321 /* already internally suspended, wait for internal resume */ 3322 mutex_unlock(&md->suspend_lock); 3323 r = wait_on_bit(&md->flags, DMF_SUSPENDED_INTERNALLY, TASK_INTERRUPTIBLE); 3324 if (r) 3325 return r; 3326 goto retry; 3327 } 3328 3329 map = rcu_dereference_protected(md->map, lockdep_is_held(&md->suspend_lock)); 3330 if (!map || !dm_table_get_size(map)) 3331 goto out; 3332 3333 r = __dm_resume(md, map); 3334 if (r) 3335 goto out; 3336 3337 clear_bit(DMF_SUSPENDED, &md->flags); 3338 3339 r = 0; 3340 out: 3341 mutex_unlock(&md->suspend_lock); 3342 3343 return r; 3344 } 3345 3346 /* 3347 * Internal suspend/resume works like userspace-driven suspend. It waits 3348 * until all bios finish and prevents issuing new bios to the target drivers. 3349 * It may be used only from the kernel. 3350 */ 3351 3352 static void __dm_internal_suspend(struct mapped_device *md, unsigned suspend_flags) 3353 { 3354 struct dm_table *map = NULL; 3355 3356 if (md->internal_suspend_count++) 3357 return; /* nested internal suspend */ 3358 3359 if (dm_suspended_md(md)) { 3360 set_bit(DMF_SUSPENDED_INTERNALLY, &md->flags); 3361 return; /* nest suspend */ 3362 } 3363 3364 map = rcu_dereference_protected(md->map, lockdep_is_held(&md->suspend_lock)); 3365 3366 /* 3367 * Using TASK_UNINTERRUPTIBLE because only NOFLUSH internal suspend is 3368 * supported. Properly supporting a TASK_INTERRUPTIBLE internal suspend 3369 * would require changing .presuspend to return an error -- avoid this 3370 * until there is a need for more elaborate variants of internal suspend. 3371 */ 3372 (void) __dm_suspend(md, map, suspend_flags, TASK_UNINTERRUPTIBLE); 3373 3374 set_bit(DMF_SUSPENDED_INTERNALLY, &md->flags); 3375 3376 dm_table_postsuspend_targets(map); 3377 } 3378 3379 static void __dm_internal_resume(struct mapped_device *md) 3380 { 3381 BUG_ON(!md->internal_suspend_count); 3382 3383 if (--md->internal_suspend_count) 3384 return; /* resume from nested internal suspend */ 3385 3386 if (dm_suspended_md(md)) 3387 goto done; /* resume from nested suspend */ 3388 3389 /* 3390 * NOTE: existing callers don't need to call dm_table_resume_targets 3391 * (which may fail -- so best to avoid it for now by passing NULL map) 3392 */ 3393 (void) __dm_resume(md, NULL); 3394 3395 done: 3396 clear_bit(DMF_SUSPENDED_INTERNALLY, &md->flags); 3397 smp_mb__after_atomic(); 3398 wake_up_bit(&md->flags, DMF_SUSPENDED_INTERNALLY); 3399 } 3400 3401 void dm_internal_suspend_noflush(struct mapped_device *md) 3402 { 3403 mutex_lock(&md->suspend_lock); 3404 __dm_internal_suspend(md, DM_SUSPEND_NOFLUSH_FLAG); 3405 mutex_unlock(&md->suspend_lock); 3406 } 3407 EXPORT_SYMBOL_GPL(dm_internal_suspend_noflush); 3408 3409 void dm_internal_resume(struct mapped_device *md) 3410 { 3411 mutex_lock(&md->suspend_lock); 3412 __dm_internal_resume(md); 3413 mutex_unlock(&md->suspend_lock); 3414 } 3415 EXPORT_SYMBOL_GPL(dm_internal_resume); 3416 3417 /* 3418 * Fast variants of internal suspend/resume hold md->suspend_lock, 3419 * which prevents interaction with userspace-driven suspend. 3420 */ 3421 3422 void dm_internal_suspend_fast(struct mapped_device *md) 3423 { 3424 mutex_lock(&md->suspend_lock); 3425 if (dm_suspended_md(md) || dm_suspended_internally_md(md)) 3426 return; 3427 3428 set_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags); 3429 synchronize_srcu(&md->io_barrier); 3430 flush_workqueue(md->wq); 3431 dm_wait_for_completion(md, TASK_UNINTERRUPTIBLE); 3432 } 3433 EXPORT_SYMBOL_GPL(dm_internal_suspend_fast); 3434 3435 void dm_internal_resume_fast(struct mapped_device *md) 3436 { 3437 if (dm_suspended_md(md) || dm_suspended_internally_md(md)) 3438 goto done; 3439 3440 dm_queue_flush(md); 3441 3442 done: 3443 mutex_unlock(&md->suspend_lock); 3444 } 3445 EXPORT_SYMBOL_GPL(dm_internal_resume_fast); 3446 3447 /*----------------------------------------------------------------- 3448 * Event notification. 3449 *---------------------------------------------------------------*/ 3450 int dm_kobject_uevent(struct mapped_device *md, enum kobject_action action, 3451 unsigned cookie) 3452 { 3453 char udev_cookie[DM_COOKIE_LENGTH]; 3454 char *envp[] = { udev_cookie, NULL }; 3455 3456 if (!cookie) 3457 return kobject_uevent(&disk_to_dev(md->disk)->kobj, action); 3458 else { 3459 snprintf(udev_cookie, DM_COOKIE_LENGTH, "%s=%u", 3460 DM_COOKIE_ENV_VAR_NAME, cookie); 3461 return kobject_uevent_env(&disk_to_dev(md->disk)->kobj, 3462 action, envp); 3463 } 3464 } 3465 3466 uint32_t dm_next_uevent_seq(struct mapped_device *md) 3467 { 3468 return atomic_add_return(1, &md->uevent_seq); 3469 } 3470 3471 uint32_t dm_get_event_nr(struct mapped_device *md) 3472 { 3473 return atomic_read(&md->event_nr); 3474 } 3475 3476 int dm_wait_event(struct mapped_device *md, int event_nr) 3477 { 3478 return wait_event_interruptible(md->eventq, 3479 (event_nr != atomic_read(&md->event_nr))); 3480 } 3481 3482 void dm_uevent_add(struct mapped_device *md, struct list_head *elist) 3483 { 3484 unsigned long flags; 3485 3486 spin_lock_irqsave(&md->uevent_lock, flags); 3487 list_add(elist, &md->uevent_list); 3488 spin_unlock_irqrestore(&md->uevent_lock, flags); 3489 } 3490 3491 /* 3492 * The gendisk is only valid as long as you have a reference 3493 * count on 'md'. 3494 */ 3495 struct gendisk *dm_disk(struct mapped_device *md) 3496 { 3497 return md->disk; 3498 } 3499 EXPORT_SYMBOL_GPL(dm_disk); 3500 3501 struct kobject *dm_kobject(struct mapped_device *md) 3502 { 3503 return &md->kobj_holder.kobj; 3504 } 3505 3506 struct mapped_device *dm_get_from_kobject(struct kobject *kobj) 3507 { 3508 struct mapped_device *md; 3509 3510 md = container_of(kobj, struct mapped_device, kobj_holder.kobj); 3511 3512 if (test_bit(DMF_FREEING, &md->flags) || 3513 dm_deleting_md(md)) 3514 return NULL; 3515 3516 dm_get(md); 3517 return md; 3518 } 3519 3520 int dm_suspended_md(struct mapped_device *md) 3521 { 3522 return test_bit(DMF_SUSPENDED, &md->flags); 3523 } 3524 3525 int dm_suspended_internally_md(struct mapped_device *md) 3526 { 3527 return test_bit(DMF_SUSPENDED_INTERNALLY, &md->flags); 3528 } 3529 3530 int dm_test_deferred_remove_flag(struct mapped_device *md) 3531 { 3532 return test_bit(DMF_DEFERRED_REMOVE, &md->flags); 3533 } 3534 3535 int dm_suspended(struct dm_target *ti) 3536 { 3537 return dm_suspended_md(dm_table_get_md(ti->table)); 3538 } 3539 EXPORT_SYMBOL_GPL(dm_suspended); 3540 3541 int dm_noflush_suspending(struct dm_target *ti) 3542 { 3543 return __noflush_suspending(dm_table_get_md(ti->table)); 3544 } 3545 EXPORT_SYMBOL_GPL(dm_noflush_suspending); 3546 3547 struct dm_md_mempools *dm_alloc_md_mempools(struct mapped_device *md, unsigned type, 3548 unsigned integrity, unsigned per_bio_data_size) 3549 { 3550 struct dm_md_mempools *pools = kzalloc(sizeof(*pools), GFP_KERNEL); 3551 struct kmem_cache *cachep = NULL; 3552 unsigned int pool_size = 0; 3553 unsigned int front_pad; 3554 3555 if (!pools) 3556 return NULL; 3557 3558 type = filter_md_type(type, md); 3559 3560 switch (type) { 3561 case DM_TYPE_BIO_BASED: 3562 cachep = _io_cache; 3563 pool_size = dm_get_reserved_bio_based_ios(); 3564 front_pad = roundup(per_bio_data_size, __alignof__(struct dm_target_io)) + offsetof(struct dm_target_io, clone); 3565 break; 3566 case DM_TYPE_REQUEST_BASED: 3567 cachep = _rq_tio_cache; 3568 pool_size = dm_get_reserved_rq_based_ios(); 3569 pools->rq_pool = mempool_create_slab_pool(pool_size, _rq_cache); 3570 if (!pools->rq_pool) 3571 goto out; 3572 /* fall through to setup remaining rq-based pools */ 3573 case DM_TYPE_MQ_REQUEST_BASED: 3574 if (!pool_size) 3575 pool_size = dm_get_reserved_rq_based_ios(); 3576 front_pad = offsetof(struct dm_rq_clone_bio_info, clone); 3577 /* per_bio_data_size is not used. See __bind_mempools(). */ 3578 WARN_ON(per_bio_data_size != 0); 3579 break; 3580 default: 3581 BUG(); 3582 } 3583 3584 if (cachep) { 3585 pools->io_pool = mempool_create_slab_pool(pool_size, cachep); 3586 if (!pools->io_pool) 3587 goto out; 3588 } 3589 3590 pools->bs = bioset_create_nobvec(pool_size, front_pad); 3591 if (!pools->bs) 3592 goto out; 3593 3594 if (integrity && bioset_integrity_create(pools->bs, pool_size)) 3595 goto out; 3596 3597 return pools; 3598 3599 out: 3600 dm_free_md_mempools(pools); 3601 3602 return NULL; 3603 } 3604 3605 void dm_free_md_mempools(struct dm_md_mempools *pools) 3606 { 3607 if (!pools) 3608 return; 3609 3610 if (pools->io_pool) 3611 mempool_destroy(pools->io_pool); 3612 3613 if (pools->rq_pool) 3614 mempool_destroy(pools->rq_pool); 3615 3616 if (pools->bs) 3617 bioset_free(pools->bs); 3618 3619 kfree(pools); 3620 } 3621 3622 static const struct block_device_operations dm_blk_dops = { 3623 .open = dm_blk_open, 3624 .release = dm_blk_close, 3625 .ioctl = dm_blk_ioctl, 3626 .getgeo = dm_blk_getgeo, 3627 .owner = THIS_MODULE 3628 }; 3629 3630 /* 3631 * module hooks 3632 */ 3633 module_init(dm_init); 3634 module_exit(dm_exit); 3635 3636 module_param(major, uint, 0); 3637 MODULE_PARM_DESC(major, "The major number of the device mapper"); 3638 3639 module_param(reserved_bio_based_ios, uint, S_IRUGO | S_IWUSR); 3640 MODULE_PARM_DESC(reserved_bio_based_ios, "Reserved IOs in bio-based mempools"); 3641 3642 module_param(reserved_rq_based_ios, uint, S_IRUGO | S_IWUSR); 3643 MODULE_PARM_DESC(reserved_rq_based_ios, "Reserved IOs in request-based mempools"); 3644 3645 module_param(use_blk_mq, bool, S_IRUGO | S_IWUSR); 3646 MODULE_PARM_DESC(use_blk_mq, "Use block multiqueue for request-based DM devices"); 3647 3648 MODULE_DESCRIPTION(DM_NAME " driver"); 3649 MODULE_AUTHOR("Joe Thornber <dm-devel@redhat.com>"); 3650 MODULE_LICENSE("GPL"); 3651