1 /* 2 * Copyright (C) 2001, 2002 Sistina Software (UK) Limited. 3 * Copyright (C) 2004-2008 Red Hat, Inc. All rights reserved. 4 * 5 * This file is released under the GPL. 6 */ 7 8 #include "dm.h" 9 #include "dm-uevent.h" 10 11 #include <linux/init.h> 12 #include <linux/module.h> 13 #include <linux/mutex.h> 14 #include <linux/moduleparam.h> 15 #include <linux/blkpg.h> 16 #include <linux/bio.h> 17 #include <linux/mempool.h> 18 #include <linux/slab.h> 19 #include <linux/idr.h> 20 #include <linux/hdreg.h> 21 #include <linux/delay.h> 22 #include <linux/wait.h> 23 #include <linux/kthread.h> 24 #include <linux/ktime.h> 25 #include <linux/elevator.h> /* for rq_end_sector() */ 26 #include <linux/blk-mq.h> 27 28 #include <trace/events/block.h> 29 30 #define DM_MSG_PREFIX "core" 31 32 #ifdef CONFIG_PRINTK 33 /* 34 * ratelimit state to be used in DMXXX_LIMIT(). 35 */ 36 DEFINE_RATELIMIT_STATE(dm_ratelimit_state, 37 DEFAULT_RATELIMIT_INTERVAL, 38 DEFAULT_RATELIMIT_BURST); 39 EXPORT_SYMBOL(dm_ratelimit_state); 40 #endif 41 42 /* 43 * Cookies are numeric values sent with CHANGE and REMOVE 44 * uevents while resuming, removing or renaming the device. 45 */ 46 #define DM_COOKIE_ENV_VAR_NAME "DM_COOKIE" 47 #define DM_COOKIE_LENGTH 24 48 49 static const char *_name = DM_NAME; 50 51 static unsigned int major = 0; 52 static unsigned int _major = 0; 53 54 static DEFINE_IDR(_minor_idr); 55 56 static DEFINE_SPINLOCK(_minor_lock); 57 58 static void do_deferred_remove(struct work_struct *w); 59 60 static DECLARE_WORK(deferred_remove_work, do_deferred_remove); 61 62 static struct workqueue_struct *deferred_remove_workqueue; 63 64 /* 65 * For bio-based dm. 66 * One of these is allocated per bio. 67 */ 68 struct dm_io { 69 struct mapped_device *md; 70 int error; 71 atomic_t io_count; 72 struct bio *bio; 73 unsigned long start_time; 74 spinlock_t endio_lock; 75 struct dm_stats_aux stats_aux; 76 }; 77 78 /* 79 * For request-based dm. 80 * One of these is allocated per request. 81 */ 82 struct dm_rq_target_io { 83 struct mapped_device *md; 84 struct dm_target *ti; 85 struct request *orig, *clone; 86 struct kthread_work work; 87 int error; 88 union map_info info; 89 }; 90 91 /* 92 * For request-based dm - the bio clones we allocate are embedded in these 93 * structs. 94 * 95 * We allocate these with bio_alloc_bioset, using the front_pad parameter when 96 * the bioset is created - this means the bio has to come at the end of the 97 * struct. 98 */ 99 struct dm_rq_clone_bio_info { 100 struct bio *orig; 101 struct dm_rq_target_io *tio; 102 struct bio clone; 103 }; 104 105 union map_info *dm_get_rq_mapinfo(struct request *rq) 106 { 107 if (rq && rq->end_io_data) 108 return &((struct dm_rq_target_io *)rq->end_io_data)->info; 109 return NULL; 110 } 111 EXPORT_SYMBOL_GPL(dm_get_rq_mapinfo); 112 113 #define MINOR_ALLOCED ((void *)-1) 114 115 /* 116 * Bits for the md->flags field. 117 */ 118 #define DMF_BLOCK_IO_FOR_SUSPEND 0 119 #define DMF_SUSPENDED 1 120 #define DMF_FROZEN 2 121 #define DMF_FREEING 3 122 #define DMF_DELETING 4 123 #define DMF_NOFLUSH_SUSPENDING 5 124 #define DMF_MERGE_IS_OPTIONAL 6 125 #define DMF_DEFERRED_REMOVE 7 126 #define DMF_SUSPENDED_INTERNALLY 8 127 128 /* 129 * A dummy definition to make RCU happy. 130 * struct dm_table should never be dereferenced in this file. 131 */ 132 struct dm_table { 133 int undefined__; 134 }; 135 136 /* 137 * Work processed by per-device workqueue. 138 */ 139 struct mapped_device { 140 struct srcu_struct io_barrier; 141 struct mutex suspend_lock; 142 atomic_t holders; 143 atomic_t open_count; 144 145 /* 146 * The current mapping. 147 * Use dm_get_live_table{_fast} or take suspend_lock for 148 * dereference. 149 */ 150 struct dm_table __rcu *map; 151 152 struct list_head table_devices; 153 struct mutex table_devices_lock; 154 155 unsigned long flags; 156 157 struct request_queue *queue; 158 unsigned type; 159 /* Protect queue and type against concurrent access. */ 160 struct mutex type_lock; 161 162 struct target_type *immutable_target_type; 163 164 struct gendisk *disk; 165 char name[16]; 166 167 void *interface_ptr; 168 169 /* 170 * A list of ios that arrived while we were suspended. 171 */ 172 atomic_t pending[2]; 173 wait_queue_head_t wait; 174 struct work_struct work; 175 struct bio_list deferred; 176 spinlock_t deferred_lock; 177 178 /* 179 * Processing queue (flush) 180 */ 181 struct workqueue_struct *wq; 182 183 /* 184 * io objects are allocated from here. 185 */ 186 mempool_t *io_pool; 187 mempool_t *rq_pool; 188 189 struct bio_set *bs; 190 191 /* 192 * Event handling. 193 */ 194 atomic_t event_nr; 195 wait_queue_head_t eventq; 196 atomic_t uevent_seq; 197 struct list_head uevent_list; 198 spinlock_t uevent_lock; /* Protect access to uevent_list */ 199 200 /* 201 * freeze/thaw support require holding onto a super block 202 */ 203 struct super_block *frozen_sb; 204 struct block_device *bdev; 205 206 /* forced geometry settings */ 207 struct hd_geometry geometry; 208 209 /* kobject and completion */ 210 struct dm_kobject_holder kobj_holder; 211 212 /* zero-length flush that will be cloned and submitted to targets */ 213 struct bio flush_bio; 214 215 /* the number of internal suspends */ 216 unsigned internal_suspend_count; 217 218 struct dm_stats stats; 219 220 struct kthread_worker kworker; 221 struct task_struct *kworker_task; 222 223 /* for request-based merge heuristic in dm_request_fn() */ 224 unsigned seq_rq_merge_deadline_usecs; 225 int last_rq_rw; 226 sector_t last_rq_pos; 227 ktime_t last_rq_start_time; 228 229 /* for blk-mq request-based DM support */ 230 struct blk_mq_tag_set tag_set; 231 bool use_blk_mq; 232 }; 233 234 #ifdef CONFIG_DM_MQ_DEFAULT 235 static bool use_blk_mq = true; 236 #else 237 static bool use_blk_mq = false; 238 #endif 239 240 bool dm_use_blk_mq(struct mapped_device *md) 241 { 242 return md->use_blk_mq; 243 } 244 245 /* 246 * For mempools pre-allocation at the table loading time. 247 */ 248 struct dm_md_mempools { 249 mempool_t *io_pool; 250 mempool_t *rq_pool; 251 struct bio_set *bs; 252 }; 253 254 struct table_device { 255 struct list_head list; 256 atomic_t count; 257 struct dm_dev dm_dev; 258 }; 259 260 #define RESERVED_BIO_BASED_IOS 16 261 #define RESERVED_REQUEST_BASED_IOS 256 262 #define RESERVED_MAX_IOS 1024 263 static struct kmem_cache *_io_cache; 264 static struct kmem_cache *_rq_tio_cache; 265 static struct kmem_cache *_rq_cache; 266 267 /* 268 * Bio-based DM's mempools' reserved IOs set by the user. 269 */ 270 static unsigned reserved_bio_based_ios = RESERVED_BIO_BASED_IOS; 271 272 /* 273 * Request-based DM's mempools' reserved IOs set by the user. 274 */ 275 static unsigned reserved_rq_based_ios = RESERVED_REQUEST_BASED_IOS; 276 277 static unsigned __dm_get_module_param(unsigned *module_param, 278 unsigned def, unsigned max) 279 { 280 unsigned param = ACCESS_ONCE(*module_param); 281 unsigned modified_param = 0; 282 283 if (!param) 284 modified_param = def; 285 else if (param > max) 286 modified_param = max; 287 288 if (modified_param) { 289 (void)cmpxchg(module_param, param, modified_param); 290 param = modified_param; 291 } 292 293 return param; 294 } 295 296 unsigned dm_get_reserved_bio_based_ios(void) 297 { 298 return __dm_get_module_param(&reserved_bio_based_ios, 299 RESERVED_BIO_BASED_IOS, RESERVED_MAX_IOS); 300 } 301 EXPORT_SYMBOL_GPL(dm_get_reserved_bio_based_ios); 302 303 unsigned dm_get_reserved_rq_based_ios(void) 304 { 305 return __dm_get_module_param(&reserved_rq_based_ios, 306 RESERVED_REQUEST_BASED_IOS, RESERVED_MAX_IOS); 307 } 308 EXPORT_SYMBOL_GPL(dm_get_reserved_rq_based_ios); 309 310 static int __init local_init(void) 311 { 312 int r = -ENOMEM; 313 314 /* allocate a slab for the dm_ios */ 315 _io_cache = KMEM_CACHE(dm_io, 0); 316 if (!_io_cache) 317 return r; 318 319 _rq_tio_cache = KMEM_CACHE(dm_rq_target_io, 0); 320 if (!_rq_tio_cache) 321 goto out_free_io_cache; 322 323 _rq_cache = kmem_cache_create("dm_clone_request", sizeof(struct request), 324 __alignof__(struct request), 0, NULL); 325 if (!_rq_cache) 326 goto out_free_rq_tio_cache; 327 328 r = dm_uevent_init(); 329 if (r) 330 goto out_free_rq_cache; 331 332 deferred_remove_workqueue = alloc_workqueue("kdmremove", WQ_UNBOUND, 1); 333 if (!deferred_remove_workqueue) { 334 r = -ENOMEM; 335 goto out_uevent_exit; 336 } 337 338 _major = major; 339 r = register_blkdev(_major, _name); 340 if (r < 0) 341 goto out_free_workqueue; 342 343 if (!_major) 344 _major = r; 345 346 return 0; 347 348 out_free_workqueue: 349 destroy_workqueue(deferred_remove_workqueue); 350 out_uevent_exit: 351 dm_uevent_exit(); 352 out_free_rq_cache: 353 kmem_cache_destroy(_rq_cache); 354 out_free_rq_tio_cache: 355 kmem_cache_destroy(_rq_tio_cache); 356 out_free_io_cache: 357 kmem_cache_destroy(_io_cache); 358 359 return r; 360 } 361 362 static void local_exit(void) 363 { 364 flush_scheduled_work(); 365 destroy_workqueue(deferred_remove_workqueue); 366 367 kmem_cache_destroy(_rq_cache); 368 kmem_cache_destroy(_rq_tio_cache); 369 kmem_cache_destroy(_io_cache); 370 unregister_blkdev(_major, _name); 371 dm_uevent_exit(); 372 373 _major = 0; 374 375 DMINFO("cleaned up"); 376 } 377 378 static int (*_inits[])(void) __initdata = { 379 local_init, 380 dm_target_init, 381 dm_linear_init, 382 dm_stripe_init, 383 dm_io_init, 384 dm_kcopyd_init, 385 dm_interface_init, 386 dm_statistics_init, 387 }; 388 389 static void (*_exits[])(void) = { 390 local_exit, 391 dm_target_exit, 392 dm_linear_exit, 393 dm_stripe_exit, 394 dm_io_exit, 395 dm_kcopyd_exit, 396 dm_interface_exit, 397 dm_statistics_exit, 398 }; 399 400 static int __init dm_init(void) 401 { 402 const int count = ARRAY_SIZE(_inits); 403 404 int r, i; 405 406 for (i = 0; i < count; i++) { 407 r = _inits[i](); 408 if (r) 409 goto bad; 410 } 411 412 return 0; 413 414 bad: 415 while (i--) 416 _exits[i](); 417 418 return r; 419 } 420 421 static void __exit dm_exit(void) 422 { 423 int i = ARRAY_SIZE(_exits); 424 425 while (i--) 426 _exits[i](); 427 428 /* 429 * Should be empty by this point. 430 */ 431 idr_destroy(&_minor_idr); 432 } 433 434 /* 435 * Block device functions 436 */ 437 int dm_deleting_md(struct mapped_device *md) 438 { 439 return test_bit(DMF_DELETING, &md->flags); 440 } 441 442 static int dm_blk_open(struct block_device *bdev, fmode_t mode) 443 { 444 struct mapped_device *md; 445 446 spin_lock(&_minor_lock); 447 448 md = bdev->bd_disk->private_data; 449 if (!md) 450 goto out; 451 452 if (test_bit(DMF_FREEING, &md->flags) || 453 dm_deleting_md(md)) { 454 md = NULL; 455 goto out; 456 } 457 458 dm_get(md); 459 atomic_inc(&md->open_count); 460 out: 461 spin_unlock(&_minor_lock); 462 463 return md ? 0 : -ENXIO; 464 } 465 466 static void dm_blk_close(struct gendisk *disk, fmode_t mode) 467 { 468 struct mapped_device *md; 469 470 spin_lock(&_minor_lock); 471 472 md = disk->private_data; 473 if (WARN_ON(!md)) 474 goto out; 475 476 if (atomic_dec_and_test(&md->open_count) && 477 (test_bit(DMF_DEFERRED_REMOVE, &md->flags))) 478 queue_work(deferred_remove_workqueue, &deferred_remove_work); 479 480 dm_put(md); 481 out: 482 spin_unlock(&_minor_lock); 483 } 484 485 int dm_open_count(struct mapped_device *md) 486 { 487 return atomic_read(&md->open_count); 488 } 489 490 /* 491 * Guarantees nothing is using the device before it's deleted. 492 */ 493 int dm_lock_for_deletion(struct mapped_device *md, bool mark_deferred, bool only_deferred) 494 { 495 int r = 0; 496 497 spin_lock(&_minor_lock); 498 499 if (dm_open_count(md)) { 500 r = -EBUSY; 501 if (mark_deferred) 502 set_bit(DMF_DEFERRED_REMOVE, &md->flags); 503 } else if (only_deferred && !test_bit(DMF_DEFERRED_REMOVE, &md->flags)) 504 r = -EEXIST; 505 else 506 set_bit(DMF_DELETING, &md->flags); 507 508 spin_unlock(&_minor_lock); 509 510 return r; 511 } 512 513 int dm_cancel_deferred_remove(struct mapped_device *md) 514 { 515 int r = 0; 516 517 spin_lock(&_minor_lock); 518 519 if (test_bit(DMF_DELETING, &md->flags)) 520 r = -EBUSY; 521 else 522 clear_bit(DMF_DEFERRED_REMOVE, &md->flags); 523 524 spin_unlock(&_minor_lock); 525 526 return r; 527 } 528 529 static void do_deferred_remove(struct work_struct *w) 530 { 531 dm_deferred_remove(); 532 } 533 534 sector_t dm_get_size(struct mapped_device *md) 535 { 536 return get_capacity(md->disk); 537 } 538 539 struct request_queue *dm_get_md_queue(struct mapped_device *md) 540 { 541 return md->queue; 542 } 543 544 struct dm_stats *dm_get_stats(struct mapped_device *md) 545 { 546 return &md->stats; 547 } 548 549 static int dm_blk_getgeo(struct block_device *bdev, struct hd_geometry *geo) 550 { 551 struct mapped_device *md = bdev->bd_disk->private_data; 552 553 return dm_get_geometry(md, geo); 554 } 555 556 static int dm_blk_ioctl(struct block_device *bdev, fmode_t mode, 557 unsigned int cmd, unsigned long arg) 558 { 559 struct mapped_device *md = bdev->bd_disk->private_data; 560 int srcu_idx; 561 struct dm_table *map; 562 struct dm_target *tgt; 563 int r = -ENOTTY; 564 565 retry: 566 map = dm_get_live_table(md, &srcu_idx); 567 568 if (!map || !dm_table_get_size(map)) 569 goto out; 570 571 /* We only support devices that have a single target */ 572 if (dm_table_get_num_targets(map) != 1) 573 goto out; 574 575 tgt = dm_table_get_target(map, 0); 576 if (!tgt->type->ioctl) 577 goto out; 578 579 if (dm_suspended_md(md)) { 580 r = -EAGAIN; 581 goto out; 582 } 583 584 r = tgt->type->ioctl(tgt, cmd, arg); 585 586 out: 587 dm_put_live_table(md, srcu_idx); 588 589 if (r == -ENOTCONN) { 590 msleep(10); 591 goto retry; 592 } 593 594 return r; 595 } 596 597 static struct dm_io *alloc_io(struct mapped_device *md) 598 { 599 return mempool_alloc(md->io_pool, GFP_NOIO); 600 } 601 602 static void free_io(struct mapped_device *md, struct dm_io *io) 603 { 604 mempool_free(io, md->io_pool); 605 } 606 607 static void free_tio(struct mapped_device *md, struct dm_target_io *tio) 608 { 609 bio_put(&tio->clone); 610 } 611 612 static struct dm_rq_target_io *alloc_rq_tio(struct mapped_device *md, 613 gfp_t gfp_mask) 614 { 615 return mempool_alloc(md->io_pool, gfp_mask); 616 } 617 618 static void free_rq_tio(struct dm_rq_target_io *tio) 619 { 620 mempool_free(tio, tio->md->io_pool); 621 } 622 623 static struct request *alloc_clone_request(struct mapped_device *md, 624 gfp_t gfp_mask) 625 { 626 return mempool_alloc(md->rq_pool, gfp_mask); 627 } 628 629 static void free_clone_request(struct mapped_device *md, struct request *rq) 630 { 631 mempool_free(rq, md->rq_pool); 632 } 633 634 static int md_in_flight(struct mapped_device *md) 635 { 636 return atomic_read(&md->pending[READ]) + 637 atomic_read(&md->pending[WRITE]); 638 } 639 640 static void start_io_acct(struct dm_io *io) 641 { 642 struct mapped_device *md = io->md; 643 struct bio *bio = io->bio; 644 int cpu; 645 int rw = bio_data_dir(bio); 646 647 io->start_time = jiffies; 648 649 cpu = part_stat_lock(); 650 part_round_stats(cpu, &dm_disk(md)->part0); 651 part_stat_unlock(); 652 atomic_set(&dm_disk(md)->part0.in_flight[rw], 653 atomic_inc_return(&md->pending[rw])); 654 655 if (unlikely(dm_stats_used(&md->stats))) 656 dm_stats_account_io(&md->stats, bio->bi_rw, bio->bi_iter.bi_sector, 657 bio_sectors(bio), false, 0, &io->stats_aux); 658 } 659 660 static void end_io_acct(struct dm_io *io) 661 { 662 struct mapped_device *md = io->md; 663 struct bio *bio = io->bio; 664 unsigned long duration = jiffies - io->start_time; 665 int pending; 666 int rw = bio_data_dir(bio); 667 668 generic_end_io_acct(rw, &dm_disk(md)->part0, io->start_time); 669 670 if (unlikely(dm_stats_used(&md->stats))) 671 dm_stats_account_io(&md->stats, bio->bi_rw, bio->bi_iter.bi_sector, 672 bio_sectors(bio), true, duration, &io->stats_aux); 673 674 /* 675 * After this is decremented the bio must not be touched if it is 676 * a flush. 677 */ 678 pending = atomic_dec_return(&md->pending[rw]); 679 atomic_set(&dm_disk(md)->part0.in_flight[rw], pending); 680 pending += atomic_read(&md->pending[rw^0x1]); 681 682 /* nudge anyone waiting on suspend queue */ 683 if (!pending) 684 wake_up(&md->wait); 685 } 686 687 /* 688 * Add the bio to the list of deferred io. 689 */ 690 static void queue_io(struct mapped_device *md, struct bio *bio) 691 { 692 unsigned long flags; 693 694 spin_lock_irqsave(&md->deferred_lock, flags); 695 bio_list_add(&md->deferred, bio); 696 spin_unlock_irqrestore(&md->deferred_lock, flags); 697 queue_work(md->wq, &md->work); 698 } 699 700 /* 701 * Everyone (including functions in this file), should use this 702 * function to access the md->map field, and make sure they call 703 * dm_put_live_table() when finished. 704 */ 705 struct dm_table *dm_get_live_table(struct mapped_device *md, int *srcu_idx) __acquires(md->io_barrier) 706 { 707 *srcu_idx = srcu_read_lock(&md->io_barrier); 708 709 return srcu_dereference(md->map, &md->io_barrier); 710 } 711 712 void dm_put_live_table(struct mapped_device *md, int srcu_idx) __releases(md->io_barrier) 713 { 714 srcu_read_unlock(&md->io_barrier, srcu_idx); 715 } 716 717 void dm_sync_table(struct mapped_device *md) 718 { 719 synchronize_srcu(&md->io_barrier); 720 synchronize_rcu_expedited(); 721 } 722 723 /* 724 * A fast alternative to dm_get_live_table/dm_put_live_table. 725 * The caller must not block between these two functions. 726 */ 727 static struct dm_table *dm_get_live_table_fast(struct mapped_device *md) __acquires(RCU) 728 { 729 rcu_read_lock(); 730 return rcu_dereference(md->map); 731 } 732 733 static void dm_put_live_table_fast(struct mapped_device *md) __releases(RCU) 734 { 735 rcu_read_unlock(); 736 } 737 738 /* 739 * Open a table device so we can use it as a map destination. 740 */ 741 static int open_table_device(struct table_device *td, dev_t dev, 742 struct mapped_device *md) 743 { 744 static char *_claim_ptr = "I belong to device-mapper"; 745 struct block_device *bdev; 746 747 int r; 748 749 BUG_ON(td->dm_dev.bdev); 750 751 bdev = blkdev_get_by_dev(dev, td->dm_dev.mode | FMODE_EXCL, _claim_ptr); 752 if (IS_ERR(bdev)) 753 return PTR_ERR(bdev); 754 755 r = bd_link_disk_holder(bdev, dm_disk(md)); 756 if (r) { 757 blkdev_put(bdev, td->dm_dev.mode | FMODE_EXCL); 758 return r; 759 } 760 761 td->dm_dev.bdev = bdev; 762 return 0; 763 } 764 765 /* 766 * Close a table device that we've been using. 767 */ 768 static void close_table_device(struct table_device *td, struct mapped_device *md) 769 { 770 if (!td->dm_dev.bdev) 771 return; 772 773 bd_unlink_disk_holder(td->dm_dev.bdev, dm_disk(md)); 774 blkdev_put(td->dm_dev.bdev, td->dm_dev.mode | FMODE_EXCL); 775 td->dm_dev.bdev = NULL; 776 } 777 778 static struct table_device *find_table_device(struct list_head *l, dev_t dev, 779 fmode_t mode) { 780 struct table_device *td; 781 782 list_for_each_entry(td, l, list) 783 if (td->dm_dev.bdev->bd_dev == dev && td->dm_dev.mode == mode) 784 return td; 785 786 return NULL; 787 } 788 789 int dm_get_table_device(struct mapped_device *md, dev_t dev, fmode_t mode, 790 struct dm_dev **result) { 791 int r; 792 struct table_device *td; 793 794 mutex_lock(&md->table_devices_lock); 795 td = find_table_device(&md->table_devices, dev, mode); 796 if (!td) { 797 td = kmalloc(sizeof(*td), GFP_KERNEL); 798 if (!td) { 799 mutex_unlock(&md->table_devices_lock); 800 return -ENOMEM; 801 } 802 803 td->dm_dev.mode = mode; 804 td->dm_dev.bdev = NULL; 805 806 if ((r = open_table_device(td, dev, md))) { 807 mutex_unlock(&md->table_devices_lock); 808 kfree(td); 809 return r; 810 } 811 812 format_dev_t(td->dm_dev.name, dev); 813 814 atomic_set(&td->count, 0); 815 list_add(&td->list, &md->table_devices); 816 } 817 atomic_inc(&td->count); 818 mutex_unlock(&md->table_devices_lock); 819 820 *result = &td->dm_dev; 821 return 0; 822 } 823 EXPORT_SYMBOL_GPL(dm_get_table_device); 824 825 void dm_put_table_device(struct mapped_device *md, struct dm_dev *d) 826 { 827 struct table_device *td = container_of(d, struct table_device, dm_dev); 828 829 mutex_lock(&md->table_devices_lock); 830 if (atomic_dec_and_test(&td->count)) { 831 close_table_device(td, md); 832 list_del(&td->list); 833 kfree(td); 834 } 835 mutex_unlock(&md->table_devices_lock); 836 } 837 EXPORT_SYMBOL(dm_put_table_device); 838 839 static void free_table_devices(struct list_head *devices) 840 { 841 struct list_head *tmp, *next; 842 843 list_for_each_safe(tmp, next, devices) { 844 struct table_device *td = list_entry(tmp, struct table_device, list); 845 846 DMWARN("dm_destroy: %s still exists with %d references", 847 td->dm_dev.name, atomic_read(&td->count)); 848 kfree(td); 849 } 850 } 851 852 /* 853 * Get the geometry associated with a dm device 854 */ 855 int dm_get_geometry(struct mapped_device *md, struct hd_geometry *geo) 856 { 857 *geo = md->geometry; 858 859 return 0; 860 } 861 862 /* 863 * Set the geometry of a device. 864 */ 865 int dm_set_geometry(struct mapped_device *md, struct hd_geometry *geo) 866 { 867 sector_t sz = (sector_t)geo->cylinders * geo->heads * geo->sectors; 868 869 if (geo->start > sz) { 870 DMWARN("Start sector is beyond the geometry limits."); 871 return -EINVAL; 872 } 873 874 md->geometry = *geo; 875 876 return 0; 877 } 878 879 /*----------------------------------------------------------------- 880 * CRUD START: 881 * A more elegant soln is in the works that uses the queue 882 * merge fn, unfortunately there are a couple of changes to 883 * the block layer that I want to make for this. So in the 884 * interests of getting something for people to use I give 885 * you this clearly demarcated crap. 886 *---------------------------------------------------------------*/ 887 888 static int __noflush_suspending(struct mapped_device *md) 889 { 890 return test_bit(DMF_NOFLUSH_SUSPENDING, &md->flags); 891 } 892 893 /* 894 * Decrements the number of outstanding ios that a bio has been 895 * cloned into, completing the original io if necc. 896 */ 897 static void dec_pending(struct dm_io *io, int error) 898 { 899 unsigned long flags; 900 int io_error; 901 struct bio *bio; 902 struct mapped_device *md = io->md; 903 904 /* Push-back supersedes any I/O errors */ 905 if (unlikely(error)) { 906 spin_lock_irqsave(&io->endio_lock, flags); 907 if (!(io->error > 0 && __noflush_suspending(md))) 908 io->error = error; 909 spin_unlock_irqrestore(&io->endio_lock, flags); 910 } 911 912 if (atomic_dec_and_test(&io->io_count)) { 913 if (io->error == DM_ENDIO_REQUEUE) { 914 /* 915 * Target requested pushing back the I/O. 916 */ 917 spin_lock_irqsave(&md->deferred_lock, flags); 918 if (__noflush_suspending(md)) 919 bio_list_add_head(&md->deferred, io->bio); 920 else 921 /* noflush suspend was interrupted. */ 922 io->error = -EIO; 923 spin_unlock_irqrestore(&md->deferred_lock, flags); 924 } 925 926 io_error = io->error; 927 bio = io->bio; 928 end_io_acct(io); 929 free_io(md, io); 930 931 if (io_error == DM_ENDIO_REQUEUE) 932 return; 933 934 if ((bio->bi_rw & REQ_FLUSH) && bio->bi_iter.bi_size) { 935 /* 936 * Preflush done for flush with data, reissue 937 * without REQ_FLUSH. 938 */ 939 bio->bi_rw &= ~REQ_FLUSH; 940 queue_io(md, bio); 941 } else { 942 /* done with normal IO or empty flush */ 943 trace_block_bio_complete(md->queue, bio, io_error); 944 bio_endio(bio, io_error); 945 } 946 } 947 } 948 949 static void disable_write_same(struct mapped_device *md) 950 { 951 struct queue_limits *limits = dm_get_queue_limits(md); 952 953 /* device doesn't really support WRITE SAME, disable it */ 954 limits->max_write_same_sectors = 0; 955 } 956 957 static void clone_endio(struct bio *bio, int error) 958 { 959 int r = error; 960 struct dm_target_io *tio = container_of(bio, struct dm_target_io, clone); 961 struct dm_io *io = tio->io; 962 struct mapped_device *md = tio->io->md; 963 dm_endio_fn endio = tio->ti->type->end_io; 964 965 if (!bio_flagged(bio, BIO_UPTODATE) && !error) 966 error = -EIO; 967 968 if (endio) { 969 r = endio(tio->ti, bio, error); 970 if (r < 0 || r == DM_ENDIO_REQUEUE) 971 /* 972 * error and requeue request are handled 973 * in dec_pending(). 974 */ 975 error = r; 976 else if (r == DM_ENDIO_INCOMPLETE) 977 /* The target will handle the io */ 978 return; 979 else if (r) { 980 DMWARN("unimplemented target endio return value: %d", r); 981 BUG(); 982 } 983 } 984 985 if (unlikely(r == -EREMOTEIO && (bio->bi_rw & REQ_WRITE_SAME) && 986 !bdev_get_queue(bio->bi_bdev)->limits.max_write_same_sectors)) 987 disable_write_same(md); 988 989 free_tio(md, tio); 990 dec_pending(io, error); 991 } 992 993 /* 994 * Partial completion handling for request-based dm 995 */ 996 static void end_clone_bio(struct bio *clone, int error) 997 { 998 struct dm_rq_clone_bio_info *info = 999 container_of(clone, struct dm_rq_clone_bio_info, clone); 1000 struct dm_rq_target_io *tio = info->tio; 1001 struct bio *bio = info->orig; 1002 unsigned int nr_bytes = info->orig->bi_iter.bi_size; 1003 1004 bio_put(clone); 1005 1006 if (tio->error) 1007 /* 1008 * An error has already been detected on the request. 1009 * Once error occurred, just let clone->end_io() handle 1010 * the remainder. 1011 */ 1012 return; 1013 else if (error) { 1014 /* 1015 * Don't notice the error to the upper layer yet. 1016 * The error handling decision is made by the target driver, 1017 * when the request is completed. 1018 */ 1019 tio->error = error; 1020 return; 1021 } 1022 1023 /* 1024 * I/O for the bio successfully completed. 1025 * Notice the data completion to the upper layer. 1026 */ 1027 1028 /* 1029 * bios are processed from the head of the list. 1030 * So the completing bio should always be rq->bio. 1031 * If it's not, something wrong is happening. 1032 */ 1033 if (tio->orig->bio != bio) 1034 DMERR("bio completion is going in the middle of the request"); 1035 1036 /* 1037 * Update the original request. 1038 * Do not use blk_end_request() here, because it may complete 1039 * the original request before the clone, and break the ordering. 1040 */ 1041 blk_update_request(tio->orig, 0, nr_bytes); 1042 } 1043 1044 static struct dm_rq_target_io *tio_from_request(struct request *rq) 1045 { 1046 return (rq->q->mq_ops ? blk_mq_rq_to_pdu(rq) : rq->special); 1047 } 1048 1049 /* 1050 * Don't touch any member of the md after calling this function because 1051 * the md may be freed in dm_put() at the end of this function. 1052 * Or do dm_get() before calling this function and dm_put() later. 1053 */ 1054 static void rq_completed(struct mapped_device *md, int rw, bool run_queue) 1055 { 1056 int nr_requests_pending; 1057 1058 atomic_dec(&md->pending[rw]); 1059 1060 /* nudge anyone waiting on suspend queue */ 1061 nr_requests_pending = md_in_flight(md); 1062 if (!nr_requests_pending) 1063 wake_up(&md->wait); 1064 1065 /* 1066 * Run this off this callpath, as drivers could invoke end_io while 1067 * inside their request_fn (and holding the queue lock). Calling 1068 * back into ->request_fn() could deadlock attempting to grab the 1069 * queue lock again. 1070 */ 1071 if (run_queue) { 1072 if (md->queue->mq_ops) 1073 blk_mq_run_hw_queues(md->queue, true); 1074 else if (!nr_requests_pending || 1075 (nr_requests_pending >= md->queue->nr_congestion_on)) 1076 blk_run_queue_async(md->queue); 1077 } 1078 1079 /* 1080 * dm_put() must be at the end of this function. See the comment above 1081 */ 1082 dm_put(md); 1083 } 1084 1085 static void free_rq_clone(struct request *clone, bool must_be_mapped) 1086 { 1087 struct dm_rq_target_io *tio = clone->end_io_data; 1088 struct mapped_device *md = tio->md; 1089 1090 WARN_ON_ONCE(must_be_mapped && !clone->q); 1091 1092 blk_rq_unprep_clone(clone); 1093 1094 if (md->type == DM_TYPE_MQ_REQUEST_BASED) 1095 /* stacked on blk-mq queue(s) */ 1096 tio->ti->type->release_clone_rq(clone); 1097 else if (!md->queue->mq_ops) 1098 /* request_fn queue stacked on request_fn queue(s) */ 1099 free_clone_request(md, clone); 1100 /* 1101 * NOTE: for the blk-mq queue stacked on request_fn queue(s) case: 1102 * no need to call free_clone_request() because we leverage blk-mq by 1103 * allocating the clone at the end of the blk-mq pdu (see: clone_rq) 1104 */ 1105 1106 if (!md->queue->mq_ops) 1107 free_rq_tio(tio); 1108 } 1109 1110 /* 1111 * Complete the clone and the original request. 1112 * Must be called without clone's queue lock held, 1113 * see end_clone_request() for more details. 1114 */ 1115 static void dm_end_request(struct request *clone, int error) 1116 { 1117 int rw = rq_data_dir(clone); 1118 struct dm_rq_target_io *tio = clone->end_io_data; 1119 struct mapped_device *md = tio->md; 1120 struct request *rq = tio->orig; 1121 1122 if (rq->cmd_type == REQ_TYPE_BLOCK_PC) { 1123 rq->errors = clone->errors; 1124 rq->resid_len = clone->resid_len; 1125 1126 if (rq->sense) 1127 /* 1128 * We are using the sense buffer of the original 1129 * request. 1130 * So setting the length of the sense data is enough. 1131 */ 1132 rq->sense_len = clone->sense_len; 1133 } 1134 1135 free_rq_clone(clone, true); 1136 if (!rq->q->mq_ops) 1137 blk_end_request_all(rq, error); 1138 else 1139 blk_mq_end_request(rq, error); 1140 rq_completed(md, rw, true); 1141 } 1142 1143 static void dm_unprep_request(struct request *rq) 1144 { 1145 struct dm_rq_target_io *tio = tio_from_request(rq); 1146 struct request *clone = tio->clone; 1147 1148 if (!rq->q->mq_ops) { 1149 rq->special = NULL; 1150 rq->cmd_flags &= ~REQ_DONTPREP; 1151 } 1152 1153 if (clone) 1154 free_rq_clone(clone, false); 1155 } 1156 1157 /* 1158 * Requeue the original request of a clone. 1159 */ 1160 static void old_requeue_request(struct request *rq) 1161 { 1162 struct request_queue *q = rq->q; 1163 unsigned long flags; 1164 1165 spin_lock_irqsave(q->queue_lock, flags); 1166 blk_requeue_request(q, rq); 1167 blk_run_queue_async(q); 1168 spin_unlock_irqrestore(q->queue_lock, flags); 1169 } 1170 1171 static void dm_requeue_unmapped_original_request(struct mapped_device *md, 1172 struct request *rq) 1173 { 1174 int rw = rq_data_dir(rq); 1175 1176 dm_unprep_request(rq); 1177 1178 if (!rq->q->mq_ops) 1179 old_requeue_request(rq); 1180 else { 1181 blk_mq_requeue_request(rq); 1182 blk_mq_kick_requeue_list(rq->q); 1183 } 1184 1185 rq_completed(md, rw, false); 1186 } 1187 1188 static void dm_requeue_unmapped_request(struct request *clone) 1189 { 1190 struct dm_rq_target_io *tio = clone->end_io_data; 1191 1192 dm_requeue_unmapped_original_request(tio->md, tio->orig); 1193 } 1194 1195 static void old_stop_queue(struct request_queue *q) 1196 { 1197 unsigned long flags; 1198 1199 if (blk_queue_stopped(q)) 1200 return; 1201 1202 spin_lock_irqsave(q->queue_lock, flags); 1203 blk_stop_queue(q); 1204 spin_unlock_irqrestore(q->queue_lock, flags); 1205 } 1206 1207 static void stop_queue(struct request_queue *q) 1208 { 1209 if (!q->mq_ops) 1210 old_stop_queue(q); 1211 else 1212 blk_mq_stop_hw_queues(q); 1213 } 1214 1215 static void old_start_queue(struct request_queue *q) 1216 { 1217 unsigned long flags; 1218 1219 spin_lock_irqsave(q->queue_lock, flags); 1220 if (blk_queue_stopped(q)) 1221 blk_start_queue(q); 1222 spin_unlock_irqrestore(q->queue_lock, flags); 1223 } 1224 1225 static void start_queue(struct request_queue *q) 1226 { 1227 if (!q->mq_ops) 1228 old_start_queue(q); 1229 else 1230 blk_mq_start_stopped_hw_queues(q, true); 1231 } 1232 1233 static void dm_done(struct request *clone, int error, bool mapped) 1234 { 1235 int r = error; 1236 struct dm_rq_target_io *tio = clone->end_io_data; 1237 dm_request_endio_fn rq_end_io = NULL; 1238 1239 if (tio->ti) { 1240 rq_end_io = tio->ti->type->rq_end_io; 1241 1242 if (mapped && rq_end_io) 1243 r = rq_end_io(tio->ti, clone, error, &tio->info); 1244 } 1245 1246 if (unlikely(r == -EREMOTEIO && (clone->cmd_flags & REQ_WRITE_SAME) && 1247 !clone->q->limits.max_write_same_sectors)) 1248 disable_write_same(tio->md); 1249 1250 if (r <= 0) 1251 /* The target wants to complete the I/O */ 1252 dm_end_request(clone, r); 1253 else if (r == DM_ENDIO_INCOMPLETE) 1254 /* The target will handle the I/O */ 1255 return; 1256 else if (r == DM_ENDIO_REQUEUE) 1257 /* The target wants to requeue the I/O */ 1258 dm_requeue_unmapped_request(clone); 1259 else { 1260 DMWARN("unimplemented target endio return value: %d", r); 1261 BUG(); 1262 } 1263 } 1264 1265 /* 1266 * Request completion handler for request-based dm 1267 */ 1268 static void dm_softirq_done(struct request *rq) 1269 { 1270 bool mapped = true; 1271 struct dm_rq_target_io *tio = tio_from_request(rq); 1272 struct request *clone = tio->clone; 1273 int rw; 1274 1275 if (!clone) { 1276 rw = rq_data_dir(rq); 1277 if (!rq->q->mq_ops) { 1278 blk_end_request_all(rq, tio->error); 1279 rq_completed(tio->md, rw, false); 1280 free_rq_tio(tio); 1281 } else { 1282 blk_mq_end_request(rq, tio->error); 1283 rq_completed(tio->md, rw, false); 1284 } 1285 return; 1286 } 1287 1288 if (rq->cmd_flags & REQ_FAILED) 1289 mapped = false; 1290 1291 dm_done(clone, tio->error, mapped); 1292 } 1293 1294 /* 1295 * Complete the clone and the original request with the error status 1296 * through softirq context. 1297 */ 1298 static void dm_complete_request(struct request *rq, int error) 1299 { 1300 struct dm_rq_target_io *tio = tio_from_request(rq); 1301 1302 tio->error = error; 1303 blk_complete_request(rq); 1304 } 1305 1306 /* 1307 * Complete the not-mapped clone and the original request with the error status 1308 * through softirq context. 1309 * Target's rq_end_io() function isn't called. 1310 * This may be used when the target's map_rq() or clone_and_map_rq() functions fail. 1311 */ 1312 static void dm_kill_unmapped_request(struct request *rq, int error) 1313 { 1314 rq->cmd_flags |= REQ_FAILED; 1315 dm_complete_request(rq, error); 1316 } 1317 1318 /* 1319 * Called with the clone's queue lock held (for non-blk-mq) 1320 */ 1321 static void end_clone_request(struct request *clone, int error) 1322 { 1323 struct dm_rq_target_io *tio = clone->end_io_data; 1324 1325 if (!clone->q->mq_ops) { 1326 /* 1327 * For just cleaning up the information of the queue in which 1328 * the clone was dispatched. 1329 * The clone is *NOT* freed actually here because it is alloced 1330 * from dm own mempool (REQ_ALLOCED isn't set). 1331 */ 1332 __blk_put_request(clone->q, clone); 1333 } 1334 1335 /* 1336 * Actual request completion is done in a softirq context which doesn't 1337 * hold the clone's queue lock. Otherwise, deadlock could occur because: 1338 * - another request may be submitted by the upper level driver 1339 * of the stacking during the completion 1340 * - the submission which requires queue lock may be done 1341 * against this clone's queue 1342 */ 1343 dm_complete_request(tio->orig, error); 1344 } 1345 1346 /* 1347 * Return maximum size of I/O possible at the supplied sector up to the current 1348 * target boundary. 1349 */ 1350 static sector_t max_io_len_target_boundary(sector_t sector, struct dm_target *ti) 1351 { 1352 sector_t target_offset = dm_target_offset(ti, sector); 1353 1354 return ti->len - target_offset; 1355 } 1356 1357 static sector_t max_io_len(sector_t sector, struct dm_target *ti) 1358 { 1359 sector_t len = max_io_len_target_boundary(sector, ti); 1360 sector_t offset, max_len; 1361 1362 /* 1363 * Does the target need to split even further? 1364 */ 1365 if (ti->max_io_len) { 1366 offset = dm_target_offset(ti, sector); 1367 if (unlikely(ti->max_io_len & (ti->max_io_len - 1))) 1368 max_len = sector_div(offset, ti->max_io_len); 1369 else 1370 max_len = offset & (ti->max_io_len - 1); 1371 max_len = ti->max_io_len - max_len; 1372 1373 if (len > max_len) 1374 len = max_len; 1375 } 1376 1377 return len; 1378 } 1379 1380 int dm_set_target_max_io_len(struct dm_target *ti, sector_t len) 1381 { 1382 if (len > UINT_MAX) { 1383 DMERR("Specified maximum size of target IO (%llu) exceeds limit (%u)", 1384 (unsigned long long)len, UINT_MAX); 1385 ti->error = "Maximum size of target IO is too large"; 1386 return -EINVAL; 1387 } 1388 1389 ti->max_io_len = (uint32_t) len; 1390 1391 return 0; 1392 } 1393 EXPORT_SYMBOL_GPL(dm_set_target_max_io_len); 1394 1395 /* 1396 * A target may call dm_accept_partial_bio only from the map routine. It is 1397 * allowed for all bio types except REQ_FLUSH. 1398 * 1399 * dm_accept_partial_bio informs the dm that the target only wants to process 1400 * additional n_sectors sectors of the bio and the rest of the data should be 1401 * sent in a next bio. 1402 * 1403 * A diagram that explains the arithmetics: 1404 * +--------------------+---------------+-------+ 1405 * | 1 | 2 | 3 | 1406 * +--------------------+---------------+-------+ 1407 * 1408 * <-------------- *tio->len_ptr ---------------> 1409 * <------- bi_size -------> 1410 * <-- n_sectors --> 1411 * 1412 * Region 1 was already iterated over with bio_advance or similar function. 1413 * (it may be empty if the target doesn't use bio_advance) 1414 * Region 2 is the remaining bio size that the target wants to process. 1415 * (it may be empty if region 1 is non-empty, although there is no reason 1416 * to make it empty) 1417 * The target requires that region 3 is to be sent in the next bio. 1418 * 1419 * If the target wants to receive multiple copies of the bio (via num_*bios, etc), 1420 * the partially processed part (the sum of regions 1+2) must be the same for all 1421 * copies of the bio. 1422 */ 1423 void dm_accept_partial_bio(struct bio *bio, unsigned n_sectors) 1424 { 1425 struct dm_target_io *tio = container_of(bio, struct dm_target_io, clone); 1426 unsigned bi_size = bio->bi_iter.bi_size >> SECTOR_SHIFT; 1427 BUG_ON(bio->bi_rw & REQ_FLUSH); 1428 BUG_ON(bi_size > *tio->len_ptr); 1429 BUG_ON(n_sectors > bi_size); 1430 *tio->len_ptr -= bi_size - n_sectors; 1431 bio->bi_iter.bi_size = n_sectors << SECTOR_SHIFT; 1432 } 1433 EXPORT_SYMBOL_GPL(dm_accept_partial_bio); 1434 1435 static void __map_bio(struct dm_target_io *tio) 1436 { 1437 int r; 1438 sector_t sector; 1439 struct mapped_device *md; 1440 struct bio *clone = &tio->clone; 1441 struct dm_target *ti = tio->ti; 1442 1443 clone->bi_end_io = clone_endio; 1444 1445 /* 1446 * Map the clone. If r == 0 we don't need to do 1447 * anything, the target has assumed ownership of 1448 * this io. 1449 */ 1450 atomic_inc(&tio->io->io_count); 1451 sector = clone->bi_iter.bi_sector; 1452 r = ti->type->map(ti, clone); 1453 if (r == DM_MAPIO_REMAPPED) { 1454 /* the bio has been remapped so dispatch it */ 1455 1456 trace_block_bio_remap(bdev_get_queue(clone->bi_bdev), clone, 1457 tio->io->bio->bi_bdev->bd_dev, sector); 1458 1459 generic_make_request(clone); 1460 } else if (r < 0 || r == DM_MAPIO_REQUEUE) { 1461 /* error the io and bail out, or requeue it if needed */ 1462 md = tio->io->md; 1463 dec_pending(tio->io, r); 1464 free_tio(md, tio); 1465 } else if (r) { 1466 DMWARN("unimplemented target map return value: %d", r); 1467 BUG(); 1468 } 1469 } 1470 1471 struct clone_info { 1472 struct mapped_device *md; 1473 struct dm_table *map; 1474 struct bio *bio; 1475 struct dm_io *io; 1476 sector_t sector; 1477 unsigned sector_count; 1478 }; 1479 1480 static void bio_setup_sector(struct bio *bio, sector_t sector, unsigned len) 1481 { 1482 bio->bi_iter.bi_sector = sector; 1483 bio->bi_iter.bi_size = to_bytes(len); 1484 } 1485 1486 /* 1487 * Creates a bio that consists of range of complete bvecs. 1488 */ 1489 static void clone_bio(struct dm_target_io *tio, struct bio *bio, 1490 sector_t sector, unsigned len) 1491 { 1492 struct bio *clone = &tio->clone; 1493 1494 __bio_clone_fast(clone, bio); 1495 1496 if (bio_integrity(bio)) 1497 bio_integrity_clone(clone, bio, GFP_NOIO); 1498 1499 bio_advance(clone, to_bytes(sector - clone->bi_iter.bi_sector)); 1500 clone->bi_iter.bi_size = to_bytes(len); 1501 1502 if (bio_integrity(bio)) 1503 bio_integrity_trim(clone, 0, len); 1504 } 1505 1506 static struct dm_target_io *alloc_tio(struct clone_info *ci, 1507 struct dm_target *ti, 1508 unsigned target_bio_nr) 1509 { 1510 struct dm_target_io *tio; 1511 struct bio *clone; 1512 1513 clone = bio_alloc_bioset(GFP_NOIO, 0, ci->md->bs); 1514 tio = container_of(clone, struct dm_target_io, clone); 1515 1516 tio->io = ci->io; 1517 tio->ti = ti; 1518 tio->target_bio_nr = target_bio_nr; 1519 1520 return tio; 1521 } 1522 1523 static void __clone_and_map_simple_bio(struct clone_info *ci, 1524 struct dm_target *ti, 1525 unsigned target_bio_nr, unsigned *len) 1526 { 1527 struct dm_target_io *tio = alloc_tio(ci, ti, target_bio_nr); 1528 struct bio *clone = &tio->clone; 1529 1530 tio->len_ptr = len; 1531 1532 __bio_clone_fast(clone, ci->bio); 1533 if (len) 1534 bio_setup_sector(clone, ci->sector, *len); 1535 1536 __map_bio(tio); 1537 } 1538 1539 static void __send_duplicate_bios(struct clone_info *ci, struct dm_target *ti, 1540 unsigned num_bios, unsigned *len) 1541 { 1542 unsigned target_bio_nr; 1543 1544 for (target_bio_nr = 0; target_bio_nr < num_bios; target_bio_nr++) 1545 __clone_and_map_simple_bio(ci, ti, target_bio_nr, len); 1546 } 1547 1548 static int __send_empty_flush(struct clone_info *ci) 1549 { 1550 unsigned target_nr = 0; 1551 struct dm_target *ti; 1552 1553 BUG_ON(bio_has_data(ci->bio)); 1554 while ((ti = dm_table_get_target(ci->map, target_nr++))) 1555 __send_duplicate_bios(ci, ti, ti->num_flush_bios, NULL); 1556 1557 return 0; 1558 } 1559 1560 static void __clone_and_map_data_bio(struct clone_info *ci, struct dm_target *ti, 1561 sector_t sector, unsigned *len) 1562 { 1563 struct bio *bio = ci->bio; 1564 struct dm_target_io *tio; 1565 unsigned target_bio_nr; 1566 unsigned num_target_bios = 1; 1567 1568 /* 1569 * Does the target want to receive duplicate copies of the bio? 1570 */ 1571 if (bio_data_dir(bio) == WRITE && ti->num_write_bios) 1572 num_target_bios = ti->num_write_bios(ti, bio); 1573 1574 for (target_bio_nr = 0; target_bio_nr < num_target_bios; target_bio_nr++) { 1575 tio = alloc_tio(ci, ti, target_bio_nr); 1576 tio->len_ptr = len; 1577 clone_bio(tio, bio, sector, *len); 1578 __map_bio(tio); 1579 } 1580 } 1581 1582 typedef unsigned (*get_num_bios_fn)(struct dm_target *ti); 1583 1584 static unsigned get_num_discard_bios(struct dm_target *ti) 1585 { 1586 return ti->num_discard_bios; 1587 } 1588 1589 static unsigned get_num_write_same_bios(struct dm_target *ti) 1590 { 1591 return ti->num_write_same_bios; 1592 } 1593 1594 typedef bool (*is_split_required_fn)(struct dm_target *ti); 1595 1596 static bool is_split_required_for_discard(struct dm_target *ti) 1597 { 1598 return ti->split_discard_bios; 1599 } 1600 1601 static int __send_changing_extent_only(struct clone_info *ci, 1602 get_num_bios_fn get_num_bios, 1603 is_split_required_fn is_split_required) 1604 { 1605 struct dm_target *ti; 1606 unsigned len; 1607 unsigned num_bios; 1608 1609 do { 1610 ti = dm_table_find_target(ci->map, ci->sector); 1611 if (!dm_target_is_valid(ti)) 1612 return -EIO; 1613 1614 /* 1615 * Even though the device advertised support for this type of 1616 * request, that does not mean every target supports it, and 1617 * reconfiguration might also have changed that since the 1618 * check was performed. 1619 */ 1620 num_bios = get_num_bios ? get_num_bios(ti) : 0; 1621 if (!num_bios) 1622 return -EOPNOTSUPP; 1623 1624 if (is_split_required && !is_split_required(ti)) 1625 len = min((sector_t)ci->sector_count, max_io_len_target_boundary(ci->sector, ti)); 1626 else 1627 len = min((sector_t)ci->sector_count, max_io_len(ci->sector, ti)); 1628 1629 __send_duplicate_bios(ci, ti, num_bios, &len); 1630 1631 ci->sector += len; 1632 } while (ci->sector_count -= len); 1633 1634 return 0; 1635 } 1636 1637 static int __send_discard(struct clone_info *ci) 1638 { 1639 return __send_changing_extent_only(ci, get_num_discard_bios, 1640 is_split_required_for_discard); 1641 } 1642 1643 static int __send_write_same(struct clone_info *ci) 1644 { 1645 return __send_changing_extent_only(ci, get_num_write_same_bios, NULL); 1646 } 1647 1648 /* 1649 * Select the correct strategy for processing a non-flush bio. 1650 */ 1651 static int __split_and_process_non_flush(struct clone_info *ci) 1652 { 1653 struct bio *bio = ci->bio; 1654 struct dm_target *ti; 1655 unsigned len; 1656 1657 if (unlikely(bio->bi_rw & REQ_DISCARD)) 1658 return __send_discard(ci); 1659 else if (unlikely(bio->bi_rw & REQ_WRITE_SAME)) 1660 return __send_write_same(ci); 1661 1662 ti = dm_table_find_target(ci->map, ci->sector); 1663 if (!dm_target_is_valid(ti)) 1664 return -EIO; 1665 1666 len = min_t(sector_t, max_io_len(ci->sector, ti), ci->sector_count); 1667 1668 __clone_and_map_data_bio(ci, ti, ci->sector, &len); 1669 1670 ci->sector += len; 1671 ci->sector_count -= len; 1672 1673 return 0; 1674 } 1675 1676 /* 1677 * Entry point to split a bio into clones and submit them to the targets. 1678 */ 1679 static void __split_and_process_bio(struct mapped_device *md, 1680 struct dm_table *map, struct bio *bio) 1681 { 1682 struct clone_info ci; 1683 int error = 0; 1684 1685 if (unlikely(!map)) { 1686 bio_io_error(bio); 1687 return; 1688 } 1689 1690 ci.map = map; 1691 ci.md = md; 1692 ci.io = alloc_io(md); 1693 ci.io->error = 0; 1694 atomic_set(&ci.io->io_count, 1); 1695 ci.io->bio = bio; 1696 ci.io->md = md; 1697 spin_lock_init(&ci.io->endio_lock); 1698 ci.sector = bio->bi_iter.bi_sector; 1699 1700 start_io_acct(ci.io); 1701 1702 if (bio->bi_rw & REQ_FLUSH) { 1703 ci.bio = &ci.md->flush_bio; 1704 ci.sector_count = 0; 1705 error = __send_empty_flush(&ci); 1706 /* dec_pending submits any data associated with flush */ 1707 } else { 1708 ci.bio = bio; 1709 ci.sector_count = bio_sectors(bio); 1710 while (ci.sector_count && !error) 1711 error = __split_and_process_non_flush(&ci); 1712 } 1713 1714 /* drop the extra reference count */ 1715 dec_pending(ci.io, error); 1716 } 1717 /*----------------------------------------------------------------- 1718 * CRUD END 1719 *---------------------------------------------------------------*/ 1720 1721 static int dm_merge_bvec(struct request_queue *q, 1722 struct bvec_merge_data *bvm, 1723 struct bio_vec *biovec) 1724 { 1725 struct mapped_device *md = q->queuedata; 1726 struct dm_table *map = dm_get_live_table_fast(md); 1727 struct dm_target *ti; 1728 sector_t max_sectors; 1729 int max_size = 0; 1730 1731 if (unlikely(!map)) 1732 goto out; 1733 1734 ti = dm_table_find_target(map, bvm->bi_sector); 1735 if (!dm_target_is_valid(ti)) 1736 goto out; 1737 1738 /* 1739 * Find maximum amount of I/O that won't need splitting 1740 */ 1741 max_sectors = min(max_io_len(bvm->bi_sector, ti), 1742 (sector_t) queue_max_sectors(q)); 1743 max_size = (max_sectors << SECTOR_SHIFT) - bvm->bi_size; 1744 if (unlikely(max_size < 0)) /* this shouldn't _ever_ happen */ 1745 max_size = 0; 1746 1747 /* 1748 * merge_bvec_fn() returns number of bytes 1749 * it can accept at this offset 1750 * max is precomputed maximal io size 1751 */ 1752 if (max_size && ti->type->merge) 1753 max_size = ti->type->merge(ti, bvm, biovec, max_size); 1754 /* 1755 * If the target doesn't support merge method and some of the devices 1756 * provided their merge_bvec method (we know this by looking for the 1757 * max_hw_sectors that dm_set_device_limits may set), then we can't 1758 * allow bios with multiple vector entries. So always set max_size 1759 * to 0, and the code below allows just one page. 1760 */ 1761 else if (queue_max_hw_sectors(q) <= PAGE_SIZE >> 9) 1762 max_size = 0; 1763 1764 out: 1765 dm_put_live_table_fast(md); 1766 /* 1767 * Always allow an entire first page 1768 */ 1769 if (max_size <= biovec->bv_len && !(bvm->bi_size >> SECTOR_SHIFT)) 1770 max_size = biovec->bv_len; 1771 1772 return max_size; 1773 } 1774 1775 /* 1776 * The request function that just remaps the bio built up by 1777 * dm_merge_bvec. 1778 */ 1779 static void dm_make_request(struct request_queue *q, struct bio *bio) 1780 { 1781 int rw = bio_data_dir(bio); 1782 struct mapped_device *md = q->queuedata; 1783 int srcu_idx; 1784 struct dm_table *map; 1785 1786 map = dm_get_live_table(md, &srcu_idx); 1787 1788 generic_start_io_acct(rw, bio_sectors(bio), &dm_disk(md)->part0); 1789 1790 /* if we're suspended, we have to queue this io for later */ 1791 if (unlikely(test_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags))) { 1792 dm_put_live_table(md, srcu_idx); 1793 1794 if (bio_rw(bio) != READA) 1795 queue_io(md, bio); 1796 else 1797 bio_io_error(bio); 1798 return; 1799 } 1800 1801 __split_and_process_bio(md, map, bio); 1802 dm_put_live_table(md, srcu_idx); 1803 return; 1804 } 1805 1806 int dm_request_based(struct mapped_device *md) 1807 { 1808 return blk_queue_stackable(md->queue); 1809 } 1810 1811 static void dm_dispatch_clone_request(struct request *clone, struct request *rq) 1812 { 1813 int r; 1814 1815 if (blk_queue_io_stat(clone->q)) 1816 clone->cmd_flags |= REQ_IO_STAT; 1817 1818 clone->start_time = jiffies; 1819 r = blk_insert_cloned_request(clone->q, clone); 1820 if (r) 1821 /* must complete clone in terms of original request */ 1822 dm_complete_request(rq, r); 1823 } 1824 1825 static int dm_rq_bio_constructor(struct bio *bio, struct bio *bio_orig, 1826 void *data) 1827 { 1828 struct dm_rq_target_io *tio = data; 1829 struct dm_rq_clone_bio_info *info = 1830 container_of(bio, struct dm_rq_clone_bio_info, clone); 1831 1832 info->orig = bio_orig; 1833 info->tio = tio; 1834 bio->bi_end_io = end_clone_bio; 1835 1836 return 0; 1837 } 1838 1839 static int setup_clone(struct request *clone, struct request *rq, 1840 struct dm_rq_target_io *tio, gfp_t gfp_mask) 1841 { 1842 int r; 1843 1844 r = blk_rq_prep_clone(clone, rq, tio->md->bs, gfp_mask, 1845 dm_rq_bio_constructor, tio); 1846 if (r) 1847 return r; 1848 1849 clone->cmd = rq->cmd; 1850 clone->cmd_len = rq->cmd_len; 1851 clone->sense = rq->sense; 1852 clone->end_io = end_clone_request; 1853 clone->end_io_data = tio; 1854 1855 tio->clone = clone; 1856 1857 return 0; 1858 } 1859 1860 static struct request *clone_rq(struct request *rq, struct mapped_device *md, 1861 struct dm_rq_target_io *tio, gfp_t gfp_mask) 1862 { 1863 /* 1864 * Do not allocate a clone if tio->clone was already set 1865 * (see: dm_mq_queue_rq). 1866 */ 1867 bool alloc_clone = !tio->clone; 1868 struct request *clone; 1869 1870 if (alloc_clone) { 1871 clone = alloc_clone_request(md, gfp_mask); 1872 if (!clone) 1873 return NULL; 1874 } else 1875 clone = tio->clone; 1876 1877 blk_rq_init(NULL, clone); 1878 if (setup_clone(clone, rq, tio, gfp_mask)) { 1879 /* -ENOMEM */ 1880 if (alloc_clone) 1881 free_clone_request(md, clone); 1882 return NULL; 1883 } 1884 1885 return clone; 1886 } 1887 1888 static void map_tio_request(struct kthread_work *work); 1889 1890 static void init_tio(struct dm_rq_target_io *tio, struct request *rq, 1891 struct mapped_device *md) 1892 { 1893 tio->md = md; 1894 tio->ti = NULL; 1895 tio->clone = NULL; 1896 tio->orig = rq; 1897 tio->error = 0; 1898 memset(&tio->info, 0, sizeof(tio->info)); 1899 if (md->kworker_task) 1900 init_kthread_work(&tio->work, map_tio_request); 1901 } 1902 1903 static struct dm_rq_target_io *prep_tio(struct request *rq, 1904 struct mapped_device *md, gfp_t gfp_mask) 1905 { 1906 struct dm_rq_target_io *tio; 1907 int srcu_idx; 1908 struct dm_table *table; 1909 1910 tio = alloc_rq_tio(md, gfp_mask); 1911 if (!tio) 1912 return NULL; 1913 1914 init_tio(tio, rq, md); 1915 1916 table = dm_get_live_table(md, &srcu_idx); 1917 if (!dm_table_mq_request_based(table)) { 1918 if (!clone_rq(rq, md, tio, gfp_mask)) { 1919 dm_put_live_table(md, srcu_idx); 1920 free_rq_tio(tio); 1921 return NULL; 1922 } 1923 } 1924 dm_put_live_table(md, srcu_idx); 1925 1926 return tio; 1927 } 1928 1929 /* 1930 * Called with the queue lock held. 1931 */ 1932 static int dm_prep_fn(struct request_queue *q, struct request *rq) 1933 { 1934 struct mapped_device *md = q->queuedata; 1935 struct dm_rq_target_io *tio; 1936 1937 if (unlikely(rq->special)) { 1938 DMWARN("Already has something in rq->special."); 1939 return BLKPREP_KILL; 1940 } 1941 1942 tio = prep_tio(rq, md, GFP_ATOMIC); 1943 if (!tio) 1944 return BLKPREP_DEFER; 1945 1946 rq->special = tio; 1947 rq->cmd_flags |= REQ_DONTPREP; 1948 1949 return BLKPREP_OK; 1950 } 1951 1952 /* 1953 * Returns: 1954 * 0 : the request has been processed 1955 * DM_MAPIO_REQUEUE : the original request needs to be requeued 1956 * < 0 : the request was completed due to failure 1957 */ 1958 static int map_request(struct dm_rq_target_io *tio, struct request *rq, 1959 struct mapped_device *md) 1960 { 1961 int r; 1962 struct dm_target *ti = tio->ti; 1963 struct request *clone = NULL; 1964 1965 if (tio->clone) { 1966 clone = tio->clone; 1967 r = ti->type->map_rq(ti, clone, &tio->info); 1968 } else { 1969 r = ti->type->clone_and_map_rq(ti, rq, &tio->info, &clone); 1970 if (r < 0) { 1971 /* The target wants to complete the I/O */ 1972 dm_kill_unmapped_request(rq, r); 1973 return r; 1974 } 1975 if (IS_ERR(clone)) 1976 return DM_MAPIO_REQUEUE; 1977 if (setup_clone(clone, rq, tio, GFP_ATOMIC)) { 1978 /* -ENOMEM */ 1979 ti->type->release_clone_rq(clone); 1980 return DM_MAPIO_REQUEUE; 1981 } 1982 } 1983 1984 switch (r) { 1985 case DM_MAPIO_SUBMITTED: 1986 /* The target has taken the I/O to submit by itself later */ 1987 break; 1988 case DM_MAPIO_REMAPPED: 1989 /* The target has remapped the I/O so dispatch it */ 1990 trace_block_rq_remap(clone->q, clone, disk_devt(dm_disk(md)), 1991 blk_rq_pos(rq)); 1992 dm_dispatch_clone_request(clone, rq); 1993 break; 1994 case DM_MAPIO_REQUEUE: 1995 /* The target wants to requeue the I/O */ 1996 dm_requeue_unmapped_request(clone); 1997 break; 1998 default: 1999 if (r > 0) { 2000 DMWARN("unimplemented target map return value: %d", r); 2001 BUG(); 2002 } 2003 2004 /* The target wants to complete the I/O */ 2005 dm_kill_unmapped_request(rq, r); 2006 return r; 2007 } 2008 2009 return 0; 2010 } 2011 2012 static void map_tio_request(struct kthread_work *work) 2013 { 2014 struct dm_rq_target_io *tio = container_of(work, struct dm_rq_target_io, work); 2015 struct request *rq = tio->orig; 2016 struct mapped_device *md = tio->md; 2017 2018 if (map_request(tio, rq, md) == DM_MAPIO_REQUEUE) 2019 dm_requeue_unmapped_original_request(md, rq); 2020 } 2021 2022 static void dm_start_request(struct mapped_device *md, struct request *orig) 2023 { 2024 if (!orig->q->mq_ops) 2025 blk_start_request(orig); 2026 else 2027 blk_mq_start_request(orig); 2028 atomic_inc(&md->pending[rq_data_dir(orig)]); 2029 2030 if (md->seq_rq_merge_deadline_usecs) { 2031 md->last_rq_pos = rq_end_sector(orig); 2032 md->last_rq_rw = rq_data_dir(orig); 2033 md->last_rq_start_time = ktime_get(); 2034 } 2035 2036 /* 2037 * Hold the md reference here for the in-flight I/O. 2038 * We can't rely on the reference count by device opener, 2039 * because the device may be closed during the request completion 2040 * when all bios are completed. 2041 * See the comment in rq_completed() too. 2042 */ 2043 dm_get(md); 2044 } 2045 2046 #define MAX_SEQ_RQ_MERGE_DEADLINE_USECS 100000 2047 2048 ssize_t dm_attr_rq_based_seq_io_merge_deadline_show(struct mapped_device *md, char *buf) 2049 { 2050 return sprintf(buf, "%u\n", md->seq_rq_merge_deadline_usecs); 2051 } 2052 2053 ssize_t dm_attr_rq_based_seq_io_merge_deadline_store(struct mapped_device *md, 2054 const char *buf, size_t count) 2055 { 2056 unsigned deadline; 2057 2058 if (!dm_request_based(md) || md->use_blk_mq) 2059 return count; 2060 2061 if (kstrtouint(buf, 10, &deadline)) 2062 return -EINVAL; 2063 2064 if (deadline > MAX_SEQ_RQ_MERGE_DEADLINE_USECS) 2065 deadline = MAX_SEQ_RQ_MERGE_DEADLINE_USECS; 2066 2067 md->seq_rq_merge_deadline_usecs = deadline; 2068 2069 return count; 2070 } 2071 2072 static bool dm_request_peeked_before_merge_deadline(struct mapped_device *md) 2073 { 2074 ktime_t kt_deadline; 2075 2076 if (!md->seq_rq_merge_deadline_usecs) 2077 return false; 2078 2079 kt_deadline = ns_to_ktime((u64)md->seq_rq_merge_deadline_usecs * NSEC_PER_USEC); 2080 kt_deadline = ktime_add_safe(md->last_rq_start_time, kt_deadline); 2081 2082 return !ktime_after(ktime_get(), kt_deadline); 2083 } 2084 2085 /* 2086 * q->request_fn for request-based dm. 2087 * Called with the queue lock held. 2088 */ 2089 static void dm_request_fn(struct request_queue *q) 2090 { 2091 struct mapped_device *md = q->queuedata; 2092 int srcu_idx; 2093 struct dm_table *map = dm_get_live_table(md, &srcu_idx); 2094 struct dm_target *ti; 2095 struct request *rq; 2096 struct dm_rq_target_io *tio; 2097 sector_t pos; 2098 2099 /* 2100 * For suspend, check blk_queue_stopped() and increment 2101 * ->pending within a single queue_lock not to increment the 2102 * number of in-flight I/Os after the queue is stopped in 2103 * dm_suspend(). 2104 */ 2105 while (!blk_queue_stopped(q)) { 2106 rq = blk_peek_request(q); 2107 if (!rq) 2108 goto out; 2109 2110 /* always use block 0 to find the target for flushes for now */ 2111 pos = 0; 2112 if (!(rq->cmd_flags & REQ_FLUSH)) 2113 pos = blk_rq_pos(rq); 2114 2115 ti = dm_table_find_target(map, pos); 2116 if (!dm_target_is_valid(ti)) { 2117 /* 2118 * Must perform setup, that rq_completed() requires, 2119 * before calling dm_kill_unmapped_request 2120 */ 2121 DMERR_LIMIT("request attempted access beyond the end of device"); 2122 dm_start_request(md, rq); 2123 dm_kill_unmapped_request(rq, -EIO); 2124 continue; 2125 } 2126 2127 if (dm_request_peeked_before_merge_deadline(md) && 2128 md_in_flight(md) && rq->bio && rq->bio->bi_vcnt == 1 && 2129 md->last_rq_pos == pos && md->last_rq_rw == rq_data_dir(rq)) 2130 goto delay_and_out; 2131 2132 if (ti->type->busy && ti->type->busy(ti)) 2133 goto delay_and_out; 2134 2135 dm_start_request(md, rq); 2136 2137 tio = tio_from_request(rq); 2138 /* Establish tio->ti before queuing work (map_tio_request) */ 2139 tio->ti = ti; 2140 queue_kthread_work(&md->kworker, &tio->work); 2141 BUG_ON(!irqs_disabled()); 2142 } 2143 2144 goto out; 2145 2146 delay_and_out: 2147 blk_delay_queue(q, HZ / 100); 2148 out: 2149 dm_put_live_table(md, srcu_idx); 2150 } 2151 2152 static int dm_any_congested(void *congested_data, int bdi_bits) 2153 { 2154 int r = bdi_bits; 2155 struct mapped_device *md = congested_data; 2156 struct dm_table *map; 2157 2158 if (!test_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags)) { 2159 map = dm_get_live_table_fast(md); 2160 if (map) { 2161 /* 2162 * Request-based dm cares about only own queue for 2163 * the query about congestion status of request_queue 2164 */ 2165 if (dm_request_based(md)) 2166 r = md->queue->backing_dev_info.state & 2167 bdi_bits; 2168 else 2169 r = dm_table_any_congested(map, bdi_bits); 2170 } 2171 dm_put_live_table_fast(md); 2172 } 2173 2174 return r; 2175 } 2176 2177 /*----------------------------------------------------------------- 2178 * An IDR is used to keep track of allocated minor numbers. 2179 *---------------------------------------------------------------*/ 2180 static void free_minor(int minor) 2181 { 2182 spin_lock(&_minor_lock); 2183 idr_remove(&_minor_idr, minor); 2184 spin_unlock(&_minor_lock); 2185 } 2186 2187 /* 2188 * See if the device with a specific minor # is free. 2189 */ 2190 static int specific_minor(int minor) 2191 { 2192 int r; 2193 2194 if (minor >= (1 << MINORBITS)) 2195 return -EINVAL; 2196 2197 idr_preload(GFP_KERNEL); 2198 spin_lock(&_minor_lock); 2199 2200 r = idr_alloc(&_minor_idr, MINOR_ALLOCED, minor, minor + 1, GFP_NOWAIT); 2201 2202 spin_unlock(&_minor_lock); 2203 idr_preload_end(); 2204 if (r < 0) 2205 return r == -ENOSPC ? -EBUSY : r; 2206 return 0; 2207 } 2208 2209 static int next_free_minor(int *minor) 2210 { 2211 int r; 2212 2213 idr_preload(GFP_KERNEL); 2214 spin_lock(&_minor_lock); 2215 2216 r = idr_alloc(&_minor_idr, MINOR_ALLOCED, 0, 1 << MINORBITS, GFP_NOWAIT); 2217 2218 spin_unlock(&_minor_lock); 2219 idr_preload_end(); 2220 if (r < 0) 2221 return r; 2222 *minor = r; 2223 return 0; 2224 } 2225 2226 static const struct block_device_operations dm_blk_dops; 2227 2228 static void dm_wq_work(struct work_struct *work); 2229 2230 static void dm_init_md_queue(struct mapped_device *md) 2231 { 2232 /* 2233 * Request-based dm devices cannot be stacked on top of bio-based dm 2234 * devices. The type of this dm device may not have been decided yet. 2235 * The type is decided at the first table loading time. 2236 * To prevent problematic device stacking, clear the queue flag 2237 * for request stacking support until then. 2238 * 2239 * This queue is new, so no concurrency on the queue_flags. 2240 */ 2241 queue_flag_clear_unlocked(QUEUE_FLAG_STACKABLE, md->queue); 2242 } 2243 2244 static void dm_init_old_md_queue(struct mapped_device *md) 2245 { 2246 md->use_blk_mq = false; 2247 dm_init_md_queue(md); 2248 2249 /* 2250 * Initialize aspects of queue that aren't relevant for blk-mq 2251 */ 2252 md->queue->queuedata = md; 2253 md->queue->backing_dev_info.congested_fn = dm_any_congested; 2254 md->queue->backing_dev_info.congested_data = md; 2255 2256 blk_queue_bounce_limit(md->queue, BLK_BOUNCE_ANY); 2257 } 2258 2259 /* 2260 * Allocate and initialise a blank device with a given minor. 2261 */ 2262 static struct mapped_device *alloc_dev(int minor) 2263 { 2264 int r; 2265 struct mapped_device *md = kzalloc(sizeof(*md), GFP_KERNEL); 2266 void *old_md; 2267 2268 if (!md) { 2269 DMWARN("unable to allocate device, out of memory."); 2270 return NULL; 2271 } 2272 2273 if (!try_module_get(THIS_MODULE)) 2274 goto bad_module_get; 2275 2276 /* get a minor number for the dev */ 2277 if (minor == DM_ANY_MINOR) 2278 r = next_free_minor(&minor); 2279 else 2280 r = specific_minor(minor); 2281 if (r < 0) 2282 goto bad_minor; 2283 2284 r = init_srcu_struct(&md->io_barrier); 2285 if (r < 0) 2286 goto bad_io_barrier; 2287 2288 md->use_blk_mq = use_blk_mq; 2289 md->type = DM_TYPE_NONE; 2290 mutex_init(&md->suspend_lock); 2291 mutex_init(&md->type_lock); 2292 mutex_init(&md->table_devices_lock); 2293 spin_lock_init(&md->deferred_lock); 2294 atomic_set(&md->holders, 1); 2295 atomic_set(&md->open_count, 0); 2296 atomic_set(&md->event_nr, 0); 2297 atomic_set(&md->uevent_seq, 0); 2298 INIT_LIST_HEAD(&md->uevent_list); 2299 INIT_LIST_HEAD(&md->table_devices); 2300 spin_lock_init(&md->uevent_lock); 2301 2302 md->queue = blk_alloc_queue(GFP_KERNEL); 2303 if (!md->queue) 2304 goto bad_queue; 2305 2306 dm_init_md_queue(md); 2307 2308 md->disk = alloc_disk(1); 2309 if (!md->disk) 2310 goto bad_disk; 2311 2312 atomic_set(&md->pending[0], 0); 2313 atomic_set(&md->pending[1], 0); 2314 init_waitqueue_head(&md->wait); 2315 INIT_WORK(&md->work, dm_wq_work); 2316 init_waitqueue_head(&md->eventq); 2317 init_completion(&md->kobj_holder.completion); 2318 md->kworker_task = NULL; 2319 2320 md->disk->major = _major; 2321 md->disk->first_minor = minor; 2322 md->disk->fops = &dm_blk_dops; 2323 md->disk->queue = md->queue; 2324 md->disk->private_data = md; 2325 sprintf(md->disk->disk_name, "dm-%d", minor); 2326 add_disk(md->disk); 2327 format_dev_t(md->name, MKDEV(_major, minor)); 2328 2329 md->wq = alloc_workqueue("kdmflush", WQ_MEM_RECLAIM, 0); 2330 if (!md->wq) 2331 goto bad_thread; 2332 2333 md->bdev = bdget_disk(md->disk, 0); 2334 if (!md->bdev) 2335 goto bad_bdev; 2336 2337 bio_init(&md->flush_bio); 2338 md->flush_bio.bi_bdev = md->bdev; 2339 md->flush_bio.bi_rw = WRITE_FLUSH; 2340 2341 dm_stats_init(&md->stats); 2342 2343 /* Populate the mapping, nobody knows we exist yet */ 2344 spin_lock(&_minor_lock); 2345 old_md = idr_replace(&_minor_idr, md, minor); 2346 spin_unlock(&_minor_lock); 2347 2348 BUG_ON(old_md != MINOR_ALLOCED); 2349 2350 return md; 2351 2352 bad_bdev: 2353 destroy_workqueue(md->wq); 2354 bad_thread: 2355 del_gendisk(md->disk); 2356 put_disk(md->disk); 2357 bad_disk: 2358 blk_cleanup_queue(md->queue); 2359 bad_queue: 2360 cleanup_srcu_struct(&md->io_barrier); 2361 bad_io_barrier: 2362 free_minor(minor); 2363 bad_minor: 2364 module_put(THIS_MODULE); 2365 bad_module_get: 2366 kfree(md); 2367 return NULL; 2368 } 2369 2370 static void unlock_fs(struct mapped_device *md); 2371 2372 static void free_dev(struct mapped_device *md) 2373 { 2374 int minor = MINOR(disk_devt(md->disk)); 2375 2376 unlock_fs(md); 2377 destroy_workqueue(md->wq); 2378 2379 if (md->kworker_task) 2380 kthread_stop(md->kworker_task); 2381 if (md->io_pool) 2382 mempool_destroy(md->io_pool); 2383 if (md->rq_pool) 2384 mempool_destroy(md->rq_pool); 2385 if (md->bs) 2386 bioset_free(md->bs); 2387 2388 cleanup_srcu_struct(&md->io_barrier); 2389 free_table_devices(&md->table_devices); 2390 dm_stats_cleanup(&md->stats); 2391 2392 spin_lock(&_minor_lock); 2393 md->disk->private_data = NULL; 2394 spin_unlock(&_minor_lock); 2395 if (blk_get_integrity(md->disk)) 2396 blk_integrity_unregister(md->disk); 2397 del_gendisk(md->disk); 2398 put_disk(md->disk); 2399 blk_cleanup_queue(md->queue); 2400 if (md->use_blk_mq) 2401 blk_mq_free_tag_set(&md->tag_set); 2402 bdput(md->bdev); 2403 free_minor(minor); 2404 2405 module_put(THIS_MODULE); 2406 kfree(md); 2407 } 2408 2409 static void __bind_mempools(struct mapped_device *md, struct dm_table *t) 2410 { 2411 struct dm_md_mempools *p = dm_table_get_md_mempools(t); 2412 2413 if (md->bs) { 2414 /* The md already has necessary mempools. */ 2415 if (dm_table_get_type(t) == DM_TYPE_BIO_BASED) { 2416 /* 2417 * Reload bioset because front_pad may have changed 2418 * because a different table was loaded. 2419 */ 2420 bioset_free(md->bs); 2421 md->bs = p->bs; 2422 p->bs = NULL; 2423 } 2424 /* 2425 * There's no need to reload with request-based dm 2426 * because the size of front_pad doesn't change. 2427 * Note for future: If you are to reload bioset, 2428 * prep-ed requests in the queue may refer 2429 * to bio from the old bioset, so you must walk 2430 * through the queue to unprep. 2431 */ 2432 goto out; 2433 } 2434 2435 BUG_ON(!p || md->io_pool || md->rq_pool || md->bs); 2436 2437 md->io_pool = p->io_pool; 2438 p->io_pool = NULL; 2439 md->rq_pool = p->rq_pool; 2440 p->rq_pool = NULL; 2441 md->bs = p->bs; 2442 p->bs = NULL; 2443 2444 out: 2445 /* mempool bind completed, no longer need any mempools in the table */ 2446 dm_table_free_md_mempools(t); 2447 } 2448 2449 /* 2450 * Bind a table to the device. 2451 */ 2452 static void event_callback(void *context) 2453 { 2454 unsigned long flags; 2455 LIST_HEAD(uevents); 2456 struct mapped_device *md = (struct mapped_device *) context; 2457 2458 spin_lock_irqsave(&md->uevent_lock, flags); 2459 list_splice_init(&md->uevent_list, &uevents); 2460 spin_unlock_irqrestore(&md->uevent_lock, flags); 2461 2462 dm_send_uevents(&uevents, &disk_to_dev(md->disk)->kobj); 2463 2464 atomic_inc(&md->event_nr); 2465 wake_up(&md->eventq); 2466 } 2467 2468 /* 2469 * Protected by md->suspend_lock obtained by dm_swap_table(). 2470 */ 2471 static void __set_size(struct mapped_device *md, sector_t size) 2472 { 2473 set_capacity(md->disk, size); 2474 2475 i_size_write(md->bdev->bd_inode, (loff_t)size << SECTOR_SHIFT); 2476 } 2477 2478 /* 2479 * Return 1 if the queue has a compulsory merge_bvec_fn function. 2480 * 2481 * If this function returns 0, then the device is either a non-dm 2482 * device without a merge_bvec_fn, or it is a dm device that is 2483 * able to split any bios it receives that are too big. 2484 */ 2485 int dm_queue_merge_is_compulsory(struct request_queue *q) 2486 { 2487 struct mapped_device *dev_md; 2488 2489 if (!q->merge_bvec_fn) 2490 return 0; 2491 2492 if (q->make_request_fn == dm_make_request) { 2493 dev_md = q->queuedata; 2494 if (test_bit(DMF_MERGE_IS_OPTIONAL, &dev_md->flags)) 2495 return 0; 2496 } 2497 2498 return 1; 2499 } 2500 2501 static int dm_device_merge_is_compulsory(struct dm_target *ti, 2502 struct dm_dev *dev, sector_t start, 2503 sector_t len, void *data) 2504 { 2505 struct block_device *bdev = dev->bdev; 2506 struct request_queue *q = bdev_get_queue(bdev); 2507 2508 return dm_queue_merge_is_compulsory(q); 2509 } 2510 2511 /* 2512 * Return 1 if it is acceptable to ignore merge_bvec_fn based 2513 * on the properties of the underlying devices. 2514 */ 2515 static int dm_table_merge_is_optional(struct dm_table *table) 2516 { 2517 unsigned i = 0; 2518 struct dm_target *ti; 2519 2520 while (i < dm_table_get_num_targets(table)) { 2521 ti = dm_table_get_target(table, i++); 2522 2523 if (ti->type->iterate_devices && 2524 ti->type->iterate_devices(ti, dm_device_merge_is_compulsory, NULL)) 2525 return 0; 2526 } 2527 2528 return 1; 2529 } 2530 2531 /* 2532 * Returns old map, which caller must destroy. 2533 */ 2534 static struct dm_table *__bind(struct mapped_device *md, struct dm_table *t, 2535 struct queue_limits *limits) 2536 { 2537 struct dm_table *old_map; 2538 struct request_queue *q = md->queue; 2539 sector_t size; 2540 int merge_is_optional; 2541 2542 size = dm_table_get_size(t); 2543 2544 /* 2545 * Wipe any geometry if the size of the table changed. 2546 */ 2547 if (size != dm_get_size(md)) 2548 memset(&md->geometry, 0, sizeof(md->geometry)); 2549 2550 __set_size(md, size); 2551 2552 dm_table_event_callback(t, event_callback, md); 2553 2554 /* 2555 * The queue hasn't been stopped yet, if the old table type wasn't 2556 * for request-based during suspension. So stop it to prevent 2557 * I/O mapping before resume. 2558 * This must be done before setting the queue restrictions, 2559 * because request-based dm may be run just after the setting. 2560 */ 2561 if (dm_table_request_based(t)) 2562 stop_queue(q); 2563 2564 __bind_mempools(md, t); 2565 2566 merge_is_optional = dm_table_merge_is_optional(t); 2567 2568 old_map = rcu_dereference_protected(md->map, lockdep_is_held(&md->suspend_lock)); 2569 rcu_assign_pointer(md->map, t); 2570 md->immutable_target_type = dm_table_get_immutable_target_type(t); 2571 2572 dm_table_set_restrictions(t, q, limits); 2573 if (merge_is_optional) 2574 set_bit(DMF_MERGE_IS_OPTIONAL, &md->flags); 2575 else 2576 clear_bit(DMF_MERGE_IS_OPTIONAL, &md->flags); 2577 if (old_map) 2578 dm_sync_table(md); 2579 2580 return old_map; 2581 } 2582 2583 /* 2584 * Returns unbound table for the caller to free. 2585 */ 2586 static struct dm_table *__unbind(struct mapped_device *md) 2587 { 2588 struct dm_table *map = rcu_dereference_protected(md->map, 1); 2589 2590 if (!map) 2591 return NULL; 2592 2593 dm_table_event_callback(map, NULL, NULL); 2594 RCU_INIT_POINTER(md->map, NULL); 2595 dm_sync_table(md); 2596 2597 return map; 2598 } 2599 2600 /* 2601 * Constructor for a new device. 2602 */ 2603 int dm_create(int minor, struct mapped_device **result) 2604 { 2605 struct mapped_device *md; 2606 2607 md = alloc_dev(minor); 2608 if (!md) 2609 return -ENXIO; 2610 2611 dm_sysfs_init(md); 2612 2613 *result = md; 2614 return 0; 2615 } 2616 2617 /* 2618 * Functions to manage md->type. 2619 * All are required to hold md->type_lock. 2620 */ 2621 void dm_lock_md_type(struct mapped_device *md) 2622 { 2623 mutex_lock(&md->type_lock); 2624 } 2625 2626 void dm_unlock_md_type(struct mapped_device *md) 2627 { 2628 mutex_unlock(&md->type_lock); 2629 } 2630 2631 void dm_set_md_type(struct mapped_device *md, unsigned type) 2632 { 2633 BUG_ON(!mutex_is_locked(&md->type_lock)); 2634 md->type = type; 2635 } 2636 2637 unsigned dm_get_md_type(struct mapped_device *md) 2638 { 2639 BUG_ON(!mutex_is_locked(&md->type_lock)); 2640 return md->type; 2641 } 2642 2643 struct target_type *dm_get_immutable_target_type(struct mapped_device *md) 2644 { 2645 return md->immutable_target_type; 2646 } 2647 2648 /* 2649 * The queue_limits are only valid as long as you have a reference 2650 * count on 'md'. 2651 */ 2652 struct queue_limits *dm_get_queue_limits(struct mapped_device *md) 2653 { 2654 BUG_ON(!atomic_read(&md->holders)); 2655 return &md->queue->limits; 2656 } 2657 EXPORT_SYMBOL_GPL(dm_get_queue_limits); 2658 2659 static void init_rq_based_worker_thread(struct mapped_device *md) 2660 { 2661 /* Initialize the request-based DM worker thread */ 2662 init_kthread_worker(&md->kworker); 2663 md->kworker_task = kthread_run(kthread_worker_fn, &md->kworker, 2664 "kdmwork-%s", dm_device_name(md)); 2665 } 2666 2667 /* 2668 * Fully initialize a request-based queue (->elevator, ->request_fn, etc). 2669 */ 2670 static int dm_init_request_based_queue(struct mapped_device *md) 2671 { 2672 struct request_queue *q = NULL; 2673 2674 /* Fully initialize the queue */ 2675 q = blk_init_allocated_queue(md->queue, dm_request_fn, NULL); 2676 if (!q) 2677 return -EINVAL; 2678 2679 /* disable dm_request_fn's merge heuristic by default */ 2680 md->seq_rq_merge_deadline_usecs = 0; 2681 2682 md->queue = q; 2683 dm_init_old_md_queue(md); 2684 blk_queue_softirq_done(md->queue, dm_softirq_done); 2685 blk_queue_prep_rq(md->queue, dm_prep_fn); 2686 2687 init_rq_based_worker_thread(md); 2688 2689 elv_register_queue(md->queue); 2690 2691 return 0; 2692 } 2693 2694 static int dm_mq_init_request(void *data, struct request *rq, 2695 unsigned int hctx_idx, unsigned int request_idx, 2696 unsigned int numa_node) 2697 { 2698 struct mapped_device *md = data; 2699 struct dm_rq_target_io *tio = blk_mq_rq_to_pdu(rq); 2700 2701 /* 2702 * Must initialize md member of tio, otherwise it won't 2703 * be available in dm_mq_queue_rq. 2704 */ 2705 tio->md = md; 2706 2707 return 0; 2708 } 2709 2710 static int dm_mq_queue_rq(struct blk_mq_hw_ctx *hctx, 2711 const struct blk_mq_queue_data *bd) 2712 { 2713 struct request *rq = bd->rq; 2714 struct dm_rq_target_io *tio = blk_mq_rq_to_pdu(rq); 2715 struct mapped_device *md = tio->md; 2716 int srcu_idx; 2717 struct dm_table *map = dm_get_live_table(md, &srcu_idx); 2718 struct dm_target *ti; 2719 sector_t pos; 2720 2721 /* always use block 0 to find the target for flushes for now */ 2722 pos = 0; 2723 if (!(rq->cmd_flags & REQ_FLUSH)) 2724 pos = blk_rq_pos(rq); 2725 2726 ti = dm_table_find_target(map, pos); 2727 if (!dm_target_is_valid(ti)) { 2728 dm_put_live_table(md, srcu_idx); 2729 DMERR_LIMIT("request attempted access beyond the end of device"); 2730 /* 2731 * Must perform setup, that rq_completed() requires, 2732 * before returning BLK_MQ_RQ_QUEUE_ERROR 2733 */ 2734 dm_start_request(md, rq); 2735 return BLK_MQ_RQ_QUEUE_ERROR; 2736 } 2737 dm_put_live_table(md, srcu_idx); 2738 2739 if (ti->type->busy && ti->type->busy(ti)) 2740 return BLK_MQ_RQ_QUEUE_BUSY; 2741 2742 dm_start_request(md, rq); 2743 2744 /* Init tio using md established in .init_request */ 2745 init_tio(tio, rq, md); 2746 2747 /* 2748 * Establish tio->ti before queuing work (map_tio_request) 2749 * or making direct call to map_request(). 2750 */ 2751 tio->ti = ti; 2752 2753 /* Clone the request if underlying devices aren't blk-mq */ 2754 if (dm_table_get_type(map) == DM_TYPE_REQUEST_BASED) { 2755 /* clone request is allocated at the end of the pdu */ 2756 tio->clone = (void *)blk_mq_rq_to_pdu(rq) + sizeof(struct dm_rq_target_io); 2757 if (!clone_rq(rq, md, tio, GFP_ATOMIC)) 2758 return BLK_MQ_RQ_QUEUE_BUSY; 2759 queue_kthread_work(&md->kworker, &tio->work); 2760 } else { 2761 /* Direct call is fine since .queue_rq allows allocations */ 2762 if (map_request(tio, rq, md) == DM_MAPIO_REQUEUE) 2763 dm_requeue_unmapped_original_request(md, rq); 2764 } 2765 2766 return BLK_MQ_RQ_QUEUE_OK; 2767 } 2768 2769 static struct blk_mq_ops dm_mq_ops = { 2770 .queue_rq = dm_mq_queue_rq, 2771 .map_queue = blk_mq_map_queue, 2772 .complete = dm_softirq_done, 2773 .init_request = dm_mq_init_request, 2774 }; 2775 2776 static int dm_init_request_based_blk_mq_queue(struct mapped_device *md) 2777 { 2778 unsigned md_type = dm_get_md_type(md); 2779 struct request_queue *q; 2780 int err; 2781 2782 memset(&md->tag_set, 0, sizeof(md->tag_set)); 2783 md->tag_set.ops = &dm_mq_ops; 2784 md->tag_set.queue_depth = BLKDEV_MAX_RQ; 2785 md->tag_set.numa_node = NUMA_NO_NODE; 2786 md->tag_set.flags = BLK_MQ_F_SHOULD_MERGE | BLK_MQ_F_SG_MERGE; 2787 md->tag_set.nr_hw_queues = 1; 2788 if (md_type == DM_TYPE_REQUEST_BASED) { 2789 /* make the memory for non-blk-mq clone part of the pdu */ 2790 md->tag_set.cmd_size = sizeof(struct dm_rq_target_io) + sizeof(struct request); 2791 } else 2792 md->tag_set.cmd_size = sizeof(struct dm_rq_target_io); 2793 md->tag_set.driver_data = md; 2794 2795 err = blk_mq_alloc_tag_set(&md->tag_set); 2796 if (err) 2797 return err; 2798 2799 q = blk_mq_init_allocated_queue(&md->tag_set, md->queue); 2800 if (IS_ERR(q)) { 2801 err = PTR_ERR(q); 2802 goto out_tag_set; 2803 } 2804 md->queue = q; 2805 dm_init_md_queue(md); 2806 2807 /* backfill 'mq' sysfs registration normally done in blk_register_queue */ 2808 blk_mq_register_disk(md->disk); 2809 2810 if (md_type == DM_TYPE_REQUEST_BASED) 2811 init_rq_based_worker_thread(md); 2812 2813 return 0; 2814 2815 out_tag_set: 2816 blk_mq_free_tag_set(&md->tag_set); 2817 return err; 2818 } 2819 2820 static unsigned filter_md_type(unsigned type, struct mapped_device *md) 2821 { 2822 if (type == DM_TYPE_BIO_BASED) 2823 return type; 2824 2825 return !md->use_blk_mq ? DM_TYPE_REQUEST_BASED : DM_TYPE_MQ_REQUEST_BASED; 2826 } 2827 2828 /* 2829 * Setup the DM device's queue based on md's type 2830 */ 2831 int dm_setup_md_queue(struct mapped_device *md) 2832 { 2833 int r; 2834 unsigned md_type = filter_md_type(dm_get_md_type(md), md); 2835 2836 switch (md_type) { 2837 case DM_TYPE_REQUEST_BASED: 2838 r = dm_init_request_based_queue(md); 2839 if (r) { 2840 DMWARN("Cannot initialize queue for request-based mapped device"); 2841 return r; 2842 } 2843 break; 2844 case DM_TYPE_MQ_REQUEST_BASED: 2845 r = dm_init_request_based_blk_mq_queue(md); 2846 if (r) { 2847 DMWARN("Cannot initialize queue for request-based blk-mq mapped device"); 2848 return r; 2849 } 2850 break; 2851 case DM_TYPE_BIO_BASED: 2852 dm_init_old_md_queue(md); 2853 blk_queue_make_request(md->queue, dm_make_request); 2854 blk_queue_merge_bvec(md->queue, dm_merge_bvec); 2855 break; 2856 } 2857 2858 return 0; 2859 } 2860 2861 struct mapped_device *dm_get_md(dev_t dev) 2862 { 2863 struct mapped_device *md; 2864 unsigned minor = MINOR(dev); 2865 2866 if (MAJOR(dev) != _major || minor >= (1 << MINORBITS)) 2867 return NULL; 2868 2869 spin_lock(&_minor_lock); 2870 2871 md = idr_find(&_minor_idr, minor); 2872 if (md) { 2873 if ((md == MINOR_ALLOCED || 2874 (MINOR(disk_devt(dm_disk(md))) != minor) || 2875 dm_deleting_md(md) || 2876 test_bit(DMF_FREEING, &md->flags))) { 2877 md = NULL; 2878 goto out; 2879 } 2880 dm_get(md); 2881 } 2882 2883 out: 2884 spin_unlock(&_minor_lock); 2885 2886 return md; 2887 } 2888 EXPORT_SYMBOL_GPL(dm_get_md); 2889 2890 void *dm_get_mdptr(struct mapped_device *md) 2891 { 2892 return md->interface_ptr; 2893 } 2894 2895 void dm_set_mdptr(struct mapped_device *md, void *ptr) 2896 { 2897 md->interface_ptr = ptr; 2898 } 2899 2900 void dm_get(struct mapped_device *md) 2901 { 2902 atomic_inc(&md->holders); 2903 BUG_ON(test_bit(DMF_FREEING, &md->flags)); 2904 } 2905 2906 int dm_hold(struct mapped_device *md) 2907 { 2908 spin_lock(&_minor_lock); 2909 if (test_bit(DMF_FREEING, &md->flags)) { 2910 spin_unlock(&_minor_lock); 2911 return -EBUSY; 2912 } 2913 dm_get(md); 2914 spin_unlock(&_minor_lock); 2915 return 0; 2916 } 2917 EXPORT_SYMBOL_GPL(dm_hold); 2918 2919 const char *dm_device_name(struct mapped_device *md) 2920 { 2921 return md->name; 2922 } 2923 EXPORT_SYMBOL_GPL(dm_device_name); 2924 2925 static void __dm_destroy(struct mapped_device *md, bool wait) 2926 { 2927 struct dm_table *map; 2928 int srcu_idx; 2929 2930 might_sleep(); 2931 2932 map = dm_get_live_table(md, &srcu_idx); 2933 2934 spin_lock(&_minor_lock); 2935 idr_replace(&_minor_idr, MINOR_ALLOCED, MINOR(disk_devt(dm_disk(md)))); 2936 set_bit(DMF_FREEING, &md->flags); 2937 spin_unlock(&_minor_lock); 2938 2939 if (dm_request_based(md) && md->kworker_task) 2940 flush_kthread_worker(&md->kworker); 2941 2942 /* 2943 * Take suspend_lock so that presuspend and postsuspend methods 2944 * do not race with internal suspend. 2945 */ 2946 mutex_lock(&md->suspend_lock); 2947 if (!dm_suspended_md(md)) { 2948 dm_table_presuspend_targets(map); 2949 dm_table_postsuspend_targets(map); 2950 } 2951 mutex_unlock(&md->suspend_lock); 2952 2953 /* dm_put_live_table must be before msleep, otherwise deadlock is possible */ 2954 dm_put_live_table(md, srcu_idx); 2955 2956 /* 2957 * Rare, but there may be I/O requests still going to complete, 2958 * for example. Wait for all references to disappear. 2959 * No one should increment the reference count of the mapped_device, 2960 * after the mapped_device state becomes DMF_FREEING. 2961 */ 2962 if (wait) 2963 while (atomic_read(&md->holders)) 2964 msleep(1); 2965 else if (atomic_read(&md->holders)) 2966 DMWARN("%s: Forcibly removing mapped_device still in use! (%d users)", 2967 dm_device_name(md), atomic_read(&md->holders)); 2968 2969 dm_sysfs_exit(md); 2970 dm_table_destroy(__unbind(md)); 2971 free_dev(md); 2972 } 2973 2974 void dm_destroy(struct mapped_device *md) 2975 { 2976 __dm_destroy(md, true); 2977 } 2978 2979 void dm_destroy_immediate(struct mapped_device *md) 2980 { 2981 __dm_destroy(md, false); 2982 } 2983 2984 void dm_put(struct mapped_device *md) 2985 { 2986 atomic_dec(&md->holders); 2987 } 2988 EXPORT_SYMBOL_GPL(dm_put); 2989 2990 static int dm_wait_for_completion(struct mapped_device *md, int interruptible) 2991 { 2992 int r = 0; 2993 DECLARE_WAITQUEUE(wait, current); 2994 2995 add_wait_queue(&md->wait, &wait); 2996 2997 while (1) { 2998 set_current_state(interruptible); 2999 3000 if (!md_in_flight(md)) 3001 break; 3002 3003 if (interruptible == TASK_INTERRUPTIBLE && 3004 signal_pending(current)) { 3005 r = -EINTR; 3006 break; 3007 } 3008 3009 io_schedule(); 3010 } 3011 set_current_state(TASK_RUNNING); 3012 3013 remove_wait_queue(&md->wait, &wait); 3014 3015 return r; 3016 } 3017 3018 /* 3019 * Process the deferred bios 3020 */ 3021 static void dm_wq_work(struct work_struct *work) 3022 { 3023 struct mapped_device *md = container_of(work, struct mapped_device, 3024 work); 3025 struct bio *c; 3026 int srcu_idx; 3027 struct dm_table *map; 3028 3029 map = dm_get_live_table(md, &srcu_idx); 3030 3031 while (!test_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags)) { 3032 spin_lock_irq(&md->deferred_lock); 3033 c = bio_list_pop(&md->deferred); 3034 spin_unlock_irq(&md->deferred_lock); 3035 3036 if (!c) 3037 break; 3038 3039 if (dm_request_based(md)) 3040 generic_make_request(c); 3041 else 3042 __split_and_process_bio(md, map, c); 3043 } 3044 3045 dm_put_live_table(md, srcu_idx); 3046 } 3047 3048 static void dm_queue_flush(struct mapped_device *md) 3049 { 3050 clear_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags); 3051 smp_mb__after_atomic(); 3052 queue_work(md->wq, &md->work); 3053 } 3054 3055 /* 3056 * Swap in a new table, returning the old one for the caller to destroy. 3057 */ 3058 struct dm_table *dm_swap_table(struct mapped_device *md, struct dm_table *table) 3059 { 3060 struct dm_table *live_map = NULL, *map = ERR_PTR(-EINVAL); 3061 struct queue_limits limits; 3062 int r; 3063 3064 mutex_lock(&md->suspend_lock); 3065 3066 /* device must be suspended */ 3067 if (!dm_suspended_md(md)) 3068 goto out; 3069 3070 /* 3071 * If the new table has no data devices, retain the existing limits. 3072 * This helps multipath with queue_if_no_path if all paths disappear, 3073 * then new I/O is queued based on these limits, and then some paths 3074 * reappear. 3075 */ 3076 if (dm_table_has_no_data_devices(table)) { 3077 live_map = dm_get_live_table_fast(md); 3078 if (live_map) 3079 limits = md->queue->limits; 3080 dm_put_live_table_fast(md); 3081 } 3082 3083 if (!live_map) { 3084 r = dm_calculate_queue_limits(table, &limits); 3085 if (r) { 3086 map = ERR_PTR(r); 3087 goto out; 3088 } 3089 } 3090 3091 map = __bind(md, table, &limits); 3092 3093 out: 3094 mutex_unlock(&md->suspend_lock); 3095 return map; 3096 } 3097 3098 /* 3099 * Functions to lock and unlock any filesystem running on the 3100 * device. 3101 */ 3102 static int lock_fs(struct mapped_device *md) 3103 { 3104 int r; 3105 3106 WARN_ON(md->frozen_sb); 3107 3108 md->frozen_sb = freeze_bdev(md->bdev); 3109 if (IS_ERR(md->frozen_sb)) { 3110 r = PTR_ERR(md->frozen_sb); 3111 md->frozen_sb = NULL; 3112 return r; 3113 } 3114 3115 set_bit(DMF_FROZEN, &md->flags); 3116 3117 return 0; 3118 } 3119 3120 static void unlock_fs(struct mapped_device *md) 3121 { 3122 if (!test_bit(DMF_FROZEN, &md->flags)) 3123 return; 3124 3125 thaw_bdev(md->bdev, md->frozen_sb); 3126 md->frozen_sb = NULL; 3127 clear_bit(DMF_FROZEN, &md->flags); 3128 } 3129 3130 /* 3131 * If __dm_suspend returns 0, the device is completely quiescent 3132 * now. There is no request-processing activity. All new requests 3133 * are being added to md->deferred list. 3134 * 3135 * Caller must hold md->suspend_lock 3136 */ 3137 static int __dm_suspend(struct mapped_device *md, struct dm_table *map, 3138 unsigned suspend_flags, int interruptible) 3139 { 3140 bool do_lockfs = suspend_flags & DM_SUSPEND_LOCKFS_FLAG; 3141 bool noflush = suspend_flags & DM_SUSPEND_NOFLUSH_FLAG; 3142 int r; 3143 3144 /* 3145 * DMF_NOFLUSH_SUSPENDING must be set before presuspend. 3146 * This flag is cleared before dm_suspend returns. 3147 */ 3148 if (noflush) 3149 set_bit(DMF_NOFLUSH_SUSPENDING, &md->flags); 3150 3151 /* 3152 * This gets reverted if there's an error later and the targets 3153 * provide the .presuspend_undo hook. 3154 */ 3155 dm_table_presuspend_targets(map); 3156 3157 /* 3158 * Flush I/O to the device. 3159 * Any I/O submitted after lock_fs() may not be flushed. 3160 * noflush takes precedence over do_lockfs. 3161 * (lock_fs() flushes I/Os and waits for them to complete.) 3162 */ 3163 if (!noflush && do_lockfs) { 3164 r = lock_fs(md); 3165 if (r) { 3166 dm_table_presuspend_undo_targets(map); 3167 return r; 3168 } 3169 } 3170 3171 /* 3172 * Here we must make sure that no processes are submitting requests 3173 * to target drivers i.e. no one may be executing 3174 * __split_and_process_bio. This is called from dm_request and 3175 * dm_wq_work. 3176 * 3177 * To get all processes out of __split_and_process_bio in dm_request, 3178 * we take the write lock. To prevent any process from reentering 3179 * __split_and_process_bio from dm_request and quiesce the thread 3180 * (dm_wq_work), we set BMF_BLOCK_IO_FOR_SUSPEND and call 3181 * flush_workqueue(md->wq). 3182 */ 3183 set_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags); 3184 if (map) 3185 synchronize_srcu(&md->io_barrier); 3186 3187 /* 3188 * Stop md->queue before flushing md->wq in case request-based 3189 * dm defers requests to md->wq from md->queue. 3190 */ 3191 if (dm_request_based(md)) { 3192 stop_queue(md->queue); 3193 if (md->kworker_task) 3194 flush_kthread_worker(&md->kworker); 3195 } 3196 3197 flush_workqueue(md->wq); 3198 3199 /* 3200 * At this point no more requests are entering target request routines. 3201 * We call dm_wait_for_completion to wait for all existing requests 3202 * to finish. 3203 */ 3204 r = dm_wait_for_completion(md, interruptible); 3205 3206 if (noflush) 3207 clear_bit(DMF_NOFLUSH_SUSPENDING, &md->flags); 3208 if (map) 3209 synchronize_srcu(&md->io_barrier); 3210 3211 /* were we interrupted ? */ 3212 if (r < 0) { 3213 dm_queue_flush(md); 3214 3215 if (dm_request_based(md)) 3216 start_queue(md->queue); 3217 3218 unlock_fs(md); 3219 dm_table_presuspend_undo_targets(map); 3220 /* pushback list is already flushed, so skip flush */ 3221 } 3222 3223 return r; 3224 } 3225 3226 /* 3227 * We need to be able to change a mapping table under a mounted 3228 * filesystem. For example we might want to move some data in 3229 * the background. Before the table can be swapped with 3230 * dm_bind_table, dm_suspend must be called to flush any in 3231 * flight bios and ensure that any further io gets deferred. 3232 */ 3233 /* 3234 * Suspend mechanism in request-based dm. 3235 * 3236 * 1. Flush all I/Os by lock_fs() if needed. 3237 * 2. Stop dispatching any I/O by stopping the request_queue. 3238 * 3. Wait for all in-flight I/Os to be completed or requeued. 3239 * 3240 * To abort suspend, start the request_queue. 3241 */ 3242 int dm_suspend(struct mapped_device *md, unsigned suspend_flags) 3243 { 3244 struct dm_table *map = NULL; 3245 int r = 0; 3246 3247 retry: 3248 mutex_lock_nested(&md->suspend_lock, SINGLE_DEPTH_NESTING); 3249 3250 if (dm_suspended_md(md)) { 3251 r = -EINVAL; 3252 goto out_unlock; 3253 } 3254 3255 if (dm_suspended_internally_md(md)) { 3256 /* already internally suspended, wait for internal resume */ 3257 mutex_unlock(&md->suspend_lock); 3258 r = wait_on_bit(&md->flags, DMF_SUSPENDED_INTERNALLY, TASK_INTERRUPTIBLE); 3259 if (r) 3260 return r; 3261 goto retry; 3262 } 3263 3264 map = rcu_dereference_protected(md->map, lockdep_is_held(&md->suspend_lock)); 3265 3266 r = __dm_suspend(md, map, suspend_flags, TASK_INTERRUPTIBLE); 3267 if (r) 3268 goto out_unlock; 3269 3270 set_bit(DMF_SUSPENDED, &md->flags); 3271 3272 dm_table_postsuspend_targets(map); 3273 3274 out_unlock: 3275 mutex_unlock(&md->suspend_lock); 3276 return r; 3277 } 3278 3279 static int __dm_resume(struct mapped_device *md, struct dm_table *map) 3280 { 3281 if (map) { 3282 int r = dm_table_resume_targets(map); 3283 if (r) 3284 return r; 3285 } 3286 3287 dm_queue_flush(md); 3288 3289 /* 3290 * Flushing deferred I/Os must be done after targets are resumed 3291 * so that mapping of targets can work correctly. 3292 * Request-based dm is queueing the deferred I/Os in its request_queue. 3293 */ 3294 if (dm_request_based(md)) 3295 start_queue(md->queue); 3296 3297 unlock_fs(md); 3298 3299 return 0; 3300 } 3301 3302 int dm_resume(struct mapped_device *md) 3303 { 3304 int r = -EINVAL; 3305 struct dm_table *map = NULL; 3306 3307 retry: 3308 mutex_lock_nested(&md->suspend_lock, SINGLE_DEPTH_NESTING); 3309 3310 if (!dm_suspended_md(md)) 3311 goto out; 3312 3313 if (dm_suspended_internally_md(md)) { 3314 /* already internally suspended, wait for internal resume */ 3315 mutex_unlock(&md->suspend_lock); 3316 r = wait_on_bit(&md->flags, DMF_SUSPENDED_INTERNALLY, TASK_INTERRUPTIBLE); 3317 if (r) 3318 return r; 3319 goto retry; 3320 } 3321 3322 map = rcu_dereference_protected(md->map, lockdep_is_held(&md->suspend_lock)); 3323 if (!map || !dm_table_get_size(map)) 3324 goto out; 3325 3326 r = __dm_resume(md, map); 3327 if (r) 3328 goto out; 3329 3330 clear_bit(DMF_SUSPENDED, &md->flags); 3331 3332 r = 0; 3333 out: 3334 mutex_unlock(&md->suspend_lock); 3335 3336 return r; 3337 } 3338 3339 /* 3340 * Internal suspend/resume works like userspace-driven suspend. It waits 3341 * until all bios finish and prevents issuing new bios to the target drivers. 3342 * It may be used only from the kernel. 3343 */ 3344 3345 static void __dm_internal_suspend(struct mapped_device *md, unsigned suspend_flags) 3346 { 3347 struct dm_table *map = NULL; 3348 3349 if (md->internal_suspend_count++) 3350 return; /* nested internal suspend */ 3351 3352 if (dm_suspended_md(md)) { 3353 set_bit(DMF_SUSPENDED_INTERNALLY, &md->flags); 3354 return; /* nest suspend */ 3355 } 3356 3357 map = rcu_dereference_protected(md->map, lockdep_is_held(&md->suspend_lock)); 3358 3359 /* 3360 * Using TASK_UNINTERRUPTIBLE because only NOFLUSH internal suspend is 3361 * supported. Properly supporting a TASK_INTERRUPTIBLE internal suspend 3362 * would require changing .presuspend to return an error -- avoid this 3363 * until there is a need for more elaborate variants of internal suspend. 3364 */ 3365 (void) __dm_suspend(md, map, suspend_flags, TASK_UNINTERRUPTIBLE); 3366 3367 set_bit(DMF_SUSPENDED_INTERNALLY, &md->flags); 3368 3369 dm_table_postsuspend_targets(map); 3370 } 3371 3372 static void __dm_internal_resume(struct mapped_device *md) 3373 { 3374 BUG_ON(!md->internal_suspend_count); 3375 3376 if (--md->internal_suspend_count) 3377 return; /* resume from nested internal suspend */ 3378 3379 if (dm_suspended_md(md)) 3380 goto done; /* resume from nested suspend */ 3381 3382 /* 3383 * NOTE: existing callers don't need to call dm_table_resume_targets 3384 * (which may fail -- so best to avoid it for now by passing NULL map) 3385 */ 3386 (void) __dm_resume(md, NULL); 3387 3388 done: 3389 clear_bit(DMF_SUSPENDED_INTERNALLY, &md->flags); 3390 smp_mb__after_atomic(); 3391 wake_up_bit(&md->flags, DMF_SUSPENDED_INTERNALLY); 3392 } 3393 3394 void dm_internal_suspend_noflush(struct mapped_device *md) 3395 { 3396 mutex_lock(&md->suspend_lock); 3397 __dm_internal_suspend(md, DM_SUSPEND_NOFLUSH_FLAG); 3398 mutex_unlock(&md->suspend_lock); 3399 } 3400 EXPORT_SYMBOL_GPL(dm_internal_suspend_noflush); 3401 3402 void dm_internal_resume(struct mapped_device *md) 3403 { 3404 mutex_lock(&md->suspend_lock); 3405 __dm_internal_resume(md); 3406 mutex_unlock(&md->suspend_lock); 3407 } 3408 EXPORT_SYMBOL_GPL(dm_internal_resume); 3409 3410 /* 3411 * Fast variants of internal suspend/resume hold md->suspend_lock, 3412 * which prevents interaction with userspace-driven suspend. 3413 */ 3414 3415 void dm_internal_suspend_fast(struct mapped_device *md) 3416 { 3417 mutex_lock(&md->suspend_lock); 3418 if (dm_suspended_md(md) || dm_suspended_internally_md(md)) 3419 return; 3420 3421 set_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags); 3422 synchronize_srcu(&md->io_barrier); 3423 flush_workqueue(md->wq); 3424 dm_wait_for_completion(md, TASK_UNINTERRUPTIBLE); 3425 } 3426 EXPORT_SYMBOL_GPL(dm_internal_suspend_fast); 3427 3428 void dm_internal_resume_fast(struct mapped_device *md) 3429 { 3430 if (dm_suspended_md(md) || dm_suspended_internally_md(md)) 3431 goto done; 3432 3433 dm_queue_flush(md); 3434 3435 done: 3436 mutex_unlock(&md->suspend_lock); 3437 } 3438 EXPORT_SYMBOL_GPL(dm_internal_resume_fast); 3439 3440 /*----------------------------------------------------------------- 3441 * Event notification. 3442 *---------------------------------------------------------------*/ 3443 int dm_kobject_uevent(struct mapped_device *md, enum kobject_action action, 3444 unsigned cookie) 3445 { 3446 char udev_cookie[DM_COOKIE_LENGTH]; 3447 char *envp[] = { udev_cookie, NULL }; 3448 3449 if (!cookie) 3450 return kobject_uevent(&disk_to_dev(md->disk)->kobj, action); 3451 else { 3452 snprintf(udev_cookie, DM_COOKIE_LENGTH, "%s=%u", 3453 DM_COOKIE_ENV_VAR_NAME, cookie); 3454 return kobject_uevent_env(&disk_to_dev(md->disk)->kobj, 3455 action, envp); 3456 } 3457 } 3458 3459 uint32_t dm_next_uevent_seq(struct mapped_device *md) 3460 { 3461 return atomic_add_return(1, &md->uevent_seq); 3462 } 3463 3464 uint32_t dm_get_event_nr(struct mapped_device *md) 3465 { 3466 return atomic_read(&md->event_nr); 3467 } 3468 3469 int dm_wait_event(struct mapped_device *md, int event_nr) 3470 { 3471 return wait_event_interruptible(md->eventq, 3472 (event_nr != atomic_read(&md->event_nr))); 3473 } 3474 3475 void dm_uevent_add(struct mapped_device *md, struct list_head *elist) 3476 { 3477 unsigned long flags; 3478 3479 spin_lock_irqsave(&md->uevent_lock, flags); 3480 list_add(elist, &md->uevent_list); 3481 spin_unlock_irqrestore(&md->uevent_lock, flags); 3482 } 3483 3484 /* 3485 * The gendisk is only valid as long as you have a reference 3486 * count on 'md'. 3487 */ 3488 struct gendisk *dm_disk(struct mapped_device *md) 3489 { 3490 return md->disk; 3491 } 3492 EXPORT_SYMBOL_GPL(dm_disk); 3493 3494 struct kobject *dm_kobject(struct mapped_device *md) 3495 { 3496 return &md->kobj_holder.kobj; 3497 } 3498 3499 struct mapped_device *dm_get_from_kobject(struct kobject *kobj) 3500 { 3501 struct mapped_device *md; 3502 3503 md = container_of(kobj, struct mapped_device, kobj_holder.kobj); 3504 3505 if (test_bit(DMF_FREEING, &md->flags) || 3506 dm_deleting_md(md)) 3507 return NULL; 3508 3509 dm_get(md); 3510 return md; 3511 } 3512 3513 int dm_suspended_md(struct mapped_device *md) 3514 { 3515 return test_bit(DMF_SUSPENDED, &md->flags); 3516 } 3517 3518 int dm_suspended_internally_md(struct mapped_device *md) 3519 { 3520 return test_bit(DMF_SUSPENDED_INTERNALLY, &md->flags); 3521 } 3522 3523 int dm_test_deferred_remove_flag(struct mapped_device *md) 3524 { 3525 return test_bit(DMF_DEFERRED_REMOVE, &md->flags); 3526 } 3527 3528 int dm_suspended(struct dm_target *ti) 3529 { 3530 return dm_suspended_md(dm_table_get_md(ti->table)); 3531 } 3532 EXPORT_SYMBOL_GPL(dm_suspended); 3533 3534 int dm_noflush_suspending(struct dm_target *ti) 3535 { 3536 return __noflush_suspending(dm_table_get_md(ti->table)); 3537 } 3538 EXPORT_SYMBOL_GPL(dm_noflush_suspending); 3539 3540 struct dm_md_mempools *dm_alloc_md_mempools(struct mapped_device *md, unsigned type, 3541 unsigned integrity, unsigned per_bio_data_size) 3542 { 3543 struct dm_md_mempools *pools = kzalloc(sizeof(*pools), GFP_KERNEL); 3544 struct kmem_cache *cachep = NULL; 3545 unsigned int pool_size = 0; 3546 unsigned int front_pad; 3547 3548 if (!pools) 3549 return NULL; 3550 3551 type = filter_md_type(type, md); 3552 3553 switch (type) { 3554 case DM_TYPE_BIO_BASED: 3555 cachep = _io_cache; 3556 pool_size = dm_get_reserved_bio_based_ios(); 3557 front_pad = roundup(per_bio_data_size, __alignof__(struct dm_target_io)) + offsetof(struct dm_target_io, clone); 3558 break; 3559 case DM_TYPE_REQUEST_BASED: 3560 cachep = _rq_tio_cache; 3561 pool_size = dm_get_reserved_rq_based_ios(); 3562 pools->rq_pool = mempool_create_slab_pool(pool_size, _rq_cache); 3563 if (!pools->rq_pool) 3564 goto out; 3565 /* fall through to setup remaining rq-based pools */ 3566 case DM_TYPE_MQ_REQUEST_BASED: 3567 if (!pool_size) 3568 pool_size = dm_get_reserved_rq_based_ios(); 3569 front_pad = offsetof(struct dm_rq_clone_bio_info, clone); 3570 /* per_bio_data_size is not used. See __bind_mempools(). */ 3571 WARN_ON(per_bio_data_size != 0); 3572 break; 3573 default: 3574 BUG(); 3575 } 3576 3577 if (cachep) { 3578 pools->io_pool = mempool_create_slab_pool(pool_size, cachep); 3579 if (!pools->io_pool) 3580 goto out; 3581 } 3582 3583 pools->bs = bioset_create_nobvec(pool_size, front_pad); 3584 if (!pools->bs) 3585 goto out; 3586 3587 if (integrity && bioset_integrity_create(pools->bs, pool_size)) 3588 goto out; 3589 3590 return pools; 3591 3592 out: 3593 dm_free_md_mempools(pools); 3594 3595 return NULL; 3596 } 3597 3598 void dm_free_md_mempools(struct dm_md_mempools *pools) 3599 { 3600 if (!pools) 3601 return; 3602 3603 if (pools->io_pool) 3604 mempool_destroy(pools->io_pool); 3605 3606 if (pools->rq_pool) 3607 mempool_destroy(pools->rq_pool); 3608 3609 if (pools->bs) 3610 bioset_free(pools->bs); 3611 3612 kfree(pools); 3613 } 3614 3615 static const struct block_device_operations dm_blk_dops = { 3616 .open = dm_blk_open, 3617 .release = dm_blk_close, 3618 .ioctl = dm_blk_ioctl, 3619 .getgeo = dm_blk_getgeo, 3620 .owner = THIS_MODULE 3621 }; 3622 3623 /* 3624 * module hooks 3625 */ 3626 module_init(dm_init); 3627 module_exit(dm_exit); 3628 3629 module_param(major, uint, 0); 3630 MODULE_PARM_DESC(major, "The major number of the device mapper"); 3631 3632 module_param(reserved_bio_based_ios, uint, S_IRUGO | S_IWUSR); 3633 MODULE_PARM_DESC(reserved_bio_based_ios, "Reserved IOs in bio-based mempools"); 3634 3635 module_param(reserved_rq_based_ios, uint, S_IRUGO | S_IWUSR); 3636 MODULE_PARM_DESC(reserved_rq_based_ios, "Reserved IOs in request-based mempools"); 3637 3638 module_param(use_blk_mq, bool, S_IRUGO | S_IWUSR); 3639 MODULE_PARM_DESC(use_blk_mq, "Use block multiqueue for request-based DM devices"); 3640 3641 MODULE_DESCRIPTION(DM_NAME " driver"); 3642 MODULE_AUTHOR("Joe Thornber <dm-devel@redhat.com>"); 3643 MODULE_LICENSE("GPL"); 3644