1 /* 2 * Copyright (C) 2001, 2002 Sistina Software (UK) Limited. 3 * Copyright (C) 2004-2008 Red Hat, Inc. All rights reserved. 4 * 5 * This file is released under the GPL. 6 */ 7 8 #include "dm.h" 9 #include "dm-uevent.h" 10 11 #include <linux/init.h> 12 #include <linux/module.h> 13 #include <linux/mutex.h> 14 #include <linux/moduleparam.h> 15 #include <linux/blkpg.h> 16 #include <linux/bio.h> 17 #include <linux/mempool.h> 18 #include <linux/slab.h> 19 #include <linux/idr.h> 20 #include <linux/hdreg.h> 21 #include <linux/delay.h> 22 #include <linux/wait.h> 23 #include <linux/kthread.h> 24 #include <linux/ktime.h> 25 #include <linux/elevator.h> /* for rq_end_sector() */ 26 #include <linux/blk-mq.h> 27 #include <linux/pr.h> 28 29 #include <trace/events/block.h> 30 31 #define DM_MSG_PREFIX "core" 32 33 #ifdef CONFIG_PRINTK 34 /* 35 * ratelimit state to be used in DMXXX_LIMIT(). 36 */ 37 DEFINE_RATELIMIT_STATE(dm_ratelimit_state, 38 DEFAULT_RATELIMIT_INTERVAL, 39 DEFAULT_RATELIMIT_BURST); 40 EXPORT_SYMBOL(dm_ratelimit_state); 41 #endif 42 43 /* 44 * Cookies are numeric values sent with CHANGE and REMOVE 45 * uevents while resuming, removing or renaming the device. 46 */ 47 #define DM_COOKIE_ENV_VAR_NAME "DM_COOKIE" 48 #define DM_COOKIE_LENGTH 24 49 50 static const char *_name = DM_NAME; 51 52 static unsigned int major = 0; 53 static unsigned int _major = 0; 54 55 static DEFINE_IDR(_minor_idr); 56 57 static DEFINE_SPINLOCK(_minor_lock); 58 59 static void do_deferred_remove(struct work_struct *w); 60 61 static DECLARE_WORK(deferred_remove_work, do_deferred_remove); 62 63 static struct workqueue_struct *deferred_remove_workqueue; 64 65 /* 66 * For bio-based dm. 67 * One of these is allocated per bio. 68 */ 69 struct dm_io { 70 struct mapped_device *md; 71 int error; 72 atomic_t io_count; 73 struct bio *bio; 74 unsigned long start_time; 75 spinlock_t endio_lock; 76 struct dm_stats_aux stats_aux; 77 }; 78 79 /* 80 * For request-based dm. 81 * One of these is allocated per request. 82 */ 83 struct dm_rq_target_io { 84 struct mapped_device *md; 85 struct dm_target *ti; 86 struct request *orig, *clone; 87 struct kthread_work work; 88 int error; 89 union map_info info; 90 struct dm_stats_aux stats_aux; 91 unsigned long duration_jiffies; 92 unsigned n_sectors; 93 }; 94 95 /* 96 * For request-based dm - the bio clones we allocate are embedded in these 97 * structs. 98 * 99 * We allocate these with bio_alloc_bioset, using the front_pad parameter when 100 * the bioset is created - this means the bio has to come at the end of the 101 * struct. 102 */ 103 struct dm_rq_clone_bio_info { 104 struct bio *orig; 105 struct dm_rq_target_io *tio; 106 struct bio clone; 107 }; 108 109 #define MINOR_ALLOCED ((void *)-1) 110 111 /* 112 * Bits for the md->flags field. 113 */ 114 #define DMF_BLOCK_IO_FOR_SUSPEND 0 115 #define DMF_SUSPENDED 1 116 #define DMF_FROZEN 2 117 #define DMF_FREEING 3 118 #define DMF_DELETING 4 119 #define DMF_NOFLUSH_SUSPENDING 5 120 #define DMF_DEFERRED_REMOVE 6 121 #define DMF_SUSPENDED_INTERNALLY 7 122 123 /* 124 * A dummy definition to make RCU happy. 125 * struct dm_table should never be dereferenced in this file. 126 */ 127 struct dm_table { 128 int undefined__; 129 }; 130 131 /* 132 * Work processed by per-device workqueue. 133 */ 134 struct mapped_device { 135 struct srcu_struct io_barrier; 136 struct mutex suspend_lock; 137 atomic_t holders; 138 atomic_t open_count; 139 140 /* 141 * The current mapping. 142 * Use dm_get_live_table{_fast} or take suspend_lock for 143 * dereference. 144 */ 145 struct dm_table __rcu *map; 146 147 struct list_head table_devices; 148 struct mutex table_devices_lock; 149 150 unsigned long flags; 151 152 struct request_queue *queue; 153 int numa_node_id; 154 155 unsigned type; 156 /* Protect queue and type against concurrent access. */ 157 struct mutex type_lock; 158 159 struct dm_target *immutable_target; 160 struct target_type *immutable_target_type; 161 162 struct gendisk *disk; 163 char name[16]; 164 165 void *interface_ptr; 166 167 /* 168 * A list of ios that arrived while we were suspended. 169 */ 170 atomic_t pending[2]; 171 wait_queue_head_t wait; 172 struct work_struct work; 173 struct bio_list deferred; 174 spinlock_t deferred_lock; 175 176 /* 177 * Processing queue (flush) 178 */ 179 struct workqueue_struct *wq; 180 181 /* 182 * io objects are allocated from here. 183 */ 184 mempool_t *io_pool; 185 mempool_t *rq_pool; 186 187 struct bio_set *bs; 188 189 /* 190 * Event handling. 191 */ 192 atomic_t event_nr; 193 wait_queue_head_t eventq; 194 atomic_t uevent_seq; 195 struct list_head uevent_list; 196 spinlock_t uevent_lock; /* Protect access to uevent_list */ 197 198 /* 199 * freeze/thaw support require holding onto a super block 200 */ 201 struct super_block *frozen_sb; 202 struct block_device *bdev; 203 204 /* forced geometry settings */ 205 struct hd_geometry geometry; 206 207 /* kobject and completion */ 208 struct dm_kobject_holder kobj_holder; 209 210 /* zero-length flush that will be cloned and submitted to targets */ 211 struct bio flush_bio; 212 213 /* the number of internal suspends */ 214 unsigned internal_suspend_count; 215 216 struct dm_stats stats; 217 218 struct kthread_worker kworker; 219 struct task_struct *kworker_task; 220 221 /* for request-based merge heuristic in dm_request_fn() */ 222 unsigned seq_rq_merge_deadline_usecs; 223 int last_rq_rw; 224 sector_t last_rq_pos; 225 ktime_t last_rq_start_time; 226 227 /* for blk-mq request-based DM support */ 228 struct blk_mq_tag_set *tag_set; 229 bool use_blk_mq:1; 230 bool init_tio_pdu:1; 231 }; 232 233 #ifdef CONFIG_DM_MQ_DEFAULT 234 static bool use_blk_mq = true; 235 #else 236 static bool use_blk_mq = false; 237 #endif 238 239 #define DM_MQ_NR_HW_QUEUES 1 240 #define DM_MQ_QUEUE_DEPTH 2048 241 #define DM_NUMA_NODE NUMA_NO_NODE 242 243 static unsigned dm_mq_nr_hw_queues = DM_MQ_NR_HW_QUEUES; 244 static unsigned dm_mq_queue_depth = DM_MQ_QUEUE_DEPTH; 245 static int dm_numa_node = DM_NUMA_NODE; 246 247 bool dm_use_blk_mq(struct mapped_device *md) 248 { 249 return md->use_blk_mq; 250 } 251 EXPORT_SYMBOL_GPL(dm_use_blk_mq); 252 253 /* 254 * For mempools pre-allocation at the table loading time. 255 */ 256 struct dm_md_mempools { 257 mempool_t *io_pool; 258 mempool_t *rq_pool; 259 struct bio_set *bs; 260 }; 261 262 struct table_device { 263 struct list_head list; 264 atomic_t count; 265 struct dm_dev dm_dev; 266 }; 267 268 #define RESERVED_BIO_BASED_IOS 16 269 #define RESERVED_REQUEST_BASED_IOS 256 270 #define RESERVED_MAX_IOS 1024 271 static struct kmem_cache *_io_cache; 272 static struct kmem_cache *_rq_tio_cache; 273 static struct kmem_cache *_rq_cache; 274 275 /* 276 * Bio-based DM's mempools' reserved IOs set by the user. 277 */ 278 static unsigned reserved_bio_based_ios = RESERVED_BIO_BASED_IOS; 279 280 /* 281 * Request-based DM's mempools' reserved IOs set by the user. 282 */ 283 static unsigned reserved_rq_based_ios = RESERVED_REQUEST_BASED_IOS; 284 285 static int __dm_get_module_param_int(int *module_param, int min, int max) 286 { 287 int param = ACCESS_ONCE(*module_param); 288 int modified_param = 0; 289 bool modified = true; 290 291 if (param < min) 292 modified_param = min; 293 else if (param > max) 294 modified_param = max; 295 else 296 modified = false; 297 298 if (modified) { 299 (void)cmpxchg(module_param, param, modified_param); 300 param = modified_param; 301 } 302 303 return param; 304 } 305 306 static unsigned __dm_get_module_param(unsigned *module_param, 307 unsigned def, unsigned max) 308 { 309 unsigned param = ACCESS_ONCE(*module_param); 310 unsigned modified_param = 0; 311 312 if (!param) 313 modified_param = def; 314 else if (param > max) 315 modified_param = max; 316 317 if (modified_param) { 318 (void)cmpxchg(module_param, param, modified_param); 319 param = modified_param; 320 } 321 322 return param; 323 } 324 325 unsigned dm_get_reserved_bio_based_ios(void) 326 { 327 return __dm_get_module_param(&reserved_bio_based_ios, 328 RESERVED_BIO_BASED_IOS, RESERVED_MAX_IOS); 329 } 330 EXPORT_SYMBOL_GPL(dm_get_reserved_bio_based_ios); 331 332 unsigned dm_get_reserved_rq_based_ios(void) 333 { 334 return __dm_get_module_param(&reserved_rq_based_ios, 335 RESERVED_REQUEST_BASED_IOS, RESERVED_MAX_IOS); 336 } 337 EXPORT_SYMBOL_GPL(dm_get_reserved_rq_based_ios); 338 339 static unsigned dm_get_blk_mq_nr_hw_queues(void) 340 { 341 return __dm_get_module_param(&dm_mq_nr_hw_queues, 1, 32); 342 } 343 344 static unsigned dm_get_blk_mq_queue_depth(void) 345 { 346 return __dm_get_module_param(&dm_mq_queue_depth, 347 DM_MQ_QUEUE_DEPTH, BLK_MQ_MAX_DEPTH); 348 } 349 350 static unsigned dm_get_numa_node(void) 351 { 352 return __dm_get_module_param_int(&dm_numa_node, 353 DM_NUMA_NODE, num_online_nodes() - 1); 354 } 355 356 static int __init local_init(void) 357 { 358 int r = -ENOMEM; 359 360 /* allocate a slab for the dm_ios */ 361 _io_cache = KMEM_CACHE(dm_io, 0); 362 if (!_io_cache) 363 return r; 364 365 _rq_tio_cache = KMEM_CACHE(dm_rq_target_io, 0); 366 if (!_rq_tio_cache) 367 goto out_free_io_cache; 368 369 _rq_cache = kmem_cache_create("dm_old_clone_request", sizeof(struct request), 370 __alignof__(struct request), 0, NULL); 371 if (!_rq_cache) 372 goto out_free_rq_tio_cache; 373 374 r = dm_uevent_init(); 375 if (r) 376 goto out_free_rq_cache; 377 378 deferred_remove_workqueue = alloc_workqueue("kdmremove", WQ_UNBOUND, 1); 379 if (!deferred_remove_workqueue) { 380 r = -ENOMEM; 381 goto out_uevent_exit; 382 } 383 384 _major = major; 385 r = register_blkdev(_major, _name); 386 if (r < 0) 387 goto out_free_workqueue; 388 389 if (!_major) 390 _major = r; 391 392 return 0; 393 394 out_free_workqueue: 395 destroy_workqueue(deferred_remove_workqueue); 396 out_uevent_exit: 397 dm_uevent_exit(); 398 out_free_rq_cache: 399 kmem_cache_destroy(_rq_cache); 400 out_free_rq_tio_cache: 401 kmem_cache_destroy(_rq_tio_cache); 402 out_free_io_cache: 403 kmem_cache_destroy(_io_cache); 404 405 return r; 406 } 407 408 static void local_exit(void) 409 { 410 flush_scheduled_work(); 411 destroy_workqueue(deferred_remove_workqueue); 412 413 kmem_cache_destroy(_rq_cache); 414 kmem_cache_destroy(_rq_tio_cache); 415 kmem_cache_destroy(_io_cache); 416 unregister_blkdev(_major, _name); 417 dm_uevent_exit(); 418 419 _major = 0; 420 421 DMINFO("cleaned up"); 422 } 423 424 static int (*_inits[])(void) __initdata = { 425 local_init, 426 dm_target_init, 427 dm_linear_init, 428 dm_stripe_init, 429 dm_io_init, 430 dm_kcopyd_init, 431 dm_interface_init, 432 dm_statistics_init, 433 }; 434 435 static void (*_exits[])(void) = { 436 local_exit, 437 dm_target_exit, 438 dm_linear_exit, 439 dm_stripe_exit, 440 dm_io_exit, 441 dm_kcopyd_exit, 442 dm_interface_exit, 443 dm_statistics_exit, 444 }; 445 446 static int __init dm_init(void) 447 { 448 const int count = ARRAY_SIZE(_inits); 449 450 int r, i; 451 452 for (i = 0; i < count; i++) { 453 r = _inits[i](); 454 if (r) 455 goto bad; 456 } 457 458 return 0; 459 460 bad: 461 while (i--) 462 _exits[i](); 463 464 return r; 465 } 466 467 static void __exit dm_exit(void) 468 { 469 int i = ARRAY_SIZE(_exits); 470 471 while (i--) 472 _exits[i](); 473 474 /* 475 * Should be empty by this point. 476 */ 477 idr_destroy(&_minor_idr); 478 } 479 480 /* 481 * Block device functions 482 */ 483 int dm_deleting_md(struct mapped_device *md) 484 { 485 return test_bit(DMF_DELETING, &md->flags); 486 } 487 488 static int dm_blk_open(struct block_device *bdev, fmode_t mode) 489 { 490 struct mapped_device *md; 491 492 spin_lock(&_minor_lock); 493 494 md = bdev->bd_disk->private_data; 495 if (!md) 496 goto out; 497 498 if (test_bit(DMF_FREEING, &md->flags) || 499 dm_deleting_md(md)) { 500 md = NULL; 501 goto out; 502 } 503 504 dm_get(md); 505 atomic_inc(&md->open_count); 506 out: 507 spin_unlock(&_minor_lock); 508 509 return md ? 0 : -ENXIO; 510 } 511 512 static void dm_blk_close(struct gendisk *disk, fmode_t mode) 513 { 514 struct mapped_device *md; 515 516 spin_lock(&_minor_lock); 517 518 md = disk->private_data; 519 if (WARN_ON(!md)) 520 goto out; 521 522 if (atomic_dec_and_test(&md->open_count) && 523 (test_bit(DMF_DEFERRED_REMOVE, &md->flags))) 524 queue_work(deferred_remove_workqueue, &deferred_remove_work); 525 526 dm_put(md); 527 out: 528 spin_unlock(&_minor_lock); 529 } 530 531 int dm_open_count(struct mapped_device *md) 532 { 533 return atomic_read(&md->open_count); 534 } 535 536 /* 537 * Guarantees nothing is using the device before it's deleted. 538 */ 539 int dm_lock_for_deletion(struct mapped_device *md, bool mark_deferred, bool only_deferred) 540 { 541 int r = 0; 542 543 spin_lock(&_minor_lock); 544 545 if (dm_open_count(md)) { 546 r = -EBUSY; 547 if (mark_deferred) 548 set_bit(DMF_DEFERRED_REMOVE, &md->flags); 549 } else if (only_deferred && !test_bit(DMF_DEFERRED_REMOVE, &md->flags)) 550 r = -EEXIST; 551 else 552 set_bit(DMF_DELETING, &md->flags); 553 554 spin_unlock(&_minor_lock); 555 556 return r; 557 } 558 559 int dm_cancel_deferred_remove(struct mapped_device *md) 560 { 561 int r = 0; 562 563 spin_lock(&_minor_lock); 564 565 if (test_bit(DMF_DELETING, &md->flags)) 566 r = -EBUSY; 567 else 568 clear_bit(DMF_DEFERRED_REMOVE, &md->flags); 569 570 spin_unlock(&_minor_lock); 571 572 return r; 573 } 574 575 static void do_deferred_remove(struct work_struct *w) 576 { 577 dm_deferred_remove(); 578 } 579 580 sector_t dm_get_size(struct mapped_device *md) 581 { 582 return get_capacity(md->disk); 583 } 584 585 struct request_queue *dm_get_md_queue(struct mapped_device *md) 586 { 587 return md->queue; 588 } 589 590 struct dm_stats *dm_get_stats(struct mapped_device *md) 591 { 592 return &md->stats; 593 } 594 595 static int dm_blk_getgeo(struct block_device *bdev, struct hd_geometry *geo) 596 { 597 struct mapped_device *md = bdev->bd_disk->private_data; 598 599 return dm_get_geometry(md, geo); 600 } 601 602 static int dm_grab_bdev_for_ioctl(struct mapped_device *md, 603 struct block_device **bdev, 604 fmode_t *mode) 605 { 606 struct dm_target *tgt; 607 struct dm_table *map; 608 int srcu_idx, r; 609 610 retry: 611 r = -ENOTTY; 612 map = dm_get_live_table(md, &srcu_idx); 613 if (!map || !dm_table_get_size(map)) 614 goto out; 615 616 /* We only support devices that have a single target */ 617 if (dm_table_get_num_targets(map) != 1) 618 goto out; 619 620 tgt = dm_table_get_target(map, 0); 621 if (!tgt->type->prepare_ioctl) 622 goto out; 623 624 if (dm_suspended_md(md)) { 625 r = -EAGAIN; 626 goto out; 627 } 628 629 r = tgt->type->prepare_ioctl(tgt, bdev, mode); 630 if (r < 0) 631 goto out; 632 633 bdgrab(*bdev); 634 dm_put_live_table(md, srcu_idx); 635 return r; 636 637 out: 638 dm_put_live_table(md, srcu_idx); 639 if (r == -ENOTCONN && !fatal_signal_pending(current)) { 640 msleep(10); 641 goto retry; 642 } 643 return r; 644 } 645 646 static int dm_blk_ioctl(struct block_device *bdev, fmode_t mode, 647 unsigned int cmd, unsigned long arg) 648 { 649 struct mapped_device *md = bdev->bd_disk->private_data; 650 int r; 651 652 r = dm_grab_bdev_for_ioctl(md, &bdev, &mode); 653 if (r < 0) 654 return r; 655 656 if (r > 0) { 657 /* 658 * Target determined this ioctl is being issued against 659 * a logical partition of the parent bdev; so extra 660 * validation is needed. 661 */ 662 r = scsi_verify_blk_ioctl(NULL, cmd); 663 if (r) 664 goto out; 665 } 666 667 r = __blkdev_driver_ioctl(bdev, mode, cmd, arg); 668 out: 669 bdput(bdev); 670 return r; 671 } 672 673 static struct dm_io *alloc_io(struct mapped_device *md) 674 { 675 return mempool_alloc(md->io_pool, GFP_NOIO); 676 } 677 678 static void free_io(struct mapped_device *md, struct dm_io *io) 679 { 680 mempool_free(io, md->io_pool); 681 } 682 683 static void free_tio(struct mapped_device *md, struct dm_target_io *tio) 684 { 685 bio_put(&tio->clone); 686 } 687 688 static struct dm_rq_target_io *alloc_old_rq_tio(struct mapped_device *md, 689 gfp_t gfp_mask) 690 { 691 return mempool_alloc(md->io_pool, gfp_mask); 692 } 693 694 static void free_old_rq_tio(struct dm_rq_target_io *tio) 695 { 696 mempool_free(tio, tio->md->io_pool); 697 } 698 699 static struct request *alloc_old_clone_request(struct mapped_device *md, 700 gfp_t gfp_mask) 701 { 702 return mempool_alloc(md->rq_pool, gfp_mask); 703 } 704 705 static void free_old_clone_request(struct mapped_device *md, struct request *rq) 706 { 707 mempool_free(rq, md->rq_pool); 708 } 709 710 static int md_in_flight(struct mapped_device *md) 711 { 712 return atomic_read(&md->pending[READ]) + 713 atomic_read(&md->pending[WRITE]); 714 } 715 716 static void start_io_acct(struct dm_io *io) 717 { 718 struct mapped_device *md = io->md; 719 struct bio *bio = io->bio; 720 int cpu; 721 int rw = bio_data_dir(bio); 722 723 io->start_time = jiffies; 724 725 cpu = part_stat_lock(); 726 part_round_stats(cpu, &dm_disk(md)->part0); 727 part_stat_unlock(); 728 atomic_set(&dm_disk(md)->part0.in_flight[rw], 729 atomic_inc_return(&md->pending[rw])); 730 731 if (unlikely(dm_stats_used(&md->stats))) 732 dm_stats_account_io(&md->stats, bio->bi_rw, bio->bi_iter.bi_sector, 733 bio_sectors(bio), false, 0, &io->stats_aux); 734 } 735 736 static void end_io_acct(struct dm_io *io) 737 { 738 struct mapped_device *md = io->md; 739 struct bio *bio = io->bio; 740 unsigned long duration = jiffies - io->start_time; 741 int pending; 742 int rw = bio_data_dir(bio); 743 744 generic_end_io_acct(rw, &dm_disk(md)->part0, io->start_time); 745 746 if (unlikely(dm_stats_used(&md->stats))) 747 dm_stats_account_io(&md->stats, bio->bi_rw, bio->bi_iter.bi_sector, 748 bio_sectors(bio), true, duration, &io->stats_aux); 749 750 /* 751 * After this is decremented the bio must not be touched if it is 752 * a flush. 753 */ 754 pending = atomic_dec_return(&md->pending[rw]); 755 atomic_set(&dm_disk(md)->part0.in_flight[rw], pending); 756 pending += atomic_read(&md->pending[rw^0x1]); 757 758 /* nudge anyone waiting on suspend queue */ 759 if (!pending) 760 wake_up(&md->wait); 761 } 762 763 /* 764 * Add the bio to the list of deferred io. 765 */ 766 static void queue_io(struct mapped_device *md, struct bio *bio) 767 { 768 unsigned long flags; 769 770 spin_lock_irqsave(&md->deferred_lock, flags); 771 bio_list_add(&md->deferred, bio); 772 spin_unlock_irqrestore(&md->deferred_lock, flags); 773 queue_work(md->wq, &md->work); 774 } 775 776 /* 777 * Everyone (including functions in this file), should use this 778 * function to access the md->map field, and make sure they call 779 * dm_put_live_table() when finished. 780 */ 781 struct dm_table *dm_get_live_table(struct mapped_device *md, int *srcu_idx) __acquires(md->io_barrier) 782 { 783 *srcu_idx = srcu_read_lock(&md->io_barrier); 784 785 return srcu_dereference(md->map, &md->io_barrier); 786 } 787 788 void dm_put_live_table(struct mapped_device *md, int srcu_idx) __releases(md->io_barrier) 789 { 790 srcu_read_unlock(&md->io_barrier, srcu_idx); 791 } 792 793 void dm_sync_table(struct mapped_device *md) 794 { 795 synchronize_srcu(&md->io_barrier); 796 synchronize_rcu_expedited(); 797 } 798 799 /* 800 * A fast alternative to dm_get_live_table/dm_put_live_table. 801 * The caller must not block between these two functions. 802 */ 803 static struct dm_table *dm_get_live_table_fast(struct mapped_device *md) __acquires(RCU) 804 { 805 rcu_read_lock(); 806 return rcu_dereference(md->map); 807 } 808 809 static void dm_put_live_table_fast(struct mapped_device *md) __releases(RCU) 810 { 811 rcu_read_unlock(); 812 } 813 814 /* 815 * Open a table device so we can use it as a map destination. 816 */ 817 static int open_table_device(struct table_device *td, dev_t dev, 818 struct mapped_device *md) 819 { 820 static char *_claim_ptr = "I belong to device-mapper"; 821 struct block_device *bdev; 822 823 int r; 824 825 BUG_ON(td->dm_dev.bdev); 826 827 bdev = blkdev_get_by_dev(dev, td->dm_dev.mode | FMODE_EXCL, _claim_ptr); 828 if (IS_ERR(bdev)) 829 return PTR_ERR(bdev); 830 831 r = bd_link_disk_holder(bdev, dm_disk(md)); 832 if (r) { 833 blkdev_put(bdev, td->dm_dev.mode | FMODE_EXCL); 834 return r; 835 } 836 837 td->dm_dev.bdev = bdev; 838 return 0; 839 } 840 841 /* 842 * Close a table device that we've been using. 843 */ 844 static void close_table_device(struct table_device *td, struct mapped_device *md) 845 { 846 if (!td->dm_dev.bdev) 847 return; 848 849 bd_unlink_disk_holder(td->dm_dev.bdev, dm_disk(md)); 850 blkdev_put(td->dm_dev.bdev, td->dm_dev.mode | FMODE_EXCL); 851 td->dm_dev.bdev = NULL; 852 } 853 854 static struct table_device *find_table_device(struct list_head *l, dev_t dev, 855 fmode_t mode) { 856 struct table_device *td; 857 858 list_for_each_entry(td, l, list) 859 if (td->dm_dev.bdev->bd_dev == dev && td->dm_dev.mode == mode) 860 return td; 861 862 return NULL; 863 } 864 865 int dm_get_table_device(struct mapped_device *md, dev_t dev, fmode_t mode, 866 struct dm_dev **result) { 867 int r; 868 struct table_device *td; 869 870 mutex_lock(&md->table_devices_lock); 871 td = find_table_device(&md->table_devices, dev, mode); 872 if (!td) { 873 td = kmalloc_node(sizeof(*td), GFP_KERNEL, md->numa_node_id); 874 if (!td) { 875 mutex_unlock(&md->table_devices_lock); 876 return -ENOMEM; 877 } 878 879 td->dm_dev.mode = mode; 880 td->dm_dev.bdev = NULL; 881 882 if ((r = open_table_device(td, dev, md))) { 883 mutex_unlock(&md->table_devices_lock); 884 kfree(td); 885 return r; 886 } 887 888 format_dev_t(td->dm_dev.name, dev); 889 890 atomic_set(&td->count, 0); 891 list_add(&td->list, &md->table_devices); 892 } 893 atomic_inc(&td->count); 894 mutex_unlock(&md->table_devices_lock); 895 896 *result = &td->dm_dev; 897 return 0; 898 } 899 EXPORT_SYMBOL_GPL(dm_get_table_device); 900 901 void dm_put_table_device(struct mapped_device *md, struct dm_dev *d) 902 { 903 struct table_device *td = container_of(d, struct table_device, dm_dev); 904 905 mutex_lock(&md->table_devices_lock); 906 if (atomic_dec_and_test(&td->count)) { 907 close_table_device(td, md); 908 list_del(&td->list); 909 kfree(td); 910 } 911 mutex_unlock(&md->table_devices_lock); 912 } 913 EXPORT_SYMBOL(dm_put_table_device); 914 915 static void free_table_devices(struct list_head *devices) 916 { 917 struct list_head *tmp, *next; 918 919 list_for_each_safe(tmp, next, devices) { 920 struct table_device *td = list_entry(tmp, struct table_device, list); 921 922 DMWARN("dm_destroy: %s still exists with %d references", 923 td->dm_dev.name, atomic_read(&td->count)); 924 kfree(td); 925 } 926 } 927 928 /* 929 * Get the geometry associated with a dm device 930 */ 931 int dm_get_geometry(struct mapped_device *md, struct hd_geometry *geo) 932 { 933 *geo = md->geometry; 934 935 return 0; 936 } 937 938 /* 939 * Set the geometry of a device. 940 */ 941 int dm_set_geometry(struct mapped_device *md, struct hd_geometry *geo) 942 { 943 sector_t sz = (sector_t)geo->cylinders * geo->heads * geo->sectors; 944 945 if (geo->start > sz) { 946 DMWARN("Start sector is beyond the geometry limits."); 947 return -EINVAL; 948 } 949 950 md->geometry = *geo; 951 952 return 0; 953 } 954 955 /*----------------------------------------------------------------- 956 * CRUD START: 957 * A more elegant soln is in the works that uses the queue 958 * merge fn, unfortunately there are a couple of changes to 959 * the block layer that I want to make for this. So in the 960 * interests of getting something for people to use I give 961 * you this clearly demarcated crap. 962 *---------------------------------------------------------------*/ 963 964 static int __noflush_suspending(struct mapped_device *md) 965 { 966 return test_bit(DMF_NOFLUSH_SUSPENDING, &md->flags); 967 } 968 969 /* 970 * Decrements the number of outstanding ios that a bio has been 971 * cloned into, completing the original io if necc. 972 */ 973 static void dec_pending(struct dm_io *io, int error) 974 { 975 unsigned long flags; 976 int io_error; 977 struct bio *bio; 978 struct mapped_device *md = io->md; 979 980 /* Push-back supersedes any I/O errors */ 981 if (unlikely(error)) { 982 spin_lock_irqsave(&io->endio_lock, flags); 983 if (!(io->error > 0 && __noflush_suspending(md))) 984 io->error = error; 985 spin_unlock_irqrestore(&io->endio_lock, flags); 986 } 987 988 if (atomic_dec_and_test(&io->io_count)) { 989 if (io->error == DM_ENDIO_REQUEUE) { 990 /* 991 * Target requested pushing back the I/O. 992 */ 993 spin_lock_irqsave(&md->deferred_lock, flags); 994 if (__noflush_suspending(md)) 995 bio_list_add_head(&md->deferred, io->bio); 996 else 997 /* noflush suspend was interrupted. */ 998 io->error = -EIO; 999 spin_unlock_irqrestore(&md->deferred_lock, flags); 1000 } 1001 1002 io_error = io->error; 1003 bio = io->bio; 1004 end_io_acct(io); 1005 free_io(md, io); 1006 1007 if (io_error == DM_ENDIO_REQUEUE) 1008 return; 1009 1010 if ((bio->bi_rw & REQ_FLUSH) && bio->bi_iter.bi_size) { 1011 /* 1012 * Preflush done for flush with data, reissue 1013 * without REQ_FLUSH. 1014 */ 1015 bio->bi_rw &= ~REQ_FLUSH; 1016 queue_io(md, bio); 1017 } else { 1018 /* done with normal IO or empty flush */ 1019 trace_block_bio_complete(md->queue, bio, io_error); 1020 bio->bi_error = io_error; 1021 bio_endio(bio); 1022 } 1023 } 1024 } 1025 1026 static void disable_write_same(struct mapped_device *md) 1027 { 1028 struct queue_limits *limits = dm_get_queue_limits(md); 1029 1030 /* device doesn't really support WRITE SAME, disable it */ 1031 limits->max_write_same_sectors = 0; 1032 } 1033 1034 static void clone_endio(struct bio *bio) 1035 { 1036 int error = bio->bi_error; 1037 int r = error; 1038 struct dm_target_io *tio = container_of(bio, struct dm_target_io, clone); 1039 struct dm_io *io = tio->io; 1040 struct mapped_device *md = tio->io->md; 1041 dm_endio_fn endio = tio->ti->type->end_io; 1042 1043 if (endio) { 1044 r = endio(tio->ti, bio, error); 1045 if (r < 0 || r == DM_ENDIO_REQUEUE) 1046 /* 1047 * error and requeue request are handled 1048 * in dec_pending(). 1049 */ 1050 error = r; 1051 else if (r == DM_ENDIO_INCOMPLETE) 1052 /* The target will handle the io */ 1053 return; 1054 else if (r) { 1055 DMWARN("unimplemented target endio return value: %d", r); 1056 BUG(); 1057 } 1058 } 1059 1060 if (unlikely(r == -EREMOTEIO && (bio->bi_rw & REQ_WRITE_SAME) && 1061 !bdev_get_queue(bio->bi_bdev)->limits.max_write_same_sectors)) 1062 disable_write_same(md); 1063 1064 free_tio(md, tio); 1065 dec_pending(io, error); 1066 } 1067 1068 /* 1069 * Partial completion handling for request-based dm 1070 */ 1071 static void end_clone_bio(struct bio *clone) 1072 { 1073 struct dm_rq_clone_bio_info *info = 1074 container_of(clone, struct dm_rq_clone_bio_info, clone); 1075 struct dm_rq_target_io *tio = info->tio; 1076 struct bio *bio = info->orig; 1077 unsigned int nr_bytes = info->orig->bi_iter.bi_size; 1078 int error = clone->bi_error; 1079 1080 bio_put(clone); 1081 1082 if (tio->error) 1083 /* 1084 * An error has already been detected on the request. 1085 * Once error occurred, just let clone->end_io() handle 1086 * the remainder. 1087 */ 1088 return; 1089 else if (error) { 1090 /* 1091 * Don't notice the error to the upper layer yet. 1092 * The error handling decision is made by the target driver, 1093 * when the request is completed. 1094 */ 1095 tio->error = error; 1096 return; 1097 } 1098 1099 /* 1100 * I/O for the bio successfully completed. 1101 * Notice the data completion to the upper layer. 1102 */ 1103 1104 /* 1105 * bios are processed from the head of the list. 1106 * So the completing bio should always be rq->bio. 1107 * If it's not, something wrong is happening. 1108 */ 1109 if (tio->orig->bio != bio) 1110 DMERR("bio completion is going in the middle of the request"); 1111 1112 /* 1113 * Update the original request. 1114 * Do not use blk_end_request() here, because it may complete 1115 * the original request before the clone, and break the ordering. 1116 */ 1117 blk_update_request(tio->orig, 0, nr_bytes); 1118 } 1119 1120 static struct dm_rq_target_io *tio_from_request(struct request *rq) 1121 { 1122 return (rq->q->mq_ops ? blk_mq_rq_to_pdu(rq) : rq->special); 1123 } 1124 1125 static void rq_end_stats(struct mapped_device *md, struct request *orig) 1126 { 1127 if (unlikely(dm_stats_used(&md->stats))) { 1128 struct dm_rq_target_io *tio = tio_from_request(orig); 1129 tio->duration_jiffies = jiffies - tio->duration_jiffies; 1130 dm_stats_account_io(&md->stats, orig->cmd_flags, blk_rq_pos(orig), 1131 tio->n_sectors, true, tio->duration_jiffies, 1132 &tio->stats_aux); 1133 } 1134 } 1135 1136 /* 1137 * Don't touch any member of the md after calling this function because 1138 * the md may be freed in dm_put() at the end of this function. 1139 * Or do dm_get() before calling this function and dm_put() later. 1140 */ 1141 static void rq_completed(struct mapped_device *md, int rw, bool run_queue) 1142 { 1143 atomic_dec(&md->pending[rw]); 1144 1145 /* nudge anyone waiting on suspend queue */ 1146 if (!md_in_flight(md)) 1147 wake_up(&md->wait); 1148 1149 /* 1150 * Run this off this callpath, as drivers could invoke end_io while 1151 * inside their request_fn (and holding the queue lock). Calling 1152 * back into ->request_fn() could deadlock attempting to grab the 1153 * queue lock again. 1154 */ 1155 if (!md->queue->mq_ops && run_queue) 1156 blk_run_queue_async(md->queue); 1157 1158 /* 1159 * dm_put() must be at the end of this function. See the comment above 1160 */ 1161 dm_put(md); 1162 } 1163 1164 static void free_rq_clone(struct request *clone) 1165 { 1166 struct dm_rq_target_io *tio = clone->end_io_data; 1167 struct mapped_device *md = tio->md; 1168 1169 blk_rq_unprep_clone(clone); 1170 1171 if (md->type == DM_TYPE_MQ_REQUEST_BASED) 1172 /* stacked on blk-mq queue(s) */ 1173 tio->ti->type->release_clone_rq(clone); 1174 else if (!md->queue->mq_ops) 1175 /* request_fn queue stacked on request_fn queue(s) */ 1176 free_old_clone_request(md, clone); 1177 1178 if (!md->queue->mq_ops) 1179 free_old_rq_tio(tio); 1180 } 1181 1182 /* 1183 * Complete the clone and the original request. 1184 * Must be called without clone's queue lock held, 1185 * see end_clone_request() for more details. 1186 */ 1187 static void dm_end_request(struct request *clone, int error) 1188 { 1189 int rw = rq_data_dir(clone); 1190 struct dm_rq_target_io *tio = clone->end_io_data; 1191 struct mapped_device *md = tio->md; 1192 struct request *rq = tio->orig; 1193 1194 if (rq->cmd_type == REQ_TYPE_BLOCK_PC) { 1195 rq->errors = clone->errors; 1196 rq->resid_len = clone->resid_len; 1197 1198 if (rq->sense) 1199 /* 1200 * We are using the sense buffer of the original 1201 * request. 1202 * So setting the length of the sense data is enough. 1203 */ 1204 rq->sense_len = clone->sense_len; 1205 } 1206 1207 free_rq_clone(clone); 1208 rq_end_stats(md, rq); 1209 if (!rq->q->mq_ops) 1210 blk_end_request_all(rq, error); 1211 else 1212 blk_mq_end_request(rq, error); 1213 rq_completed(md, rw, true); 1214 } 1215 1216 static void dm_unprep_request(struct request *rq) 1217 { 1218 struct dm_rq_target_io *tio = tio_from_request(rq); 1219 struct request *clone = tio->clone; 1220 1221 if (!rq->q->mq_ops) { 1222 rq->special = NULL; 1223 rq->cmd_flags &= ~REQ_DONTPREP; 1224 } 1225 1226 if (clone) 1227 free_rq_clone(clone); 1228 else if (!tio->md->queue->mq_ops) 1229 free_old_rq_tio(tio); 1230 } 1231 1232 /* 1233 * Requeue the original request of a clone. 1234 */ 1235 static void dm_old_requeue_request(struct request *rq) 1236 { 1237 struct request_queue *q = rq->q; 1238 unsigned long flags; 1239 1240 spin_lock_irqsave(q->queue_lock, flags); 1241 blk_requeue_request(q, rq); 1242 blk_run_queue_async(q); 1243 spin_unlock_irqrestore(q->queue_lock, flags); 1244 } 1245 1246 static void dm_mq_requeue_request(struct request *rq) 1247 { 1248 struct request_queue *q = rq->q; 1249 unsigned long flags; 1250 1251 blk_mq_requeue_request(rq); 1252 spin_lock_irqsave(q->queue_lock, flags); 1253 if (!blk_queue_stopped(q)) 1254 blk_mq_kick_requeue_list(q); 1255 spin_unlock_irqrestore(q->queue_lock, flags); 1256 } 1257 1258 static void dm_requeue_original_request(struct mapped_device *md, 1259 struct request *rq) 1260 { 1261 int rw = rq_data_dir(rq); 1262 1263 dm_unprep_request(rq); 1264 1265 rq_end_stats(md, rq); 1266 if (!rq->q->mq_ops) 1267 dm_old_requeue_request(rq); 1268 else 1269 dm_mq_requeue_request(rq); 1270 1271 rq_completed(md, rw, false); 1272 } 1273 1274 static void dm_old_stop_queue(struct request_queue *q) 1275 { 1276 unsigned long flags; 1277 1278 spin_lock_irqsave(q->queue_lock, flags); 1279 if (blk_queue_stopped(q)) { 1280 spin_unlock_irqrestore(q->queue_lock, flags); 1281 return; 1282 } 1283 1284 blk_stop_queue(q); 1285 spin_unlock_irqrestore(q->queue_lock, flags); 1286 } 1287 1288 static void dm_stop_queue(struct request_queue *q) 1289 { 1290 if (!q->mq_ops) 1291 dm_old_stop_queue(q); 1292 else 1293 blk_mq_stop_hw_queues(q); 1294 } 1295 1296 static void dm_old_start_queue(struct request_queue *q) 1297 { 1298 unsigned long flags; 1299 1300 spin_lock_irqsave(q->queue_lock, flags); 1301 if (blk_queue_stopped(q)) 1302 blk_start_queue(q); 1303 spin_unlock_irqrestore(q->queue_lock, flags); 1304 } 1305 1306 static void dm_start_queue(struct request_queue *q) 1307 { 1308 if (!q->mq_ops) 1309 dm_old_start_queue(q); 1310 else { 1311 blk_mq_start_stopped_hw_queues(q, true); 1312 blk_mq_kick_requeue_list(q); 1313 } 1314 } 1315 1316 static void dm_done(struct request *clone, int error, bool mapped) 1317 { 1318 int r = error; 1319 struct dm_rq_target_io *tio = clone->end_io_data; 1320 dm_request_endio_fn rq_end_io = NULL; 1321 1322 if (tio->ti) { 1323 rq_end_io = tio->ti->type->rq_end_io; 1324 1325 if (mapped && rq_end_io) 1326 r = rq_end_io(tio->ti, clone, error, &tio->info); 1327 } 1328 1329 if (unlikely(r == -EREMOTEIO && (clone->cmd_flags & REQ_WRITE_SAME) && 1330 !clone->q->limits.max_write_same_sectors)) 1331 disable_write_same(tio->md); 1332 1333 if (r <= 0) 1334 /* The target wants to complete the I/O */ 1335 dm_end_request(clone, r); 1336 else if (r == DM_ENDIO_INCOMPLETE) 1337 /* The target will handle the I/O */ 1338 return; 1339 else if (r == DM_ENDIO_REQUEUE) 1340 /* The target wants to requeue the I/O */ 1341 dm_requeue_original_request(tio->md, tio->orig); 1342 else { 1343 DMWARN("unimplemented target endio return value: %d", r); 1344 BUG(); 1345 } 1346 } 1347 1348 /* 1349 * Request completion handler for request-based dm 1350 */ 1351 static void dm_softirq_done(struct request *rq) 1352 { 1353 bool mapped = true; 1354 struct dm_rq_target_io *tio = tio_from_request(rq); 1355 struct request *clone = tio->clone; 1356 int rw; 1357 1358 if (!clone) { 1359 rq_end_stats(tio->md, rq); 1360 rw = rq_data_dir(rq); 1361 if (!rq->q->mq_ops) { 1362 blk_end_request_all(rq, tio->error); 1363 rq_completed(tio->md, rw, false); 1364 free_old_rq_tio(tio); 1365 } else { 1366 blk_mq_end_request(rq, tio->error); 1367 rq_completed(tio->md, rw, false); 1368 } 1369 return; 1370 } 1371 1372 if (rq->cmd_flags & REQ_FAILED) 1373 mapped = false; 1374 1375 dm_done(clone, tio->error, mapped); 1376 } 1377 1378 /* 1379 * Complete the clone and the original request with the error status 1380 * through softirq context. 1381 */ 1382 static void dm_complete_request(struct request *rq, int error) 1383 { 1384 struct dm_rq_target_io *tio = tio_from_request(rq); 1385 1386 tio->error = error; 1387 if (!rq->q->mq_ops) 1388 blk_complete_request(rq); 1389 else 1390 blk_mq_complete_request(rq, error); 1391 } 1392 1393 /* 1394 * Complete the not-mapped clone and the original request with the error status 1395 * through softirq context. 1396 * Target's rq_end_io() function isn't called. 1397 * This may be used when the target's map_rq() or clone_and_map_rq() functions fail. 1398 */ 1399 static void dm_kill_unmapped_request(struct request *rq, int error) 1400 { 1401 rq->cmd_flags |= REQ_FAILED; 1402 dm_complete_request(rq, error); 1403 } 1404 1405 /* 1406 * Called with the clone's queue lock held (in the case of .request_fn) 1407 */ 1408 static void end_clone_request(struct request *clone, int error) 1409 { 1410 struct dm_rq_target_io *tio = clone->end_io_data; 1411 1412 if (!clone->q->mq_ops) { 1413 /* 1414 * For just cleaning up the information of the queue in which 1415 * the clone was dispatched. 1416 * The clone is *NOT* freed actually here because it is alloced 1417 * from dm own mempool (REQ_ALLOCED isn't set). 1418 */ 1419 __blk_put_request(clone->q, clone); 1420 } 1421 1422 /* 1423 * Actual request completion is done in a softirq context which doesn't 1424 * hold the clone's queue lock. Otherwise, deadlock could occur because: 1425 * - another request may be submitted by the upper level driver 1426 * of the stacking during the completion 1427 * - the submission which requires queue lock may be done 1428 * against this clone's queue 1429 */ 1430 dm_complete_request(tio->orig, error); 1431 } 1432 1433 /* 1434 * Return maximum size of I/O possible at the supplied sector up to the current 1435 * target boundary. 1436 */ 1437 static sector_t max_io_len_target_boundary(sector_t sector, struct dm_target *ti) 1438 { 1439 sector_t target_offset = dm_target_offset(ti, sector); 1440 1441 return ti->len - target_offset; 1442 } 1443 1444 static sector_t max_io_len(sector_t sector, struct dm_target *ti) 1445 { 1446 sector_t len = max_io_len_target_boundary(sector, ti); 1447 sector_t offset, max_len; 1448 1449 /* 1450 * Does the target need to split even further? 1451 */ 1452 if (ti->max_io_len) { 1453 offset = dm_target_offset(ti, sector); 1454 if (unlikely(ti->max_io_len & (ti->max_io_len - 1))) 1455 max_len = sector_div(offset, ti->max_io_len); 1456 else 1457 max_len = offset & (ti->max_io_len - 1); 1458 max_len = ti->max_io_len - max_len; 1459 1460 if (len > max_len) 1461 len = max_len; 1462 } 1463 1464 return len; 1465 } 1466 1467 int dm_set_target_max_io_len(struct dm_target *ti, sector_t len) 1468 { 1469 if (len > UINT_MAX) { 1470 DMERR("Specified maximum size of target IO (%llu) exceeds limit (%u)", 1471 (unsigned long long)len, UINT_MAX); 1472 ti->error = "Maximum size of target IO is too large"; 1473 return -EINVAL; 1474 } 1475 1476 ti->max_io_len = (uint32_t) len; 1477 1478 return 0; 1479 } 1480 EXPORT_SYMBOL_GPL(dm_set_target_max_io_len); 1481 1482 /* 1483 * A target may call dm_accept_partial_bio only from the map routine. It is 1484 * allowed for all bio types except REQ_FLUSH. 1485 * 1486 * dm_accept_partial_bio informs the dm that the target only wants to process 1487 * additional n_sectors sectors of the bio and the rest of the data should be 1488 * sent in a next bio. 1489 * 1490 * A diagram that explains the arithmetics: 1491 * +--------------------+---------------+-------+ 1492 * | 1 | 2 | 3 | 1493 * +--------------------+---------------+-------+ 1494 * 1495 * <-------------- *tio->len_ptr ---------------> 1496 * <------- bi_size -------> 1497 * <-- n_sectors --> 1498 * 1499 * Region 1 was already iterated over with bio_advance or similar function. 1500 * (it may be empty if the target doesn't use bio_advance) 1501 * Region 2 is the remaining bio size that the target wants to process. 1502 * (it may be empty if region 1 is non-empty, although there is no reason 1503 * to make it empty) 1504 * The target requires that region 3 is to be sent in the next bio. 1505 * 1506 * If the target wants to receive multiple copies of the bio (via num_*bios, etc), 1507 * the partially processed part (the sum of regions 1+2) must be the same for all 1508 * copies of the bio. 1509 */ 1510 void dm_accept_partial_bio(struct bio *bio, unsigned n_sectors) 1511 { 1512 struct dm_target_io *tio = container_of(bio, struct dm_target_io, clone); 1513 unsigned bi_size = bio->bi_iter.bi_size >> SECTOR_SHIFT; 1514 BUG_ON(bio->bi_rw & REQ_FLUSH); 1515 BUG_ON(bi_size > *tio->len_ptr); 1516 BUG_ON(n_sectors > bi_size); 1517 *tio->len_ptr -= bi_size - n_sectors; 1518 bio->bi_iter.bi_size = n_sectors << SECTOR_SHIFT; 1519 } 1520 EXPORT_SYMBOL_GPL(dm_accept_partial_bio); 1521 1522 static void __map_bio(struct dm_target_io *tio) 1523 { 1524 int r; 1525 sector_t sector; 1526 struct mapped_device *md; 1527 struct bio *clone = &tio->clone; 1528 struct dm_target *ti = tio->ti; 1529 1530 clone->bi_end_io = clone_endio; 1531 1532 /* 1533 * Map the clone. If r == 0 we don't need to do 1534 * anything, the target has assumed ownership of 1535 * this io. 1536 */ 1537 atomic_inc(&tio->io->io_count); 1538 sector = clone->bi_iter.bi_sector; 1539 r = ti->type->map(ti, clone); 1540 if (r == DM_MAPIO_REMAPPED) { 1541 /* the bio has been remapped so dispatch it */ 1542 1543 trace_block_bio_remap(bdev_get_queue(clone->bi_bdev), clone, 1544 tio->io->bio->bi_bdev->bd_dev, sector); 1545 1546 generic_make_request(clone); 1547 } else if (r < 0 || r == DM_MAPIO_REQUEUE) { 1548 /* error the io and bail out, or requeue it if needed */ 1549 md = tio->io->md; 1550 dec_pending(tio->io, r); 1551 free_tio(md, tio); 1552 } else if (r != DM_MAPIO_SUBMITTED) { 1553 DMWARN("unimplemented target map return value: %d", r); 1554 BUG(); 1555 } 1556 } 1557 1558 struct clone_info { 1559 struct mapped_device *md; 1560 struct dm_table *map; 1561 struct bio *bio; 1562 struct dm_io *io; 1563 sector_t sector; 1564 unsigned sector_count; 1565 }; 1566 1567 static void bio_setup_sector(struct bio *bio, sector_t sector, unsigned len) 1568 { 1569 bio->bi_iter.bi_sector = sector; 1570 bio->bi_iter.bi_size = to_bytes(len); 1571 } 1572 1573 /* 1574 * Creates a bio that consists of range of complete bvecs. 1575 */ 1576 static void clone_bio(struct dm_target_io *tio, struct bio *bio, 1577 sector_t sector, unsigned len) 1578 { 1579 struct bio *clone = &tio->clone; 1580 1581 __bio_clone_fast(clone, bio); 1582 1583 if (bio_integrity(bio)) 1584 bio_integrity_clone(clone, bio, GFP_NOIO); 1585 1586 bio_advance(clone, to_bytes(sector - clone->bi_iter.bi_sector)); 1587 clone->bi_iter.bi_size = to_bytes(len); 1588 1589 if (bio_integrity(bio)) 1590 bio_integrity_trim(clone, 0, len); 1591 } 1592 1593 static struct dm_target_io *alloc_tio(struct clone_info *ci, 1594 struct dm_target *ti, 1595 unsigned target_bio_nr) 1596 { 1597 struct dm_target_io *tio; 1598 struct bio *clone; 1599 1600 clone = bio_alloc_bioset(GFP_NOIO, 0, ci->md->bs); 1601 tio = container_of(clone, struct dm_target_io, clone); 1602 1603 tio->io = ci->io; 1604 tio->ti = ti; 1605 tio->target_bio_nr = target_bio_nr; 1606 1607 return tio; 1608 } 1609 1610 static void __clone_and_map_simple_bio(struct clone_info *ci, 1611 struct dm_target *ti, 1612 unsigned target_bio_nr, unsigned *len) 1613 { 1614 struct dm_target_io *tio = alloc_tio(ci, ti, target_bio_nr); 1615 struct bio *clone = &tio->clone; 1616 1617 tio->len_ptr = len; 1618 1619 __bio_clone_fast(clone, ci->bio); 1620 if (len) 1621 bio_setup_sector(clone, ci->sector, *len); 1622 1623 __map_bio(tio); 1624 } 1625 1626 static void __send_duplicate_bios(struct clone_info *ci, struct dm_target *ti, 1627 unsigned num_bios, unsigned *len) 1628 { 1629 unsigned target_bio_nr; 1630 1631 for (target_bio_nr = 0; target_bio_nr < num_bios; target_bio_nr++) 1632 __clone_and_map_simple_bio(ci, ti, target_bio_nr, len); 1633 } 1634 1635 static int __send_empty_flush(struct clone_info *ci) 1636 { 1637 unsigned target_nr = 0; 1638 struct dm_target *ti; 1639 1640 BUG_ON(bio_has_data(ci->bio)); 1641 while ((ti = dm_table_get_target(ci->map, target_nr++))) 1642 __send_duplicate_bios(ci, ti, ti->num_flush_bios, NULL); 1643 1644 return 0; 1645 } 1646 1647 static void __clone_and_map_data_bio(struct clone_info *ci, struct dm_target *ti, 1648 sector_t sector, unsigned *len) 1649 { 1650 struct bio *bio = ci->bio; 1651 struct dm_target_io *tio; 1652 unsigned target_bio_nr; 1653 unsigned num_target_bios = 1; 1654 1655 /* 1656 * Does the target want to receive duplicate copies of the bio? 1657 */ 1658 if (bio_data_dir(bio) == WRITE && ti->num_write_bios) 1659 num_target_bios = ti->num_write_bios(ti, bio); 1660 1661 for (target_bio_nr = 0; target_bio_nr < num_target_bios; target_bio_nr++) { 1662 tio = alloc_tio(ci, ti, target_bio_nr); 1663 tio->len_ptr = len; 1664 clone_bio(tio, bio, sector, *len); 1665 __map_bio(tio); 1666 } 1667 } 1668 1669 typedef unsigned (*get_num_bios_fn)(struct dm_target *ti); 1670 1671 static unsigned get_num_discard_bios(struct dm_target *ti) 1672 { 1673 return ti->num_discard_bios; 1674 } 1675 1676 static unsigned get_num_write_same_bios(struct dm_target *ti) 1677 { 1678 return ti->num_write_same_bios; 1679 } 1680 1681 typedef bool (*is_split_required_fn)(struct dm_target *ti); 1682 1683 static bool is_split_required_for_discard(struct dm_target *ti) 1684 { 1685 return ti->split_discard_bios; 1686 } 1687 1688 static int __send_changing_extent_only(struct clone_info *ci, 1689 get_num_bios_fn get_num_bios, 1690 is_split_required_fn is_split_required) 1691 { 1692 struct dm_target *ti; 1693 unsigned len; 1694 unsigned num_bios; 1695 1696 do { 1697 ti = dm_table_find_target(ci->map, ci->sector); 1698 if (!dm_target_is_valid(ti)) 1699 return -EIO; 1700 1701 /* 1702 * Even though the device advertised support for this type of 1703 * request, that does not mean every target supports it, and 1704 * reconfiguration might also have changed that since the 1705 * check was performed. 1706 */ 1707 num_bios = get_num_bios ? get_num_bios(ti) : 0; 1708 if (!num_bios) 1709 return -EOPNOTSUPP; 1710 1711 if (is_split_required && !is_split_required(ti)) 1712 len = min((sector_t)ci->sector_count, max_io_len_target_boundary(ci->sector, ti)); 1713 else 1714 len = min((sector_t)ci->sector_count, max_io_len(ci->sector, ti)); 1715 1716 __send_duplicate_bios(ci, ti, num_bios, &len); 1717 1718 ci->sector += len; 1719 } while (ci->sector_count -= len); 1720 1721 return 0; 1722 } 1723 1724 static int __send_discard(struct clone_info *ci) 1725 { 1726 return __send_changing_extent_only(ci, get_num_discard_bios, 1727 is_split_required_for_discard); 1728 } 1729 1730 static int __send_write_same(struct clone_info *ci) 1731 { 1732 return __send_changing_extent_only(ci, get_num_write_same_bios, NULL); 1733 } 1734 1735 /* 1736 * Select the correct strategy for processing a non-flush bio. 1737 */ 1738 static int __split_and_process_non_flush(struct clone_info *ci) 1739 { 1740 struct bio *bio = ci->bio; 1741 struct dm_target *ti; 1742 unsigned len; 1743 1744 if (unlikely(bio->bi_rw & REQ_DISCARD)) 1745 return __send_discard(ci); 1746 else if (unlikely(bio->bi_rw & REQ_WRITE_SAME)) 1747 return __send_write_same(ci); 1748 1749 ti = dm_table_find_target(ci->map, ci->sector); 1750 if (!dm_target_is_valid(ti)) 1751 return -EIO; 1752 1753 len = min_t(sector_t, max_io_len(ci->sector, ti), ci->sector_count); 1754 1755 __clone_and_map_data_bio(ci, ti, ci->sector, &len); 1756 1757 ci->sector += len; 1758 ci->sector_count -= len; 1759 1760 return 0; 1761 } 1762 1763 /* 1764 * Entry point to split a bio into clones and submit them to the targets. 1765 */ 1766 static void __split_and_process_bio(struct mapped_device *md, 1767 struct dm_table *map, struct bio *bio) 1768 { 1769 struct clone_info ci; 1770 int error = 0; 1771 1772 if (unlikely(!map)) { 1773 bio_io_error(bio); 1774 return; 1775 } 1776 1777 ci.map = map; 1778 ci.md = md; 1779 ci.io = alloc_io(md); 1780 ci.io->error = 0; 1781 atomic_set(&ci.io->io_count, 1); 1782 ci.io->bio = bio; 1783 ci.io->md = md; 1784 spin_lock_init(&ci.io->endio_lock); 1785 ci.sector = bio->bi_iter.bi_sector; 1786 1787 start_io_acct(ci.io); 1788 1789 if (bio->bi_rw & REQ_FLUSH) { 1790 ci.bio = &ci.md->flush_bio; 1791 ci.sector_count = 0; 1792 error = __send_empty_flush(&ci); 1793 /* dec_pending submits any data associated with flush */ 1794 } else { 1795 ci.bio = bio; 1796 ci.sector_count = bio_sectors(bio); 1797 while (ci.sector_count && !error) 1798 error = __split_and_process_non_flush(&ci); 1799 } 1800 1801 /* drop the extra reference count */ 1802 dec_pending(ci.io, error); 1803 } 1804 /*----------------------------------------------------------------- 1805 * CRUD END 1806 *---------------------------------------------------------------*/ 1807 1808 /* 1809 * The request function that just remaps the bio built up by 1810 * dm_merge_bvec. 1811 */ 1812 static blk_qc_t dm_make_request(struct request_queue *q, struct bio *bio) 1813 { 1814 int rw = bio_data_dir(bio); 1815 struct mapped_device *md = q->queuedata; 1816 int srcu_idx; 1817 struct dm_table *map; 1818 1819 map = dm_get_live_table(md, &srcu_idx); 1820 1821 generic_start_io_acct(rw, bio_sectors(bio), &dm_disk(md)->part0); 1822 1823 /* if we're suspended, we have to queue this io for later */ 1824 if (unlikely(test_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags))) { 1825 dm_put_live_table(md, srcu_idx); 1826 1827 if (bio_rw(bio) != READA) 1828 queue_io(md, bio); 1829 else 1830 bio_io_error(bio); 1831 return BLK_QC_T_NONE; 1832 } 1833 1834 __split_and_process_bio(md, map, bio); 1835 dm_put_live_table(md, srcu_idx); 1836 return BLK_QC_T_NONE; 1837 } 1838 1839 int dm_request_based(struct mapped_device *md) 1840 { 1841 return blk_queue_stackable(md->queue); 1842 } 1843 1844 static void dm_dispatch_clone_request(struct request *clone, struct request *rq) 1845 { 1846 int r; 1847 1848 if (blk_queue_io_stat(clone->q)) 1849 clone->cmd_flags |= REQ_IO_STAT; 1850 1851 clone->start_time = jiffies; 1852 r = blk_insert_cloned_request(clone->q, clone); 1853 if (r) 1854 /* must complete clone in terms of original request */ 1855 dm_complete_request(rq, r); 1856 } 1857 1858 static int dm_rq_bio_constructor(struct bio *bio, struct bio *bio_orig, 1859 void *data) 1860 { 1861 struct dm_rq_target_io *tio = data; 1862 struct dm_rq_clone_bio_info *info = 1863 container_of(bio, struct dm_rq_clone_bio_info, clone); 1864 1865 info->orig = bio_orig; 1866 info->tio = tio; 1867 bio->bi_end_io = end_clone_bio; 1868 1869 return 0; 1870 } 1871 1872 static int setup_clone(struct request *clone, struct request *rq, 1873 struct dm_rq_target_io *tio, gfp_t gfp_mask) 1874 { 1875 int r; 1876 1877 r = blk_rq_prep_clone(clone, rq, tio->md->bs, gfp_mask, 1878 dm_rq_bio_constructor, tio); 1879 if (r) 1880 return r; 1881 1882 clone->cmd = rq->cmd; 1883 clone->cmd_len = rq->cmd_len; 1884 clone->sense = rq->sense; 1885 clone->end_io = end_clone_request; 1886 clone->end_io_data = tio; 1887 1888 tio->clone = clone; 1889 1890 return 0; 1891 } 1892 1893 static struct request *clone_old_rq(struct request *rq, struct mapped_device *md, 1894 struct dm_rq_target_io *tio, gfp_t gfp_mask) 1895 { 1896 /* 1897 * Create clone for use with .request_fn request_queue 1898 */ 1899 struct request *clone; 1900 1901 clone = alloc_old_clone_request(md, gfp_mask); 1902 if (!clone) 1903 return NULL; 1904 1905 blk_rq_init(NULL, clone); 1906 if (setup_clone(clone, rq, tio, gfp_mask)) { 1907 /* -ENOMEM */ 1908 free_old_clone_request(md, clone); 1909 return NULL; 1910 } 1911 1912 return clone; 1913 } 1914 1915 static void map_tio_request(struct kthread_work *work); 1916 1917 static void init_tio(struct dm_rq_target_io *tio, struct request *rq, 1918 struct mapped_device *md) 1919 { 1920 tio->md = md; 1921 tio->ti = NULL; 1922 tio->clone = NULL; 1923 tio->orig = rq; 1924 tio->error = 0; 1925 /* 1926 * Avoid initializing info for blk-mq; it passes 1927 * target-specific data through info.ptr 1928 * (see: dm_mq_init_request) 1929 */ 1930 if (!md->init_tio_pdu) 1931 memset(&tio->info, 0, sizeof(tio->info)); 1932 if (md->kworker_task) 1933 init_kthread_work(&tio->work, map_tio_request); 1934 } 1935 1936 static struct dm_rq_target_io *dm_old_prep_tio(struct request *rq, 1937 struct mapped_device *md, 1938 gfp_t gfp_mask) 1939 { 1940 struct dm_rq_target_io *tio; 1941 int srcu_idx; 1942 struct dm_table *table; 1943 1944 tio = alloc_old_rq_tio(md, gfp_mask); 1945 if (!tio) 1946 return NULL; 1947 1948 init_tio(tio, rq, md); 1949 1950 table = dm_get_live_table(md, &srcu_idx); 1951 /* 1952 * Must clone a request if this .request_fn DM device 1953 * is stacked on .request_fn device(s). 1954 */ 1955 if (!dm_table_mq_request_based(table)) { 1956 if (!clone_old_rq(rq, md, tio, gfp_mask)) { 1957 dm_put_live_table(md, srcu_idx); 1958 free_old_rq_tio(tio); 1959 return NULL; 1960 } 1961 } 1962 dm_put_live_table(md, srcu_idx); 1963 1964 return tio; 1965 } 1966 1967 /* 1968 * Called with the queue lock held. 1969 */ 1970 static int dm_old_prep_fn(struct request_queue *q, struct request *rq) 1971 { 1972 struct mapped_device *md = q->queuedata; 1973 struct dm_rq_target_io *tio; 1974 1975 if (unlikely(rq->special)) { 1976 DMWARN("Already has something in rq->special."); 1977 return BLKPREP_KILL; 1978 } 1979 1980 tio = dm_old_prep_tio(rq, md, GFP_ATOMIC); 1981 if (!tio) 1982 return BLKPREP_DEFER; 1983 1984 rq->special = tio; 1985 rq->cmd_flags |= REQ_DONTPREP; 1986 1987 return BLKPREP_OK; 1988 } 1989 1990 /* 1991 * Returns: 1992 * 0 : the request has been processed 1993 * DM_MAPIO_REQUEUE : the original request needs to be requeued 1994 * < 0 : the request was completed due to failure 1995 */ 1996 static int map_request(struct dm_rq_target_io *tio, struct request *rq, 1997 struct mapped_device *md) 1998 { 1999 int r; 2000 struct dm_target *ti = tio->ti; 2001 struct request *clone = NULL; 2002 2003 if (tio->clone) { 2004 clone = tio->clone; 2005 r = ti->type->map_rq(ti, clone, &tio->info); 2006 } else { 2007 r = ti->type->clone_and_map_rq(ti, rq, &tio->info, &clone); 2008 if (r < 0) { 2009 /* The target wants to complete the I/O */ 2010 dm_kill_unmapped_request(rq, r); 2011 return r; 2012 } 2013 if (r != DM_MAPIO_REMAPPED) 2014 return r; 2015 if (setup_clone(clone, rq, tio, GFP_ATOMIC)) { 2016 /* -ENOMEM */ 2017 ti->type->release_clone_rq(clone); 2018 return DM_MAPIO_REQUEUE; 2019 } 2020 } 2021 2022 switch (r) { 2023 case DM_MAPIO_SUBMITTED: 2024 /* The target has taken the I/O to submit by itself later */ 2025 break; 2026 case DM_MAPIO_REMAPPED: 2027 /* The target has remapped the I/O so dispatch it */ 2028 trace_block_rq_remap(clone->q, clone, disk_devt(dm_disk(md)), 2029 blk_rq_pos(rq)); 2030 dm_dispatch_clone_request(clone, rq); 2031 break; 2032 case DM_MAPIO_REQUEUE: 2033 /* The target wants to requeue the I/O */ 2034 dm_requeue_original_request(md, tio->orig); 2035 break; 2036 default: 2037 if (r > 0) { 2038 DMWARN("unimplemented target map return value: %d", r); 2039 BUG(); 2040 } 2041 2042 /* The target wants to complete the I/O */ 2043 dm_kill_unmapped_request(rq, r); 2044 return r; 2045 } 2046 2047 return 0; 2048 } 2049 2050 static void map_tio_request(struct kthread_work *work) 2051 { 2052 struct dm_rq_target_io *tio = container_of(work, struct dm_rq_target_io, work); 2053 struct request *rq = tio->orig; 2054 struct mapped_device *md = tio->md; 2055 2056 if (map_request(tio, rq, md) == DM_MAPIO_REQUEUE) 2057 dm_requeue_original_request(md, rq); 2058 } 2059 2060 static void dm_start_request(struct mapped_device *md, struct request *orig) 2061 { 2062 if (!orig->q->mq_ops) 2063 blk_start_request(orig); 2064 else 2065 blk_mq_start_request(orig); 2066 atomic_inc(&md->pending[rq_data_dir(orig)]); 2067 2068 if (md->seq_rq_merge_deadline_usecs) { 2069 md->last_rq_pos = rq_end_sector(orig); 2070 md->last_rq_rw = rq_data_dir(orig); 2071 md->last_rq_start_time = ktime_get(); 2072 } 2073 2074 if (unlikely(dm_stats_used(&md->stats))) { 2075 struct dm_rq_target_io *tio = tio_from_request(orig); 2076 tio->duration_jiffies = jiffies; 2077 tio->n_sectors = blk_rq_sectors(orig); 2078 dm_stats_account_io(&md->stats, orig->cmd_flags, blk_rq_pos(orig), 2079 tio->n_sectors, false, 0, &tio->stats_aux); 2080 } 2081 2082 /* 2083 * Hold the md reference here for the in-flight I/O. 2084 * We can't rely on the reference count by device opener, 2085 * because the device may be closed during the request completion 2086 * when all bios are completed. 2087 * See the comment in rq_completed() too. 2088 */ 2089 dm_get(md); 2090 } 2091 2092 #define MAX_SEQ_RQ_MERGE_DEADLINE_USECS 100000 2093 2094 ssize_t dm_attr_rq_based_seq_io_merge_deadline_show(struct mapped_device *md, char *buf) 2095 { 2096 return sprintf(buf, "%u\n", md->seq_rq_merge_deadline_usecs); 2097 } 2098 2099 ssize_t dm_attr_rq_based_seq_io_merge_deadline_store(struct mapped_device *md, 2100 const char *buf, size_t count) 2101 { 2102 unsigned deadline; 2103 2104 if (!dm_request_based(md) || md->use_blk_mq) 2105 return count; 2106 2107 if (kstrtouint(buf, 10, &deadline)) 2108 return -EINVAL; 2109 2110 if (deadline > MAX_SEQ_RQ_MERGE_DEADLINE_USECS) 2111 deadline = MAX_SEQ_RQ_MERGE_DEADLINE_USECS; 2112 2113 md->seq_rq_merge_deadline_usecs = deadline; 2114 2115 return count; 2116 } 2117 2118 static bool dm_request_peeked_before_merge_deadline(struct mapped_device *md) 2119 { 2120 ktime_t kt_deadline; 2121 2122 if (!md->seq_rq_merge_deadline_usecs) 2123 return false; 2124 2125 kt_deadline = ns_to_ktime((u64)md->seq_rq_merge_deadline_usecs * NSEC_PER_USEC); 2126 kt_deadline = ktime_add_safe(md->last_rq_start_time, kt_deadline); 2127 2128 return !ktime_after(ktime_get(), kt_deadline); 2129 } 2130 2131 /* 2132 * q->request_fn for request-based dm. 2133 * Called with the queue lock held. 2134 */ 2135 static void dm_request_fn(struct request_queue *q) 2136 { 2137 struct mapped_device *md = q->queuedata; 2138 struct dm_target *ti = md->immutable_target; 2139 struct request *rq; 2140 struct dm_rq_target_io *tio; 2141 sector_t pos = 0; 2142 2143 if (unlikely(!ti)) { 2144 int srcu_idx; 2145 struct dm_table *map = dm_get_live_table(md, &srcu_idx); 2146 2147 ti = dm_table_find_target(map, pos); 2148 dm_put_live_table(md, srcu_idx); 2149 } 2150 2151 /* 2152 * For suspend, check blk_queue_stopped() and increment 2153 * ->pending within a single queue_lock not to increment the 2154 * number of in-flight I/Os after the queue is stopped in 2155 * dm_suspend(). 2156 */ 2157 while (!blk_queue_stopped(q)) { 2158 rq = blk_peek_request(q); 2159 if (!rq) 2160 return; 2161 2162 /* always use block 0 to find the target for flushes for now */ 2163 pos = 0; 2164 if (!(rq->cmd_flags & REQ_FLUSH)) 2165 pos = blk_rq_pos(rq); 2166 2167 if ((dm_request_peeked_before_merge_deadline(md) && 2168 md_in_flight(md) && rq->bio && rq->bio->bi_vcnt == 1 && 2169 md->last_rq_pos == pos && md->last_rq_rw == rq_data_dir(rq)) || 2170 (ti->type->busy && ti->type->busy(ti))) { 2171 blk_delay_queue(q, HZ / 100); 2172 return; 2173 } 2174 2175 dm_start_request(md, rq); 2176 2177 tio = tio_from_request(rq); 2178 /* Establish tio->ti before queuing work (map_tio_request) */ 2179 tio->ti = ti; 2180 queue_kthread_work(&md->kworker, &tio->work); 2181 BUG_ON(!irqs_disabled()); 2182 } 2183 } 2184 2185 static int dm_any_congested(void *congested_data, int bdi_bits) 2186 { 2187 int r = bdi_bits; 2188 struct mapped_device *md = congested_data; 2189 struct dm_table *map; 2190 2191 if (!test_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags)) { 2192 if (dm_request_based(md)) { 2193 /* 2194 * With request-based DM we only need to check the 2195 * top-level queue for congestion. 2196 */ 2197 r = md->queue->backing_dev_info.wb.state & bdi_bits; 2198 } else { 2199 map = dm_get_live_table_fast(md); 2200 if (map) 2201 r = dm_table_any_congested(map, bdi_bits); 2202 dm_put_live_table_fast(md); 2203 } 2204 } 2205 2206 return r; 2207 } 2208 2209 /*----------------------------------------------------------------- 2210 * An IDR is used to keep track of allocated minor numbers. 2211 *---------------------------------------------------------------*/ 2212 static void free_minor(int minor) 2213 { 2214 spin_lock(&_minor_lock); 2215 idr_remove(&_minor_idr, minor); 2216 spin_unlock(&_minor_lock); 2217 } 2218 2219 /* 2220 * See if the device with a specific minor # is free. 2221 */ 2222 static int specific_minor(int minor) 2223 { 2224 int r; 2225 2226 if (minor >= (1 << MINORBITS)) 2227 return -EINVAL; 2228 2229 idr_preload(GFP_KERNEL); 2230 spin_lock(&_minor_lock); 2231 2232 r = idr_alloc(&_minor_idr, MINOR_ALLOCED, minor, minor + 1, GFP_NOWAIT); 2233 2234 spin_unlock(&_minor_lock); 2235 idr_preload_end(); 2236 if (r < 0) 2237 return r == -ENOSPC ? -EBUSY : r; 2238 return 0; 2239 } 2240 2241 static int next_free_minor(int *minor) 2242 { 2243 int r; 2244 2245 idr_preload(GFP_KERNEL); 2246 spin_lock(&_minor_lock); 2247 2248 r = idr_alloc(&_minor_idr, MINOR_ALLOCED, 0, 1 << MINORBITS, GFP_NOWAIT); 2249 2250 spin_unlock(&_minor_lock); 2251 idr_preload_end(); 2252 if (r < 0) 2253 return r; 2254 *minor = r; 2255 return 0; 2256 } 2257 2258 static const struct block_device_operations dm_blk_dops; 2259 2260 static void dm_wq_work(struct work_struct *work); 2261 2262 static void dm_init_md_queue(struct mapped_device *md) 2263 { 2264 /* 2265 * Request-based dm devices cannot be stacked on top of bio-based dm 2266 * devices. The type of this dm device may not have been decided yet. 2267 * The type is decided at the first table loading time. 2268 * To prevent problematic device stacking, clear the queue flag 2269 * for request stacking support until then. 2270 * 2271 * This queue is new, so no concurrency on the queue_flags. 2272 */ 2273 queue_flag_clear_unlocked(QUEUE_FLAG_STACKABLE, md->queue); 2274 2275 /* 2276 * Initialize data that will only be used by a non-blk-mq DM queue 2277 * - must do so here (in alloc_dev callchain) before queue is used 2278 */ 2279 md->queue->queuedata = md; 2280 md->queue->backing_dev_info.congested_data = md; 2281 } 2282 2283 static void dm_init_normal_md_queue(struct mapped_device *md) 2284 { 2285 md->use_blk_mq = false; 2286 dm_init_md_queue(md); 2287 2288 /* 2289 * Initialize aspects of queue that aren't relevant for blk-mq 2290 */ 2291 md->queue->backing_dev_info.congested_fn = dm_any_congested; 2292 blk_queue_bounce_limit(md->queue, BLK_BOUNCE_ANY); 2293 } 2294 2295 static void cleanup_mapped_device(struct mapped_device *md) 2296 { 2297 if (md->wq) 2298 destroy_workqueue(md->wq); 2299 if (md->kworker_task) 2300 kthread_stop(md->kworker_task); 2301 mempool_destroy(md->io_pool); 2302 mempool_destroy(md->rq_pool); 2303 if (md->bs) 2304 bioset_free(md->bs); 2305 2306 cleanup_srcu_struct(&md->io_barrier); 2307 2308 if (md->disk) { 2309 spin_lock(&_minor_lock); 2310 md->disk->private_data = NULL; 2311 spin_unlock(&_minor_lock); 2312 del_gendisk(md->disk); 2313 put_disk(md->disk); 2314 } 2315 2316 if (md->queue) 2317 blk_cleanup_queue(md->queue); 2318 2319 if (md->bdev) { 2320 bdput(md->bdev); 2321 md->bdev = NULL; 2322 } 2323 } 2324 2325 /* 2326 * Allocate and initialise a blank device with a given minor. 2327 */ 2328 static struct mapped_device *alloc_dev(int minor) 2329 { 2330 int r, numa_node_id = dm_get_numa_node(); 2331 struct mapped_device *md; 2332 void *old_md; 2333 2334 md = kzalloc_node(sizeof(*md), GFP_KERNEL, numa_node_id); 2335 if (!md) { 2336 DMWARN("unable to allocate device, out of memory."); 2337 return NULL; 2338 } 2339 2340 if (!try_module_get(THIS_MODULE)) 2341 goto bad_module_get; 2342 2343 /* get a minor number for the dev */ 2344 if (minor == DM_ANY_MINOR) 2345 r = next_free_minor(&minor); 2346 else 2347 r = specific_minor(minor); 2348 if (r < 0) 2349 goto bad_minor; 2350 2351 r = init_srcu_struct(&md->io_barrier); 2352 if (r < 0) 2353 goto bad_io_barrier; 2354 2355 md->numa_node_id = numa_node_id; 2356 md->use_blk_mq = use_blk_mq; 2357 md->init_tio_pdu = false; 2358 md->type = DM_TYPE_NONE; 2359 mutex_init(&md->suspend_lock); 2360 mutex_init(&md->type_lock); 2361 mutex_init(&md->table_devices_lock); 2362 spin_lock_init(&md->deferred_lock); 2363 atomic_set(&md->holders, 1); 2364 atomic_set(&md->open_count, 0); 2365 atomic_set(&md->event_nr, 0); 2366 atomic_set(&md->uevent_seq, 0); 2367 INIT_LIST_HEAD(&md->uevent_list); 2368 INIT_LIST_HEAD(&md->table_devices); 2369 spin_lock_init(&md->uevent_lock); 2370 2371 md->queue = blk_alloc_queue_node(GFP_KERNEL, numa_node_id); 2372 if (!md->queue) 2373 goto bad; 2374 2375 dm_init_md_queue(md); 2376 2377 md->disk = alloc_disk_node(1, numa_node_id); 2378 if (!md->disk) 2379 goto bad; 2380 2381 atomic_set(&md->pending[0], 0); 2382 atomic_set(&md->pending[1], 0); 2383 init_waitqueue_head(&md->wait); 2384 INIT_WORK(&md->work, dm_wq_work); 2385 init_waitqueue_head(&md->eventq); 2386 init_completion(&md->kobj_holder.completion); 2387 md->kworker_task = NULL; 2388 2389 md->disk->major = _major; 2390 md->disk->first_minor = minor; 2391 md->disk->fops = &dm_blk_dops; 2392 md->disk->queue = md->queue; 2393 md->disk->private_data = md; 2394 sprintf(md->disk->disk_name, "dm-%d", minor); 2395 add_disk(md->disk); 2396 format_dev_t(md->name, MKDEV(_major, minor)); 2397 2398 md->wq = alloc_workqueue("kdmflush", WQ_MEM_RECLAIM, 0); 2399 if (!md->wq) 2400 goto bad; 2401 2402 md->bdev = bdget_disk(md->disk, 0); 2403 if (!md->bdev) 2404 goto bad; 2405 2406 bio_init(&md->flush_bio); 2407 md->flush_bio.bi_bdev = md->bdev; 2408 md->flush_bio.bi_rw = WRITE_FLUSH; 2409 2410 dm_stats_init(&md->stats); 2411 2412 /* Populate the mapping, nobody knows we exist yet */ 2413 spin_lock(&_minor_lock); 2414 old_md = idr_replace(&_minor_idr, md, minor); 2415 spin_unlock(&_minor_lock); 2416 2417 BUG_ON(old_md != MINOR_ALLOCED); 2418 2419 return md; 2420 2421 bad: 2422 cleanup_mapped_device(md); 2423 bad_io_barrier: 2424 free_minor(minor); 2425 bad_minor: 2426 module_put(THIS_MODULE); 2427 bad_module_get: 2428 kfree(md); 2429 return NULL; 2430 } 2431 2432 static void unlock_fs(struct mapped_device *md); 2433 2434 static void free_dev(struct mapped_device *md) 2435 { 2436 int minor = MINOR(disk_devt(md->disk)); 2437 2438 unlock_fs(md); 2439 2440 cleanup_mapped_device(md); 2441 if (md->tag_set) { 2442 blk_mq_free_tag_set(md->tag_set); 2443 kfree(md->tag_set); 2444 } 2445 2446 free_table_devices(&md->table_devices); 2447 dm_stats_cleanup(&md->stats); 2448 free_minor(minor); 2449 2450 module_put(THIS_MODULE); 2451 kfree(md); 2452 } 2453 2454 static void __bind_mempools(struct mapped_device *md, struct dm_table *t) 2455 { 2456 struct dm_md_mempools *p = dm_table_get_md_mempools(t); 2457 2458 if (md->bs) { 2459 /* The md already has necessary mempools. */ 2460 if (dm_table_get_type(t) == DM_TYPE_BIO_BASED) { 2461 /* 2462 * Reload bioset because front_pad may have changed 2463 * because a different table was loaded. 2464 */ 2465 bioset_free(md->bs); 2466 md->bs = p->bs; 2467 p->bs = NULL; 2468 } 2469 /* 2470 * There's no need to reload with request-based dm 2471 * because the size of front_pad doesn't change. 2472 * Note for future: If you are to reload bioset, 2473 * prep-ed requests in the queue may refer 2474 * to bio from the old bioset, so you must walk 2475 * through the queue to unprep. 2476 */ 2477 goto out; 2478 } 2479 2480 BUG_ON(!p || md->io_pool || md->rq_pool || md->bs); 2481 2482 md->io_pool = p->io_pool; 2483 p->io_pool = NULL; 2484 md->rq_pool = p->rq_pool; 2485 p->rq_pool = NULL; 2486 md->bs = p->bs; 2487 p->bs = NULL; 2488 2489 out: 2490 /* mempool bind completed, no longer need any mempools in the table */ 2491 dm_table_free_md_mempools(t); 2492 } 2493 2494 /* 2495 * Bind a table to the device. 2496 */ 2497 static void event_callback(void *context) 2498 { 2499 unsigned long flags; 2500 LIST_HEAD(uevents); 2501 struct mapped_device *md = (struct mapped_device *) context; 2502 2503 spin_lock_irqsave(&md->uevent_lock, flags); 2504 list_splice_init(&md->uevent_list, &uevents); 2505 spin_unlock_irqrestore(&md->uevent_lock, flags); 2506 2507 dm_send_uevents(&uevents, &disk_to_dev(md->disk)->kobj); 2508 2509 atomic_inc(&md->event_nr); 2510 wake_up(&md->eventq); 2511 } 2512 2513 /* 2514 * Protected by md->suspend_lock obtained by dm_swap_table(). 2515 */ 2516 static void __set_size(struct mapped_device *md, sector_t size) 2517 { 2518 set_capacity(md->disk, size); 2519 2520 i_size_write(md->bdev->bd_inode, (loff_t)size << SECTOR_SHIFT); 2521 } 2522 2523 /* 2524 * Returns old map, which caller must destroy. 2525 */ 2526 static struct dm_table *__bind(struct mapped_device *md, struct dm_table *t, 2527 struct queue_limits *limits) 2528 { 2529 struct dm_table *old_map; 2530 struct request_queue *q = md->queue; 2531 sector_t size; 2532 2533 size = dm_table_get_size(t); 2534 2535 /* 2536 * Wipe any geometry if the size of the table changed. 2537 */ 2538 if (size != dm_get_size(md)) 2539 memset(&md->geometry, 0, sizeof(md->geometry)); 2540 2541 __set_size(md, size); 2542 2543 dm_table_event_callback(t, event_callback, md); 2544 2545 /* 2546 * The queue hasn't been stopped yet, if the old table type wasn't 2547 * for request-based during suspension. So stop it to prevent 2548 * I/O mapping before resume. 2549 * This must be done before setting the queue restrictions, 2550 * because request-based dm may be run just after the setting. 2551 */ 2552 if (dm_table_request_based(t)) { 2553 dm_stop_queue(q); 2554 /* 2555 * Leverage the fact that request-based DM targets are 2556 * immutable singletons and establish md->immutable_target 2557 * - used to optimize both dm_request_fn and dm_mq_queue_rq 2558 */ 2559 md->immutable_target = dm_table_get_immutable_target(t); 2560 } 2561 2562 __bind_mempools(md, t); 2563 2564 old_map = rcu_dereference_protected(md->map, lockdep_is_held(&md->suspend_lock)); 2565 rcu_assign_pointer(md->map, t); 2566 md->immutable_target_type = dm_table_get_immutable_target_type(t); 2567 2568 dm_table_set_restrictions(t, q, limits); 2569 if (old_map) 2570 dm_sync_table(md); 2571 2572 return old_map; 2573 } 2574 2575 /* 2576 * Returns unbound table for the caller to free. 2577 */ 2578 static struct dm_table *__unbind(struct mapped_device *md) 2579 { 2580 struct dm_table *map = rcu_dereference_protected(md->map, 1); 2581 2582 if (!map) 2583 return NULL; 2584 2585 dm_table_event_callback(map, NULL, NULL); 2586 RCU_INIT_POINTER(md->map, NULL); 2587 dm_sync_table(md); 2588 2589 return map; 2590 } 2591 2592 /* 2593 * Constructor for a new device. 2594 */ 2595 int dm_create(int minor, struct mapped_device **result) 2596 { 2597 struct mapped_device *md; 2598 2599 md = alloc_dev(minor); 2600 if (!md) 2601 return -ENXIO; 2602 2603 dm_sysfs_init(md); 2604 2605 *result = md; 2606 return 0; 2607 } 2608 2609 /* 2610 * Functions to manage md->type. 2611 * All are required to hold md->type_lock. 2612 */ 2613 void dm_lock_md_type(struct mapped_device *md) 2614 { 2615 mutex_lock(&md->type_lock); 2616 } 2617 2618 void dm_unlock_md_type(struct mapped_device *md) 2619 { 2620 mutex_unlock(&md->type_lock); 2621 } 2622 2623 void dm_set_md_type(struct mapped_device *md, unsigned type) 2624 { 2625 BUG_ON(!mutex_is_locked(&md->type_lock)); 2626 md->type = type; 2627 } 2628 2629 unsigned dm_get_md_type(struct mapped_device *md) 2630 { 2631 return md->type; 2632 } 2633 2634 struct target_type *dm_get_immutable_target_type(struct mapped_device *md) 2635 { 2636 return md->immutable_target_type; 2637 } 2638 2639 /* 2640 * The queue_limits are only valid as long as you have a reference 2641 * count on 'md'. 2642 */ 2643 struct queue_limits *dm_get_queue_limits(struct mapped_device *md) 2644 { 2645 BUG_ON(!atomic_read(&md->holders)); 2646 return &md->queue->limits; 2647 } 2648 EXPORT_SYMBOL_GPL(dm_get_queue_limits); 2649 2650 static void dm_old_init_rq_based_worker_thread(struct mapped_device *md) 2651 { 2652 /* Initialize the request-based DM worker thread */ 2653 init_kthread_worker(&md->kworker); 2654 md->kworker_task = kthread_run(kthread_worker_fn, &md->kworker, 2655 "kdmwork-%s", dm_device_name(md)); 2656 } 2657 2658 /* 2659 * Fully initialize a .request_fn request-based queue. 2660 */ 2661 static int dm_old_init_request_queue(struct mapped_device *md) 2662 { 2663 struct request_queue *q = NULL; 2664 2665 /* Fully initialize the queue */ 2666 q = blk_init_allocated_queue(md->queue, dm_request_fn, NULL); 2667 if (!q) 2668 return -EINVAL; 2669 2670 /* disable dm_request_fn's merge heuristic by default */ 2671 md->seq_rq_merge_deadline_usecs = 0; 2672 2673 md->queue = q; 2674 dm_init_normal_md_queue(md); 2675 blk_queue_softirq_done(md->queue, dm_softirq_done); 2676 blk_queue_prep_rq(md->queue, dm_old_prep_fn); 2677 2678 dm_old_init_rq_based_worker_thread(md); 2679 2680 elv_register_queue(md->queue); 2681 2682 return 0; 2683 } 2684 2685 static int dm_mq_init_request(void *data, struct request *rq, 2686 unsigned int hctx_idx, unsigned int request_idx, 2687 unsigned int numa_node) 2688 { 2689 struct mapped_device *md = data; 2690 struct dm_rq_target_io *tio = blk_mq_rq_to_pdu(rq); 2691 2692 /* 2693 * Must initialize md member of tio, otherwise it won't 2694 * be available in dm_mq_queue_rq. 2695 */ 2696 tio->md = md; 2697 2698 if (md->init_tio_pdu) { 2699 /* target-specific per-io data is immediately after the tio */ 2700 tio->info.ptr = tio + 1; 2701 } 2702 2703 return 0; 2704 } 2705 2706 static int dm_mq_queue_rq(struct blk_mq_hw_ctx *hctx, 2707 const struct blk_mq_queue_data *bd) 2708 { 2709 struct request *rq = bd->rq; 2710 struct dm_rq_target_io *tio = blk_mq_rq_to_pdu(rq); 2711 struct mapped_device *md = tio->md; 2712 struct dm_target *ti = md->immutable_target; 2713 2714 if (unlikely(!ti)) { 2715 int srcu_idx; 2716 struct dm_table *map = dm_get_live_table(md, &srcu_idx); 2717 2718 ti = dm_table_find_target(map, 0); 2719 dm_put_live_table(md, srcu_idx); 2720 } 2721 2722 if (ti->type->busy && ti->type->busy(ti)) 2723 return BLK_MQ_RQ_QUEUE_BUSY; 2724 2725 dm_start_request(md, rq); 2726 2727 /* Init tio using md established in .init_request */ 2728 init_tio(tio, rq, md); 2729 2730 /* 2731 * Establish tio->ti before queuing work (map_tio_request) 2732 * or making direct call to map_request(). 2733 */ 2734 tio->ti = ti; 2735 2736 /* Direct call is fine since .queue_rq allows allocations */ 2737 if (map_request(tio, rq, md) == DM_MAPIO_REQUEUE) { 2738 /* Undo dm_start_request() before requeuing */ 2739 rq_end_stats(md, rq); 2740 rq_completed(md, rq_data_dir(rq), false); 2741 return BLK_MQ_RQ_QUEUE_BUSY; 2742 } 2743 2744 return BLK_MQ_RQ_QUEUE_OK; 2745 } 2746 2747 static struct blk_mq_ops dm_mq_ops = { 2748 .queue_rq = dm_mq_queue_rq, 2749 .map_queue = blk_mq_map_queue, 2750 .complete = dm_softirq_done, 2751 .init_request = dm_mq_init_request, 2752 }; 2753 2754 static int dm_mq_init_request_queue(struct mapped_device *md, 2755 struct dm_target *immutable_tgt) 2756 { 2757 struct request_queue *q; 2758 int err; 2759 2760 if (dm_get_md_type(md) == DM_TYPE_REQUEST_BASED) { 2761 DMERR("request-based dm-mq may only be stacked on blk-mq device(s)"); 2762 return -EINVAL; 2763 } 2764 2765 md->tag_set = kzalloc_node(sizeof(struct blk_mq_tag_set), GFP_KERNEL, md->numa_node_id); 2766 if (!md->tag_set) 2767 return -ENOMEM; 2768 2769 md->tag_set->ops = &dm_mq_ops; 2770 md->tag_set->queue_depth = dm_get_blk_mq_queue_depth(); 2771 md->tag_set->numa_node = md->numa_node_id; 2772 md->tag_set->flags = BLK_MQ_F_SHOULD_MERGE | BLK_MQ_F_SG_MERGE; 2773 md->tag_set->nr_hw_queues = dm_get_blk_mq_nr_hw_queues(); 2774 md->tag_set->driver_data = md; 2775 2776 md->tag_set->cmd_size = sizeof(struct dm_rq_target_io); 2777 if (immutable_tgt && immutable_tgt->per_io_data_size) { 2778 /* any target-specific per-io data is immediately after the tio */ 2779 md->tag_set->cmd_size += immutable_tgt->per_io_data_size; 2780 md->init_tio_pdu = true; 2781 } 2782 2783 err = blk_mq_alloc_tag_set(md->tag_set); 2784 if (err) 2785 goto out_kfree_tag_set; 2786 2787 q = blk_mq_init_allocated_queue(md->tag_set, md->queue); 2788 if (IS_ERR(q)) { 2789 err = PTR_ERR(q); 2790 goto out_tag_set; 2791 } 2792 md->queue = q; 2793 dm_init_md_queue(md); 2794 2795 /* backfill 'mq' sysfs registration normally done in blk_register_queue */ 2796 blk_mq_register_disk(md->disk); 2797 2798 return 0; 2799 2800 out_tag_set: 2801 blk_mq_free_tag_set(md->tag_set); 2802 out_kfree_tag_set: 2803 kfree(md->tag_set); 2804 2805 return err; 2806 } 2807 2808 static unsigned filter_md_type(unsigned type, struct mapped_device *md) 2809 { 2810 if (type == DM_TYPE_BIO_BASED) 2811 return type; 2812 2813 return !md->use_blk_mq ? DM_TYPE_REQUEST_BASED : DM_TYPE_MQ_REQUEST_BASED; 2814 } 2815 2816 /* 2817 * Setup the DM device's queue based on md's type 2818 */ 2819 int dm_setup_md_queue(struct mapped_device *md, struct dm_table *t) 2820 { 2821 int r; 2822 unsigned md_type = filter_md_type(dm_get_md_type(md), md); 2823 2824 switch (md_type) { 2825 case DM_TYPE_REQUEST_BASED: 2826 r = dm_old_init_request_queue(md); 2827 if (r) { 2828 DMERR("Cannot initialize queue for request-based mapped device"); 2829 return r; 2830 } 2831 break; 2832 case DM_TYPE_MQ_REQUEST_BASED: 2833 r = dm_mq_init_request_queue(md, dm_table_get_immutable_target(t)); 2834 if (r) { 2835 DMERR("Cannot initialize queue for request-based dm-mq mapped device"); 2836 return r; 2837 } 2838 break; 2839 case DM_TYPE_BIO_BASED: 2840 dm_init_normal_md_queue(md); 2841 blk_queue_make_request(md->queue, dm_make_request); 2842 /* 2843 * DM handles splitting bios as needed. Free the bio_split bioset 2844 * since it won't be used (saves 1 process per bio-based DM device). 2845 */ 2846 bioset_free(md->queue->bio_split); 2847 md->queue->bio_split = NULL; 2848 break; 2849 } 2850 2851 return 0; 2852 } 2853 2854 struct mapped_device *dm_get_md(dev_t dev) 2855 { 2856 struct mapped_device *md; 2857 unsigned minor = MINOR(dev); 2858 2859 if (MAJOR(dev) != _major || minor >= (1 << MINORBITS)) 2860 return NULL; 2861 2862 spin_lock(&_minor_lock); 2863 2864 md = idr_find(&_minor_idr, minor); 2865 if (md) { 2866 if ((md == MINOR_ALLOCED || 2867 (MINOR(disk_devt(dm_disk(md))) != minor) || 2868 dm_deleting_md(md) || 2869 test_bit(DMF_FREEING, &md->flags))) { 2870 md = NULL; 2871 goto out; 2872 } 2873 dm_get(md); 2874 } 2875 2876 out: 2877 spin_unlock(&_minor_lock); 2878 2879 return md; 2880 } 2881 EXPORT_SYMBOL_GPL(dm_get_md); 2882 2883 void *dm_get_mdptr(struct mapped_device *md) 2884 { 2885 return md->interface_ptr; 2886 } 2887 2888 void dm_set_mdptr(struct mapped_device *md, void *ptr) 2889 { 2890 md->interface_ptr = ptr; 2891 } 2892 2893 void dm_get(struct mapped_device *md) 2894 { 2895 atomic_inc(&md->holders); 2896 BUG_ON(test_bit(DMF_FREEING, &md->flags)); 2897 } 2898 2899 int dm_hold(struct mapped_device *md) 2900 { 2901 spin_lock(&_minor_lock); 2902 if (test_bit(DMF_FREEING, &md->flags)) { 2903 spin_unlock(&_minor_lock); 2904 return -EBUSY; 2905 } 2906 dm_get(md); 2907 spin_unlock(&_minor_lock); 2908 return 0; 2909 } 2910 EXPORT_SYMBOL_GPL(dm_hold); 2911 2912 const char *dm_device_name(struct mapped_device *md) 2913 { 2914 return md->name; 2915 } 2916 EXPORT_SYMBOL_GPL(dm_device_name); 2917 2918 static void __dm_destroy(struct mapped_device *md, bool wait) 2919 { 2920 struct dm_table *map; 2921 int srcu_idx; 2922 2923 might_sleep(); 2924 2925 spin_lock(&_minor_lock); 2926 idr_replace(&_minor_idr, MINOR_ALLOCED, MINOR(disk_devt(dm_disk(md)))); 2927 set_bit(DMF_FREEING, &md->flags); 2928 spin_unlock(&_minor_lock); 2929 2930 if (dm_request_based(md) && md->kworker_task) 2931 flush_kthread_worker(&md->kworker); 2932 2933 /* 2934 * Take suspend_lock so that presuspend and postsuspend methods 2935 * do not race with internal suspend. 2936 */ 2937 mutex_lock(&md->suspend_lock); 2938 map = dm_get_live_table(md, &srcu_idx); 2939 if (!dm_suspended_md(md)) { 2940 dm_table_presuspend_targets(map); 2941 dm_table_postsuspend_targets(map); 2942 } 2943 /* dm_put_live_table must be before msleep, otherwise deadlock is possible */ 2944 dm_put_live_table(md, srcu_idx); 2945 mutex_unlock(&md->suspend_lock); 2946 2947 /* 2948 * Rare, but there may be I/O requests still going to complete, 2949 * for example. Wait for all references to disappear. 2950 * No one should increment the reference count of the mapped_device, 2951 * after the mapped_device state becomes DMF_FREEING. 2952 */ 2953 if (wait) 2954 while (atomic_read(&md->holders)) 2955 msleep(1); 2956 else if (atomic_read(&md->holders)) 2957 DMWARN("%s: Forcibly removing mapped_device still in use! (%d users)", 2958 dm_device_name(md), atomic_read(&md->holders)); 2959 2960 dm_sysfs_exit(md); 2961 dm_table_destroy(__unbind(md)); 2962 free_dev(md); 2963 } 2964 2965 void dm_destroy(struct mapped_device *md) 2966 { 2967 __dm_destroy(md, true); 2968 } 2969 2970 void dm_destroy_immediate(struct mapped_device *md) 2971 { 2972 __dm_destroy(md, false); 2973 } 2974 2975 void dm_put(struct mapped_device *md) 2976 { 2977 atomic_dec(&md->holders); 2978 } 2979 EXPORT_SYMBOL_GPL(dm_put); 2980 2981 static int dm_wait_for_completion(struct mapped_device *md, int interruptible) 2982 { 2983 int r = 0; 2984 DECLARE_WAITQUEUE(wait, current); 2985 2986 add_wait_queue(&md->wait, &wait); 2987 2988 while (1) { 2989 set_current_state(interruptible); 2990 2991 if (!md_in_flight(md)) 2992 break; 2993 2994 if (interruptible == TASK_INTERRUPTIBLE && 2995 signal_pending(current)) { 2996 r = -EINTR; 2997 break; 2998 } 2999 3000 io_schedule(); 3001 } 3002 set_current_state(TASK_RUNNING); 3003 3004 remove_wait_queue(&md->wait, &wait); 3005 3006 return r; 3007 } 3008 3009 /* 3010 * Process the deferred bios 3011 */ 3012 static void dm_wq_work(struct work_struct *work) 3013 { 3014 struct mapped_device *md = container_of(work, struct mapped_device, 3015 work); 3016 struct bio *c; 3017 int srcu_idx; 3018 struct dm_table *map; 3019 3020 map = dm_get_live_table(md, &srcu_idx); 3021 3022 while (!test_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags)) { 3023 spin_lock_irq(&md->deferred_lock); 3024 c = bio_list_pop(&md->deferred); 3025 spin_unlock_irq(&md->deferred_lock); 3026 3027 if (!c) 3028 break; 3029 3030 if (dm_request_based(md)) 3031 generic_make_request(c); 3032 else 3033 __split_and_process_bio(md, map, c); 3034 } 3035 3036 dm_put_live_table(md, srcu_idx); 3037 } 3038 3039 static void dm_queue_flush(struct mapped_device *md) 3040 { 3041 clear_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags); 3042 smp_mb__after_atomic(); 3043 queue_work(md->wq, &md->work); 3044 } 3045 3046 /* 3047 * Swap in a new table, returning the old one for the caller to destroy. 3048 */ 3049 struct dm_table *dm_swap_table(struct mapped_device *md, struct dm_table *table) 3050 { 3051 struct dm_table *live_map = NULL, *map = ERR_PTR(-EINVAL); 3052 struct queue_limits limits; 3053 int r; 3054 3055 mutex_lock(&md->suspend_lock); 3056 3057 /* device must be suspended */ 3058 if (!dm_suspended_md(md)) 3059 goto out; 3060 3061 /* 3062 * If the new table has no data devices, retain the existing limits. 3063 * This helps multipath with queue_if_no_path if all paths disappear, 3064 * then new I/O is queued based on these limits, and then some paths 3065 * reappear. 3066 */ 3067 if (dm_table_has_no_data_devices(table)) { 3068 live_map = dm_get_live_table_fast(md); 3069 if (live_map) 3070 limits = md->queue->limits; 3071 dm_put_live_table_fast(md); 3072 } 3073 3074 if (!live_map) { 3075 r = dm_calculate_queue_limits(table, &limits); 3076 if (r) { 3077 map = ERR_PTR(r); 3078 goto out; 3079 } 3080 } 3081 3082 map = __bind(md, table, &limits); 3083 3084 out: 3085 mutex_unlock(&md->suspend_lock); 3086 return map; 3087 } 3088 3089 /* 3090 * Functions to lock and unlock any filesystem running on the 3091 * device. 3092 */ 3093 static int lock_fs(struct mapped_device *md) 3094 { 3095 int r; 3096 3097 WARN_ON(md->frozen_sb); 3098 3099 md->frozen_sb = freeze_bdev(md->bdev); 3100 if (IS_ERR(md->frozen_sb)) { 3101 r = PTR_ERR(md->frozen_sb); 3102 md->frozen_sb = NULL; 3103 return r; 3104 } 3105 3106 set_bit(DMF_FROZEN, &md->flags); 3107 3108 return 0; 3109 } 3110 3111 static void unlock_fs(struct mapped_device *md) 3112 { 3113 if (!test_bit(DMF_FROZEN, &md->flags)) 3114 return; 3115 3116 thaw_bdev(md->bdev, md->frozen_sb); 3117 md->frozen_sb = NULL; 3118 clear_bit(DMF_FROZEN, &md->flags); 3119 } 3120 3121 /* 3122 * If __dm_suspend returns 0, the device is completely quiescent 3123 * now. There is no request-processing activity. All new requests 3124 * are being added to md->deferred list. 3125 * 3126 * Caller must hold md->suspend_lock 3127 */ 3128 static int __dm_suspend(struct mapped_device *md, struct dm_table *map, 3129 unsigned suspend_flags, int interruptible) 3130 { 3131 bool do_lockfs = suspend_flags & DM_SUSPEND_LOCKFS_FLAG; 3132 bool noflush = suspend_flags & DM_SUSPEND_NOFLUSH_FLAG; 3133 int r; 3134 3135 /* 3136 * DMF_NOFLUSH_SUSPENDING must be set before presuspend. 3137 * This flag is cleared before dm_suspend returns. 3138 */ 3139 if (noflush) 3140 set_bit(DMF_NOFLUSH_SUSPENDING, &md->flags); 3141 3142 /* 3143 * This gets reverted if there's an error later and the targets 3144 * provide the .presuspend_undo hook. 3145 */ 3146 dm_table_presuspend_targets(map); 3147 3148 /* 3149 * Flush I/O to the device. 3150 * Any I/O submitted after lock_fs() may not be flushed. 3151 * noflush takes precedence over do_lockfs. 3152 * (lock_fs() flushes I/Os and waits for them to complete.) 3153 */ 3154 if (!noflush && do_lockfs) { 3155 r = lock_fs(md); 3156 if (r) { 3157 dm_table_presuspend_undo_targets(map); 3158 return r; 3159 } 3160 } 3161 3162 /* 3163 * Here we must make sure that no processes are submitting requests 3164 * to target drivers i.e. no one may be executing 3165 * __split_and_process_bio. This is called from dm_request and 3166 * dm_wq_work. 3167 * 3168 * To get all processes out of __split_and_process_bio in dm_request, 3169 * we take the write lock. To prevent any process from reentering 3170 * __split_and_process_bio from dm_request and quiesce the thread 3171 * (dm_wq_work), we set BMF_BLOCK_IO_FOR_SUSPEND and call 3172 * flush_workqueue(md->wq). 3173 */ 3174 set_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags); 3175 if (map) 3176 synchronize_srcu(&md->io_barrier); 3177 3178 /* 3179 * Stop md->queue before flushing md->wq in case request-based 3180 * dm defers requests to md->wq from md->queue. 3181 */ 3182 if (dm_request_based(md)) { 3183 dm_stop_queue(md->queue); 3184 if (md->kworker_task) 3185 flush_kthread_worker(&md->kworker); 3186 } 3187 3188 flush_workqueue(md->wq); 3189 3190 /* 3191 * At this point no more requests are entering target request routines. 3192 * We call dm_wait_for_completion to wait for all existing requests 3193 * to finish. 3194 */ 3195 r = dm_wait_for_completion(md, interruptible); 3196 3197 if (noflush) 3198 clear_bit(DMF_NOFLUSH_SUSPENDING, &md->flags); 3199 if (map) 3200 synchronize_srcu(&md->io_barrier); 3201 3202 /* were we interrupted ? */ 3203 if (r < 0) { 3204 dm_queue_flush(md); 3205 3206 if (dm_request_based(md)) 3207 dm_start_queue(md->queue); 3208 3209 unlock_fs(md); 3210 dm_table_presuspend_undo_targets(map); 3211 /* pushback list is already flushed, so skip flush */ 3212 } 3213 3214 return r; 3215 } 3216 3217 /* 3218 * We need to be able to change a mapping table under a mounted 3219 * filesystem. For example we might want to move some data in 3220 * the background. Before the table can be swapped with 3221 * dm_bind_table, dm_suspend must be called to flush any in 3222 * flight bios and ensure that any further io gets deferred. 3223 */ 3224 /* 3225 * Suspend mechanism in request-based dm. 3226 * 3227 * 1. Flush all I/Os by lock_fs() if needed. 3228 * 2. Stop dispatching any I/O by stopping the request_queue. 3229 * 3. Wait for all in-flight I/Os to be completed or requeued. 3230 * 3231 * To abort suspend, start the request_queue. 3232 */ 3233 int dm_suspend(struct mapped_device *md, unsigned suspend_flags) 3234 { 3235 struct dm_table *map = NULL; 3236 int r = 0; 3237 3238 retry: 3239 mutex_lock_nested(&md->suspend_lock, SINGLE_DEPTH_NESTING); 3240 3241 if (dm_suspended_md(md)) { 3242 r = -EINVAL; 3243 goto out_unlock; 3244 } 3245 3246 if (dm_suspended_internally_md(md)) { 3247 /* already internally suspended, wait for internal resume */ 3248 mutex_unlock(&md->suspend_lock); 3249 r = wait_on_bit(&md->flags, DMF_SUSPENDED_INTERNALLY, TASK_INTERRUPTIBLE); 3250 if (r) 3251 return r; 3252 goto retry; 3253 } 3254 3255 map = rcu_dereference_protected(md->map, lockdep_is_held(&md->suspend_lock)); 3256 3257 r = __dm_suspend(md, map, suspend_flags, TASK_INTERRUPTIBLE); 3258 if (r) 3259 goto out_unlock; 3260 3261 set_bit(DMF_SUSPENDED, &md->flags); 3262 3263 dm_table_postsuspend_targets(map); 3264 3265 out_unlock: 3266 mutex_unlock(&md->suspend_lock); 3267 return r; 3268 } 3269 3270 static int __dm_resume(struct mapped_device *md, struct dm_table *map) 3271 { 3272 if (map) { 3273 int r = dm_table_resume_targets(map); 3274 if (r) 3275 return r; 3276 } 3277 3278 dm_queue_flush(md); 3279 3280 /* 3281 * Flushing deferred I/Os must be done after targets are resumed 3282 * so that mapping of targets can work correctly. 3283 * Request-based dm is queueing the deferred I/Os in its request_queue. 3284 */ 3285 if (dm_request_based(md)) 3286 dm_start_queue(md->queue); 3287 3288 unlock_fs(md); 3289 3290 return 0; 3291 } 3292 3293 int dm_resume(struct mapped_device *md) 3294 { 3295 int r = -EINVAL; 3296 struct dm_table *map = NULL; 3297 3298 retry: 3299 mutex_lock_nested(&md->suspend_lock, SINGLE_DEPTH_NESTING); 3300 3301 if (!dm_suspended_md(md)) 3302 goto out; 3303 3304 if (dm_suspended_internally_md(md)) { 3305 /* already internally suspended, wait for internal resume */ 3306 mutex_unlock(&md->suspend_lock); 3307 r = wait_on_bit(&md->flags, DMF_SUSPENDED_INTERNALLY, TASK_INTERRUPTIBLE); 3308 if (r) 3309 return r; 3310 goto retry; 3311 } 3312 3313 map = rcu_dereference_protected(md->map, lockdep_is_held(&md->suspend_lock)); 3314 if (!map || !dm_table_get_size(map)) 3315 goto out; 3316 3317 r = __dm_resume(md, map); 3318 if (r) 3319 goto out; 3320 3321 clear_bit(DMF_SUSPENDED, &md->flags); 3322 3323 r = 0; 3324 out: 3325 mutex_unlock(&md->suspend_lock); 3326 3327 return r; 3328 } 3329 3330 /* 3331 * Internal suspend/resume works like userspace-driven suspend. It waits 3332 * until all bios finish and prevents issuing new bios to the target drivers. 3333 * It may be used only from the kernel. 3334 */ 3335 3336 static void __dm_internal_suspend(struct mapped_device *md, unsigned suspend_flags) 3337 { 3338 struct dm_table *map = NULL; 3339 3340 if (md->internal_suspend_count++) 3341 return; /* nested internal suspend */ 3342 3343 if (dm_suspended_md(md)) { 3344 set_bit(DMF_SUSPENDED_INTERNALLY, &md->flags); 3345 return; /* nest suspend */ 3346 } 3347 3348 map = rcu_dereference_protected(md->map, lockdep_is_held(&md->suspend_lock)); 3349 3350 /* 3351 * Using TASK_UNINTERRUPTIBLE because only NOFLUSH internal suspend is 3352 * supported. Properly supporting a TASK_INTERRUPTIBLE internal suspend 3353 * would require changing .presuspend to return an error -- avoid this 3354 * until there is a need for more elaborate variants of internal suspend. 3355 */ 3356 (void) __dm_suspend(md, map, suspend_flags, TASK_UNINTERRUPTIBLE); 3357 3358 set_bit(DMF_SUSPENDED_INTERNALLY, &md->flags); 3359 3360 dm_table_postsuspend_targets(map); 3361 } 3362 3363 static void __dm_internal_resume(struct mapped_device *md) 3364 { 3365 BUG_ON(!md->internal_suspend_count); 3366 3367 if (--md->internal_suspend_count) 3368 return; /* resume from nested internal suspend */ 3369 3370 if (dm_suspended_md(md)) 3371 goto done; /* resume from nested suspend */ 3372 3373 /* 3374 * NOTE: existing callers don't need to call dm_table_resume_targets 3375 * (which may fail -- so best to avoid it for now by passing NULL map) 3376 */ 3377 (void) __dm_resume(md, NULL); 3378 3379 done: 3380 clear_bit(DMF_SUSPENDED_INTERNALLY, &md->flags); 3381 smp_mb__after_atomic(); 3382 wake_up_bit(&md->flags, DMF_SUSPENDED_INTERNALLY); 3383 } 3384 3385 void dm_internal_suspend_noflush(struct mapped_device *md) 3386 { 3387 mutex_lock(&md->suspend_lock); 3388 __dm_internal_suspend(md, DM_SUSPEND_NOFLUSH_FLAG); 3389 mutex_unlock(&md->suspend_lock); 3390 } 3391 EXPORT_SYMBOL_GPL(dm_internal_suspend_noflush); 3392 3393 void dm_internal_resume(struct mapped_device *md) 3394 { 3395 mutex_lock(&md->suspend_lock); 3396 __dm_internal_resume(md); 3397 mutex_unlock(&md->suspend_lock); 3398 } 3399 EXPORT_SYMBOL_GPL(dm_internal_resume); 3400 3401 /* 3402 * Fast variants of internal suspend/resume hold md->suspend_lock, 3403 * which prevents interaction with userspace-driven suspend. 3404 */ 3405 3406 void dm_internal_suspend_fast(struct mapped_device *md) 3407 { 3408 mutex_lock(&md->suspend_lock); 3409 if (dm_suspended_md(md) || dm_suspended_internally_md(md)) 3410 return; 3411 3412 set_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags); 3413 synchronize_srcu(&md->io_barrier); 3414 flush_workqueue(md->wq); 3415 dm_wait_for_completion(md, TASK_UNINTERRUPTIBLE); 3416 } 3417 EXPORT_SYMBOL_GPL(dm_internal_suspend_fast); 3418 3419 void dm_internal_resume_fast(struct mapped_device *md) 3420 { 3421 if (dm_suspended_md(md) || dm_suspended_internally_md(md)) 3422 goto done; 3423 3424 dm_queue_flush(md); 3425 3426 done: 3427 mutex_unlock(&md->suspend_lock); 3428 } 3429 EXPORT_SYMBOL_GPL(dm_internal_resume_fast); 3430 3431 /*----------------------------------------------------------------- 3432 * Event notification. 3433 *---------------------------------------------------------------*/ 3434 int dm_kobject_uevent(struct mapped_device *md, enum kobject_action action, 3435 unsigned cookie) 3436 { 3437 char udev_cookie[DM_COOKIE_LENGTH]; 3438 char *envp[] = { udev_cookie, NULL }; 3439 3440 if (!cookie) 3441 return kobject_uevent(&disk_to_dev(md->disk)->kobj, action); 3442 else { 3443 snprintf(udev_cookie, DM_COOKIE_LENGTH, "%s=%u", 3444 DM_COOKIE_ENV_VAR_NAME, cookie); 3445 return kobject_uevent_env(&disk_to_dev(md->disk)->kobj, 3446 action, envp); 3447 } 3448 } 3449 3450 uint32_t dm_next_uevent_seq(struct mapped_device *md) 3451 { 3452 return atomic_add_return(1, &md->uevent_seq); 3453 } 3454 3455 uint32_t dm_get_event_nr(struct mapped_device *md) 3456 { 3457 return atomic_read(&md->event_nr); 3458 } 3459 3460 int dm_wait_event(struct mapped_device *md, int event_nr) 3461 { 3462 return wait_event_interruptible(md->eventq, 3463 (event_nr != atomic_read(&md->event_nr))); 3464 } 3465 3466 void dm_uevent_add(struct mapped_device *md, struct list_head *elist) 3467 { 3468 unsigned long flags; 3469 3470 spin_lock_irqsave(&md->uevent_lock, flags); 3471 list_add(elist, &md->uevent_list); 3472 spin_unlock_irqrestore(&md->uevent_lock, flags); 3473 } 3474 3475 /* 3476 * The gendisk is only valid as long as you have a reference 3477 * count on 'md'. 3478 */ 3479 struct gendisk *dm_disk(struct mapped_device *md) 3480 { 3481 return md->disk; 3482 } 3483 EXPORT_SYMBOL_GPL(dm_disk); 3484 3485 struct kobject *dm_kobject(struct mapped_device *md) 3486 { 3487 return &md->kobj_holder.kobj; 3488 } 3489 3490 struct mapped_device *dm_get_from_kobject(struct kobject *kobj) 3491 { 3492 struct mapped_device *md; 3493 3494 md = container_of(kobj, struct mapped_device, kobj_holder.kobj); 3495 3496 if (test_bit(DMF_FREEING, &md->flags) || 3497 dm_deleting_md(md)) 3498 return NULL; 3499 3500 dm_get(md); 3501 return md; 3502 } 3503 3504 int dm_suspended_md(struct mapped_device *md) 3505 { 3506 return test_bit(DMF_SUSPENDED, &md->flags); 3507 } 3508 3509 int dm_suspended_internally_md(struct mapped_device *md) 3510 { 3511 return test_bit(DMF_SUSPENDED_INTERNALLY, &md->flags); 3512 } 3513 3514 int dm_test_deferred_remove_flag(struct mapped_device *md) 3515 { 3516 return test_bit(DMF_DEFERRED_REMOVE, &md->flags); 3517 } 3518 3519 int dm_suspended(struct dm_target *ti) 3520 { 3521 return dm_suspended_md(dm_table_get_md(ti->table)); 3522 } 3523 EXPORT_SYMBOL_GPL(dm_suspended); 3524 3525 int dm_noflush_suspending(struct dm_target *ti) 3526 { 3527 return __noflush_suspending(dm_table_get_md(ti->table)); 3528 } 3529 EXPORT_SYMBOL_GPL(dm_noflush_suspending); 3530 3531 struct dm_md_mempools *dm_alloc_md_mempools(struct mapped_device *md, unsigned type, 3532 unsigned integrity, unsigned per_io_data_size) 3533 { 3534 struct dm_md_mempools *pools = kzalloc_node(sizeof(*pools), GFP_KERNEL, md->numa_node_id); 3535 struct kmem_cache *cachep = NULL; 3536 unsigned int pool_size = 0; 3537 unsigned int front_pad; 3538 3539 if (!pools) 3540 return NULL; 3541 3542 type = filter_md_type(type, md); 3543 3544 switch (type) { 3545 case DM_TYPE_BIO_BASED: 3546 cachep = _io_cache; 3547 pool_size = dm_get_reserved_bio_based_ios(); 3548 front_pad = roundup(per_io_data_size, __alignof__(struct dm_target_io)) + offsetof(struct dm_target_io, clone); 3549 break; 3550 case DM_TYPE_REQUEST_BASED: 3551 cachep = _rq_tio_cache; 3552 pool_size = dm_get_reserved_rq_based_ios(); 3553 pools->rq_pool = mempool_create_slab_pool(pool_size, _rq_cache); 3554 if (!pools->rq_pool) 3555 goto out; 3556 /* fall through to setup remaining rq-based pools */ 3557 case DM_TYPE_MQ_REQUEST_BASED: 3558 if (!pool_size) 3559 pool_size = dm_get_reserved_rq_based_ios(); 3560 front_pad = offsetof(struct dm_rq_clone_bio_info, clone); 3561 /* per_io_data_size is used for blk-mq pdu at queue allocation */ 3562 break; 3563 default: 3564 BUG(); 3565 } 3566 3567 if (cachep) { 3568 pools->io_pool = mempool_create_slab_pool(pool_size, cachep); 3569 if (!pools->io_pool) 3570 goto out; 3571 } 3572 3573 pools->bs = bioset_create_nobvec(pool_size, front_pad); 3574 if (!pools->bs) 3575 goto out; 3576 3577 if (integrity && bioset_integrity_create(pools->bs, pool_size)) 3578 goto out; 3579 3580 return pools; 3581 3582 out: 3583 dm_free_md_mempools(pools); 3584 3585 return NULL; 3586 } 3587 3588 void dm_free_md_mempools(struct dm_md_mempools *pools) 3589 { 3590 if (!pools) 3591 return; 3592 3593 mempool_destroy(pools->io_pool); 3594 mempool_destroy(pools->rq_pool); 3595 3596 if (pools->bs) 3597 bioset_free(pools->bs); 3598 3599 kfree(pools); 3600 } 3601 3602 static int dm_pr_register(struct block_device *bdev, u64 old_key, u64 new_key, 3603 u32 flags) 3604 { 3605 struct mapped_device *md = bdev->bd_disk->private_data; 3606 const struct pr_ops *ops; 3607 fmode_t mode; 3608 int r; 3609 3610 r = dm_grab_bdev_for_ioctl(md, &bdev, &mode); 3611 if (r < 0) 3612 return r; 3613 3614 ops = bdev->bd_disk->fops->pr_ops; 3615 if (ops && ops->pr_register) 3616 r = ops->pr_register(bdev, old_key, new_key, flags); 3617 else 3618 r = -EOPNOTSUPP; 3619 3620 bdput(bdev); 3621 return r; 3622 } 3623 3624 static int dm_pr_reserve(struct block_device *bdev, u64 key, enum pr_type type, 3625 u32 flags) 3626 { 3627 struct mapped_device *md = bdev->bd_disk->private_data; 3628 const struct pr_ops *ops; 3629 fmode_t mode; 3630 int r; 3631 3632 r = dm_grab_bdev_for_ioctl(md, &bdev, &mode); 3633 if (r < 0) 3634 return r; 3635 3636 ops = bdev->bd_disk->fops->pr_ops; 3637 if (ops && ops->pr_reserve) 3638 r = ops->pr_reserve(bdev, key, type, flags); 3639 else 3640 r = -EOPNOTSUPP; 3641 3642 bdput(bdev); 3643 return r; 3644 } 3645 3646 static int dm_pr_release(struct block_device *bdev, u64 key, enum pr_type type) 3647 { 3648 struct mapped_device *md = bdev->bd_disk->private_data; 3649 const struct pr_ops *ops; 3650 fmode_t mode; 3651 int r; 3652 3653 r = dm_grab_bdev_for_ioctl(md, &bdev, &mode); 3654 if (r < 0) 3655 return r; 3656 3657 ops = bdev->bd_disk->fops->pr_ops; 3658 if (ops && ops->pr_release) 3659 r = ops->pr_release(bdev, key, type); 3660 else 3661 r = -EOPNOTSUPP; 3662 3663 bdput(bdev); 3664 return r; 3665 } 3666 3667 static int dm_pr_preempt(struct block_device *bdev, u64 old_key, u64 new_key, 3668 enum pr_type type, bool abort) 3669 { 3670 struct mapped_device *md = bdev->bd_disk->private_data; 3671 const struct pr_ops *ops; 3672 fmode_t mode; 3673 int r; 3674 3675 r = dm_grab_bdev_for_ioctl(md, &bdev, &mode); 3676 if (r < 0) 3677 return r; 3678 3679 ops = bdev->bd_disk->fops->pr_ops; 3680 if (ops && ops->pr_preempt) 3681 r = ops->pr_preempt(bdev, old_key, new_key, type, abort); 3682 else 3683 r = -EOPNOTSUPP; 3684 3685 bdput(bdev); 3686 return r; 3687 } 3688 3689 static int dm_pr_clear(struct block_device *bdev, u64 key) 3690 { 3691 struct mapped_device *md = bdev->bd_disk->private_data; 3692 const struct pr_ops *ops; 3693 fmode_t mode; 3694 int r; 3695 3696 r = dm_grab_bdev_for_ioctl(md, &bdev, &mode); 3697 if (r < 0) 3698 return r; 3699 3700 ops = bdev->bd_disk->fops->pr_ops; 3701 if (ops && ops->pr_clear) 3702 r = ops->pr_clear(bdev, key); 3703 else 3704 r = -EOPNOTSUPP; 3705 3706 bdput(bdev); 3707 return r; 3708 } 3709 3710 static const struct pr_ops dm_pr_ops = { 3711 .pr_register = dm_pr_register, 3712 .pr_reserve = dm_pr_reserve, 3713 .pr_release = dm_pr_release, 3714 .pr_preempt = dm_pr_preempt, 3715 .pr_clear = dm_pr_clear, 3716 }; 3717 3718 static const struct block_device_operations dm_blk_dops = { 3719 .open = dm_blk_open, 3720 .release = dm_blk_close, 3721 .ioctl = dm_blk_ioctl, 3722 .getgeo = dm_blk_getgeo, 3723 .pr_ops = &dm_pr_ops, 3724 .owner = THIS_MODULE 3725 }; 3726 3727 /* 3728 * module hooks 3729 */ 3730 module_init(dm_init); 3731 module_exit(dm_exit); 3732 3733 module_param(major, uint, 0); 3734 MODULE_PARM_DESC(major, "The major number of the device mapper"); 3735 3736 module_param(reserved_bio_based_ios, uint, S_IRUGO | S_IWUSR); 3737 MODULE_PARM_DESC(reserved_bio_based_ios, "Reserved IOs in bio-based mempools"); 3738 3739 module_param(reserved_rq_based_ios, uint, S_IRUGO | S_IWUSR); 3740 MODULE_PARM_DESC(reserved_rq_based_ios, "Reserved IOs in request-based mempools"); 3741 3742 module_param(use_blk_mq, bool, S_IRUGO | S_IWUSR); 3743 MODULE_PARM_DESC(use_blk_mq, "Use block multiqueue for request-based DM devices"); 3744 3745 module_param(dm_mq_nr_hw_queues, uint, S_IRUGO | S_IWUSR); 3746 MODULE_PARM_DESC(dm_mq_nr_hw_queues, "Number of hardware queues for request-based dm-mq devices"); 3747 3748 module_param(dm_mq_queue_depth, uint, S_IRUGO | S_IWUSR); 3749 MODULE_PARM_DESC(dm_mq_queue_depth, "Queue depth for request-based dm-mq devices"); 3750 3751 module_param(dm_numa_node, int, S_IRUGO | S_IWUSR); 3752 MODULE_PARM_DESC(dm_numa_node, "NUMA node for DM device memory allocations"); 3753 3754 MODULE_DESCRIPTION(DM_NAME " driver"); 3755 MODULE_AUTHOR("Joe Thornber <dm-devel@redhat.com>"); 3756 MODULE_LICENSE("GPL"); 3757