1 /* 2 * Copyright (C) 2001, 2002 Sistina Software (UK) Limited. 3 * Copyright (C) 2004-2008 Red Hat, Inc. All rights reserved. 4 * 5 * This file is released under the GPL. 6 */ 7 8 #include "dm.h" 9 #include "dm-uevent.h" 10 11 #include <linux/init.h> 12 #include <linux/module.h> 13 #include <linux/mutex.h> 14 #include <linux/moduleparam.h> 15 #include <linux/blkpg.h> 16 #include <linux/bio.h> 17 #include <linux/mempool.h> 18 #include <linux/slab.h> 19 #include <linux/idr.h> 20 #include <linux/hdreg.h> 21 #include <linux/delay.h> 22 #include <linux/wait.h> 23 #include <linux/kthread.h> 24 #include <linux/ktime.h> 25 #include <linux/elevator.h> /* for rq_end_sector() */ 26 #include <linux/blk-mq.h> 27 28 #include <trace/events/block.h> 29 30 #define DM_MSG_PREFIX "core" 31 32 #ifdef CONFIG_PRINTK 33 /* 34 * ratelimit state to be used in DMXXX_LIMIT(). 35 */ 36 DEFINE_RATELIMIT_STATE(dm_ratelimit_state, 37 DEFAULT_RATELIMIT_INTERVAL, 38 DEFAULT_RATELIMIT_BURST); 39 EXPORT_SYMBOL(dm_ratelimit_state); 40 #endif 41 42 /* 43 * Cookies are numeric values sent with CHANGE and REMOVE 44 * uevents while resuming, removing or renaming the device. 45 */ 46 #define DM_COOKIE_ENV_VAR_NAME "DM_COOKIE" 47 #define DM_COOKIE_LENGTH 24 48 49 static const char *_name = DM_NAME; 50 51 static unsigned int major = 0; 52 static unsigned int _major = 0; 53 54 static DEFINE_IDR(_minor_idr); 55 56 static DEFINE_SPINLOCK(_minor_lock); 57 58 static void do_deferred_remove(struct work_struct *w); 59 60 static DECLARE_WORK(deferred_remove_work, do_deferred_remove); 61 62 static struct workqueue_struct *deferred_remove_workqueue; 63 64 /* 65 * For bio-based dm. 66 * One of these is allocated per bio. 67 */ 68 struct dm_io { 69 struct mapped_device *md; 70 int error; 71 atomic_t io_count; 72 struct bio *bio; 73 unsigned long start_time; 74 spinlock_t endio_lock; 75 struct dm_stats_aux stats_aux; 76 }; 77 78 /* 79 * For request-based dm. 80 * One of these is allocated per request. 81 */ 82 struct dm_rq_target_io { 83 struct mapped_device *md; 84 struct dm_target *ti; 85 struct request *orig, *clone; 86 struct kthread_work work; 87 int error; 88 union map_info info; 89 }; 90 91 /* 92 * For request-based dm - the bio clones we allocate are embedded in these 93 * structs. 94 * 95 * We allocate these with bio_alloc_bioset, using the front_pad parameter when 96 * the bioset is created - this means the bio has to come at the end of the 97 * struct. 98 */ 99 struct dm_rq_clone_bio_info { 100 struct bio *orig; 101 struct dm_rq_target_io *tio; 102 struct bio clone; 103 }; 104 105 union map_info *dm_get_rq_mapinfo(struct request *rq) 106 { 107 if (rq && rq->end_io_data) 108 return &((struct dm_rq_target_io *)rq->end_io_data)->info; 109 return NULL; 110 } 111 EXPORT_SYMBOL_GPL(dm_get_rq_mapinfo); 112 113 #define MINOR_ALLOCED ((void *)-1) 114 115 /* 116 * Bits for the md->flags field. 117 */ 118 #define DMF_BLOCK_IO_FOR_SUSPEND 0 119 #define DMF_SUSPENDED 1 120 #define DMF_FROZEN 2 121 #define DMF_FREEING 3 122 #define DMF_DELETING 4 123 #define DMF_NOFLUSH_SUSPENDING 5 124 #define DMF_MERGE_IS_OPTIONAL 6 125 #define DMF_DEFERRED_REMOVE 7 126 #define DMF_SUSPENDED_INTERNALLY 8 127 128 /* 129 * A dummy definition to make RCU happy. 130 * struct dm_table should never be dereferenced in this file. 131 */ 132 struct dm_table { 133 int undefined__; 134 }; 135 136 /* 137 * Work processed by per-device workqueue. 138 */ 139 struct mapped_device { 140 struct srcu_struct io_barrier; 141 struct mutex suspend_lock; 142 atomic_t holders; 143 atomic_t open_count; 144 145 /* 146 * The current mapping. 147 * Use dm_get_live_table{_fast} or take suspend_lock for 148 * dereference. 149 */ 150 struct dm_table __rcu *map; 151 152 struct list_head table_devices; 153 struct mutex table_devices_lock; 154 155 unsigned long flags; 156 157 struct request_queue *queue; 158 unsigned type; 159 /* Protect queue and type against concurrent access. */ 160 struct mutex type_lock; 161 162 struct target_type *immutable_target_type; 163 164 struct gendisk *disk; 165 char name[16]; 166 167 void *interface_ptr; 168 169 /* 170 * A list of ios that arrived while we were suspended. 171 */ 172 atomic_t pending[2]; 173 wait_queue_head_t wait; 174 struct work_struct work; 175 struct bio_list deferred; 176 spinlock_t deferred_lock; 177 178 /* 179 * Processing queue (flush) 180 */ 181 struct workqueue_struct *wq; 182 183 /* 184 * io objects are allocated from here. 185 */ 186 mempool_t *io_pool; 187 mempool_t *rq_pool; 188 189 struct bio_set *bs; 190 191 /* 192 * Event handling. 193 */ 194 atomic_t event_nr; 195 wait_queue_head_t eventq; 196 atomic_t uevent_seq; 197 struct list_head uevent_list; 198 spinlock_t uevent_lock; /* Protect access to uevent_list */ 199 200 /* 201 * freeze/thaw support require holding onto a super block 202 */ 203 struct super_block *frozen_sb; 204 struct block_device *bdev; 205 206 /* forced geometry settings */ 207 struct hd_geometry geometry; 208 209 /* kobject and completion */ 210 struct dm_kobject_holder kobj_holder; 211 212 /* zero-length flush that will be cloned and submitted to targets */ 213 struct bio flush_bio; 214 215 /* the number of internal suspends */ 216 unsigned internal_suspend_count; 217 218 struct dm_stats stats; 219 220 struct kthread_worker kworker; 221 struct task_struct *kworker_task; 222 223 /* for request-based merge heuristic in dm_request_fn() */ 224 unsigned seq_rq_merge_deadline_usecs; 225 int last_rq_rw; 226 sector_t last_rq_pos; 227 ktime_t last_rq_start_time; 228 229 /* for blk-mq request-based DM support */ 230 struct blk_mq_tag_set tag_set; 231 }; 232 233 /* 234 * For mempools pre-allocation at the table loading time. 235 */ 236 struct dm_md_mempools { 237 mempool_t *io_pool; 238 mempool_t *rq_pool; 239 struct bio_set *bs; 240 }; 241 242 struct table_device { 243 struct list_head list; 244 atomic_t count; 245 struct dm_dev dm_dev; 246 }; 247 248 #define RESERVED_BIO_BASED_IOS 16 249 #define RESERVED_REQUEST_BASED_IOS 256 250 #define RESERVED_MAX_IOS 1024 251 static struct kmem_cache *_io_cache; 252 static struct kmem_cache *_rq_tio_cache; 253 static struct kmem_cache *_rq_cache; 254 255 /* 256 * Bio-based DM's mempools' reserved IOs set by the user. 257 */ 258 static unsigned reserved_bio_based_ios = RESERVED_BIO_BASED_IOS; 259 260 /* 261 * Request-based DM's mempools' reserved IOs set by the user. 262 */ 263 static unsigned reserved_rq_based_ios = RESERVED_REQUEST_BASED_IOS; 264 265 static unsigned __dm_get_module_param(unsigned *module_param, 266 unsigned def, unsigned max) 267 { 268 unsigned param = ACCESS_ONCE(*module_param); 269 unsigned modified_param = 0; 270 271 if (!param) 272 modified_param = def; 273 else if (param > max) 274 modified_param = max; 275 276 if (modified_param) { 277 (void)cmpxchg(module_param, param, modified_param); 278 param = modified_param; 279 } 280 281 return param; 282 } 283 284 unsigned dm_get_reserved_bio_based_ios(void) 285 { 286 return __dm_get_module_param(&reserved_bio_based_ios, 287 RESERVED_BIO_BASED_IOS, RESERVED_MAX_IOS); 288 } 289 EXPORT_SYMBOL_GPL(dm_get_reserved_bio_based_ios); 290 291 unsigned dm_get_reserved_rq_based_ios(void) 292 { 293 return __dm_get_module_param(&reserved_rq_based_ios, 294 RESERVED_REQUEST_BASED_IOS, RESERVED_MAX_IOS); 295 } 296 EXPORT_SYMBOL_GPL(dm_get_reserved_rq_based_ios); 297 298 static int __init local_init(void) 299 { 300 int r = -ENOMEM; 301 302 /* allocate a slab for the dm_ios */ 303 _io_cache = KMEM_CACHE(dm_io, 0); 304 if (!_io_cache) 305 return r; 306 307 _rq_tio_cache = KMEM_CACHE(dm_rq_target_io, 0); 308 if (!_rq_tio_cache) 309 goto out_free_io_cache; 310 311 _rq_cache = kmem_cache_create("dm_clone_request", sizeof(struct request), 312 __alignof__(struct request), 0, NULL); 313 if (!_rq_cache) 314 goto out_free_rq_tio_cache; 315 316 r = dm_uevent_init(); 317 if (r) 318 goto out_free_rq_cache; 319 320 deferred_remove_workqueue = alloc_workqueue("kdmremove", WQ_UNBOUND, 1); 321 if (!deferred_remove_workqueue) { 322 r = -ENOMEM; 323 goto out_uevent_exit; 324 } 325 326 _major = major; 327 r = register_blkdev(_major, _name); 328 if (r < 0) 329 goto out_free_workqueue; 330 331 if (!_major) 332 _major = r; 333 334 return 0; 335 336 out_free_workqueue: 337 destroy_workqueue(deferred_remove_workqueue); 338 out_uevent_exit: 339 dm_uevent_exit(); 340 out_free_rq_cache: 341 kmem_cache_destroy(_rq_cache); 342 out_free_rq_tio_cache: 343 kmem_cache_destroy(_rq_tio_cache); 344 out_free_io_cache: 345 kmem_cache_destroy(_io_cache); 346 347 return r; 348 } 349 350 static void local_exit(void) 351 { 352 flush_scheduled_work(); 353 destroy_workqueue(deferred_remove_workqueue); 354 355 kmem_cache_destroy(_rq_cache); 356 kmem_cache_destroy(_rq_tio_cache); 357 kmem_cache_destroy(_io_cache); 358 unregister_blkdev(_major, _name); 359 dm_uevent_exit(); 360 361 _major = 0; 362 363 DMINFO("cleaned up"); 364 } 365 366 static int (*_inits[])(void) __initdata = { 367 local_init, 368 dm_target_init, 369 dm_linear_init, 370 dm_stripe_init, 371 dm_io_init, 372 dm_kcopyd_init, 373 dm_interface_init, 374 dm_statistics_init, 375 }; 376 377 static void (*_exits[])(void) = { 378 local_exit, 379 dm_target_exit, 380 dm_linear_exit, 381 dm_stripe_exit, 382 dm_io_exit, 383 dm_kcopyd_exit, 384 dm_interface_exit, 385 dm_statistics_exit, 386 }; 387 388 static int __init dm_init(void) 389 { 390 const int count = ARRAY_SIZE(_inits); 391 392 int r, i; 393 394 for (i = 0; i < count; i++) { 395 r = _inits[i](); 396 if (r) 397 goto bad; 398 } 399 400 return 0; 401 402 bad: 403 while (i--) 404 _exits[i](); 405 406 return r; 407 } 408 409 static void __exit dm_exit(void) 410 { 411 int i = ARRAY_SIZE(_exits); 412 413 while (i--) 414 _exits[i](); 415 416 /* 417 * Should be empty by this point. 418 */ 419 idr_destroy(&_minor_idr); 420 } 421 422 /* 423 * Block device functions 424 */ 425 int dm_deleting_md(struct mapped_device *md) 426 { 427 return test_bit(DMF_DELETING, &md->flags); 428 } 429 430 static int dm_blk_open(struct block_device *bdev, fmode_t mode) 431 { 432 struct mapped_device *md; 433 434 spin_lock(&_minor_lock); 435 436 md = bdev->bd_disk->private_data; 437 if (!md) 438 goto out; 439 440 if (test_bit(DMF_FREEING, &md->flags) || 441 dm_deleting_md(md)) { 442 md = NULL; 443 goto out; 444 } 445 446 dm_get(md); 447 atomic_inc(&md->open_count); 448 out: 449 spin_unlock(&_minor_lock); 450 451 return md ? 0 : -ENXIO; 452 } 453 454 static void dm_blk_close(struct gendisk *disk, fmode_t mode) 455 { 456 struct mapped_device *md; 457 458 spin_lock(&_minor_lock); 459 460 md = disk->private_data; 461 if (WARN_ON(!md)) 462 goto out; 463 464 if (atomic_dec_and_test(&md->open_count) && 465 (test_bit(DMF_DEFERRED_REMOVE, &md->flags))) 466 queue_work(deferred_remove_workqueue, &deferred_remove_work); 467 468 dm_put(md); 469 out: 470 spin_unlock(&_minor_lock); 471 } 472 473 int dm_open_count(struct mapped_device *md) 474 { 475 return atomic_read(&md->open_count); 476 } 477 478 /* 479 * Guarantees nothing is using the device before it's deleted. 480 */ 481 int dm_lock_for_deletion(struct mapped_device *md, bool mark_deferred, bool only_deferred) 482 { 483 int r = 0; 484 485 spin_lock(&_minor_lock); 486 487 if (dm_open_count(md)) { 488 r = -EBUSY; 489 if (mark_deferred) 490 set_bit(DMF_DEFERRED_REMOVE, &md->flags); 491 } else if (only_deferred && !test_bit(DMF_DEFERRED_REMOVE, &md->flags)) 492 r = -EEXIST; 493 else 494 set_bit(DMF_DELETING, &md->flags); 495 496 spin_unlock(&_minor_lock); 497 498 return r; 499 } 500 501 int dm_cancel_deferred_remove(struct mapped_device *md) 502 { 503 int r = 0; 504 505 spin_lock(&_minor_lock); 506 507 if (test_bit(DMF_DELETING, &md->flags)) 508 r = -EBUSY; 509 else 510 clear_bit(DMF_DEFERRED_REMOVE, &md->flags); 511 512 spin_unlock(&_minor_lock); 513 514 return r; 515 } 516 517 static void do_deferred_remove(struct work_struct *w) 518 { 519 dm_deferred_remove(); 520 } 521 522 sector_t dm_get_size(struct mapped_device *md) 523 { 524 return get_capacity(md->disk); 525 } 526 527 struct request_queue *dm_get_md_queue(struct mapped_device *md) 528 { 529 return md->queue; 530 } 531 532 struct dm_stats *dm_get_stats(struct mapped_device *md) 533 { 534 return &md->stats; 535 } 536 537 static int dm_blk_getgeo(struct block_device *bdev, struct hd_geometry *geo) 538 { 539 struct mapped_device *md = bdev->bd_disk->private_data; 540 541 return dm_get_geometry(md, geo); 542 } 543 544 static int dm_blk_ioctl(struct block_device *bdev, fmode_t mode, 545 unsigned int cmd, unsigned long arg) 546 { 547 struct mapped_device *md = bdev->bd_disk->private_data; 548 int srcu_idx; 549 struct dm_table *map; 550 struct dm_target *tgt; 551 int r = -ENOTTY; 552 553 retry: 554 map = dm_get_live_table(md, &srcu_idx); 555 556 if (!map || !dm_table_get_size(map)) 557 goto out; 558 559 /* We only support devices that have a single target */ 560 if (dm_table_get_num_targets(map) != 1) 561 goto out; 562 563 tgt = dm_table_get_target(map, 0); 564 if (!tgt->type->ioctl) 565 goto out; 566 567 if (dm_suspended_md(md)) { 568 r = -EAGAIN; 569 goto out; 570 } 571 572 r = tgt->type->ioctl(tgt, cmd, arg); 573 574 out: 575 dm_put_live_table(md, srcu_idx); 576 577 if (r == -ENOTCONN) { 578 msleep(10); 579 goto retry; 580 } 581 582 return r; 583 } 584 585 static struct dm_io *alloc_io(struct mapped_device *md) 586 { 587 return mempool_alloc(md->io_pool, GFP_NOIO); 588 } 589 590 static void free_io(struct mapped_device *md, struct dm_io *io) 591 { 592 mempool_free(io, md->io_pool); 593 } 594 595 static void free_tio(struct mapped_device *md, struct dm_target_io *tio) 596 { 597 bio_put(&tio->clone); 598 } 599 600 static struct dm_rq_target_io *alloc_rq_tio(struct mapped_device *md, 601 gfp_t gfp_mask) 602 { 603 return mempool_alloc(md->io_pool, gfp_mask); 604 } 605 606 static void free_rq_tio(struct dm_rq_target_io *tio) 607 { 608 mempool_free(tio, tio->md->io_pool); 609 } 610 611 static struct request *alloc_clone_request(struct mapped_device *md, 612 gfp_t gfp_mask) 613 { 614 return mempool_alloc(md->rq_pool, gfp_mask); 615 } 616 617 static void free_clone_request(struct mapped_device *md, struct request *rq) 618 { 619 mempool_free(rq, md->rq_pool); 620 } 621 622 static int md_in_flight(struct mapped_device *md) 623 { 624 return atomic_read(&md->pending[READ]) + 625 atomic_read(&md->pending[WRITE]); 626 } 627 628 static void start_io_acct(struct dm_io *io) 629 { 630 struct mapped_device *md = io->md; 631 struct bio *bio = io->bio; 632 int cpu; 633 int rw = bio_data_dir(bio); 634 635 io->start_time = jiffies; 636 637 cpu = part_stat_lock(); 638 part_round_stats(cpu, &dm_disk(md)->part0); 639 part_stat_unlock(); 640 atomic_set(&dm_disk(md)->part0.in_flight[rw], 641 atomic_inc_return(&md->pending[rw])); 642 643 if (unlikely(dm_stats_used(&md->stats))) 644 dm_stats_account_io(&md->stats, bio->bi_rw, bio->bi_iter.bi_sector, 645 bio_sectors(bio), false, 0, &io->stats_aux); 646 } 647 648 static void end_io_acct(struct dm_io *io) 649 { 650 struct mapped_device *md = io->md; 651 struct bio *bio = io->bio; 652 unsigned long duration = jiffies - io->start_time; 653 int pending; 654 int rw = bio_data_dir(bio); 655 656 generic_end_io_acct(rw, &dm_disk(md)->part0, io->start_time); 657 658 if (unlikely(dm_stats_used(&md->stats))) 659 dm_stats_account_io(&md->stats, bio->bi_rw, bio->bi_iter.bi_sector, 660 bio_sectors(bio), true, duration, &io->stats_aux); 661 662 /* 663 * After this is decremented the bio must not be touched if it is 664 * a flush. 665 */ 666 pending = atomic_dec_return(&md->pending[rw]); 667 atomic_set(&dm_disk(md)->part0.in_flight[rw], pending); 668 pending += atomic_read(&md->pending[rw^0x1]); 669 670 /* nudge anyone waiting on suspend queue */ 671 if (!pending) 672 wake_up(&md->wait); 673 } 674 675 /* 676 * Add the bio to the list of deferred io. 677 */ 678 static void queue_io(struct mapped_device *md, struct bio *bio) 679 { 680 unsigned long flags; 681 682 spin_lock_irqsave(&md->deferred_lock, flags); 683 bio_list_add(&md->deferred, bio); 684 spin_unlock_irqrestore(&md->deferred_lock, flags); 685 queue_work(md->wq, &md->work); 686 } 687 688 /* 689 * Everyone (including functions in this file), should use this 690 * function to access the md->map field, and make sure they call 691 * dm_put_live_table() when finished. 692 */ 693 struct dm_table *dm_get_live_table(struct mapped_device *md, int *srcu_idx) __acquires(md->io_barrier) 694 { 695 *srcu_idx = srcu_read_lock(&md->io_barrier); 696 697 return srcu_dereference(md->map, &md->io_barrier); 698 } 699 700 void dm_put_live_table(struct mapped_device *md, int srcu_idx) __releases(md->io_barrier) 701 { 702 srcu_read_unlock(&md->io_barrier, srcu_idx); 703 } 704 705 void dm_sync_table(struct mapped_device *md) 706 { 707 synchronize_srcu(&md->io_barrier); 708 synchronize_rcu_expedited(); 709 } 710 711 /* 712 * A fast alternative to dm_get_live_table/dm_put_live_table. 713 * The caller must not block between these two functions. 714 */ 715 static struct dm_table *dm_get_live_table_fast(struct mapped_device *md) __acquires(RCU) 716 { 717 rcu_read_lock(); 718 return rcu_dereference(md->map); 719 } 720 721 static void dm_put_live_table_fast(struct mapped_device *md) __releases(RCU) 722 { 723 rcu_read_unlock(); 724 } 725 726 /* 727 * Open a table device so we can use it as a map destination. 728 */ 729 static int open_table_device(struct table_device *td, dev_t dev, 730 struct mapped_device *md) 731 { 732 static char *_claim_ptr = "I belong to device-mapper"; 733 struct block_device *bdev; 734 735 int r; 736 737 BUG_ON(td->dm_dev.bdev); 738 739 bdev = blkdev_get_by_dev(dev, td->dm_dev.mode | FMODE_EXCL, _claim_ptr); 740 if (IS_ERR(bdev)) 741 return PTR_ERR(bdev); 742 743 r = bd_link_disk_holder(bdev, dm_disk(md)); 744 if (r) { 745 blkdev_put(bdev, td->dm_dev.mode | FMODE_EXCL); 746 return r; 747 } 748 749 td->dm_dev.bdev = bdev; 750 return 0; 751 } 752 753 /* 754 * Close a table device that we've been using. 755 */ 756 static void close_table_device(struct table_device *td, struct mapped_device *md) 757 { 758 if (!td->dm_dev.bdev) 759 return; 760 761 bd_unlink_disk_holder(td->dm_dev.bdev, dm_disk(md)); 762 blkdev_put(td->dm_dev.bdev, td->dm_dev.mode | FMODE_EXCL); 763 td->dm_dev.bdev = NULL; 764 } 765 766 static struct table_device *find_table_device(struct list_head *l, dev_t dev, 767 fmode_t mode) { 768 struct table_device *td; 769 770 list_for_each_entry(td, l, list) 771 if (td->dm_dev.bdev->bd_dev == dev && td->dm_dev.mode == mode) 772 return td; 773 774 return NULL; 775 } 776 777 int dm_get_table_device(struct mapped_device *md, dev_t dev, fmode_t mode, 778 struct dm_dev **result) { 779 int r; 780 struct table_device *td; 781 782 mutex_lock(&md->table_devices_lock); 783 td = find_table_device(&md->table_devices, dev, mode); 784 if (!td) { 785 td = kmalloc(sizeof(*td), GFP_KERNEL); 786 if (!td) { 787 mutex_unlock(&md->table_devices_lock); 788 return -ENOMEM; 789 } 790 791 td->dm_dev.mode = mode; 792 td->dm_dev.bdev = NULL; 793 794 if ((r = open_table_device(td, dev, md))) { 795 mutex_unlock(&md->table_devices_lock); 796 kfree(td); 797 return r; 798 } 799 800 format_dev_t(td->dm_dev.name, dev); 801 802 atomic_set(&td->count, 0); 803 list_add(&td->list, &md->table_devices); 804 } 805 atomic_inc(&td->count); 806 mutex_unlock(&md->table_devices_lock); 807 808 *result = &td->dm_dev; 809 return 0; 810 } 811 EXPORT_SYMBOL_GPL(dm_get_table_device); 812 813 void dm_put_table_device(struct mapped_device *md, struct dm_dev *d) 814 { 815 struct table_device *td = container_of(d, struct table_device, dm_dev); 816 817 mutex_lock(&md->table_devices_lock); 818 if (atomic_dec_and_test(&td->count)) { 819 close_table_device(td, md); 820 list_del(&td->list); 821 kfree(td); 822 } 823 mutex_unlock(&md->table_devices_lock); 824 } 825 EXPORT_SYMBOL(dm_put_table_device); 826 827 static void free_table_devices(struct list_head *devices) 828 { 829 struct list_head *tmp, *next; 830 831 list_for_each_safe(tmp, next, devices) { 832 struct table_device *td = list_entry(tmp, struct table_device, list); 833 834 DMWARN("dm_destroy: %s still exists with %d references", 835 td->dm_dev.name, atomic_read(&td->count)); 836 kfree(td); 837 } 838 } 839 840 /* 841 * Get the geometry associated with a dm device 842 */ 843 int dm_get_geometry(struct mapped_device *md, struct hd_geometry *geo) 844 { 845 *geo = md->geometry; 846 847 return 0; 848 } 849 850 /* 851 * Set the geometry of a device. 852 */ 853 int dm_set_geometry(struct mapped_device *md, struct hd_geometry *geo) 854 { 855 sector_t sz = (sector_t)geo->cylinders * geo->heads * geo->sectors; 856 857 if (geo->start > sz) { 858 DMWARN("Start sector is beyond the geometry limits."); 859 return -EINVAL; 860 } 861 862 md->geometry = *geo; 863 864 return 0; 865 } 866 867 /*----------------------------------------------------------------- 868 * CRUD START: 869 * A more elegant soln is in the works that uses the queue 870 * merge fn, unfortunately there are a couple of changes to 871 * the block layer that I want to make for this. So in the 872 * interests of getting something for people to use I give 873 * you this clearly demarcated crap. 874 *---------------------------------------------------------------*/ 875 876 static int __noflush_suspending(struct mapped_device *md) 877 { 878 return test_bit(DMF_NOFLUSH_SUSPENDING, &md->flags); 879 } 880 881 /* 882 * Decrements the number of outstanding ios that a bio has been 883 * cloned into, completing the original io if necc. 884 */ 885 static void dec_pending(struct dm_io *io, int error) 886 { 887 unsigned long flags; 888 int io_error; 889 struct bio *bio; 890 struct mapped_device *md = io->md; 891 892 /* Push-back supersedes any I/O errors */ 893 if (unlikely(error)) { 894 spin_lock_irqsave(&io->endio_lock, flags); 895 if (!(io->error > 0 && __noflush_suspending(md))) 896 io->error = error; 897 spin_unlock_irqrestore(&io->endio_lock, flags); 898 } 899 900 if (atomic_dec_and_test(&io->io_count)) { 901 if (io->error == DM_ENDIO_REQUEUE) { 902 /* 903 * Target requested pushing back the I/O. 904 */ 905 spin_lock_irqsave(&md->deferred_lock, flags); 906 if (__noflush_suspending(md)) 907 bio_list_add_head(&md->deferred, io->bio); 908 else 909 /* noflush suspend was interrupted. */ 910 io->error = -EIO; 911 spin_unlock_irqrestore(&md->deferred_lock, flags); 912 } 913 914 io_error = io->error; 915 bio = io->bio; 916 end_io_acct(io); 917 free_io(md, io); 918 919 if (io_error == DM_ENDIO_REQUEUE) 920 return; 921 922 if ((bio->bi_rw & REQ_FLUSH) && bio->bi_iter.bi_size) { 923 /* 924 * Preflush done for flush with data, reissue 925 * without REQ_FLUSH. 926 */ 927 bio->bi_rw &= ~REQ_FLUSH; 928 queue_io(md, bio); 929 } else { 930 /* done with normal IO or empty flush */ 931 trace_block_bio_complete(md->queue, bio, io_error); 932 bio_endio(bio, io_error); 933 } 934 } 935 } 936 937 static void disable_write_same(struct mapped_device *md) 938 { 939 struct queue_limits *limits = dm_get_queue_limits(md); 940 941 /* device doesn't really support WRITE SAME, disable it */ 942 limits->max_write_same_sectors = 0; 943 } 944 945 static void clone_endio(struct bio *bio, int error) 946 { 947 int r = error; 948 struct dm_target_io *tio = container_of(bio, struct dm_target_io, clone); 949 struct dm_io *io = tio->io; 950 struct mapped_device *md = tio->io->md; 951 dm_endio_fn endio = tio->ti->type->end_io; 952 953 if (!bio_flagged(bio, BIO_UPTODATE) && !error) 954 error = -EIO; 955 956 if (endio) { 957 r = endio(tio->ti, bio, error); 958 if (r < 0 || r == DM_ENDIO_REQUEUE) 959 /* 960 * error and requeue request are handled 961 * in dec_pending(). 962 */ 963 error = r; 964 else if (r == DM_ENDIO_INCOMPLETE) 965 /* The target will handle the io */ 966 return; 967 else if (r) { 968 DMWARN("unimplemented target endio return value: %d", r); 969 BUG(); 970 } 971 } 972 973 if (unlikely(r == -EREMOTEIO && (bio->bi_rw & REQ_WRITE_SAME) && 974 !bdev_get_queue(bio->bi_bdev)->limits.max_write_same_sectors)) 975 disable_write_same(md); 976 977 free_tio(md, tio); 978 dec_pending(io, error); 979 } 980 981 /* 982 * Partial completion handling for request-based dm 983 */ 984 static void end_clone_bio(struct bio *clone, int error) 985 { 986 struct dm_rq_clone_bio_info *info = 987 container_of(clone, struct dm_rq_clone_bio_info, clone); 988 struct dm_rq_target_io *tio = info->tio; 989 struct bio *bio = info->orig; 990 unsigned int nr_bytes = info->orig->bi_iter.bi_size; 991 992 bio_put(clone); 993 994 if (tio->error) 995 /* 996 * An error has already been detected on the request. 997 * Once error occurred, just let clone->end_io() handle 998 * the remainder. 999 */ 1000 return; 1001 else if (error) { 1002 /* 1003 * Don't notice the error to the upper layer yet. 1004 * The error handling decision is made by the target driver, 1005 * when the request is completed. 1006 */ 1007 tio->error = error; 1008 return; 1009 } 1010 1011 /* 1012 * I/O for the bio successfully completed. 1013 * Notice the data completion to the upper layer. 1014 */ 1015 1016 /* 1017 * bios are processed from the head of the list. 1018 * So the completing bio should always be rq->bio. 1019 * If it's not, something wrong is happening. 1020 */ 1021 if (tio->orig->bio != bio) 1022 DMERR("bio completion is going in the middle of the request"); 1023 1024 /* 1025 * Update the original request. 1026 * Do not use blk_end_request() here, because it may complete 1027 * the original request before the clone, and break the ordering. 1028 */ 1029 blk_update_request(tio->orig, 0, nr_bytes); 1030 } 1031 1032 static struct dm_rq_target_io *tio_from_request(struct request *rq) 1033 { 1034 return (rq->q->mq_ops ? blk_mq_rq_to_pdu(rq) : rq->special); 1035 } 1036 1037 /* 1038 * Don't touch any member of the md after calling this function because 1039 * the md may be freed in dm_put() at the end of this function. 1040 * Or do dm_get() before calling this function and dm_put() later. 1041 */ 1042 static void rq_completed(struct mapped_device *md, int rw, bool run_queue) 1043 { 1044 int nr_requests_pending; 1045 1046 atomic_dec(&md->pending[rw]); 1047 1048 /* nudge anyone waiting on suspend queue */ 1049 nr_requests_pending = md_in_flight(md); 1050 if (!nr_requests_pending) 1051 wake_up(&md->wait); 1052 1053 /* 1054 * Run this off this callpath, as drivers could invoke end_io while 1055 * inside their request_fn (and holding the queue lock). Calling 1056 * back into ->request_fn() could deadlock attempting to grab the 1057 * queue lock again. 1058 */ 1059 if (run_queue) { 1060 if (md->queue->mq_ops) 1061 blk_mq_run_hw_queues(md->queue, true); 1062 else if (!nr_requests_pending || 1063 (nr_requests_pending >= md->queue->nr_congestion_on)) 1064 blk_run_queue_async(md->queue); 1065 } 1066 1067 /* 1068 * dm_put() must be at the end of this function. See the comment above 1069 */ 1070 dm_put(md); 1071 } 1072 1073 static void free_rq_clone(struct request *clone) 1074 { 1075 struct dm_rq_target_io *tio = clone->end_io_data; 1076 struct mapped_device *md = tio->md; 1077 1078 blk_rq_unprep_clone(clone); 1079 1080 if (clone->q && clone->q->mq_ops) 1081 tio->ti->type->release_clone_rq(clone); 1082 else 1083 free_clone_request(md, clone); 1084 1085 if (!md->queue->mq_ops) 1086 free_rq_tio(tio); 1087 } 1088 1089 /* 1090 * Complete the clone and the original request. 1091 * Must be called without clone's queue lock held, 1092 * see end_clone_request() for more details. 1093 */ 1094 static void dm_end_request(struct request *clone, int error) 1095 { 1096 int rw = rq_data_dir(clone); 1097 struct dm_rq_target_io *tio = clone->end_io_data; 1098 struct mapped_device *md = tio->md; 1099 struct request *rq = tio->orig; 1100 1101 if (rq->cmd_type == REQ_TYPE_BLOCK_PC) { 1102 rq->errors = clone->errors; 1103 rq->resid_len = clone->resid_len; 1104 1105 if (rq->sense) 1106 /* 1107 * We are using the sense buffer of the original 1108 * request. 1109 * So setting the length of the sense data is enough. 1110 */ 1111 rq->sense_len = clone->sense_len; 1112 } 1113 1114 free_rq_clone(clone); 1115 if (!rq->q->mq_ops) 1116 blk_end_request_all(rq, error); 1117 else 1118 blk_mq_end_request(rq, error); 1119 rq_completed(md, rw, true); 1120 } 1121 1122 static void dm_unprep_request(struct request *rq) 1123 { 1124 struct dm_rq_target_io *tio = tio_from_request(rq); 1125 struct request *clone = tio->clone; 1126 1127 if (!rq->q->mq_ops) { 1128 rq->special = NULL; 1129 rq->cmd_flags &= ~REQ_DONTPREP; 1130 } 1131 1132 if (clone) 1133 free_rq_clone(clone); 1134 } 1135 1136 /* 1137 * Requeue the original request of a clone. 1138 */ 1139 static void old_requeue_request(struct request *rq) 1140 { 1141 struct request_queue *q = rq->q; 1142 unsigned long flags; 1143 1144 spin_lock_irqsave(q->queue_lock, flags); 1145 blk_requeue_request(q, rq); 1146 spin_unlock_irqrestore(q->queue_lock, flags); 1147 } 1148 1149 static void dm_requeue_unmapped_original_request(struct mapped_device *md, 1150 struct request *rq) 1151 { 1152 int rw = rq_data_dir(rq); 1153 1154 dm_unprep_request(rq); 1155 1156 if (!rq->q->mq_ops) 1157 old_requeue_request(rq); 1158 else { 1159 blk_mq_requeue_request(rq); 1160 blk_mq_kick_requeue_list(rq->q); 1161 } 1162 1163 rq_completed(md, rw, false); 1164 } 1165 1166 static void dm_requeue_unmapped_request(struct request *clone) 1167 { 1168 struct dm_rq_target_io *tio = clone->end_io_data; 1169 1170 dm_requeue_unmapped_original_request(tio->md, tio->orig); 1171 } 1172 1173 static void old_stop_queue(struct request_queue *q) 1174 { 1175 unsigned long flags; 1176 1177 if (blk_queue_stopped(q)) 1178 return; 1179 1180 spin_lock_irqsave(q->queue_lock, flags); 1181 blk_stop_queue(q); 1182 spin_unlock_irqrestore(q->queue_lock, flags); 1183 } 1184 1185 static void stop_queue(struct request_queue *q) 1186 { 1187 if (!q->mq_ops) 1188 old_stop_queue(q); 1189 else 1190 blk_mq_stop_hw_queues(q); 1191 } 1192 1193 static void old_start_queue(struct request_queue *q) 1194 { 1195 unsigned long flags; 1196 1197 spin_lock_irqsave(q->queue_lock, flags); 1198 if (blk_queue_stopped(q)) 1199 blk_start_queue(q); 1200 spin_unlock_irqrestore(q->queue_lock, flags); 1201 } 1202 1203 static void start_queue(struct request_queue *q) 1204 { 1205 if (!q->mq_ops) 1206 old_start_queue(q); 1207 else 1208 blk_mq_start_stopped_hw_queues(q, true); 1209 } 1210 1211 static void dm_done(struct request *clone, int error, bool mapped) 1212 { 1213 int r = error; 1214 struct dm_rq_target_io *tio = clone->end_io_data; 1215 dm_request_endio_fn rq_end_io = NULL; 1216 1217 if (tio->ti) { 1218 rq_end_io = tio->ti->type->rq_end_io; 1219 1220 if (mapped && rq_end_io) 1221 r = rq_end_io(tio->ti, clone, error, &tio->info); 1222 } 1223 1224 if (unlikely(r == -EREMOTEIO && (clone->cmd_flags & REQ_WRITE_SAME) && 1225 !clone->q->limits.max_write_same_sectors)) 1226 disable_write_same(tio->md); 1227 1228 if (r <= 0) 1229 /* The target wants to complete the I/O */ 1230 dm_end_request(clone, r); 1231 else if (r == DM_ENDIO_INCOMPLETE) 1232 /* The target will handle the I/O */ 1233 return; 1234 else if (r == DM_ENDIO_REQUEUE) 1235 /* The target wants to requeue the I/O */ 1236 dm_requeue_unmapped_request(clone); 1237 else { 1238 DMWARN("unimplemented target endio return value: %d", r); 1239 BUG(); 1240 } 1241 } 1242 1243 /* 1244 * Request completion handler for request-based dm 1245 */ 1246 static void dm_softirq_done(struct request *rq) 1247 { 1248 bool mapped = true; 1249 struct dm_rq_target_io *tio = tio_from_request(rq); 1250 struct request *clone = tio->clone; 1251 int rw; 1252 1253 if (!clone) { 1254 rw = rq_data_dir(rq); 1255 if (!rq->q->mq_ops) { 1256 blk_end_request_all(rq, tio->error); 1257 rq_completed(tio->md, rw, false); 1258 free_rq_tio(tio); 1259 } else { 1260 blk_mq_end_request(rq, tio->error); 1261 rq_completed(tio->md, rw, false); 1262 } 1263 return; 1264 } 1265 1266 if (rq->cmd_flags & REQ_FAILED) 1267 mapped = false; 1268 1269 dm_done(clone, tio->error, mapped); 1270 } 1271 1272 /* 1273 * Complete the clone and the original request with the error status 1274 * through softirq context. 1275 */ 1276 static void dm_complete_request(struct request *rq, int error) 1277 { 1278 struct dm_rq_target_io *tio = tio_from_request(rq); 1279 1280 tio->error = error; 1281 blk_complete_request(rq); 1282 } 1283 1284 /* 1285 * Complete the not-mapped clone and the original request with the error status 1286 * through softirq context. 1287 * Target's rq_end_io() function isn't called. 1288 * This may be used when the target's map_rq() or clone_and_map_rq() functions fail. 1289 */ 1290 static void dm_kill_unmapped_request(struct request *rq, int error) 1291 { 1292 rq->cmd_flags |= REQ_FAILED; 1293 dm_complete_request(rq, error); 1294 } 1295 1296 /* 1297 * Called with the clone's queue lock held (for non-blk-mq) 1298 */ 1299 static void end_clone_request(struct request *clone, int error) 1300 { 1301 struct dm_rq_target_io *tio = clone->end_io_data; 1302 1303 if (!clone->q->mq_ops) { 1304 /* 1305 * For just cleaning up the information of the queue in which 1306 * the clone was dispatched. 1307 * The clone is *NOT* freed actually here because it is alloced 1308 * from dm own mempool (REQ_ALLOCED isn't set). 1309 */ 1310 __blk_put_request(clone->q, clone); 1311 } 1312 1313 /* 1314 * Actual request completion is done in a softirq context which doesn't 1315 * hold the clone's queue lock. Otherwise, deadlock could occur because: 1316 * - another request may be submitted by the upper level driver 1317 * of the stacking during the completion 1318 * - the submission which requires queue lock may be done 1319 * against this clone's queue 1320 */ 1321 dm_complete_request(tio->orig, error); 1322 } 1323 1324 /* 1325 * Return maximum size of I/O possible at the supplied sector up to the current 1326 * target boundary. 1327 */ 1328 static sector_t max_io_len_target_boundary(sector_t sector, struct dm_target *ti) 1329 { 1330 sector_t target_offset = dm_target_offset(ti, sector); 1331 1332 return ti->len - target_offset; 1333 } 1334 1335 static sector_t max_io_len(sector_t sector, struct dm_target *ti) 1336 { 1337 sector_t len = max_io_len_target_boundary(sector, ti); 1338 sector_t offset, max_len; 1339 1340 /* 1341 * Does the target need to split even further? 1342 */ 1343 if (ti->max_io_len) { 1344 offset = dm_target_offset(ti, sector); 1345 if (unlikely(ti->max_io_len & (ti->max_io_len - 1))) 1346 max_len = sector_div(offset, ti->max_io_len); 1347 else 1348 max_len = offset & (ti->max_io_len - 1); 1349 max_len = ti->max_io_len - max_len; 1350 1351 if (len > max_len) 1352 len = max_len; 1353 } 1354 1355 return len; 1356 } 1357 1358 int dm_set_target_max_io_len(struct dm_target *ti, sector_t len) 1359 { 1360 if (len > UINT_MAX) { 1361 DMERR("Specified maximum size of target IO (%llu) exceeds limit (%u)", 1362 (unsigned long long)len, UINT_MAX); 1363 ti->error = "Maximum size of target IO is too large"; 1364 return -EINVAL; 1365 } 1366 1367 ti->max_io_len = (uint32_t) len; 1368 1369 return 0; 1370 } 1371 EXPORT_SYMBOL_GPL(dm_set_target_max_io_len); 1372 1373 /* 1374 * A target may call dm_accept_partial_bio only from the map routine. It is 1375 * allowed for all bio types except REQ_FLUSH. 1376 * 1377 * dm_accept_partial_bio informs the dm that the target only wants to process 1378 * additional n_sectors sectors of the bio and the rest of the data should be 1379 * sent in a next bio. 1380 * 1381 * A diagram that explains the arithmetics: 1382 * +--------------------+---------------+-------+ 1383 * | 1 | 2 | 3 | 1384 * +--------------------+---------------+-------+ 1385 * 1386 * <-------------- *tio->len_ptr ---------------> 1387 * <------- bi_size -------> 1388 * <-- n_sectors --> 1389 * 1390 * Region 1 was already iterated over with bio_advance or similar function. 1391 * (it may be empty if the target doesn't use bio_advance) 1392 * Region 2 is the remaining bio size that the target wants to process. 1393 * (it may be empty if region 1 is non-empty, although there is no reason 1394 * to make it empty) 1395 * The target requires that region 3 is to be sent in the next bio. 1396 * 1397 * If the target wants to receive multiple copies of the bio (via num_*bios, etc), 1398 * the partially processed part (the sum of regions 1+2) must be the same for all 1399 * copies of the bio. 1400 */ 1401 void dm_accept_partial_bio(struct bio *bio, unsigned n_sectors) 1402 { 1403 struct dm_target_io *tio = container_of(bio, struct dm_target_io, clone); 1404 unsigned bi_size = bio->bi_iter.bi_size >> SECTOR_SHIFT; 1405 BUG_ON(bio->bi_rw & REQ_FLUSH); 1406 BUG_ON(bi_size > *tio->len_ptr); 1407 BUG_ON(n_sectors > bi_size); 1408 *tio->len_ptr -= bi_size - n_sectors; 1409 bio->bi_iter.bi_size = n_sectors << SECTOR_SHIFT; 1410 } 1411 EXPORT_SYMBOL_GPL(dm_accept_partial_bio); 1412 1413 static void __map_bio(struct dm_target_io *tio) 1414 { 1415 int r; 1416 sector_t sector; 1417 struct mapped_device *md; 1418 struct bio *clone = &tio->clone; 1419 struct dm_target *ti = tio->ti; 1420 1421 clone->bi_end_io = clone_endio; 1422 1423 /* 1424 * Map the clone. If r == 0 we don't need to do 1425 * anything, the target has assumed ownership of 1426 * this io. 1427 */ 1428 atomic_inc(&tio->io->io_count); 1429 sector = clone->bi_iter.bi_sector; 1430 r = ti->type->map(ti, clone); 1431 if (r == DM_MAPIO_REMAPPED) { 1432 /* the bio has been remapped so dispatch it */ 1433 1434 trace_block_bio_remap(bdev_get_queue(clone->bi_bdev), clone, 1435 tio->io->bio->bi_bdev->bd_dev, sector); 1436 1437 generic_make_request(clone); 1438 } else if (r < 0 || r == DM_MAPIO_REQUEUE) { 1439 /* error the io and bail out, or requeue it if needed */ 1440 md = tio->io->md; 1441 dec_pending(tio->io, r); 1442 free_tio(md, tio); 1443 } else if (r) { 1444 DMWARN("unimplemented target map return value: %d", r); 1445 BUG(); 1446 } 1447 } 1448 1449 struct clone_info { 1450 struct mapped_device *md; 1451 struct dm_table *map; 1452 struct bio *bio; 1453 struct dm_io *io; 1454 sector_t sector; 1455 unsigned sector_count; 1456 }; 1457 1458 static void bio_setup_sector(struct bio *bio, sector_t sector, unsigned len) 1459 { 1460 bio->bi_iter.bi_sector = sector; 1461 bio->bi_iter.bi_size = to_bytes(len); 1462 } 1463 1464 /* 1465 * Creates a bio that consists of range of complete bvecs. 1466 */ 1467 static void clone_bio(struct dm_target_io *tio, struct bio *bio, 1468 sector_t sector, unsigned len) 1469 { 1470 struct bio *clone = &tio->clone; 1471 1472 __bio_clone_fast(clone, bio); 1473 1474 if (bio_integrity(bio)) 1475 bio_integrity_clone(clone, bio, GFP_NOIO); 1476 1477 bio_advance(clone, to_bytes(sector - clone->bi_iter.bi_sector)); 1478 clone->bi_iter.bi_size = to_bytes(len); 1479 1480 if (bio_integrity(bio)) 1481 bio_integrity_trim(clone, 0, len); 1482 } 1483 1484 static struct dm_target_io *alloc_tio(struct clone_info *ci, 1485 struct dm_target *ti, 1486 unsigned target_bio_nr) 1487 { 1488 struct dm_target_io *tio; 1489 struct bio *clone; 1490 1491 clone = bio_alloc_bioset(GFP_NOIO, 0, ci->md->bs); 1492 tio = container_of(clone, struct dm_target_io, clone); 1493 1494 tio->io = ci->io; 1495 tio->ti = ti; 1496 tio->target_bio_nr = target_bio_nr; 1497 1498 return tio; 1499 } 1500 1501 static void __clone_and_map_simple_bio(struct clone_info *ci, 1502 struct dm_target *ti, 1503 unsigned target_bio_nr, unsigned *len) 1504 { 1505 struct dm_target_io *tio = alloc_tio(ci, ti, target_bio_nr); 1506 struct bio *clone = &tio->clone; 1507 1508 tio->len_ptr = len; 1509 1510 __bio_clone_fast(clone, ci->bio); 1511 if (len) 1512 bio_setup_sector(clone, ci->sector, *len); 1513 1514 __map_bio(tio); 1515 } 1516 1517 static void __send_duplicate_bios(struct clone_info *ci, struct dm_target *ti, 1518 unsigned num_bios, unsigned *len) 1519 { 1520 unsigned target_bio_nr; 1521 1522 for (target_bio_nr = 0; target_bio_nr < num_bios; target_bio_nr++) 1523 __clone_and_map_simple_bio(ci, ti, target_bio_nr, len); 1524 } 1525 1526 static int __send_empty_flush(struct clone_info *ci) 1527 { 1528 unsigned target_nr = 0; 1529 struct dm_target *ti; 1530 1531 BUG_ON(bio_has_data(ci->bio)); 1532 while ((ti = dm_table_get_target(ci->map, target_nr++))) 1533 __send_duplicate_bios(ci, ti, ti->num_flush_bios, NULL); 1534 1535 return 0; 1536 } 1537 1538 static void __clone_and_map_data_bio(struct clone_info *ci, struct dm_target *ti, 1539 sector_t sector, unsigned *len) 1540 { 1541 struct bio *bio = ci->bio; 1542 struct dm_target_io *tio; 1543 unsigned target_bio_nr; 1544 unsigned num_target_bios = 1; 1545 1546 /* 1547 * Does the target want to receive duplicate copies of the bio? 1548 */ 1549 if (bio_data_dir(bio) == WRITE && ti->num_write_bios) 1550 num_target_bios = ti->num_write_bios(ti, bio); 1551 1552 for (target_bio_nr = 0; target_bio_nr < num_target_bios; target_bio_nr++) { 1553 tio = alloc_tio(ci, ti, target_bio_nr); 1554 tio->len_ptr = len; 1555 clone_bio(tio, bio, sector, *len); 1556 __map_bio(tio); 1557 } 1558 } 1559 1560 typedef unsigned (*get_num_bios_fn)(struct dm_target *ti); 1561 1562 static unsigned get_num_discard_bios(struct dm_target *ti) 1563 { 1564 return ti->num_discard_bios; 1565 } 1566 1567 static unsigned get_num_write_same_bios(struct dm_target *ti) 1568 { 1569 return ti->num_write_same_bios; 1570 } 1571 1572 typedef bool (*is_split_required_fn)(struct dm_target *ti); 1573 1574 static bool is_split_required_for_discard(struct dm_target *ti) 1575 { 1576 return ti->split_discard_bios; 1577 } 1578 1579 static int __send_changing_extent_only(struct clone_info *ci, 1580 get_num_bios_fn get_num_bios, 1581 is_split_required_fn is_split_required) 1582 { 1583 struct dm_target *ti; 1584 unsigned len; 1585 unsigned num_bios; 1586 1587 do { 1588 ti = dm_table_find_target(ci->map, ci->sector); 1589 if (!dm_target_is_valid(ti)) 1590 return -EIO; 1591 1592 /* 1593 * Even though the device advertised support for this type of 1594 * request, that does not mean every target supports it, and 1595 * reconfiguration might also have changed that since the 1596 * check was performed. 1597 */ 1598 num_bios = get_num_bios ? get_num_bios(ti) : 0; 1599 if (!num_bios) 1600 return -EOPNOTSUPP; 1601 1602 if (is_split_required && !is_split_required(ti)) 1603 len = min((sector_t)ci->sector_count, max_io_len_target_boundary(ci->sector, ti)); 1604 else 1605 len = min((sector_t)ci->sector_count, max_io_len(ci->sector, ti)); 1606 1607 __send_duplicate_bios(ci, ti, num_bios, &len); 1608 1609 ci->sector += len; 1610 } while (ci->sector_count -= len); 1611 1612 return 0; 1613 } 1614 1615 static int __send_discard(struct clone_info *ci) 1616 { 1617 return __send_changing_extent_only(ci, get_num_discard_bios, 1618 is_split_required_for_discard); 1619 } 1620 1621 static int __send_write_same(struct clone_info *ci) 1622 { 1623 return __send_changing_extent_only(ci, get_num_write_same_bios, NULL); 1624 } 1625 1626 /* 1627 * Select the correct strategy for processing a non-flush bio. 1628 */ 1629 static int __split_and_process_non_flush(struct clone_info *ci) 1630 { 1631 struct bio *bio = ci->bio; 1632 struct dm_target *ti; 1633 unsigned len; 1634 1635 if (unlikely(bio->bi_rw & REQ_DISCARD)) 1636 return __send_discard(ci); 1637 else if (unlikely(bio->bi_rw & REQ_WRITE_SAME)) 1638 return __send_write_same(ci); 1639 1640 ti = dm_table_find_target(ci->map, ci->sector); 1641 if (!dm_target_is_valid(ti)) 1642 return -EIO; 1643 1644 len = min_t(sector_t, max_io_len(ci->sector, ti), ci->sector_count); 1645 1646 __clone_and_map_data_bio(ci, ti, ci->sector, &len); 1647 1648 ci->sector += len; 1649 ci->sector_count -= len; 1650 1651 return 0; 1652 } 1653 1654 /* 1655 * Entry point to split a bio into clones and submit them to the targets. 1656 */ 1657 static void __split_and_process_bio(struct mapped_device *md, 1658 struct dm_table *map, struct bio *bio) 1659 { 1660 struct clone_info ci; 1661 int error = 0; 1662 1663 if (unlikely(!map)) { 1664 bio_io_error(bio); 1665 return; 1666 } 1667 1668 ci.map = map; 1669 ci.md = md; 1670 ci.io = alloc_io(md); 1671 ci.io->error = 0; 1672 atomic_set(&ci.io->io_count, 1); 1673 ci.io->bio = bio; 1674 ci.io->md = md; 1675 spin_lock_init(&ci.io->endio_lock); 1676 ci.sector = bio->bi_iter.bi_sector; 1677 1678 start_io_acct(ci.io); 1679 1680 if (bio->bi_rw & REQ_FLUSH) { 1681 ci.bio = &ci.md->flush_bio; 1682 ci.sector_count = 0; 1683 error = __send_empty_flush(&ci); 1684 /* dec_pending submits any data associated with flush */ 1685 } else { 1686 ci.bio = bio; 1687 ci.sector_count = bio_sectors(bio); 1688 while (ci.sector_count && !error) 1689 error = __split_and_process_non_flush(&ci); 1690 } 1691 1692 /* drop the extra reference count */ 1693 dec_pending(ci.io, error); 1694 } 1695 /*----------------------------------------------------------------- 1696 * CRUD END 1697 *---------------------------------------------------------------*/ 1698 1699 static int dm_merge_bvec(struct request_queue *q, 1700 struct bvec_merge_data *bvm, 1701 struct bio_vec *biovec) 1702 { 1703 struct mapped_device *md = q->queuedata; 1704 struct dm_table *map = dm_get_live_table_fast(md); 1705 struct dm_target *ti; 1706 sector_t max_sectors; 1707 int max_size = 0; 1708 1709 if (unlikely(!map)) 1710 goto out; 1711 1712 ti = dm_table_find_target(map, bvm->bi_sector); 1713 if (!dm_target_is_valid(ti)) 1714 goto out; 1715 1716 /* 1717 * Find maximum amount of I/O that won't need splitting 1718 */ 1719 max_sectors = min(max_io_len(bvm->bi_sector, ti), 1720 (sector_t) queue_max_sectors(q)); 1721 max_size = (max_sectors << SECTOR_SHIFT) - bvm->bi_size; 1722 if (unlikely(max_size < 0)) /* this shouldn't _ever_ happen */ 1723 max_size = 0; 1724 1725 /* 1726 * merge_bvec_fn() returns number of bytes 1727 * it can accept at this offset 1728 * max is precomputed maximal io size 1729 */ 1730 if (max_size && ti->type->merge) 1731 max_size = ti->type->merge(ti, bvm, biovec, max_size); 1732 /* 1733 * If the target doesn't support merge method and some of the devices 1734 * provided their merge_bvec method (we know this by looking for the 1735 * max_hw_sectors that dm_set_device_limits may set), then we can't 1736 * allow bios with multiple vector entries. So always set max_size 1737 * to 0, and the code below allows just one page. 1738 */ 1739 else if (queue_max_hw_sectors(q) <= PAGE_SIZE >> 9) 1740 max_size = 0; 1741 1742 out: 1743 dm_put_live_table_fast(md); 1744 /* 1745 * Always allow an entire first page 1746 */ 1747 if (max_size <= biovec->bv_len && !(bvm->bi_size >> SECTOR_SHIFT)) 1748 max_size = biovec->bv_len; 1749 1750 return max_size; 1751 } 1752 1753 /* 1754 * The request function that just remaps the bio built up by 1755 * dm_merge_bvec. 1756 */ 1757 static void dm_make_request(struct request_queue *q, struct bio *bio) 1758 { 1759 int rw = bio_data_dir(bio); 1760 struct mapped_device *md = q->queuedata; 1761 int srcu_idx; 1762 struct dm_table *map; 1763 1764 map = dm_get_live_table(md, &srcu_idx); 1765 1766 generic_start_io_acct(rw, bio_sectors(bio), &dm_disk(md)->part0); 1767 1768 /* if we're suspended, we have to queue this io for later */ 1769 if (unlikely(test_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags))) { 1770 dm_put_live_table(md, srcu_idx); 1771 1772 if (bio_rw(bio) != READA) 1773 queue_io(md, bio); 1774 else 1775 bio_io_error(bio); 1776 return; 1777 } 1778 1779 __split_and_process_bio(md, map, bio); 1780 dm_put_live_table(md, srcu_idx); 1781 return; 1782 } 1783 1784 int dm_request_based(struct mapped_device *md) 1785 { 1786 return blk_queue_stackable(md->queue); 1787 } 1788 1789 static void dm_dispatch_clone_request(struct request *clone, struct request *rq) 1790 { 1791 int r; 1792 1793 if (blk_queue_io_stat(clone->q)) 1794 clone->cmd_flags |= REQ_IO_STAT; 1795 1796 clone->start_time = jiffies; 1797 r = blk_insert_cloned_request(clone->q, clone); 1798 if (r) 1799 /* must complete clone in terms of original request */ 1800 dm_complete_request(rq, r); 1801 } 1802 1803 static int dm_rq_bio_constructor(struct bio *bio, struct bio *bio_orig, 1804 void *data) 1805 { 1806 struct dm_rq_target_io *tio = data; 1807 struct dm_rq_clone_bio_info *info = 1808 container_of(bio, struct dm_rq_clone_bio_info, clone); 1809 1810 info->orig = bio_orig; 1811 info->tio = tio; 1812 bio->bi_end_io = end_clone_bio; 1813 1814 return 0; 1815 } 1816 1817 static int setup_clone(struct request *clone, struct request *rq, 1818 struct dm_rq_target_io *tio, gfp_t gfp_mask) 1819 { 1820 int r; 1821 1822 r = blk_rq_prep_clone(clone, rq, tio->md->bs, gfp_mask, 1823 dm_rq_bio_constructor, tio); 1824 if (r) 1825 return r; 1826 1827 clone->cmd = rq->cmd; 1828 clone->cmd_len = rq->cmd_len; 1829 clone->sense = rq->sense; 1830 clone->end_io = end_clone_request; 1831 clone->end_io_data = tio; 1832 1833 tio->clone = clone; 1834 1835 return 0; 1836 } 1837 1838 static struct request *clone_rq(struct request *rq, struct mapped_device *md, 1839 struct dm_rq_target_io *tio, gfp_t gfp_mask) 1840 { 1841 struct request *clone = alloc_clone_request(md, gfp_mask); 1842 1843 if (!clone) 1844 return NULL; 1845 1846 blk_rq_init(NULL, clone); 1847 if (setup_clone(clone, rq, tio, gfp_mask)) { 1848 /* -ENOMEM */ 1849 free_clone_request(md, clone); 1850 return NULL; 1851 } 1852 1853 return clone; 1854 } 1855 1856 static void map_tio_request(struct kthread_work *work); 1857 1858 static void init_tio(struct dm_rq_target_io *tio, struct request *rq, 1859 struct mapped_device *md) 1860 { 1861 tio->md = md; 1862 tio->ti = NULL; 1863 tio->clone = NULL; 1864 tio->orig = rq; 1865 tio->error = 0; 1866 memset(&tio->info, 0, sizeof(tio->info)); 1867 init_kthread_work(&tio->work, map_tio_request); 1868 } 1869 1870 static struct dm_rq_target_io *prep_tio(struct request *rq, 1871 struct mapped_device *md, gfp_t gfp_mask) 1872 { 1873 struct dm_rq_target_io *tio; 1874 int srcu_idx; 1875 struct dm_table *table; 1876 1877 tio = alloc_rq_tio(md, gfp_mask); 1878 if (!tio) 1879 return NULL; 1880 1881 init_tio(tio, rq, md); 1882 1883 table = dm_get_live_table(md, &srcu_idx); 1884 if (!dm_table_mq_request_based(table)) { 1885 if (!clone_rq(rq, md, tio, gfp_mask)) { 1886 dm_put_live_table(md, srcu_idx); 1887 free_rq_tio(tio); 1888 return NULL; 1889 } 1890 } 1891 dm_put_live_table(md, srcu_idx); 1892 1893 return tio; 1894 } 1895 1896 /* 1897 * Called with the queue lock held. 1898 */ 1899 static int dm_prep_fn(struct request_queue *q, struct request *rq) 1900 { 1901 struct mapped_device *md = q->queuedata; 1902 struct dm_rq_target_io *tio; 1903 1904 if (unlikely(rq->special)) { 1905 DMWARN("Already has something in rq->special."); 1906 return BLKPREP_KILL; 1907 } 1908 1909 tio = prep_tio(rq, md, GFP_ATOMIC); 1910 if (!tio) 1911 return BLKPREP_DEFER; 1912 1913 rq->special = tio; 1914 rq->cmd_flags |= REQ_DONTPREP; 1915 1916 return BLKPREP_OK; 1917 } 1918 1919 /* 1920 * Returns: 1921 * 0 : the request has been processed 1922 * DM_MAPIO_REQUEUE : the original request needs to be requeued 1923 * < 0 : the request was completed due to failure 1924 */ 1925 static int map_request(struct dm_rq_target_io *tio, struct request *rq, 1926 struct mapped_device *md) 1927 { 1928 int r; 1929 struct dm_target *ti = tio->ti; 1930 struct request *clone = NULL; 1931 1932 if (tio->clone) { 1933 clone = tio->clone; 1934 r = ti->type->map_rq(ti, clone, &tio->info); 1935 } else { 1936 r = ti->type->clone_and_map_rq(ti, rq, &tio->info, &clone); 1937 if (r < 0) { 1938 /* The target wants to complete the I/O */ 1939 dm_kill_unmapped_request(rq, r); 1940 return r; 1941 } 1942 if (IS_ERR(clone)) 1943 return DM_MAPIO_REQUEUE; 1944 if (setup_clone(clone, rq, tio, GFP_NOIO)) { 1945 /* -ENOMEM */ 1946 ti->type->release_clone_rq(clone); 1947 return DM_MAPIO_REQUEUE; 1948 } 1949 } 1950 1951 switch (r) { 1952 case DM_MAPIO_SUBMITTED: 1953 /* The target has taken the I/O to submit by itself later */ 1954 break; 1955 case DM_MAPIO_REMAPPED: 1956 /* The target has remapped the I/O so dispatch it */ 1957 trace_block_rq_remap(clone->q, clone, disk_devt(dm_disk(md)), 1958 blk_rq_pos(rq)); 1959 dm_dispatch_clone_request(clone, rq); 1960 break; 1961 case DM_MAPIO_REQUEUE: 1962 /* The target wants to requeue the I/O */ 1963 dm_requeue_unmapped_request(clone); 1964 break; 1965 default: 1966 if (r > 0) { 1967 DMWARN("unimplemented target map return value: %d", r); 1968 BUG(); 1969 } 1970 1971 /* The target wants to complete the I/O */ 1972 dm_kill_unmapped_request(rq, r); 1973 return r; 1974 } 1975 1976 return 0; 1977 } 1978 1979 static void map_tio_request(struct kthread_work *work) 1980 { 1981 struct dm_rq_target_io *tio = container_of(work, struct dm_rq_target_io, work); 1982 struct request *rq = tio->orig; 1983 struct mapped_device *md = tio->md; 1984 1985 if (map_request(tio, rq, md) == DM_MAPIO_REQUEUE) 1986 dm_requeue_unmapped_original_request(md, rq); 1987 } 1988 1989 static void dm_start_request(struct mapped_device *md, struct request *orig) 1990 { 1991 if (!orig->q->mq_ops) 1992 blk_start_request(orig); 1993 else 1994 blk_mq_start_request(orig); 1995 atomic_inc(&md->pending[rq_data_dir(orig)]); 1996 1997 if (md->seq_rq_merge_deadline_usecs) { 1998 md->last_rq_pos = rq_end_sector(orig); 1999 md->last_rq_rw = rq_data_dir(orig); 2000 md->last_rq_start_time = ktime_get(); 2001 } 2002 2003 /* 2004 * Hold the md reference here for the in-flight I/O. 2005 * We can't rely on the reference count by device opener, 2006 * because the device may be closed during the request completion 2007 * when all bios are completed. 2008 * See the comment in rq_completed() too. 2009 */ 2010 dm_get(md); 2011 } 2012 2013 #define MAX_SEQ_RQ_MERGE_DEADLINE_USECS 100000 2014 2015 ssize_t dm_attr_rq_based_seq_io_merge_deadline_show(struct mapped_device *md, char *buf) 2016 { 2017 return sprintf(buf, "%u\n", md->seq_rq_merge_deadline_usecs); 2018 } 2019 2020 ssize_t dm_attr_rq_based_seq_io_merge_deadline_store(struct mapped_device *md, 2021 const char *buf, size_t count) 2022 { 2023 unsigned deadline; 2024 2025 if (!dm_request_based(md)) 2026 return count; 2027 2028 if (kstrtouint(buf, 10, &deadline)) 2029 return -EINVAL; 2030 2031 if (deadline > MAX_SEQ_RQ_MERGE_DEADLINE_USECS) 2032 deadline = MAX_SEQ_RQ_MERGE_DEADLINE_USECS; 2033 2034 md->seq_rq_merge_deadline_usecs = deadline; 2035 2036 return count; 2037 } 2038 2039 static bool dm_request_peeked_before_merge_deadline(struct mapped_device *md) 2040 { 2041 ktime_t kt_deadline; 2042 2043 if (!md->seq_rq_merge_deadline_usecs) 2044 return false; 2045 2046 kt_deadline = ns_to_ktime((u64)md->seq_rq_merge_deadline_usecs * NSEC_PER_USEC); 2047 kt_deadline = ktime_add_safe(md->last_rq_start_time, kt_deadline); 2048 2049 return !ktime_after(ktime_get(), kt_deadline); 2050 } 2051 2052 /* 2053 * q->request_fn for request-based dm. 2054 * Called with the queue lock held. 2055 */ 2056 static void dm_request_fn(struct request_queue *q) 2057 { 2058 struct mapped_device *md = q->queuedata; 2059 int srcu_idx; 2060 struct dm_table *map = dm_get_live_table(md, &srcu_idx); 2061 struct dm_target *ti; 2062 struct request *rq; 2063 struct dm_rq_target_io *tio; 2064 sector_t pos; 2065 2066 /* 2067 * For suspend, check blk_queue_stopped() and increment 2068 * ->pending within a single queue_lock not to increment the 2069 * number of in-flight I/Os after the queue is stopped in 2070 * dm_suspend(). 2071 */ 2072 while (!blk_queue_stopped(q)) { 2073 rq = blk_peek_request(q); 2074 if (!rq) 2075 goto out; 2076 2077 /* always use block 0 to find the target for flushes for now */ 2078 pos = 0; 2079 if (!(rq->cmd_flags & REQ_FLUSH)) 2080 pos = blk_rq_pos(rq); 2081 2082 ti = dm_table_find_target(map, pos); 2083 if (!dm_target_is_valid(ti)) { 2084 /* 2085 * Must perform setup, that rq_completed() requires, 2086 * before calling dm_kill_unmapped_request 2087 */ 2088 DMERR_LIMIT("request attempted access beyond the end of device"); 2089 dm_start_request(md, rq); 2090 dm_kill_unmapped_request(rq, -EIO); 2091 continue; 2092 } 2093 2094 if (dm_request_peeked_before_merge_deadline(md) && 2095 md_in_flight(md) && rq->bio && rq->bio->bi_vcnt == 1 && 2096 md->last_rq_pos == pos && md->last_rq_rw == rq_data_dir(rq)) 2097 goto delay_and_out; 2098 2099 if (ti->type->busy && ti->type->busy(ti)) 2100 goto delay_and_out; 2101 2102 dm_start_request(md, rq); 2103 2104 tio = tio_from_request(rq); 2105 /* Establish tio->ti before queuing work (map_tio_request) */ 2106 tio->ti = ti; 2107 queue_kthread_work(&md->kworker, &tio->work); 2108 BUG_ON(!irqs_disabled()); 2109 } 2110 2111 goto out; 2112 2113 delay_and_out: 2114 blk_delay_queue(q, HZ / 100); 2115 out: 2116 dm_put_live_table(md, srcu_idx); 2117 } 2118 2119 static int dm_any_congested(void *congested_data, int bdi_bits) 2120 { 2121 int r = bdi_bits; 2122 struct mapped_device *md = congested_data; 2123 struct dm_table *map; 2124 2125 if (!test_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags)) { 2126 map = dm_get_live_table_fast(md); 2127 if (map) { 2128 /* 2129 * Request-based dm cares about only own queue for 2130 * the query about congestion status of request_queue 2131 */ 2132 if (dm_request_based(md)) 2133 r = md->queue->backing_dev_info.state & 2134 bdi_bits; 2135 else 2136 r = dm_table_any_congested(map, bdi_bits); 2137 } 2138 dm_put_live_table_fast(md); 2139 } 2140 2141 return r; 2142 } 2143 2144 /*----------------------------------------------------------------- 2145 * An IDR is used to keep track of allocated minor numbers. 2146 *---------------------------------------------------------------*/ 2147 static void free_minor(int minor) 2148 { 2149 spin_lock(&_minor_lock); 2150 idr_remove(&_minor_idr, minor); 2151 spin_unlock(&_minor_lock); 2152 } 2153 2154 /* 2155 * See if the device with a specific minor # is free. 2156 */ 2157 static int specific_minor(int minor) 2158 { 2159 int r; 2160 2161 if (minor >= (1 << MINORBITS)) 2162 return -EINVAL; 2163 2164 idr_preload(GFP_KERNEL); 2165 spin_lock(&_minor_lock); 2166 2167 r = idr_alloc(&_minor_idr, MINOR_ALLOCED, minor, minor + 1, GFP_NOWAIT); 2168 2169 spin_unlock(&_minor_lock); 2170 idr_preload_end(); 2171 if (r < 0) 2172 return r == -ENOSPC ? -EBUSY : r; 2173 return 0; 2174 } 2175 2176 static int next_free_minor(int *minor) 2177 { 2178 int r; 2179 2180 idr_preload(GFP_KERNEL); 2181 spin_lock(&_minor_lock); 2182 2183 r = idr_alloc(&_minor_idr, MINOR_ALLOCED, 0, 1 << MINORBITS, GFP_NOWAIT); 2184 2185 spin_unlock(&_minor_lock); 2186 idr_preload_end(); 2187 if (r < 0) 2188 return r; 2189 *minor = r; 2190 return 0; 2191 } 2192 2193 static const struct block_device_operations dm_blk_dops; 2194 2195 static void dm_wq_work(struct work_struct *work); 2196 2197 static void dm_init_md_queue(struct mapped_device *md) 2198 { 2199 /* 2200 * Request-based dm devices cannot be stacked on top of bio-based dm 2201 * devices. The type of this dm device may not have been decided yet. 2202 * The type is decided at the first table loading time. 2203 * To prevent problematic device stacking, clear the queue flag 2204 * for request stacking support until then. 2205 * 2206 * This queue is new, so no concurrency on the queue_flags. 2207 */ 2208 queue_flag_clear_unlocked(QUEUE_FLAG_STACKABLE, md->queue); 2209 } 2210 2211 static void dm_init_old_md_queue(struct mapped_device *md) 2212 { 2213 dm_init_md_queue(md); 2214 2215 /* 2216 * Initialize aspects of queue that aren't relevant for blk-mq 2217 */ 2218 md->queue->queuedata = md; 2219 md->queue->backing_dev_info.congested_fn = dm_any_congested; 2220 md->queue->backing_dev_info.congested_data = md; 2221 2222 blk_queue_bounce_limit(md->queue, BLK_BOUNCE_ANY); 2223 } 2224 2225 /* 2226 * Allocate and initialise a blank device with a given minor. 2227 */ 2228 static struct mapped_device *alloc_dev(int minor) 2229 { 2230 int r; 2231 struct mapped_device *md = kzalloc(sizeof(*md), GFP_KERNEL); 2232 void *old_md; 2233 2234 if (!md) { 2235 DMWARN("unable to allocate device, out of memory."); 2236 return NULL; 2237 } 2238 2239 if (!try_module_get(THIS_MODULE)) 2240 goto bad_module_get; 2241 2242 /* get a minor number for the dev */ 2243 if (minor == DM_ANY_MINOR) 2244 r = next_free_minor(&minor); 2245 else 2246 r = specific_minor(minor); 2247 if (r < 0) 2248 goto bad_minor; 2249 2250 r = init_srcu_struct(&md->io_barrier); 2251 if (r < 0) 2252 goto bad_io_barrier; 2253 2254 md->type = DM_TYPE_NONE; 2255 mutex_init(&md->suspend_lock); 2256 mutex_init(&md->type_lock); 2257 mutex_init(&md->table_devices_lock); 2258 spin_lock_init(&md->deferred_lock); 2259 atomic_set(&md->holders, 1); 2260 atomic_set(&md->open_count, 0); 2261 atomic_set(&md->event_nr, 0); 2262 atomic_set(&md->uevent_seq, 0); 2263 INIT_LIST_HEAD(&md->uevent_list); 2264 INIT_LIST_HEAD(&md->table_devices); 2265 spin_lock_init(&md->uevent_lock); 2266 2267 md->queue = blk_alloc_queue(GFP_KERNEL); 2268 if (!md->queue) 2269 goto bad_queue; 2270 2271 dm_init_md_queue(md); 2272 2273 md->disk = alloc_disk(1); 2274 if (!md->disk) 2275 goto bad_disk; 2276 2277 atomic_set(&md->pending[0], 0); 2278 atomic_set(&md->pending[1], 0); 2279 init_waitqueue_head(&md->wait); 2280 INIT_WORK(&md->work, dm_wq_work); 2281 init_waitqueue_head(&md->eventq); 2282 init_completion(&md->kobj_holder.completion); 2283 md->kworker_task = NULL; 2284 2285 md->disk->major = _major; 2286 md->disk->first_minor = minor; 2287 md->disk->fops = &dm_blk_dops; 2288 md->disk->queue = md->queue; 2289 md->disk->private_data = md; 2290 sprintf(md->disk->disk_name, "dm-%d", minor); 2291 add_disk(md->disk); 2292 format_dev_t(md->name, MKDEV(_major, minor)); 2293 2294 md->wq = alloc_workqueue("kdmflush", WQ_MEM_RECLAIM, 0); 2295 if (!md->wq) 2296 goto bad_thread; 2297 2298 md->bdev = bdget_disk(md->disk, 0); 2299 if (!md->bdev) 2300 goto bad_bdev; 2301 2302 bio_init(&md->flush_bio); 2303 md->flush_bio.bi_bdev = md->bdev; 2304 md->flush_bio.bi_rw = WRITE_FLUSH; 2305 2306 dm_stats_init(&md->stats); 2307 2308 /* Populate the mapping, nobody knows we exist yet */ 2309 spin_lock(&_minor_lock); 2310 old_md = idr_replace(&_minor_idr, md, minor); 2311 spin_unlock(&_minor_lock); 2312 2313 BUG_ON(old_md != MINOR_ALLOCED); 2314 2315 return md; 2316 2317 bad_bdev: 2318 destroy_workqueue(md->wq); 2319 bad_thread: 2320 del_gendisk(md->disk); 2321 put_disk(md->disk); 2322 bad_disk: 2323 blk_cleanup_queue(md->queue); 2324 bad_queue: 2325 cleanup_srcu_struct(&md->io_barrier); 2326 bad_io_barrier: 2327 free_minor(minor); 2328 bad_minor: 2329 module_put(THIS_MODULE); 2330 bad_module_get: 2331 kfree(md); 2332 return NULL; 2333 } 2334 2335 static void unlock_fs(struct mapped_device *md); 2336 2337 static void free_dev(struct mapped_device *md) 2338 { 2339 int minor = MINOR(disk_devt(md->disk)); 2340 bool using_blk_mq = !!md->queue->mq_ops; 2341 2342 unlock_fs(md); 2343 destroy_workqueue(md->wq); 2344 2345 if (md->kworker_task) 2346 kthread_stop(md->kworker_task); 2347 if (md->io_pool) 2348 mempool_destroy(md->io_pool); 2349 if (md->rq_pool) 2350 mempool_destroy(md->rq_pool); 2351 if (md->bs) 2352 bioset_free(md->bs); 2353 2354 cleanup_srcu_struct(&md->io_barrier); 2355 free_table_devices(&md->table_devices); 2356 dm_stats_cleanup(&md->stats); 2357 2358 spin_lock(&_minor_lock); 2359 md->disk->private_data = NULL; 2360 spin_unlock(&_minor_lock); 2361 if (blk_get_integrity(md->disk)) 2362 blk_integrity_unregister(md->disk); 2363 del_gendisk(md->disk); 2364 put_disk(md->disk); 2365 blk_cleanup_queue(md->queue); 2366 if (using_blk_mq) 2367 blk_mq_free_tag_set(&md->tag_set); 2368 bdput(md->bdev); 2369 free_minor(minor); 2370 2371 module_put(THIS_MODULE); 2372 kfree(md); 2373 } 2374 2375 static void __bind_mempools(struct mapped_device *md, struct dm_table *t) 2376 { 2377 struct dm_md_mempools *p = dm_table_get_md_mempools(t); 2378 2379 if (md->io_pool && md->bs) { 2380 /* The md already has necessary mempools. */ 2381 if (dm_table_get_type(t) == DM_TYPE_BIO_BASED) { 2382 /* 2383 * Reload bioset because front_pad may have changed 2384 * because a different table was loaded. 2385 */ 2386 bioset_free(md->bs); 2387 md->bs = p->bs; 2388 p->bs = NULL; 2389 } 2390 /* 2391 * There's no need to reload with request-based dm 2392 * because the size of front_pad doesn't change. 2393 * Note for future: If you are to reload bioset, 2394 * prep-ed requests in the queue may refer 2395 * to bio from the old bioset, so you must walk 2396 * through the queue to unprep. 2397 */ 2398 goto out; 2399 } 2400 2401 BUG_ON(!p || md->io_pool || md->rq_pool || md->bs); 2402 2403 md->io_pool = p->io_pool; 2404 p->io_pool = NULL; 2405 md->rq_pool = p->rq_pool; 2406 p->rq_pool = NULL; 2407 md->bs = p->bs; 2408 p->bs = NULL; 2409 2410 out: 2411 /* mempool bind completed, now no need any mempools in the table */ 2412 dm_table_free_md_mempools(t); 2413 } 2414 2415 /* 2416 * Bind a table to the device. 2417 */ 2418 static void event_callback(void *context) 2419 { 2420 unsigned long flags; 2421 LIST_HEAD(uevents); 2422 struct mapped_device *md = (struct mapped_device *) context; 2423 2424 spin_lock_irqsave(&md->uevent_lock, flags); 2425 list_splice_init(&md->uevent_list, &uevents); 2426 spin_unlock_irqrestore(&md->uevent_lock, flags); 2427 2428 dm_send_uevents(&uevents, &disk_to_dev(md->disk)->kobj); 2429 2430 atomic_inc(&md->event_nr); 2431 wake_up(&md->eventq); 2432 } 2433 2434 /* 2435 * Protected by md->suspend_lock obtained by dm_swap_table(). 2436 */ 2437 static void __set_size(struct mapped_device *md, sector_t size) 2438 { 2439 set_capacity(md->disk, size); 2440 2441 i_size_write(md->bdev->bd_inode, (loff_t)size << SECTOR_SHIFT); 2442 } 2443 2444 /* 2445 * Return 1 if the queue has a compulsory merge_bvec_fn function. 2446 * 2447 * If this function returns 0, then the device is either a non-dm 2448 * device without a merge_bvec_fn, or it is a dm device that is 2449 * able to split any bios it receives that are too big. 2450 */ 2451 int dm_queue_merge_is_compulsory(struct request_queue *q) 2452 { 2453 struct mapped_device *dev_md; 2454 2455 if (!q->merge_bvec_fn) 2456 return 0; 2457 2458 if (q->make_request_fn == dm_make_request) { 2459 dev_md = q->queuedata; 2460 if (test_bit(DMF_MERGE_IS_OPTIONAL, &dev_md->flags)) 2461 return 0; 2462 } 2463 2464 return 1; 2465 } 2466 2467 static int dm_device_merge_is_compulsory(struct dm_target *ti, 2468 struct dm_dev *dev, sector_t start, 2469 sector_t len, void *data) 2470 { 2471 struct block_device *bdev = dev->bdev; 2472 struct request_queue *q = bdev_get_queue(bdev); 2473 2474 return dm_queue_merge_is_compulsory(q); 2475 } 2476 2477 /* 2478 * Return 1 if it is acceptable to ignore merge_bvec_fn based 2479 * on the properties of the underlying devices. 2480 */ 2481 static int dm_table_merge_is_optional(struct dm_table *table) 2482 { 2483 unsigned i = 0; 2484 struct dm_target *ti; 2485 2486 while (i < dm_table_get_num_targets(table)) { 2487 ti = dm_table_get_target(table, i++); 2488 2489 if (ti->type->iterate_devices && 2490 ti->type->iterate_devices(ti, dm_device_merge_is_compulsory, NULL)) 2491 return 0; 2492 } 2493 2494 return 1; 2495 } 2496 2497 /* 2498 * Returns old map, which caller must destroy. 2499 */ 2500 static struct dm_table *__bind(struct mapped_device *md, struct dm_table *t, 2501 struct queue_limits *limits) 2502 { 2503 struct dm_table *old_map; 2504 struct request_queue *q = md->queue; 2505 sector_t size; 2506 int merge_is_optional; 2507 2508 size = dm_table_get_size(t); 2509 2510 /* 2511 * Wipe any geometry if the size of the table changed. 2512 */ 2513 if (size != dm_get_size(md)) 2514 memset(&md->geometry, 0, sizeof(md->geometry)); 2515 2516 __set_size(md, size); 2517 2518 dm_table_event_callback(t, event_callback, md); 2519 2520 /* 2521 * The queue hasn't been stopped yet, if the old table type wasn't 2522 * for request-based during suspension. So stop it to prevent 2523 * I/O mapping before resume. 2524 * This must be done before setting the queue restrictions, 2525 * because request-based dm may be run just after the setting. 2526 */ 2527 if (dm_table_request_based(t)) 2528 stop_queue(q); 2529 2530 __bind_mempools(md, t); 2531 2532 merge_is_optional = dm_table_merge_is_optional(t); 2533 2534 old_map = rcu_dereference_protected(md->map, lockdep_is_held(&md->suspend_lock)); 2535 rcu_assign_pointer(md->map, t); 2536 md->immutable_target_type = dm_table_get_immutable_target_type(t); 2537 2538 dm_table_set_restrictions(t, q, limits); 2539 if (merge_is_optional) 2540 set_bit(DMF_MERGE_IS_OPTIONAL, &md->flags); 2541 else 2542 clear_bit(DMF_MERGE_IS_OPTIONAL, &md->flags); 2543 if (old_map) 2544 dm_sync_table(md); 2545 2546 return old_map; 2547 } 2548 2549 /* 2550 * Returns unbound table for the caller to free. 2551 */ 2552 static struct dm_table *__unbind(struct mapped_device *md) 2553 { 2554 struct dm_table *map = rcu_dereference_protected(md->map, 1); 2555 2556 if (!map) 2557 return NULL; 2558 2559 dm_table_event_callback(map, NULL, NULL); 2560 RCU_INIT_POINTER(md->map, NULL); 2561 dm_sync_table(md); 2562 2563 return map; 2564 } 2565 2566 /* 2567 * Constructor for a new device. 2568 */ 2569 int dm_create(int minor, struct mapped_device **result) 2570 { 2571 struct mapped_device *md; 2572 2573 md = alloc_dev(minor); 2574 if (!md) 2575 return -ENXIO; 2576 2577 dm_sysfs_init(md); 2578 2579 *result = md; 2580 return 0; 2581 } 2582 2583 /* 2584 * Functions to manage md->type. 2585 * All are required to hold md->type_lock. 2586 */ 2587 void dm_lock_md_type(struct mapped_device *md) 2588 { 2589 mutex_lock(&md->type_lock); 2590 } 2591 2592 void dm_unlock_md_type(struct mapped_device *md) 2593 { 2594 mutex_unlock(&md->type_lock); 2595 } 2596 2597 void dm_set_md_type(struct mapped_device *md, unsigned type) 2598 { 2599 BUG_ON(!mutex_is_locked(&md->type_lock)); 2600 md->type = type; 2601 } 2602 2603 unsigned dm_get_md_type(struct mapped_device *md) 2604 { 2605 BUG_ON(!mutex_is_locked(&md->type_lock)); 2606 return md->type; 2607 } 2608 2609 struct target_type *dm_get_immutable_target_type(struct mapped_device *md) 2610 { 2611 return md->immutable_target_type; 2612 } 2613 2614 /* 2615 * The queue_limits are only valid as long as you have a reference 2616 * count on 'md'. 2617 */ 2618 struct queue_limits *dm_get_queue_limits(struct mapped_device *md) 2619 { 2620 BUG_ON(!atomic_read(&md->holders)); 2621 return &md->queue->limits; 2622 } 2623 EXPORT_SYMBOL_GPL(dm_get_queue_limits); 2624 2625 static void init_rq_based_worker_thread(struct mapped_device *md) 2626 { 2627 /* Initialize the request-based DM worker thread */ 2628 init_kthread_worker(&md->kworker); 2629 md->kworker_task = kthread_run(kthread_worker_fn, &md->kworker, 2630 "kdmwork-%s", dm_device_name(md)); 2631 } 2632 2633 /* 2634 * Fully initialize a request-based queue (->elevator, ->request_fn, etc). 2635 */ 2636 static int dm_init_request_based_queue(struct mapped_device *md) 2637 { 2638 struct request_queue *q = NULL; 2639 2640 if (md->queue->elevator) 2641 return 0; 2642 2643 /* Fully initialize the queue */ 2644 q = blk_init_allocated_queue(md->queue, dm_request_fn, NULL); 2645 if (!q) 2646 return -EINVAL; 2647 2648 /* disable dm_request_fn's merge heuristic by default */ 2649 md->seq_rq_merge_deadline_usecs = 0; 2650 2651 md->queue = q; 2652 dm_init_old_md_queue(md); 2653 blk_queue_softirq_done(md->queue, dm_softirq_done); 2654 blk_queue_prep_rq(md->queue, dm_prep_fn); 2655 2656 init_rq_based_worker_thread(md); 2657 2658 elv_register_queue(md->queue); 2659 2660 return 0; 2661 } 2662 2663 static int dm_mq_init_request(void *data, struct request *rq, 2664 unsigned int hctx_idx, unsigned int request_idx, 2665 unsigned int numa_node) 2666 { 2667 struct mapped_device *md = data; 2668 struct dm_rq_target_io *tio = blk_mq_rq_to_pdu(rq); 2669 2670 /* 2671 * Must initialize md member of tio, otherwise it won't 2672 * be available in dm_mq_queue_rq. 2673 */ 2674 tio->md = md; 2675 2676 return 0; 2677 } 2678 2679 static int dm_mq_queue_rq(struct blk_mq_hw_ctx *hctx, 2680 const struct blk_mq_queue_data *bd) 2681 { 2682 struct request *rq = bd->rq; 2683 struct dm_rq_target_io *tio = blk_mq_rq_to_pdu(rq); 2684 struct mapped_device *md = tio->md; 2685 int srcu_idx; 2686 struct dm_table *map = dm_get_live_table(md, &srcu_idx); 2687 struct dm_target *ti; 2688 sector_t pos; 2689 2690 /* always use block 0 to find the target for flushes for now */ 2691 pos = 0; 2692 if (!(rq->cmd_flags & REQ_FLUSH)) 2693 pos = blk_rq_pos(rq); 2694 2695 ti = dm_table_find_target(map, pos); 2696 if (!dm_target_is_valid(ti)) { 2697 dm_put_live_table(md, srcu_idx); 2698 DMERR_LIMIT("request attempted access beyond the end of device"); 2699 /* 2700 * Must perform setup, that rq_completed() requires, 2701 * before returning BLK_MQ_RQ_QUEUE_ERROR 2702 */ 2703 dm_start_request(md, rq); 2704 return BLK_MQ_RQ_QUEUE_ERROR; 2705 } 2706 dm_put_live_table(md, srcu_idx); 2707 2708 if (ti->type->busy && ti->type->busy(ti)) 2709 return BLK_MQ_RQ_QUEUE_BUSY; 2710 2711 dm_start_request(md, rq); 2712 2713 /* Init tio using md established in .init_request */ 2714 init_tio(tio, rq, md); 2715 2716 /* Establish tio->ti before queuing work (map_tio_request) */ 2717 tio->ti = ti; 2718 queue_kthread_work(&md->kworker, &tio->work); 2719 2720 return BLK_MQ_RQ_QUEUE_OK; 2721 } 2722 2723 static struct blk_mq_ops dm_mq_ops = { 2724 .queue_rq = dm_mq_queue_rq, 2725 .map_queue = blk_mq_map_queue, 2726 .complete = dm_softirq_done, 2727 .init_request = dm_mq_init_request, 2728 }; 2729 2730 static int dm_init_request_based_blk_mq_queue(struct mapped_device *md) 2731 { 2732 struct request_queue *q; 2733 int err; 2734 2735 memset(&md->tag_set, 0, sizeof(md->tag_set)); 2736 md->tag_set.ops = &dm_mq_ops; 2737 md->tag_set.queue_depth = BLKDEV_MAX_RQ; 2738 md->tag_set.numa_node = NUMA_NO_NODE; 2739 md->tag_set.flags = BLK_MQ_F_SHOULD_MERGE | BLK_MQ_F_SG_MERGE; 2740 md->tag_set.nr_hw_queues = 1; 2741 md->tag_set.cmd_size = sizeof(struct dm_rq_target_io); 2742 md->tag_set.driver_data = md; 2743 2744 err = blk_mq_alloc_tag_set(&md->tag_set); 2745 if (err) 2746 return err; 2747 2748 q = blk_mq_init_allocated_queue(&md->tag_set, md->queue); 2749 if (IS_ERR(q)) { 2750 err = PTR_ERR(q); 2751 goto out_tag_set; 2752 } 2753 md->queue = q; 2754 dm_init_md_queue(md); 2755 2756 /* backfill 'mq' sysfs registration normally done in blk_register_queue */ 2757 blk_mq_register_disk(md->disk); 2758 2759 init_rq_based_worker_thread(md); 2760 2761 return 0; 2762 2763 out_tag_set: 2764 blk_mq_free_tag_set(&md->tag_set); 2765 return err; 2766 } 2767 2768 /* 2769 * Setup the DM device's queue based on md's type 2770 */ 2771 int dm_setup_md_queue(struct mapped_device *md) 2772 { 2773 int r; 2774 unsigned md_type = dm_get_md_type(md); 2775 2776 switch (md_type) { 2777 case DM_TYPE_REQUEST_BASED: 2778 r = dm_init_request_based_queue(md); 2779 if (r) { 2780 DMWARN("Cannot initialize queue for request-based mapped device"); 2781 return r; 2782 } 2783 break; 2784 case DM_TYPE_MQ_REQUEST_BASED: 2785 r = dm_init_request_based_blk_mq_queue(md); 2786 if (r) { 2787 DMWARN("Cannot initialize queue for request-based blk-mq mapped device"); 2788 return r; 2789 } 2790 break; 2791 case DM_TYPE_BIO_BASED: 2792 dm_init_old_md_queue(md); 2793 blk_queue_make_request(md->queue, dm_make_request); 2794 blk_queue_merge_bvec(md->queue, dm_merge_bvec); 2795 break; 2796 } 2797 2798 return 0; 2799 } 2800 2801 struct mapped_device *dm_get_md(dev_t dev) 2802 { 2803 struct mapped_device *md; 2804 unsigned minor = MINOR(dev); 2805 2806 if (MAJOR(dev) != _major || minor >= (1 << MINORBITS)) 2807 return NULL; 2808 2809 spin_lock(&_minor_lock); 2810 2811 md = idr_find(&_minor_idr, minor); 2812 if (md) { 2813 if ((md == MINOR_ALLOCED || 2814 (MINOR(disk_devt(dm_disk(md))) != minor) || 2815 dm_deleting_md(md) || 2816 test_bit(DMF_FREEING, &md->flags))) { 2817 md = NULL; 2818 goto out; 2819 } 2820 dm_get(md); 2821 } 2822 2823 out: 2824 spin_unlock(&_minor_lock); 2825 2826 return md; 2827 } 2828 EXPORT_SYMBOL_GPL(dm_get_md); 2829 2830 void *dm_get_mdptr(struct mapped_device *md) 2831 { 2832 return md->interface_ptr; 2833 } 2834 2835 void dm_set_mdptr(struct mapped_device *md, void *ptr) 2836 { 2837 md->interface_ptr = ptr; 2838 } 2839 2840 void dm_get(struct mapped_device *md) 2841 { 2842 atomic_inc(&md->holders); 2843 BUG_ON(test_bit(DMF_FREEING, &md->flags)); 2844 } 2845 2846 int dm_hold(struct mapped_device *md) 2847 { 2848 spin_lock(&_minor_lock); 2849 if (test_bit(DMF_FREEING, &md->flags)) { 2850 spin_unlock(&_minor_lock); 2851 return -EBUSY; 2852 } 2853 dm_get(md); 2854 spin_unlock(&_minor_lock); 2855 return 0; 2856 } 2857 EXPORT_SYMBOL_GPL(dm_hold); 2858 2859 const char *dm_device_name(struct mapped_device *md) 2860 { 2861 return md->name; 2862 } 2863 EXPORT_SYMBOL_GPL(dm_device_name); 2864 2865 static void __dm_destroy(struct mapped_device *md, bool wait) 2866 { 2867 struct dm_table *map; 2868 int srcu_idx; 2869 2870 might_sleep(); 2871 2872 map = dm_get_live_table(md, &srcu_idx); 2873 2874 spin_lock(&_minor_lock); 2875 idr_replace(&_minor_idr, MINOR_ALLOCED, MINOR(disk_devt(dm_disk(md)))); 2876 set_bit(DMF_FREEING, &md->flags); 2877 spin_unlock(&_minor_lock); 2878 2879 if (dm_request_based(md)) 2880 flush_kthread_worker(&md->kworker); 2881 2882 /* 2883 * Take suspend_lock so that presuspend and postsuspend methods 2884 * do not race with internal suspend. 2885 */ 2886 mutex_lock(&md->suspend_lock); 2887 if (!dm_suspended_md(md)) { 2888 dm_table_presuspend_targets(map); 2889 dm_table_postsuspend_targets(map); 2890 } 2891 mutex_unlock(&md->suspend_lock); 2892 2893 /* dm_put_live_table must be before msleep, otherwise deadlock is possible */ 2894 dm_put_live_table(md, srcu_idx); 2895 2896 /* 2897 * Rare, but there may be I/O requests still going to complete, 2898 * for example. Wait for all references to disappear. 2899 * No one should increment the reference count of the mapped_device, 2900 * after the mapped_device state becomes DMF_FREEING. 2901 */ 2902 if (wait) 2903 while (atomic_read(&md->holders)) 2904 msleep(1); 2905 else if (atomic_read(&md->holders)) 2906 DMWARN("%s: Forcibly removing mapped_device still in use! (%d users)", 2907 dm_device_name(md), atomic_read(&md->holders)); 2908 2909 dm_sysfs_exit(md); 2910 dm_table_destroy(__unbind(md)); 2911 free_dev(md); 2912 } 2913 2914 void dm_destroy(struct mapped_device *md) 2915 { 2916 __dm_destroy(md, true); 2917 } 2918 2919 void dm_destroy_immediate(struct mapped_device *md) 2920 { 2921 __dm_destroy(md, false); 2922 } 2923 2924 void dm_put(struct mapped_device *md) 2925 { 2926 atomic_dec(&md->holders); 2927 } 2928 EXPORT_SYMBOL_GPL(dm_put); 2929 2930 static int dm_wait_for_completion(struct mapped_device *md, int interruptible) 2931 { 2932 int r = 0; 2933 DECLARE_WAITQUEUE(wait, current); 2934 2935 add_wait_queue(&md->wait, &wait); 2936 2937 while (1) { 2938 set_current_state(interruptible); 2939 2940 if (!md_in_flight(md)) 2941 break; 2942 2943 if (interruptible == TASK_INTERRUPTIBLE && 2944 signal_pending(current)) { 2945 r = -EINTR; 2946 break; 2947 } 2948 2949 io_schedule(); 2950 } 2951 set_current_state(TASK_RUNNING); 2952 2953 remove_wait_queue(&md->wait, &wait); 2954 2955 return r; 2956 } 2957 2958 /* 2959 * Process the deferred bios 2960 */ 2961 static void dm_wq_work(struct work_struct *work) 2962 { 2963 struct mapped_device *md = container_of(work, struct mapped_device, 2964 work); 2965 struct bio *c; 2966 int srcu_idx; 2967 struct dm_table *map; 2968 2969 map = dm_get_live_table(md, &srcu_idx); 2970 2971 while (!test_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags)) { 2972 spin_lock_irq(&md->deferred_lock); 2973 c = bio_list_pop(&md->deferred); 2974 spin_unlock_irq(&md->deferred_lock); 2975 2976 if (!c) 2977 break; 2978 2979 if (dm_request_based(md)) 2980 generic_make_request(c); 2981 else 2982 __split_and_process_bio(md, map, c); 2983 } 2984 2985 dm_put_live_table(md, srcu_idx); 2986 } 2987 2988 static void dm_queue_flush(struct mapped_device *md) 2989 { 2990 clear_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags); 2991 smp_mb__after_atomic(); 2992 queue_work(md->wq, &md->work); 2993 } 2994 2995 /* 2996 * Swap in a new table, returning the old one for the caller to destroy. 2997 */ 2998 struct dm_table *dm_swap_table(struct mapped_device *md, struct dm_table *table) 2999 { 3000 struct dm_table *live_map = NULL, *map = ERR_PTR(-EINVAL); 3001 struct queue_limits limits; 3002 int r; 3003 3004 mutex_lock(&md->suspend_lock); 3005 3006 /* device must be suspended */ 3007 if (!dm_suspended_md(md)) 3008 goto out; 3009 3010 /* 3011 * If the new table has no data devices, retain the existing limits. 3012 * This helps multipath with queue_if_no_path if all paths disappear, 3013 * then new I/O is queued based on these limits, and then some paths 3014 * reappear. 3015 */ 3016 if (dm_table_has_no_data_devices(table)) { 3017 live_map = dm_get_live_table_fast(md); 3018 if (live_map) 3019 limits = md->queue->limits; 3020 dm_put_live_table_fast(md); 3021 } 3022 3023 if (!live_map) { 3024 r = dm_calculate_queue_limits(table, &limits); 3025 if (r) { 3026 map = ERR_PTR(r); 3027 goto out; 3028 } 3029 } 3030 3031 map = __bind(md, table, &limits); 3032 3033 out: 3034 mutex_unlock(&md->suspend_lock); 3035 return map; 3036 } 3037 3038 /* 3039 * Functions to lock and unlock any filesystem running on the 3040 * device. 3041 */ 3042 static int lock_fs(struct mapped_device *md) 3043 { 3044 int r; 3045 3046 WARN_ON(md->frozen_sb); 3047 3048 md->frozen_sb = freeze_bdev(md->bdev); 3049 if (IS_ERR(md->frozen_sb)) { 3050 r = PTR_ERR(md->frozen_sb); 3051 md->frozen_sb = NULL; 3052 return r; 3053 } 3054 3055 set_bit(DMF_FROZEN, &md->flags); 3056 3057 return 0; 3058 } 3059 3060 static void unlock_fs(struct mapped_device *md) 3061 { 3062 if (!test_bit(DMF_FROZEN, &md->flags)) 3063 return; 3064 3065 thaw_bdev(md->bdev, md->frozen_sb); 3066 md->frozen_sb = NULL; 3067 clear_bit(DMF_FROZEN, &md->flags); 3068 } 3069 3070 /* 3071 * If __dm_suspend returns 0, the device is completely quiescent 3072 * now. There is no request-processing activity. All new requests 3073 * are being added to md->deferred list. 3074 * 3075 * Caller must hold md->suspend_lock 3076 */ 3077 static int __dm_suspend(struct mapped_device *md, struct dm_table *map, 3078 unsigned suspend_flags, int interruptible) 3079 { 3080 bool do_lockfs = suspend_flags & DM_SUSPEND_LOCKFS_FLAG; 3081 bool noflush = suspend_flags & DM_SUSPEND_NOFLUSH_FLAG; 3082 int r; 3083 3084 /* 3085 * DMF_NOFLUSH_SUSPENDING must be set before presuspend. 3086 * This flag is cleared before dm_suspend returns. 3087 */ 3088 if (noflush) 3089 set_bit(DMF_NOFLUSH_SUSPENDING, &md->flags); 3090 3091 /* 3092 * This gets reverted if there's an error later and the targets 3093 * provide the .presuspend_undo hook. 3094 */ 3095 dm_table_presuspend_targets(map); 3096 3097 /* 3098 * Flush I/O to the device. 3099 * Any I/O submitted after lock_fs() may not be flushed. 3100 * noflush takes precedence over do_lockfs. 3101 * (lock_fs() flushes I/Os and waits for them to complete.) 3102 */ 3103 if (!noflush && do_lockfs) { 3104 r = lock_fs(md); 3105 if (r) { 3106 dm_table_presuspend_undo_targets(map); 3107 return r; 3108 } 3109 } 3110 3111 /* 3112 * Here we must make sure that no processes are submitting requests 3113 * to target drivers i.e. no one may be executing 3114 * __split_and_process_bio. This is called from dm_request and 3115 * dm_wq_work. 3116 * 3117 * To get all processes out of __split_and_process_bio in dm_request, 3118 * we take the write lock. To prevent any process from reentering 3119 * __split_and_process_bio from dm_request and quiesce the thread 3120 * (dm_wq_work), we set BMF_BLOCK_IO_FOR_SUSPEND and call 3121 * flush_workqueue(md->wq). 3122 */ 3123 set_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags); 3124 if (map) 3125 synchronize_srcu(&md->io_barrier); 3126 3127 /* 3128 * Stop md->queue before flushing md->wq in case request-based 3129 * dm defers requests to md->wq from md->queue. 3130 */ 3131 if (dm_request_based(md)) { 3132 stop_queue(md->queue); 3133 flush_kthread_worker(&md->kworker); 3134 } 3135 3136 flush_workqueue(md->wq); 3137 3138 /* 3139 * At this point no more requests are entering target request routines. 3140 * We call dm_wait_for_completion to wait for all existing requests 3141 * to finish. 3142 */ 3143 r = dm_wait_for_completion(md, interruptible); 3144 3145 if (noflush) 3146 clear_bit(DMF_NOFLUSH_SUSPENDING, &md->flags); 3147 if (map) 3148 synchronize_srcu(&md->io_barrier); 3149 3150 /* were we interrupted ? */ 3151 if (r < 0) { 3152 dm_queue_flush(md); 3153 3154 if (dm_request_based(md)) 3155 start_queue(md->queue); 3156 3157 unlock_fs(md); 3158 dm_table_presuspend_undo_targets(map); 3159 /* pushback list is already flushed, so skip flush */ 3160 } 3161 3162 return r; 3163 } 3164 3165 /* 3166 * We need to be able to change a mapping table under a mounted 3167 * filesystem. For example we might want to move some data in 3168 * the background. Before the table can be swapped with 3169 * dm_bind_table, dm_suspend must be called to flush any in 3170 * flight bios and ensure that any further io gets deferred. 3171 */ 3172 /* 3173 * Suspend mechanism in request-based dm. 3174 * 3175 * 1. Flush all I/Os by lock_fs() if needed. 3176 * 2. Stop dispatching any I/O by stopping the request_queue. 3177 * 3. Wait for all in-flight I/Os to be completed or requeued. 3178 * 3179 * To abort suspend, start the request_queue. 3180 */ 3181 int dm_suspend(struct mapped_device *md, unsigned suspend_flags) 3182 { 3183 struct dm_table *map = NULL; 3184 int r = 0; 3185 3186 retry: 3187 mutex_lock_nested(&md->suspend_lock, SINGLE_DEPTH_NESTING); 3188 3189 if (dm_suspended_md(md)) { 3190 r = -EINVAL; 3191 goto out_unlock; 3192 } 3193 3194 if (dm_suspended_internally_md(md)) { 3195 /* already internally suspended, wait for internal resume */ 3196 mutex_unlock(&md->suspend_lock); 3197 r = wait_on_bit(&md->flags, DMF_SUSPENDED_INTERNALLY, TASK_INTERRUPTIBLE); 3198 if (r) 3199 return r; 3200 goto retry; 3201 } 3202 3203 map = rcu_dereference_protected(md->map, lockdep_is_held(&md->suspend_lock)); 3204 3205 r = __dm_suspend(md, map, suspend_flags, TASK_INTERRUPTIBLE); 3206 if (r) 3207 goto out_unlock; 3208 3209 set_bit(DMF_SUSPENDED, &md->flags); 3210 3211 dm_table_postsuspend_targets(map); 3212 3213 out_unlock: 3214 mutex_unlock(&md->suspend_lock); 3215 return r; 3216 } 3217 3218 static int __dm_resume(struct mapped_device *md, struct dm_table *map) 3219 { 3220 if (map) { 3221 int r = dm_table_resume_targets(map); 3222 if (r) 3223 return r; 3224 } 3225 3226 dm_queue_flush(md); 3227 3228 /* 3229 * Flushing deferred I/Os must be done after targets are resumed 3230 * so that mapping of targets can work correctly. 3231 * Request-based dm is queueing the deferred I/Os in its request_queue. 3232 */ 3233 if (dm_request_based(md)) 3234 start_queue(md->queue); 3235 3236 unlock_fs(md); 3237 3238 return 0; 3239 } 3240 3241 int dm_resume(struct mapped_device *md) 3242 { 3243 int r = -EINVAL; 3244 struct dm_table *map = NULL; 3245 3246 retry: 3247 mutex_lock_nested(&md->suspend_lock, SINGLE_DEPTH_NESTING); 3248 3249 if (!dm_suspended_md(md)) 3250 goto out; 3251 3252 if (dm_suspended_internally_md(md)) { 3253 /* already internally suspended, wait for internal resume */ 3254 mutex_unlock(&md->suspend_lock); 3255 r = wait_on_bit(&md->flags, DMF_SUSPENDED_INTERNALLY, TASK_INTERRUPTIBLE); 3256 if (r) 3257 return r; 3258 goto retry; 3259 } 3260 3261 map = rcu_dereference_protected(md->map, lockdep_is_held(&md->suspend_lock)); 3262 if (!map || !dm_table_get_size(map)) 3263 goto out; 3264 3265 r = __dm_resume(md, map); 3266 if (r) 3267 goto out; 3268 3269 clear_bit(DMF_SUSPENDED, &md->flags); 3270 3271 r = 0; 3272 out: 3273 mutex_unlock(&md->suspend_lock); 3274 3275 return r; 3276 } 3277 3278 /* 3279 * Internal suspend/resume works like userspace-driven suspend. It waits 3280 * until all bios finish and prevents issuing new bios to the target drivers. 3281 * It may be used only from the kernel. 3282 */ 3283 3284 static void __dm_internal_suspend(struct mapped_device *md, unsigned suspend_flags) 3285 { 3286 struct dm_table *map = NULL; 3287 3288 if (md->internal_suspend_count++) 3289 return; /* nested internal suspend */ 3290 3291 if (dm_suspended_md(md)) { 3292 set_bit(DMF_SUSPENDED_INTERNALLY, &md->flags); 3293 return; /* nest suspend */ 3294 } 3295 3296 map = rcu_dereference_protected(md->map, lockdep_is_held(&md->suspend_lock)); 3297 3298 /* 3299 * Using TASK_UNINTERRUPTIBLE because only NOFLUSH internal suspend is 3300 * supported. Properly supporting a TASK_INTERRUPTIBLE internal suspend 3301 * would require changing .presuspend to return an error -- avoid this 3302 * until there is a need for more elaborate variants of internal suspend. 3303 */ 3304 (void) __dm_suspend(md, map, suspend_flags, TASK_UNINTERRUPTIBLE); 3305 3306 set_bit(DMF_SUSPENDED_INTERNALLY, &md->flags); 3307 3308 dm_table_postsuspend_targets(map); 3309 } 3310 3311 static void __dm_internal_resume(struct mapped_device *md) 3312 { 3313 BUG_ON(!md->internal_suspend_count); 3314 3315 if (--md->internal_suspend_count) 3316 return; /* resume from nested internal suspend */ 3317 3318 if (dm_suspended_md(md)) 3319 goto done; /* resume from nested suspend */ 3320 3321 /* 3322 * NOTE: existing callers don't need to call dm_table_resume_targets 3323 * (which may fail -- so best to avoid it for now by passing NULL map) 3324 */ 3325 (void) __dm_resume(md, NULL); 3326 3327 done: 3328 clear_bit(DMF_SUSPENDED_INTERNALLY, &md->flags); 3329 smp_mb__after_atomic(); 3330 wake_up_bit(&md->flags, DMF_SUSPENDED_INTERNALLY); 3331 } 3332 3333 void dm_internal_suspend_noflush(struct mapped_device *md) 3334 { 3335 mutex_lock(&md->suspend_lock); 3336 __dm_internal_suspend(md, DM_SUSPEND_NOFLUSH_FLAG); 3337 mutex_unlock(&md->suspend_lock); 3338 } 3339 EXPORT_SYMBOL_GPL(dm_internal_suspend_noflush); 3340 3341 void dm_internal_resume(struct mapped_device *md) 3342 { 3343 mutex_lock(&md->suspend_lock); 3344 __dm_internal_resume(md); 3345 mutex_unlock(&md->suspend_lock); 3346 } 3347 EXPORT_SYMBOL_GPL(dm_internal_resume); 3348 3349 /* 3350 * Fast variants of internal suspend/resume hold md->suspend_lock, 3351 * which prevents interaction with userspace-driven suspend. 3352 */ 3353 3354 void dm_internal_suspend_fast(struct mapped_device *md) 3355 { 3356 mutex_lock(&md->suspend_lock); 3357 if (dm_suspended_md(md) || dm_suspended_internally_md(md)) 3358 return; 3359 3360 set_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags); 3361 synchronize_srcu(&md->io_barrier); 3362 flush_workqueue(md->wq); 3363 dm_wait_for_completion(md, TASK_UNINTERRUPTIBLE); 3364 } 3365 EXPORT_SYMBOL_GPL(dm_internal_suspend_fast); 3366 3367 void dm_internal_resume_fast(struct mapped_device *md) 3368 { 3369 if (dm_suspended_md(md) || dm_suspended_internally_md(md)) 3370 goto done; 3371 3372 dm_queue_flush(md); 3373 3374 done: 3375 mutex_unlock(&md->suspend_lock); 3376 } 3377 EXPORT_SYMBOL_GPL(dm_internal_resume_fast); 3378 3379 /*----------------------------------------------------------------- 3380 * Event notification. 3381 *---------------------------------------------------------------*/ 3382 int dm_kobject_uevent(struct mapped_device *md, enum kobject_action action, 3383 unsigned cookie) 3384 { 3385 char udev_cookie[DM_COOKIE_LENGTH]; 3386 char *envp[] = { udev_cookie, NULL }; 3387 3388 if (!cookie) 3389 return kobject_uevent(&disk_to_dev(md->disk)->kobj, action); 3390 else { 3391 snprintf(udev_cookie, DM_COOKIE_LENGTH, "%s=%u", 3392 DM_COOKIE_ENV_VAR_NAME, cookie); 3393 return kobject_uevent_env(&disk_to_dev(md->disk)->kobj, 3394 action, envp); 3395 } 3396 } 3397 3398 uint32_t dm_next_uevent_seq(struct mapped_device *md) 3399 { 3400 return atomic_add_return(1, &md->uevent_seq); 3401 } 3402 3403 uint32_t dm_get_event_nr(struct mapped_device *md) 3404 { 3405 return atomic_read(&md->event_nr); 3406 } 3407 3408 int dm_wait_event(struct mapped_device *md, int event_nr) 3409 { 3410 return wait_event_interruptible(md->eventq, 3411 (event_nr != atomic_read(&md->event_nr))); 3412 } 3413 3414 void dm_uevent_add(struct mapped_device *md, struct list_head *elist) 3415 { 3416 unsigned long flags; 3417 3418 spin_lock_irqsave(&md->uevent_lock, flags); 3419 list_add(elist, &md->uevent_list); 3420 spin_unlock_irqrestore(&md->uevent_lock, flags); 3421 } 3422 3423 /* 3424 * The gendisk is only valid as long as you have a reference 3425 * count on 'md'. 3426 */ 3427 struct gendisk *dm_disk(struct mapped_device *md) 3428 { 3429 return md->disk; 3430 } 3431 3432 struct kobject *dm_kobject(struct mapped_device *md) 3433 { 3434 return &md->kobj_holder.kobj; 3435 } 3436 3437 struct mapped_device *dm_get_from_kobject(struct kobject *kobj) 3438 { 3439 struct mapped_device *md; 3440 3441 md = container_of(kobj, struct mapped_device, kobj_holder.kobj); 3442 3443 if (test_bit(DMF_FREEING, &md->flags) || 3444 dm_deleting_md(md)) 3445 return NULL; 3446 3447 dm_get(md); 3448 return md; 3449 } 3450 3451 int dm_suspended_md(struct mapped_device *md) 3452 { 3453 return test_bit(DMF_SUSPENDED, &md->flags); 3454 } 3455 3456 int dm_suspended_internally_md(struct mapped_device *md) 3457 { 3458 return test_bit(DMF_SUSPENDED_INTERNALLY, &md->flags); 3459 } 3460 3461 int dm_test_deferred_remove_flag(struct mapped_device *md) 3462 { 3463 return test_bit(DMF_DEFERRED_REMOVE, &md->flags); 3464 } 3465 3466 int dm_suspended(struct dm_target *ti) 3467 { 3468 return dm_suspended_md(dm_table_get_md(ti->table)); 3469 } 3470 EXPORT_SYMBOL_GPL(dm_suspended); 3471 3472 int dm_noflush_suspending(struct dm_target *ti) 3473 { 3474 return __noflush_suspending(dm_table_get_md(ti->table)); 3475 } 3476 EXPORT_SYMBOL_GPL(dm_noflush_suspending); 3477 3478 struct dm_md_mempools *dm_alloc_md_mempools(unsigned type, unsigned integrity, unsigned per_bio_data_size) 3479 { 3480 struct dm_md_mempools *pools = kzalloc(sizeof(*pools), GFP_KERNEL); 3481 struct kmem_cache *cachep; 3482 unsigned int pool_size = 0; 3483 unsigned int front_pad; 3484 3485 if (!pools) 3486 return NULL; 3487 3488 switch (type) { 3489 case DM_TYPE_BIO_BASED: 3490 cachep = _io_cache; 3491 pool_size = dm_get_reserved_bio_based_ios(); 3492 front_pad = roundup(per_bio_data_size, __alignof__(struct dm_target_io)) + offsetof(struct dm_target_io, clone); 3493 break; 3494 case DM_TYPE_REQUEST_BASED: 3495 pool_size = dm_get_reserved_rq_based_ios(); 3496 pools->rq_pool = mempool_create_slab_pool(pool_size, _rq_cache); 3497 if (!pools->rq_pool) 3498 goto out; 3499 /* fall through to setup remaining rq-based pools */ 3500 case DM_TYPE_MQ_REQUEST_BASED: 3501 cachep = _rq_tio_cache; 3502 if (!pool_size) 3503 pool_size = dm_get_reserved_rq_based_ios(); 3504 front_pad = offsetof(struct dm_rq_clone_bio_info, clone); 3505 /* per_bio_data_size is not used. See __bind_mempools(). */ 3506 WARN_ON(per_bio_data_size != 0); 3507 break; 3508 default: 3509 goto out; 3510 } 3511 3512 pools->io_pool = mempool_create_slab_pool(pool_size, cachep); 3513 if (!pools->io_pool) 3514 goto out; 3515 3516 pools->bs = bioset_create_nobvec(pool_size, front_pad); 3517 if (!pools->bs) 3518 goto out; 3519 3520 if (integrity && bioset_integrity_create(pools->bs, pool_size)) 3521 goto out; 3522 3523 return pools; 3524 3525 out: 3526 dm_free_md_mempools(pools); 3527 3528 return NULL; 3529 } 3530 3531 void dm_free_md_mempools(struct dm_md_mempools *pools) 3532 { 3533 if (!pools) 3534 return; 3535 3536 if (pools->io_pool) 3537 mempool_destroy(pools->io_pool); 3538 3539 if (pools->rq_pool) 3540 mempool_destroy(pools->rq_pool); 3541 3542 if (pools->bs) 3543 bioset_free(pools->bs); 3544 3545 kfree(pools); 3546 } 3547 3548 static const struct block_device_operations dm_blk_dops = { 3549 .open = dm_blk_open, 3550 .release = dm_blk_close, 3551 .ioctl = dm_blk_ioctl, 3552 .getgeo = dm_blk_getgeo, 3553 .owner = THIS_MODULE 3554 }; 3555 3556 /* 3557 * module hooks 3558 */ 3559 module_init(dm_init); 3560 module_exit(dm_exit); 3561 3562 module_param(major, uint, 0); 3563 MODULE_PARM_DESC(major, "The major number of the device mapper"); 3564 3565 module_param(reserved_bio_based_ios, uint, S_IRUGO | S_IWUSR); 3566 MODULE_PARM_DESC(reserved_bio_based_ios, "Reserved IOs in bio-based mempools"); 3567 3568 module_param(reserved_rq_based_ios, uint, S_IRUGO | S_IWUSR); 3569 MODULE_PARM_DESC(reserved_rq_based_ios, "Reserved IOs in request-based mempools"); 3570 3571 MODULE_DESCRIPTION(DM_NAME " driver"); 3572 MODULE_AUTHOR("Joe Thornber <dm-devel@redhat.com>"); 3573 MODULE_LICENSE("GPL"); 3574