1 /* 2 * Copyright (C) 2001, 2002 Sistina Software (UK) Limited. 3 * Copyright (C) 2004-2008 Red Hat, Inc. All rights reserved. 4 * 5 * This file is released under the GPL. 6 */ 7 8 #include "dm.h" 9 #include "dm-uevent.h" 10 11 #include <linux/init.h> 12 #include <linux/module.h> 13 #include <linux/mutex.h> 14 #include <linux/moduleparam.h> 15 #include <linux/blkpg.h> 16 #include <linux/bio.h> 17 #include <linux/mempool.h> 18 #include <linux/slab.h> 19 #include <linux/idr.h> 20 #include <linux/hdreg.h> 21 #include <linux/delay.h> 22 #include <linux/wait.h> 23 #include <linux/kthread.h> 24 25 #include <trace/events/block.h> 26 27 #define DM_MSG_PREFIX "core" 28 29 #ifdef CONFIG_PRINTK 30 /* 31 * ratelimit state to be used in DMXXX_LIMIT(). 32 */ 33 DEFINE_RATELIMIT_STATE(dm_ratelimit_state, 34 DEFAULT_RATELIMIT_INTERVAL, 35 DEFAULT_RATELIMIT_BURST); 36 EXPORT_SYMBOL(dm_ratelimit_state); 37 #endif 38 39 /* 40 * Cookies are numeric values sent with CHANGE and REMOVE 41 * uevents while resuming, removing or renaming the device. 42 */ 43 #define DM_COOKIE_ENV_VAR_NAME "DM_COOKIE" 44 #define DM_COOKIE_LENGTH 24 45 46 static const char *_name = DM_NAME; 47 48 static unsigned int major = 0; 49 static unsigned int _major = 0; 50 51 static DEFINE_IDR(_minor_idr); 52 53 static DEFINE_SPINLOCK(_minor_lock); 54 55 static void do_deferred_remove(struct work_struct *w); 56 57 static DECLARE_WORK(deferred_remove_work, do_deferred_remove); 58 59 static struct workqueue_struct *deferred_remove_workqueue; 60 61 /* 62 * For bio-based dm. 63 * One of these is allocated per bio. 64 */ 65 struct dm_io { 66 struct mapped_device *md; 67 int error; 68 atomic_t io_count; 69 struct bio *bio; 70 unsigned long start_time; 71 spinlock_t endio_lock; 72 struct dm_stats_aux stats_aux; 73 }; 74 75 /* 76 * For request-based dm. 77 * One of these is allocated per request. 78 */ 79 struct dm_rq_target_io { 80 struct mapped_device *md; 81 struct dm_target *ti; 82 struct request *orig, *clone; 83 struct kthread_work work; 84 int error; 85 union map_info info; 86 }; 87 88 /* 89 * For request-based dm - the bio clones we allocate are embedded in these 90 * structs. 91 * 92 * We allocate these with bio_alloc_bioset, using the front_pad parameter when 93 * the bioset is created - this means the bio has to come at the end of the 94 * struct. 95 */ 96 struct dm_rq_clone_bio_info { 97 struct bio *orig; 98 struct dm_rq_target_io *tio; 99 struct bio clone; 100 }; 101 102 union map_info *dm_get_rq_mapinfo(struct request *rq) 103 { 104 if (rq && rq->end_io_data) 105 return &((struct dm_rq_target_io *)rq->end_io_data)->info; 106 return NULL; 107 } 108 EXPORT_SYMBOL_GPL(dm_get_rq_mapinfo); 109 110 #define MINOR_ALLOCED ((void *)-1) 111 112 /* 113 * Bits for the md->flags field. 114 */ 115 #define DMF_BLOCK_IO_FOR_SUSPEND 0 116 #define DMF_SUSPENDED 1 117 #define DMF_FROZEN 2 118 #define DMF_FREEING 3 119 #define DMF_DELETING 4 120 #define DMF_NOFLUSH_SUSPENDING 5 121 #define DMF_MERGE_IS_OPTIONAL 6 122 #define DMF_DEFERRED_REMOVE 7 123 #define DMF_SUSPENDED_INTERNALLY 8 124 125 /* 126 * A dummy definition to make RCU happy. 127 * struct dm_table should never be dereferenced in this file. 128 */ 129 struct dm_table { 130 int undefined__; 131 }; 132 133 /* 134 * Work processed by per-device workqueue. 135 */ 136 struct mapped_device { 137 struct srcu_struct io_barrier; 138 struct mutex suspend_lock; 139 atomic_t holders; 140 atomic_t open_count; 141 142 /* 143 * The current mapping. 144 * Use dm_get_live_table{_fast} or take suspend_lock for 145 * dereference. 146 */ 147 struct dm_table __rcu *map; 148 149 struct list_head table_devices; 150 struct mutex table_devices_lock; 151 152 unsigned long flags; 153 154 struct request_queue *queue; 155 unsigned type; 156 /* Protect queue and type against concurrent access. */ 157 struct mutex type_lock; 158 159 struct target_type *immutable_target_type; 160 161 struct gendisk *disk; 162 char name[16]; 163 164 void *interface_ptr; 165 166 /* 167 * A list of ios that arrived while we were suspended. 168 */ 169 atomic_t pending[2]; 170 wait_queue_head_t wait; 171 struct work_struct work; 172 struct bio_list deferred; 173 spinlock_t deferred_lock; 174 175 /* 176 * Processing queue (flush) 177 */ 178 struct workqueue_struct *wq; 179 180 /* 181 * io objects are allocated from here. 182 */ 183 mempool_t *io_pool; 184 mempool_t *rq_pool; 185 186 struct bio_set *bs; 187 188 /* 189 * Event handling. 190 */ 191 atomic_t event_nr; 192 wait_queue_head_t eventq; 193 atomic_t uevent_seq; 194 struct list_head uevent_list; 195 spinlock_t uevent_lock; /* Protect access to uevent_list */ 196 197 /* 198 * freeze/thaw support require holding onto a super block 199 */ 200 struct super_block *frozen_sb; 201 struct block_device *bdev; 202 203 /* forced geometry settings */ 204 struct hd_geometry geometry; 205 206 /* kobject and completion */ 207 struct dm_kobject_holder kobj_holder; 208 209 /* zero-length flush that will be cloned and submitted to targets */ 210 struct bio flush_bio; 211 212 /* the number of internal suspends */ 213 unsigned internal_suspend_count; 214 215 struct dm_stats stats; 216 217 struct kthread_worker kworker; 218 struct task_struct *kworker_task; 219 }; 220 221 /* 222 * For mempools pre-allocation at the table loading time. 223 */ 224 struct dm_md_mempools { 225 mempool_t *io_pool; 226 mempool_t *rq_pool; 227 struct bio_set *bs; 228 }; 229 230 struct table_device { 231 struct list_head list; 232 atomic_t count; 233 struct dm_dev dm_dev; 234 }; 235 236 #define RESERVED_BIO_BASED_IOS 16 237 #define RESERVED_REQUEST_BASED_IOS 256 238 #define RESERVED_MAX_IOS 1024 239 static struct kmem_cache *_io_cache; 240 static struct kmem_cache *_rq_tio_cache; 241 static struct kmem_cache *_rq_cache; 242 243 /* 244 * Bio-based DM's mempools' reserved IOs set by the user. 245 */ 246 static unsigned reserved_bio_based_ios = RESERVED_BIO_BASED_IOS; 247 248 /* 249 * Request-based DM's mempools' reserved IOs set by the user. 250 */ 251 static unsigned reserved_rq_based_ios = RESERVED_REQUEST_BASED_IOS; 252 253 static unsigned __dm_get_reserved_ios(unsigned *reserved_ios, 254 unsigned def, unsigned max) 255 { 256 unsigned ios = ACCESS_ONCE(*reserved_ios); 257 unsigned modified_ios = 0; 258 259 if (!ios) 260 modified_ios = def; 261 else if (ios > max) 262 modified_ios = max; 263 264 if (modified_ios) { 265 (void)cmpxchg(reserved_ios, ios, modified_ios); 266 ios = modified_ios; 267 } 268 269 return ios; 270 } 271 272 unsigned dm_get_reserved_bio_based_ios(void) 273 { 274 return __dm_get_reserved_ios(&reserved_bio_based_ios, 275 RESERVED_BIO_BASED_IOS, RESERVED_MAX_IOS); 276 } 277 EXPORT_SYMBOL_GPL(dm_get_reserved_bio_based_ios); 278 279 unsigned dm_get_reserved_rq_based_ios(void) 280 { 281 return __dm_get_reserved_ios(&reserved_rq_based_ios, 282 RESERVED_REQUEST_BASED_IOS, RESERVED_MAX_IOS); 283 } 284 EXPORT_SYMBOL_GPL(dm_get_reserved_rq_based_ios); 285 286 static int __init local_init(void) 287 { 288 int r = -ENOMEM; 289 290 /* allocate a slab for the dm_ios */ 291 _io_cache = KMEM_CACHE(dm_io, 0); 292 if (!_io_cache) 293 return r; 294 295 _rq_tio_cache = KMEM_CACHE(dm_rq_target_io, 0); 296 if (!_rq_tio_cache) 297 goto out_free_io_cache; 298 299 _rq_cache = kmem_cache_create("dm_clone_request", sizeof(struct request), 300 __alignof__(struct request), 0, NULL); 301 if (!_rq_cache) 302 goto out_free_rq_tio_cache; 303 304 r = dm_uevent_init(); 305 if (r) 306 goto out_free_rq_cache; 307 308 deferred_remove_workqueue = alloc_workqueue("kdmremove", WQ_UNBOUND, 1); 309 if (!deferred_remove_workqueue) { 310 r = -ENOMEM; 311 goto out_uevent_exit; 312 } 313 314 _major = major; 315 r = register_blkdev(_major, _name); 316 if (r < 0) 317 goto out_free_workqueue; 318 319 if (!_major) 320 _major = r; 321 322 return 0; 323 324 out_free_workqueue: 325 destroy_workqueue(deferred_remove_workqueue); 326 out_uevent_exit: 327 dm_uevent_exit(); 328 out_free_rq_cache: 329 kmem_cache_destroy(_rq_cache); 330 out_free_rq_tio_cache: 331 kmem_cache_destroy(_rq_tio_cache); 332 out_free_io_cache: 333 kmem_cache_destroy(_io_cache); 334 335 return r; 336 } 337 338 static void local_exit(void) 339 { 340 flush_scheduled_work(); 341 destroy_workqueue(deferred_remove_workqueue); 342 343 kmem_cache_destroy(_rq_cache); 344 kmem_cache_destroy(_rq_tio_cache); 345 kmem_cache_destroy(_io_cache); 346 unregister_blkdev(_major, _name); 347 dm_uevent_exit(); 348 349 _major = 0; 350 351 DMINFO("cleaned up"); 352 } 353 354 static int (*_inits[])(void) __initdata = { 355 local_init, 356 dm_target_init, 357 dm_linear_init, 358 dm_stripe_init, 359 dm_io_init, 360 dm_kcopyd_init, 361 dm_interface_init, 362 dm_statistics_init, 363 }; 364 365 static void (*_exits[])(void) = { 366 local_exit, 367 dm_target_exit, 368 dm_linear_exit, 369 dm_stripe_exit, 370 dm_io_exit, 371 dm_kcopyd_exit, 372 dm_interface_exit, 373 dm_statistics_exit, 374 }; 375 376 static int __init dm_init(void) 377 { 378 const int count = ARRAY_SIZE(_inits); 379 380 int r, i; 381 382 for (i = 0; i < count; i++) { 383 r = _inits[i](); 384 if (r) 385 goto bad; 386 } 387 388 return 0; 389 390 bad: 391 while (i--) 392 _exits[i](); 393 394 return r; 395 } 396 397 static void __exit dm_exit(void) 398 { 399 int i = ARRAY_SIZE(_exits); 400 401 while (i--) 402 _exits[i](); 403 404 /* 405 * Should be empty by this point. 406 */ 407 idr_destroy(&_minor_idr); 408 } 409 410 /* 411 * Block device functions 412 */ 413 int dm_deleting_md(struct mapped_device *md) 414 { 415 return test_bit(DMF_DELETING, &md->flags); 416 } 417 418 static int dm_blk_open(struct block_device *bdev, fmode_t mode) 419 { 420 struct mapped_device *md; 421 422 spin_lock(&_minor_lock); 423 424 md = bdev->bd_disk->private_data; 425 if (!md) 426 goto out; 427 428 if (test_bit(DMF_FREEING, &md->flags) || 429 dm_deleting_md(md)) { 430 md = NULL; 431 goto out; 432 } 433 434 dm_get(md); 435 atomic_inc(&md->open_count); 436 437 out: 438 spin_unlock(&_minor_lock); 439 440 return md ? 0 : -ENXIO; 441 } 442 443 static void dm_blk_close(struct gendisk *disk, fmode_t mode) 444 { 445 struct mapped_device *md = disk->private_data; 446 447 spin_lock(&_minor_lock); 448 449 if (atomic_dec_and_test(&md->open_count) && 450 (test_bit(DMF_DEFERRED_REMOVE, &md->flags))) 451 queue_work(deferred_remove_workqueue, &deferred_remove_work); 452 453 dm_put(md); 454 455 spin_unlock(&_minor_lock); 456 } 457 458 int dm_open_count(struct mapped_device *md) 459 { 460 return atomic_read(&md->open_count); 461 } 462 463 /* 464 * Guarantees nothing is using the device before it's deleted. 465 */ 466 int dm_lock_for_deletion(struct mapped_device *md, bool mark_deferred, bool only_deferred) 467 { 468 int r = 0; 469 470 spin_lock(&_minor_lock); 471 472 if (dm_open_count(md)) { 473 r = -EBUSY; 474 if (mark_deferred) 475 set_bit(DMF_DEFERRED_REMOVE, &md->flags); 476 } else if (only_deferred && !test_bit(DMF_DEFERRED_REMOVE, &md->flags)) 477 r = -EEXIST; 478 else 479 set_bit(DMF_DELETING, &md->flags); 480 481 spin_unlock(&_minor_lock); 482 483 return r; 484 } 485 486 int dm_cancel_deferred_remove(struct mapped_device *md) 487 { 488 int r = 0; 489 490 spin_lock(&_minor_lock); 491 492 if (test_bit(DMF_DELETING, &md->flags)) 493 r = -EBUSY; 494 else 495 clear_bit(DMF_DEFERRED_REMOVE, &md->flags); 496 497 spin_unlock(&_minor_lock); 498 499 return r; 500 } 501 502 static void do_deferred_remove(struct work_struct *w) 503 { 504 dm_deferred_remove(); 505 } 506 507 sector_t dm_get_size(struct mapped_device *md) 508 { 509 return get_capacity(md->disk); 510 } 511 512 struct request_queue *dm_get_md_queue(struct mapped_device *md) 513 { 514 return md->queue; 515 } 516 517 struct dm_stats *dm_get_stats(struct mapped_device *md) 518 { 519 return &md->stats; 520 } 521 522 static int dm_blk_getgeo(struct block_device *bdev, struct hd_geometry *geo) 523 { 524 struct mapped_device *md = bdev->bd_disk->private_data; 525 526 return dm_get_geometry(md, geo); 527 } 528 529 static int dm_blk_ioctl(struct block_device *bdev, fmode_t mode, 530 unsigned int cmd, unsigned long arg) 531 { 532 struct mapped_device *md = bdev->bd_disk->private_data; 533 int srcu_idx; 534 struct dm_table *map; 535 struct dm_target *tgt; 536 int r = -ENOTTY; 537 538 retry: 539 map = dm_get_live_table(md, &srcu_idx); 540 541 if (!map || !dm_table_get_size(map)) 542 goto out; 543 544 /* We only support devices that have a single target */ 545 if (dm_table_get_num_targets(map) != 1) 546 goto out; 547 548 tgt = dm_table_get_target(map, 0); 549 if (!tgt->type->ioctl) 550 goto out; 551 552 if (dm_suspended_md(md)) { 553 r = -EAGAIN; 554 goto out; 555 } 556 557 r = tgt->type->ioctl(tgt, cmd, arg); 558 559 out: 560 dm_put_live_table(md, srcu_idx); 561 562 if (r == -ENOTCONN) { 563 msleep(10); 564 goto retry; 565 } 566 567 return r; 568 } 569 570 static struct dm_io *alloc_io(struct mapped_device *md) 571 { 572 return mempool_alloc(md->io_pool, GFP_NOIO); 573 } 574 575 static void free_io(struct mapped_device *md, struct dm_io *io) 576 { 577 mempool_free(io, md->io_pool); 578 } 579 580 static void free_tio(struct mapped_device *md, struct dm_target_io *tio) 581 { 582 bio_put(&tio->clone); 583 } 584 585 static struct dm_rq_target_io *alloc_rq_tio(struct mapped_device *md, 586 gfp_t gfp_mask) 587 { 588 return mempool_alloc(md->io_pool, gfp_mask); 589 } 590 591 static void free_rq_tio(struct dm_rq_target_io *tio) 592 { 593 mempool_free(tio, tio->md->io_pool); 594 } 595 596 static struct request *alloc_clone_request(struct mapped_device *md, 597 gfp_t gfp_mask) 598 { 599 return mempool_alloc(md->rq_pool, gfp_mask); 600 } 601 602 static void free_clone_request(struct mapped_device *md, struct request *rq) 603 { 604 mempool_free(rq, md->rq_pool); 605 } 606 607 static int md_in_flight(struct mapped_device *md) 608 { 609 return atomic_read(&md->pending[READ]) + 610 atomic_read(&md->pending[WRITE]); 611 } 612 613 static void start_io_acct(struct dm_io *io) 614 { 615 struct mapped_device *md = io->md; 616 struct bio *bio = io->bio; 617 int cpu; 618 int rw = bio_data_dir(bio); 619 620 io->start_time = jiffies; 621 622 cpu = part_stat_lock(); 623 part_round_stats(cpu, &dm_disk(md)->part0); 624 part_stat_unlock(); 625 atomic_set(&dm_disk(md)->part0.in_flight[rw], 626 atomic_inc_return(&md->pending[rw])); 627 628 if (unlikely(dm_stats_used(&md->stats))) 629 dm_stats_account_io(&md->stats, bio->bi_rw, bio->bi_iter.bi_sector, 630 bio_sectors(bio), false, 0, &io->stats_aux); 631 } 632 633 static void end_io_acct(struct dm_io *io) 634 { 635 struct mapped_device *md = io->md; 636 struct bio *bio = io->bio; 637 unsigned long duration = jiffies - io->start_time; 638 int pending; 639 int rw = bio_data_dir(bio); 640 641 generic_end_io_acct(rw, &dm_disk(md)->part0, io->start_time); 642 643 if (unlikely(dm_stats_used(&md->stats))) 644 dm_stats_account_io(&md->stats, bio->bi_rw, bio->bi_iter.bi_sector, 645 bio_sectors(bio), true, duration, &io->stats_aux); 646 647 /* 648 * After this is decremented the bio must not be touched if it is 649 * a flush. 650 */ 651 pending = atomic_dec_return(&md->pending[rw]); 652 atomic_set(&dm_disk(md)->part0.in_flight[rw], pending); 653 pending += atomic_read(&md->pending[rw^0x1]); 654 655 /* nudge anyone waiting on suspend queue */ 656 if (!pending) 657 wake_up(&md->wait); 658 } 659 660 /* 661 * Add the bio to the list of deferred io. 662 */ 663 static void queue_io(struct mapped_device *md, struct bio *bio) 664 { 665 unsigned long flags; 666 667 spin_lock_irqsave(&md->deferred_lock, flags); 668 bio_list_add(&md->deferred, bio); 669 spin_unlock_irqrestore(&md->deferred_lock, flags); 670 queue_work(md->wq, &md->work); 671 } 672 673 /* 674 * Everyone (including functions in this file), should use this 675 * function to access the md->map field, and make sure they call 676 * dm_put_live_table() when finished. 677 */ 678 struct dm_table *dm_get_live_table(struct mapped_device *md, int *srcu_idx) __acquires(md->io_barrier) 679 { 680 *srcu_idx = srcu_read_lock(&md->io_barrier); 681 682 return srcu_dereference(md->map, &md->io_barrier); 683 } 684 685 void dm_put_live_table(struct mapped_device *md, int srcu_idx) __releases(md->io_barrier) 686 { 687 srcu_read_unlock(&md->io_barrier, srcu_idx); 688 } 689 690 void dm_sync_table(struct mapped_device *md) 691 { 692 synchronize_srcu(&md->io_barrier); 693 synchronize_rcu_expedited(); 694 } 695 696 /* 697 * A fast alternative to dm_get_live_table/dm_put_live_table. 698 * The caller must not block between these two functions. 699 */ 700 static struct dm_table *dm_get_live_table_fast(struct mapped_device *md) __acquires(RCU) 701 { 702 rcu_read_lock(); 703 return rcu_dereference(md->map); 704 } 705 706 static void dm_put_live_table_fast(struct mapped_device *md) __releases(RCU) 707 { 708 rcu_read_unlock(); 709 } 710 711 /* 712 * Open a table device so we can use it as a map destination. 713 */ 714 static int open_table_device(struct table_device *td, dev_t dev, 715 struct mapped_device *md) 716 { 717 static char *_claim_ptr = "I belong to device-mapper"; 718 struct block_device *bdev; 719 720 int r; 721 722 BUG_ON(td->dm_dev.bdev); 723 724 bdev = blkdev_get_by_dev(dev, td->dm_dev.mode | FMODE_EXCL, _claim_ptr); 725 if (IS_ERR(bdev)) 726 return PTR_ERR(bdev); 727 728 r = bd_link_disk_holder(bdev, dm_disk(md)); 729 if (r) { 730 blkdev_put(bdev, td->dm_dev.mode | FMODE_EXCL); 731 return r; 732 } 733 734 td->dm_dev.bdev = bdev; 735 return 0; 736 } 737 738 /* 739 * Close a table device that we've been using. 740 */ 741 static void close_table_device(struct table_device *td, struct mapped_device *md) 742 { 743 if (!td->dm_dev.bdev) 744 return; 745 746 bd_unlink_disk_holder(td->dm_dev.bdev, dm_disk(md)); 747 blkdev_put(td->dm_dev.bdev, td->dm_dev.mode | FMODE_EXCL); 748 td->dm_dev.bdev = NULL; 749 } 750 751 static struct table_device *find_table_device(struct list_head *l, dev_t dev, 752 fmode_t mode) { 753 struct table_device *td; 754 755 list_for_each_entry(td, l, list) 756 if (td->dm_dev.bdev->bd_dev == dev && td->dm_dev.mode == mode) 757 return td; 758 759 return NULL; 760 } 761 762 int dm_get_table_device(struct mapped_device *md, dev_t dev, fmode_t mode, 763 struct dm_dev **result) { 764 int r; 765 struct table_device *td; 766 767 mutex_lock(&md->table_devices_lock); 768 td = find_table_device(&md->table_devices, dev, mode); 769 if (!td) { 770 td = kmalloc(sizeof(*td), GFP_KERNEL); 771 if (!td) { 772 mutex_unlock(&md->table_devices_lock); 773 return -ENOMEM; 774 } 775 776 td->dm_dev.mode = mode; 777 td->dm_dev.bdev = NULL; 778 779 if ((r = open_table_device(td, dev, md))) { 780 mutex_unlock(&md->table_devices_lock); 781 kfree(td); 782 return r; 783 } 784 785 format_dev_t(td->dm_dev.name, dev); 786 787 atomic_set(&td->count, 0); 788 list_add(&td->list, &md->table_devices); 789 } 790 atomic_inc(&td->count); 791 mutex_unlock(&md->table_devices_lock); 792 793 *result = &td->dm_dev; 794 return 0; 795 } 796 EXPORT_SYMBOL_GPL(dm_get_table_device); 797 798 void dm_put_table_device(struct mapped_device *md, struct dm_dev *d) 799 { 800 struct table_device *td = container_of(d, struct table_device, dm_dev); 801 802 mutex_lock(&md->table_devices_lock); 803 if (atomic_dec_and_test(&td->count)) { 804 close_table_device(td, md); 805 list_del(&td->list); 806 kfree(td); 807 } 808 mutex_unlock(&md->table_devices_lock); 809 } 810 EXPORT_SYMBOL(dm_put_table_device); 811 812 static void free_table_devices(struct list_head *devices) 813 { 814 struct list_head *tmp, *next; 815 816 list_for_each_safe(tmp, next, devices) { 817 struct table_device *td = list_entry(tmp, struct table_device, list); 818 819 DMWARN("dm_destroy: %s still exists with %d references", 820 td->dm_dev.name, atomic_read(&td->count)); 821 kfree(td); 822 } 823 } 824 825 /* 826 * Get the geometry associated with a dm device 827 */ 828 int dm_get_geometry(struct mapped_device *md, struct hd_geometry *geo) 829 { 830 *geo = md->geometry; 831 832 return 0; 833 } 834 835 /* 836 * Set the geometry of a device. 837 */ 838 int dm_set_geometry(struct mapped_device *md, struct hd_geometry *geo) 839 { 840 sector_t sz = (sector_t)geo->cylinders * geo->heads * geo->sectors; 841 842 if (geo->start > sz) { 843 DMWARN("Start sector is beyond the geometry limits."); 844 return -EINVAL; 845 } 846 847 md->geometry = *geo; 848 849 return 0; 850 } 851 852 /*----------------------------------------------------------------- 853 * CRUD START: 854 * A more elegant soln is in the works that uses the queue 855 * merge fn, unfortunately there are a couple of changes to 856 * the block layer that I want to make for this. So in the 857 * interests of getting something for people to use I give 858 * you this clearly demarcated crap. 859 *---------------------------------------------------------------*/ 860 861 static int __noflush_suspending(struct mapped_device *md) 862 { 863 return test_bit(DMF_NOFLUSH_SUSPENDING, &md->flags); 864 } 865 866 /* 867 * Decrements the number of outstanding ios that a bio has been 868 * cloned into, completing the original io if necc. 869 */ 870 static void dec_pending(struct dm_io *io, int error) 871 { 872 unsigned long flags; 873 int io_error; 874 struct bio *bio; 875 struct mapped_device *md = io->md; 876 877 /* Push-back supersedes any I/O errors */ 878 if (unlikely(error)) { 879 spin_lock_irqsave(&io->endio_lock, flags); 880 if (!(io->error > 0 && __noflush_suspending(md))) 881 io->error = error; 882 spin_unlock_irqrestore(&io->endio_lock, flags); 883 } 884 885 if (atomic_dec_and_test(&io->io_count)) { 886 if (io->error == DM_ENDIO_REQUEUE) { 887 /* 888 * Target requested pushing back the I/O. 889 */ 890 spin_lock_irqsave(&md->deferred_lock, flags); 891 if (__noflush_suspending(md)) 892 bio_list_add_head(&md->deferred, io->bio); 893 else 894 /* noflush suspend was interrupted. */ 895 io->error = -EIO; 896 spin_unlock_irqrestore(&md->deferred_lock, flags); 897 } 898 899 io_error = io->error; 900 bio = io->bio; 901 end_io_acct(io); 902 free_io(md, io); 903 904 if (io_error == DM_ENDIO_REQUEUE) 905 return; 906 907 if ((bio->bi_rw & REQ_FLUSH) && bio->bi_iter.bi_size) { 908 /* 909 * Preflush done for flush with data, reissue 910 * without REQ_FLUSH. 911 */ 912 bio->bi_rw &= ~REQ_FLUSH; 913 queue_io(md, bio); 914 } else { 915 /* done with normal IO or empty flush */ 916 trace_block_bio_complete(md->queue, bio, io_error); 917 bio_endio(bio, io_error); 918 } 919 } 920 } 921 922 static void disable_write_same(struct mapped_device *md) 923 { 924 struct queue_limits *limits = dm_get_queue_limits(md); 925 926 /* device doesn't really support WRITE SAME, disable it */ 927 limits->max_write_same_sectors = 0; 928 } 929 930 static void clone_endio(struct bio *bio, int error) 931 { 932 int r = error; 933 struct dm_target_io *tio = container_of(bio, struct dm_target_io, clone); 934 struct dm_io *io = tio->io; 935 struct mapped_device *md = tio->io->md; 936 dm_endio_fn endio = tio->ti->type->end_io; 937 938 if (!bio_flagged(bio, BIO_UPTODATE) && !error) 939 error = -EIO; 940 941 if (endio) { 942 r = endio(tio->ti, bio, error); 943 if (r < 0 || r == DM_ENDIO_REQUEUE) 944 /* 945 * error and requeue request are handled 946 * in dec_pending(). 947 */ 948 error = r; 949 else if (r == DM_ENDIO_INCOMPLETE) 950 /* The target will handle the io */ 951 return; 952 else if (r) { 953 DMWARN("unimplemented target endio return value: %d", r); 954 BUG(); 955 } 956 } 957 958 if (unlikely(r == -EREMOTEIO && (bio->bi_rw & REQ_WRITE_SAME) && 959 !bdev_get_queue(bio->bi_bdev)->limits.max_write_same_sectors)) 960 disable_write_same(md); 961 962 free_tio(md, tio); 963 dec_pending(io, error); 964 } 965 966 /* 967 * Partial completion handling for request-based dm 968 */ 969 static void end_clone_bio(struct bio *clone, int error) 970 { 971 struct dm_rq_clone_bio_info *info = 972 container_of(clone, struct dm_rq_clone_bio_info, clone); 973 struct dm_rq_target_io *tio = info->tio; 974 struct bio *bio = info->orig; 975 unsigned int nr_bytes = info->orig->bi_iter.bi_size; 976 977 bio_put(clone); 978 979 if (tio->error) 980 /* 981 * An error has already been detected on the request. 982 * Once error occurred, just let clone->end_io() handle 983 * the remainder. 984 */ 985 return; 986 else if (error) { 987 /* 988 * Don't notice the error to the upper layer yet. 989 * The error handling decision is made by the target driver, 990 * when the request is completed. 991 */ 992 tio->error = error; 993 return; 994 } 995 996 /* 997 * I/O for the bio successfully completed. 998 * Notice the data completion to the upper layer. 999 */ 1000 1001 /* 1002 * bios are processed from the head of the list. 1003 * So the completing bio should always be rq->bio. 1004 * If it's not, something wrong is happening. 1005 */ 1006 if (tio->orig->bio != bio) 1007 DMERR("bio completion is going in the middle of the request"); 1008 1009 /* 1010 * Update the original request. 1011 * Do not use blk_end_request() here, because it may complete 1012 * the original request before the clone, and break the ordering. 1013 */ 1014 blk_update_request(tio->orig, 0, nr_bytes); 1015 } 1016 1017 /* 1018 * Don't touch any member of the md after calling this function because 1019 * the md may be freed in dm_put() at the end of this function. 1020 * Or do dm_get() before calling this function and dm_put() later. 1021 */ 1022 static void rq_completed(struct mapped_device *md, int rw, bool run_queue) 1023 { 1024 atomic_dec(&md->pending[rw]); 1025 1026 /* nudge anyone waiting on suspend queue */ 1027 if (!md_in_flight(md)) 1028 wake_up(&md->wait); 1029 1030 /* 1031 * Run this off this callpath, as drivers could invoke end_io while 1032 * inside their request_fn (and holding the queue lock). Calling 1033 * back into ->request_fn() could deadlock attempting to grab the 1034 * queue lock again. 1035 */ 1036 if (run_queue) 1037 blk_run_queue_async(md->queue); 1038 1039 /* 1040 * dm_put() must be at the end of this function. See the comment above 1041 */ 1042 dm_put(md); 1043 } 1044 1045 static void free_rq_clone(struct request *clone) 1046 { 1047 struct dm_rq_target_io *tio = clone->end_io_data; 1048 1049 blk_rq_unprep_clone(clone); 1050 if (clone->q && clone->q->mq_ops) 1051 tio->ti->type->release_clone_rq(clone); 1052 else 1053 free_clone_request(tio->md, clone); 1054 free_rq_tio(tio); 1055 } 1056 1057 /* 1058 * Complete the clone and the original request. 1059 * Must be called without clone's queue lock held, 1060 * see end_clone_request() for more details. 1061 */ 1062 static void dm_end_request(struct request *clone, int error) 1063 { 1064 int rw = rq_data_dir(clone); 1065 struct dm_rq_target_io *tio = clone->end_io_data; 1066 struct mapped_device *md = tio->md; 1067 struct request *rq = tio->orig; 1068 1069 if (rq->cmd_type == REQ_TYPE_BLOCK_PC) { 1070 rq->errors = clone->errors; 1071 rq->resid_len = clone->resid_len; 1072 1073 if (rq->sense) 1074 /* 1075 * We are using the sense buffer of the original 1076 * request. 1077 * So setting the length of the sense data is enough. 1078 */ 1079 rq->sense_len = clone->sense_len; 1080 } 1081 1082 free_rq_clone(clone); 1083 blk_end_request_all(rq, error); 1084 rq_completed(md, rw, true); 1085 } 1086 1087 static void dm_unprep_request(struct request *rq) 1088 { 1089 struct dm_rq_target_io *tio = rq->special; 1090 struct request *clone = tio->clone; 1091 1092 rq->special = NULL; 1093 rq->cmd_flags &= ~REQ_DONTPREP; 1094 1095 if (clone) 1096 free_rq_clone(clone); 1097 } 1098 1099 /* 1100 * Requeue the original request of a clone. 1101 */ 1102 static void dm_requeue_unmapped_original_request(struct mapped_device *md, 1103 struct request *rq) 1104 { 1105 int rw = rq_data_dir(rq); 1106 struct request_queue *q = rq->q; 1107 unsigned long flags; 1108 1109 dm_unprep_request(rq); 1110 1111 spin_lock_irqsave(q->queue_lock, flags); 1112 blk_requeue_request(q, rq); 1113 spin_unlock_irqrestore(q->queue_lock, flags); 1114 1115 rq_completed(md, rw, false); 1116 } 1117 1118 static void dm_requeue_unmapped_request(struct request *clone) 1119 { 1120 struct dm_rq_target_io *tio = clone->end_io_data; 1121 1122 dm_requeue_unmapped_original_request(tio->md, tio->orig); 1123 } 1124 1125 static void __stop_queue(struct request_queue *q) 1126 { 1127 blk_stop_queue(q); 1128 } 1129 1130 static void stop_queue(struct request_queue *q) 1131 { 1132 unsigned long flags; 1133 1134 spin_lock_irqsave(q->queue_lock, flags); 1135 __stop_queue(q); 1136 spin_unlock_irqrestore(q->queue_lock, flags); 1137 } 1138 1139 static void __start_queue(struct request_queue *q) 1140 { 1141 if (blk_queue_stopped(q)) 1142 blk_start_queue(q); 1143 } 1144 1145 static void start_queue(struct request_queue *q) 1146 { 1147 unsigned long flags; 1148 1149 spin_lock_irqsave(q->queue_lock, flags); 1150 __start_queue(q); 1151 spin_unlock_irqrestore(q->queue_lock, flags); 1152 } 1153 1154 static void dm_done(struct request *clone, int error, bool mapped) 1155 { 1156 int r = error; 1157 struct dm_rq_target_io *tio = clone->end_io_data; 1158 dm_request_endio_fn rq_end_io = NULL; 1159 1160 if (tio->ti) { 1161 rq_end_io = tio->ti->type->rq_end_io; 1162 1163 if (mapped && rq_end_io) 1164 r = rq_end_io(tio->ti, clone, error, &tio->info); 1165 } 1166 1167 if (unlikely(r == -EREMOTEIO && (clone->cmd_flags & REQ_WRITE_SAME) && 1168 !clone->q->limits.max_write_same_sectors)) 1169 disable_write_same(tio->md); 1170 1171 if (r <= 0) 1172 /* The target wants to complete the I/O */ 1173 dm_end_request(clone, r); 1174 else if (r == DM_ENDIO_INCOMPLETE) 1175 /* The target will handle the I/O */ 1176 return; 1177 else if (r == DM_ENDIO_REQUEUE) 1178 /* The target wants to requeue the I/O */ 1179 dm_requeue_unmapped_request(clone); 1180 else { 1181 DMWARN("unimplemented target endio return value: %d", r); 1182 BUG(); 1183 } 1184 } 1185 1186 /* 1187 * Request completion handler for request-based dm 1188 */ 1189 static void dm_softirq_done(struct request *rq) 1190 { 1191 bool mapped = true; 1192 struct dm_rq_target_io *tio = rq->special; 1193 struct request *clone = tio->clone; 1194 1195 if (!clone) { 1196 blk_end_request_all(rq, tio->error); 1197 rq_completed(tio->md, rq_data_dir(rq), false); 1198 free_rq_tio(tio); 1199 return; 1200 } 1201 1202 if (rq->cmd_flags & REQ_FAILED) 1203 mapped = false; 1204 1205 dm_done(clone, tio->error, mapped); 1206 } 1207 1208 /* 1209 * Complete the clone and the original request with the error status 1210 * through softirq context. 1211 */ 1212 static void dm_complete_request(struct request *rq, int error) 1213 { 1214 struct dm_rq_target_io *tio = rq->special; 1215 1216 tio->error = error; 1217 blk_complete_request(rq); 1218 } 1219 1220 /* 1221 * Complete the not-mapped clone and the original request with the error status 1222 * through softirq context. 1223 * Target's rq_end_io() function isn't called. 1224 * This may be used when the target's map_rq() or clone_and_map_rq() functions fail. 1225 */ 1226 static void dm_kill_unmapped_request(struct request *rq, int error) 1227 { 1228 rq->cmd_flags |= REQ_FAILED; 1229 dm_complete_request(rq, error); 1230 } 1231 1232 /* 1233 * Called with the clone's queue lock held 1234 */ 1235 static void end_clone_request(struct request *clone, int error) 1236 { 1237 struct dm_rq_target_io *tio = clone->end_io_data; 1238 1239 if (!clone->q->mq_ops) { 1240 /* 1241 * For just cleaning up the information of the queue in which 1242 * the clone was dispatched. 1243 * The clone is *NOT* freed actually here because it is alloced 1244 * from dm own mempool (REQ_ALLOCED isn't set). 1245 */ 1246 __blk_put_request(clone->q, clone); 1247 } 1248 1249 /* 1250 * Actual request completion is done in a softirq context which doesn't 1251 * hold the clone's queue lock. Otherwise, deadlock could occur because: 1252 * - another request may be submitted by the upper level driver 1253 * of the stacking during the completion 1254 * - the submission which requires queue lock may be done 1255 * against this clone's queue 1256 */ 1257 dm_complete_request(tio->orig, error); 1258 } 1259 1260 /* 1261 * Return maximum size of I/O possible at the supplied sector up to the current 1262 * target boundary. 1263 */ 1264 static sector_t max_io_len_target_boundary(sector_t sector, struct dm_target *ti) 1265 { 1266 sector_t target_offset = dm_target_offset(ti, sector); 1267 1268 return ti->len - target_offset; 1269 } 1270 1271 static sector_t max_io_len(sector_t sector, struct dm_target *ti) 1272 { 1273 sector_t len = max_io_len_target_boundary(sector, ti); 1274 sector_t offset, max_len; 1275 1276 /* 1277 * Does the target need to split even further? 1278 */ 1279 if (ti->max_io_len) { 1280 offset = dm_target_offset(ti, sector); 1281 if (unlikely(ti->max_io_len & (ti->max_io_len - 1))) 1282 max_len = sector_div(offset, ti->max_io_len); 1283 else 1284 max_len = offset & (ti->max_io_len - 1); 1285 max_len = ti->max_io_len - max_len; 1286 1287 if (len > max_len) 1288 len = max_len; 1289 } 1290 1291 return len; 1292 } 1293 1294 int dm_set_target_max_io_len(struct dm_target *ti, sector_t len) 1295 { 1296 if (len > UINT_MAX) { 1297 DMERR("Specified maximum size of target IO (%llu) exceeds limit (%u)", 1298 (unsigned long long)len, UINT_MAX); 1299 ti->error = "Maximum size of target IO is too large"; 1300 return -EINVAL; 1301 } 1302 1303 ti->max_io_len = (uint32_t) len; 1304 1305 return 0; 1306 } 1307 EXPORT_SYMBOL_GPL(dm_set_target_max_io_len); 1308 1309 /* 1310 * A target may call dm_accept_partial_bio only from the map routine. It is 1311 * allowed for all bio types except REQ_FLUSH. 1312 * 1313 * dm_accept_partial_bio informs the dm that the target only wants to process 1314 * additional n_sectors sectors of the bio and the rest of the data should be 1315 * sent in a next bio. 1316 * 1317 * A diagram that explains the arithmetics: 1318 * +--------------------+---------------+-------+ 1319 * | 1 | 2 | 3 | 1320 * +--------------------+---------------+-------+ 1321 * 1322 * <-------------- *tio->len_ptr ---------------> 1323 * <------- bi_size -------> 1324 * <-- n_sectors --> 1325 * 1326 * Region 1 was already iterated over with bio_advance or similar function. 1327 * (it may be empty if the target doesn't use bio_advance) 1328 * Region 2 is the remaining bio size that the target wants to process. 1329 * (it may be empty if region 1 is non-empty, although there is no reason 1330 * to make it empty) 1331 * The target requires that region 3 is to be sent in the next bio. 1332 * 1333 * If the target wants to receive multiple copies of the bio (via num_*bios, etc), 1334 * the partially processed part (the sum of regions 1+2) must be the same for all 1335 * copies of the bio. 1336 */ 1337 void dm_accept_partial_bio(struct bio *bio, unsigned n_sectors) 1338 { 1339 struct dm_target_io *tio = container_of(bio, struct dm_target_io, clone); 1340 unsigned bi_size = bio->bi_iter.bi_size >> SECTOR_SHIFT; 1341 BUG_ON(bio->bi_rw & REQ_FLUSH); 1342 BUG_ON(bi_size > *tio->len_ptr); 1343 BUG_ON(n_sectors > bi_size); 1344 *tio->len_ptr -= bi_size - n_sectors; 1345 bio->bi_iter.bi_size = n_sectors << SECTOR_SHIFT; 1346 } 1347 EXPORT_SYMBOL_GPL(dm_accept_partial_bio); 1348 1349 static void __map_bio(struct dm_target_io *tio) 1350 { 1351 int r; 1352 sector_t sector; 1353 struct mapped_device *md; 1354 struct bio *clone = &tio->clone; 1355 struct dm_target *ti = tio->ti; 1356 1357 clone->bi_end_io = clone_endio; 1358 1359 /* 1360 * Map the clone. If r == 0 we don't need to do 1361 * anything, the target has assumed ownership of 1362 * this io. 1363 */ 1364 atomic_inc(&tio->io->io_count); 1365 sector = clone->bi_iter.bi_sector; 1366 r = ti->type->map(ti, clone); 1367 if (r == DM_MAPIO_REMAPPED) { 1368 /* the bio has been remapped so dispatch it */ 1369 1370 trace_block_bio_remap(bdev_get_queue(clone->bi_bdev), clone, 1371 tio->io->bio->bi_bdev->bd_dev, sector); 1372 1373 generic_make_request(clone); 1374 } else if (r < 0 || r == DM_MAPIO_REQUEUE) { 1375 /* error the io and bail out, or requeue it if needed */ 1376 md = tio->io->md; 1377 dec_pending(tio->io, r); 1378 free_tio(md, tio); 1379 } else if (r) { 1380 DMWARN("unimplemented target map return value: %d", r); 1381 BUG(); 1382 } 1383 } 1384 1385 struct clone_info { 1386 struct mapped_device *md; 1387 struct dm_table *map; 1388 struct bio *bio; 1389 struct dm_io *io; 1390 sector_t sector; 1391 unsigned sector_count; 1392 }; 1393 1394 static void bio_setup_sector(struct bio *bio, sector_t sector, unsigned len) 1395 { 1396 bio->bi_iter.bi_sector = sector; 1397 bio->bi_iter.bi_size = to_bytes(len); 1398 } 1399 1400 /* 1401 * Creates a bio that consists of range of complete bvecs. 1402 */ 1403 static void clone_bio(struct dm_target_io *tio, struct bio *bio, 1404 sector_t sector, unsigned len) 1405 { 1406 struct bio *clone = &tio->clone; 1407 1408 __bio_clone_fast(clone, bio); 1409 1410 if (bio_integrity(bio)) 1411 bio_integrity_clone(clone, bio, GFP_NOIO); 1412 1413 bio_advance(clone, to_bytes(sector - clone->bi_iter.bi_sector)); 1414 clone->bi_iter.bi_size = to_bytes(len); 1415 1416 if (bio_integrity(bio)) 1417 bio_integrity_trim(clone, 0, len); 1418 } 1419 1420 static struct dm_target_io *alloc_tio(struct clone_info *ci, 1421 struct dm_target *ti, 1422 unsigned target_bio_nr) 1423 { 1424 struct dm_target_io *tio; 1425 struct bio *clone; 1426 1427 clone = bio_alloc_bioset(GFP_NOIO, 0, ci->md->bs); 1428 tio = container_of(clone, struct dm_target_io, clone); 1429 1430 tio->io = ci->io; 1431 tio->ti = ti; 1432 tio->target_bio_nr = target_bio_nr; 1433 1434 return tio; 1435 } 1436 1437 static void __clone_and_map_simple_bio(struct clone_info *ci, 1438 struct dm_target *ti, 1439 unsigned target_bio_nr, unsigned *len) 1440 { 1441 struct dm_target_io *tio = alloc_tio(ci, ti, target_bio_nr); 1442 struct bio *clone = &tio->clone; 1443 1444 tio->len_ptr = len; 1445 1446 __bio_clone_fast(clone, ci->bio); 1447 if (len) 1448 bio_setup_sector(clone, ci->sector, *len); 1449 1450 __map_bio(tio); 1451 } 1452 1453 static void __send_duplicate_bios(struct clone_info *ci, struct dm_target *ti, 1454 unsigned num_bios, unsigned *len) 1455 { 1456 unsigned target_bio_nr; 1457 1458 for (target_bio_nr = 0; target_bio_nr < num_bios; target_bio_nr++) 1459 __clone_and_map_simple_bio(ci, ti, target_bio_nr, len); 1460 } 1461 1462 static int __send_empty_flush(struct clone_info *ci) 1463 { 1464 unsigned target_nr = 0; 1465 struct dm_target *ti; 1466 1467 BUG_ON(bio_has_data(ci->bio)); 1468 while ((ti = dm_table_get_target(ci->map, target_nr++))) 1469 __send_duplicate_bios(ci, ti, ti->num_flush_bios, NULL); 1470 1471 return 0; 1472 } 1473 1474 static void __clone_and_map_data_bio(struct clone_info *ci, struct dm_target *ti, 1475 sector_t sector, unsigned *len) 1476 { 1477 struct bio *bio = ci->bio; 1478 struct dm_target_io *tio; 1479 unsigned target_bio_nr; 1480 unsigned num_target_bios = 1; 1481 1482 /* 1483 * Does the target want to receive duplicate copies of the bio? 1484 */ 1485 if (bio_data_dir(bio) == WRITE && ti->num_write_bios) 1486 num_target_bios = ti->num_write_bios(ti, bio); 1487 1488 for (target_bio_nr = 0; target_bio_nr < num_target_bios; target_bio_nr++) { 1489 tio = alloc_tio(ci, ti, target_bio_nr); 1490 tio->len_ptr = len; 1491 clone_bio(tio, bio, sector, *len); 1492 __map_bio(tio); 1493 } 1494 } 1495 1496 typedef unsigned (*get_num_bios_fn)(struct dm_target *ti); 1497 1498 static unsigned get_num_discard_bios(struct dm_target *ti) 1499 { 1500 return ti->num_discard_bios; 1501 } 1502 1503 static unsigned get_num_write_same_bios(struct dm_target *ti) 1504 { 1505 return ti->num_write_same_bios; 1506 } 1507 1508 typedef bool (*is_split_required_fn)(struct dm_target *ti); 1509 1510 static bool is_split_required_for_discard(struct dm_target *ti) 1511 { 1512 return ti->split_discard_bios; 1513 } 1514 1515 static int __send_changing_extent_only(struct clone_info *ci, 1516 get_num_bios_fn get_num_bios, 1517 is_split_required_fn is_split_required) 1518 { 1519 struct dm_target *ti; 1520 unsigned len; 1521 unsigned num_bios; 1522 1523 do { 1524 ti = dm_table_find_target(ci->map, ci->sector); 1525 if (!dm_target_is_valid(ti)) 1526 return -EIO; 1527 1528 /* 1529 * Even though the device advertised support for this type of 1530 * request, that does not mean every target supports it, and 1531 * reconfiguration might also have changed that since the 1532 * check was performed. 1533 */ 1534 num_bios = get_num_bios ? get_num_bios(ti) : 0; 1535 if (!num_bios) 1536 return -EOPNOTSUPP; 1537 1538 if (is_split_required && !is_split_required(ti)) 1539 len = min((sector_t)ci->sector_count, max_io_len_target_boundary(ci->sector, ti)); 1540 else 1541 len = min((sector_t)ci->sector_count, max_io_len(ci->sector, ti)); 1542 1543 __send_duplicate_bios(ci, ti, num_bios, &len); 1544 1545 ci->sector += len; 1546 } while (ci->sector_count -= len); 1547 1548 return 0; 1549 } 1550 1551 static int __send_discard(struct clone_info *ci) 1552 { 1553 return __send_changing_extent_only(ci, get_num_discard_bios, 1554 is_split_required_for_discard); 1555 } 1556 1557 static int __send_write_same(struct clone_info *ci) 1558 { 1559 return __send_changing_extent_only(ci, get_num_write_same_bios, NULL); 1560 } 1561 1562 /* 1563 * Select the correct strategy for processing a non-flush bio. 1564 */ 1565 static int __split_and_process_non_flush(struct clone_info *ci) 1566 { 1567 struct bio *bio = ci->bio; 1568 struct dm_target *ti; 1569 unsigned len; 1570 1571 if (unlikely(bio->bi_rw & REQ_DISCARD)) 1572 return __send_discard(ci); 1573 else if (unlikely(bio->bi_rw & REQ_WRITE_SAME)) 1574 return __send_write_same(ci); 1575 1576 ti = dm_table_find_target(ci->map, ci->sector); 1577 if (!dm_target_is_valid(ti)) 1578 return -EIO; 1579 1580 len = min_t(sector_t, max_io_len(ci->sector, ti), ci->sector_count); 1581 1582 __clone_and_map_data_bio(ci, ti, ci->sector, &len); 1583 1584 ci->sector += len; 1585 ci->sector_count -= len; 1586 1587 return 0; 1588 } 1589 1590 /* 1591 * Entry point to split a bio into clones and submit them to the targets. 1592 */ 1593 static void __split_and_process_bio(struct mapped_device *md, 1594 struct dm_table *map, struct bio *bio) 1595 { 1596 struct clone_info ci; 1597 int error = 0; 1598 1599 if (unlikely(!map)) { 1600 bio_io_error(bio); 1601 return; 1602 } 1603 1604 ci.map = map; 1605 ci.md = md; 1606 ci.io = alloc_io(md); 1607 ci.io->error = 0; 1608 atomic_set(&ci.io->io_count, 1); 1609 ci.io->bio = bio; 1610 ci.io->md = md; 1611 spin_lock_init(&ci.io->endio_lock); 1612 ci.sector = bio->bi_iter.bi_sector; 1613 1614 start_io_acct(ci.io); 1615 1616 if (bio->bi_rw & REQ_FLUSH) { 1617 ci.bio = &ci.md->flush_bio; 1618 ci.sector_count = 0; 1619 error = __send_empty_flush(&ci); 1620 /* dec_pending submits any data associated with flush */ 1621 } else { 1622 ci.bio = bio; 1623 ci.sector_count = bio_sectors(bio); 1624 while (ci.sector_count && !error) 1625 error = __split_and_process_non_flush(&ci); 1626 } 1627 1628 /* drop the extra reference count */ 1629 dec_pending(ci.io, error); 1630 } 1631 /*----------------------------------------------------------------- 1632 * CRUD END 1633 *---------------------------------------------------------------*/ 1634 1635 static int dm_merge_bvec(struct request_queue *q, 1636 struct bvec_merge_data *bvm, 1637 struct bio_vec *biovec) 1638 { 1639 struct mapped_device *md = q->queuedata; 1640 struct dm_table *map = dm_get_live_table_fast(md); 1641 struct dm_target *ti; 1642 sector_t max_sectors; 1643 int max_size = 0; 1644 1645 if (unlikely(!map)) 1646 goto out; 1647 1648 ti = dm_table_find_target(map, bvm->bi_sector); 1649 if (!dm_target_is_valid(ti)) 1650 goto out; 1651 1652 /* 1653 * Find maximum amount of I/O that won't need splitting 1654 */ 1655 max_sectors = min(max_io_len(bvm->bi_sector, ti), 1656 (sector_t) queue_max_sectors(q)); 1657 max_size = (max_sectors << SECTOR_SHIFT) - bvm->bi_size; 1658 if (unlikely(max_size < 0)) /* this shouldn't _ever_ happen */ 1659 max_size = 0; 1660 1661 /* 1662 * merge_bvec_fn() returns number of bytes 1663 * it can accept at this offset 1664 * max is precomputed maximal io size 1665 */ 1666 if (max_size && ti->type->merge) 1667 max_size = ti->type->merge(ti, bvm, biovec, max_size); 1668 /* 1669 * If the target doesn't support merge method and some of the devices 1670 * provided their merge_bvec method (we know this by looking for the 1671 * max_hw_sectors that dm_set_device_limits may set), then we can't 1672 * allow bios with multiple vector entries. So always set max_size 1673 * to 0, and the code below allows just one page. 1674 */ 1675 else if (queue_max_hw_sectors(q) <= PAGE_SIZE >> 9) 1676 max_size = 0; 1677 1678 out: 1679 dm_put_live_table_fast(md); 1680 /* 1681 * Always allow an entire first page 1682 */ 1683 if (max_size <= biovec->bv_len && !(bvm->bi_size >> SECTOR_SHIFT)) 1684 max_size = biovec->bv_len; 1685 1686 return max_size; 1687 } 1688 1689 /* 1690 * The request function that just remaps the bio built up by 1691 * dm_merge_bvec. 1692 */ 1693 static void _dm_request(struct request_queue *q, struct bio *bio) 1694 { 1695 int rw = bio_data_dir(bio); 1696 struct mapped_device *md = q->queuedata; 1697 int srcu_idx; 1698 struct dm_table *map; 1699 1700 map = dm_get_live_table(md, &srcu_idx); 1701 1702 generic_start_io_acct(rw, bio_sectors(bio), &dm_disk(md)->part0); 1703 1704 /* if we're suspended, we have to queue this io for later */ 1705 if (unlikely(test_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags))) { 1706 dm_put_live_table(md, srcu_idx); 1707 1708 if (bio_rw(bio) != READA) 1709 queue_io(md, bio); 1710 else 1711 bio_io_error(bio); 1712 return; 1713 } 1714 1715 __split_and_process_bio(md, map, bio); 1716 dm_put_live_table(md, srcu_idx); 1717 return; 1718 } 1719 1720 int dm_request_based(struct mapped_device *md) 1721 { 1722 return blk_queue_stackable(md->queue); 1723 } 1724 1725 static void dm_request(struct request_queue *q, struct bio *bio) 1726 { 1727 struct mapped_device *md = q->queuedata; 1728 1729 if (dm_request_based(md)) 1730 blk_queue_bio(q, bio); 1731 else 1732 _dm_request(q, bio); 1733 } 1734 1735 static void dm_dispatch_clone_request(struct request *clone, struct request *rq) 1736 { 1737 int r; 1738 1739 if (blk_queue_io_stat(clone->q)) 1740 clone->cmd_flags |= REQ_IO_STAT; 1741 1742 clone->start_time = jiffies; 1743 r = blk_insert_cloned_request(clone->q, clone); 1744 if (r) 1745 /* must complete clone in terms of original request */ 1746 dm_complete_request(rq, r); 1747 } 1748 1749 static int dm_rq_bio_constructor(struct bio *bio, struct bio *bio_orig, 1750 void *data) 1751 { 1752 struct dm_rq_target_io *tio = data; 1753 struct dm_rq_clone_bio_info *info = 1754 container_of(bio, struct dm_rq_clone_bio_info, clone); 1755 1756 info->orig = bio_orig; 1757 info->tio = tio; 1758 bio->bi_end_io = end_clone_bio; 1759 1760 return 0; 1761 } 1762 1763 static int setup_clone(struct request *clone, struct request *rq, 1764 struct dm_rq_target_io *tio, gfp_t gfp_mask) 1765 { 1766 int r; 1767 1768 r = blk_rq_prep_clone(clone, rq, tio->md->bs, gfp_mask, 1769 dm_rq_bio_constructor, tio); 1770 if (r) 1771 return r; 1772 1773 clone->cmd = rq->cmd; 1774 clone->cmd_len = rq->cmd_len; 1775 clone->sense = rq->sense; 1776 clone->end_io = end_clone_request; 1777 clone->end_io_data = tio; 1778 1779 tio->clone = clone; 1780 1781 return 0; 1782 } 1783 1784 static struct request *clone_rq(struct request *rq, struct mapped_device *md, 1785 struct dm_rq_target_io *tio, gfp_t gfp_mask) 1786 { 1787 struct request *clone = alloc_clone_request(md, gfp_mask); 1788 1789 if (!clone) 1790 return NULL; 1791 1792 blk_rq_init(NULL, clone); 1793 if (setup_clone(clone, rq, tio, gfp_mask)) { 1794 /* -ENOMEM */ 1795 free_clone_request(md, clone); 1796 return NULL; 1797 } 1798 1799 return clone; 1800 } 1801 1802 static void map_tio_request(struct kthread_work *work); 1803 1804 static struct dm_rq_target_io *prep_tio(struct request *rq, 1805 struct mapped_device *md, gfp_t gfp_mask) 1806 { 1807 struct dm_rq_target_io *tio; 1808 int srcu_idx; 1809 struct dm_table *table; 1810 1811 tio = alloc_rq_tio(md, gfp_mask); 1812 if (!tio) 1813 return NULL; 1814 1815 tio->md = md; 1816 tio->ti = NULL; 1817 tio->clone = NULL; 1818 tio->orig = rq; 1819 tio->error = 0; 1820 memset(&tio->info, 0, sizeof(tio->info)); 1821 init_kthread_work(&tio->work, map_tio_request); 1822 1823 table = dm_get_live_table(md, &srcu_idx); 1824 if (!dm_table_mq_request_based(table)) { 1825 if (!clone_rq(rq, md, tio, gfp_mask)) { 1826 dm_put_live_table(md, srcu_idx); 1827 free_rq_tio(tio); 1828 return NULL; 1829 } 1830 } 1831 dm_put_live_table(md, srcu_idx); 1832 1833 return tio; 1834 } 1835 1836 /* 1837 * Called with the queue lock held. 1838 */ 1839 static int dm_prep_fn(struct request_queue *q, struct request *rq) 1840 { 1841 struct mapped_device *md = q->queuedata; 1842 struct dm_rq_target_io *tio; 1843 1844 if (unlikely(rq->special)) { 1845 DMWARN("Already has something in rq->special."); 1846 return BLKPREP_KILL; 1847 } 1848 1849 tio = prep_tio(rq, md, GFP_ATOMIC); 1850 if (!tio) 1851 return BLKPREP_DEFER; 1852 1853 rq->special = tio; 1854 rq->cmd_flags |= REQ_DONTPREP; 1855 1856 return BLKPREP_OK; 1857 } 1858 1859 /* 1860 * Returns: 1861 * 0 : the request has been processed 1862 * DM_MAPIO_REQUEUE : the original request needs to be requeued 1863 * < 0 : the request was completed due to failure 1864 */ 1865 static int map_request(struct dm_target *ti, struct request *rq, 1866 struct mapped_device *md) 1867 { 1868 int r; 1869 struct dm_rq_target_io *tio = rq->special; 1870 struct request *clone = NULL; 1871 1872 if (tio->clone) { 1873 clone = tio->clone; 1874 r = ti->type->map_rq(ti, clone, &tio->info); 1875 } else { 1876 r = ti->type->clone_and_map_rq(ti, rq, &tio->info, &clone); 1877 if (r < 0) { 1878 /* The target wants to complete the I/O */ 1879 dm_kill_unmapped_request(rq, r); 1880 return r; 1881 } 1882 if (IS_ERR(clone)) 1883 return DM_MAPIO_REQUEUE; 1884 if (setup_clone(clone, rq, tio, GFP_KERNEL)) { 1885 /* -ENOMEM */ 1886 ti->type->release_clone_rq(clone); 1887 return DM_MAPIO_REQUEUE; 1888 } 1889 } 1890 1891 switch (r) { 1892 case DM_MAPIO_SUBMITTED: 1893 /* The target has taken the I/O to submit by itself later */ 1894 break; 1895 case DM_MAPIO_REMAPPED: 1896 /* The target has remapped the I/O so dispatch it */ 1897 trace_block_rq_remap(clone->q, clone, disk_devt(dm_disk(md)), 1898 blk_rq_pos(rq)); 1899 dm_dispatch_clone_request(clone, rq); 1900 break; 1901 case DM_MAPIO_REQUEUE: 1902 /* The target wants to requeue the I/O */ 1903 dm_requeue_unmapped_request(clone); 1904 break; 1905 default: 1906 if (r > 0) { 1907 DMWARN("unimplemented target map return value: %d", r); 1908 BUG(); 1909 } 1910 1911 /* The target wants to complete the I/O */ 1912 dm_kill_unmapped_request(rq, r); 1913 return r; 1914 } 1915 1916 return 0; 1917 } 1918 1919 static void map_tio_request(struct kthread_work *work) 1920 { 1921 struct dm_rq_target_io *tio = container_of(work, struct dm_rq_target_io, work); 1922 struct request *rq = tio->orig; 1923 struct mapped_device *md = tio->md; 1924 1925 if (map_request(tio->ti, rq, md) == DM_MAPIO_REQUEUE) 1926 dm_requeue_unmapped_original_request(md, rq); 1927 } 1928 1929 static void dm_start_request(struct mapped_device *md, struct request *orig) 1930 { 1931 blk_start_request(orig); 1932 atomic_inc(&md->pending[rq_data_dir(orig)]); 1933 1934 /* 1935 * Hold the md reference here for the in-flight I/O. 1936 * We can't rely on the reference count by device opener, 1937 * because the device may be closed during the request completion 1938 * when all bios are completed. 1939 * See the comment in rq_completed() too. 1940 */ 1941 dm_get(md); 1942 } 1943 1944 /* 1945 * q->request_fn for request-based dm. 1946 * Called with the queue lock held. 1947 */ 1948 static void dm_request_fn(struct request_queue *q) 1949 { 1950 struct mapped_device *md = q->queuedata; 1951 int srcu_idx; 1952 struct dm_table *map = dm_get_live_table(md, &srcu_idx); 1953 struct dm_target *ti; 1954 struct request *rq; 1955 struct dm_rq_target_io *tio; 1956 sector_t pos; 1957 1958 /* 1959 * For suspend, check blk_queue_stopped() and increment 1960 * ->pending within a single queue_lock not to increment the 1961 * number of in-flight I/Os after the queue is stopped in 1962 * dm_suspend(). 1963 */ 1964 while (!blk_queue_stopped(q)) { 1965 rq = blk_peek_request(q); 1966 if (!rq) 1967 goto delay_and_out; 1968 1969 /* always use block 0 to find the target for flushes for now */ 1970 pos = 0; 1971 if (!(rq->cmd_flags & REQ_FLUSH)) 1972 pos = blk_rq_pos(rq); 1973 1974 ti = dm_table_find_target(map, pos); 1975 if (!dm_target_is_valid(ti)) { 1976 /* 1977 * Must perform setup, that rq_completed() requires, 1978 * before calling dm_kill_unmapped_request 1979 */ 1980 DMERR_LIMIT("request attempted access beyond the end of device"); 1981 dm_start_request(md, rq); 1982 dm_kill_unmapped_request(rq, -EIO); 1983 continue; 1984 } 1985 1986 if (ti->type->busy && ti->type->busy(ti)) 1987 goto delay_and_out; 1988 1989 dm_start_request(md, rq); 1990 1991 tio = rq->special; 1992 /* Establish tio->ti before queuing work (map_tio_request) */ 1993 tio->ti = ti; 1994 queue_kthread_work(&md->kworker, &tio->work); 1995 BUG_ON(!irqs_disabled()); 1996 } 1997 1998 goto out; 1999 2000 delay_and_out: 2001 blk_delay_queue(q, HZ / 10); 2002 out: 2003 dm_put_live_table(md, srcu_idx); 2004 } 2005 2006 int dm_underlying_device_busy(struct request_queue *q) 2007 { 2008 return blk_lld_busy(q); 2009 } 2010 EXPORT_SYMBOL_GPL(dm_underlying_device_busy); 2011 2012 static int dm_lld_busy(struct request_queue *q) 2013 { 2014 int r; 2015 struct mapped_device *md = q->queuedata; 2016 struct dm_table *map = dm_get_live_table_fast(md); 2017 2018 if (!map || test_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags)) 2019 r = 1; 2020 else 2021 r = dm_table_any_busy_target(map); 2022 2023 dm_put_live_table_fast(md); 2024 2025 return r; 2026 } 2027 2028 static int dm_any_congested(void *congested_data, int bdi_bits) 2029 { 2030 int r = bdi_bits; 2031 struct mapped_device *md = congested_data; 2032 struct dm_table *map; 2033 2034 if (!test_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags)) { 2035 map = dm_get_live_table_fast(md); 2036 if (map) { 2037 /* 2038 * Request-based dm cares about only own queue for 2039 * the query about congestion status of request_queue 2040 */ 2041 if (dm_request_based(md)) 2042 r = md->queue->backing_dev_info.state & 2043 bdi_bits; 2044 else 2045 r = dm_table_any_congested(map, bdi_bits); 2046 } 2047 dm_put_live_table_fast(md); 2048 } 2049 2050 return r; 2051 } 2052 2053 /*----------------------------------------------------------------- 2054 * An IDR is used to keep track of allocated minor numbers. 2055 *---------------------------------------------------------------*/ 2056 static void free_minor(int minor) 2057 { 2058 spin_lock(&_minor_lock); 2059 idr_remove(&_minor_idr, minor); 2060 spin_unlock(&_minor_lock); 2061 } 2062 2063 /* 2064 * See if the device with a specific minor # is free. 2065 */ 2066 static int specific_minor(int minor) 2067 { 2068 int r; 2069 2070 if (minor >= (1 << MINORBITS)) 2071 return -EINVAL; 2072 2073 idr_preload(GFP_KERNEL); 2074 spin_lock(&_minor_lock); 2075 2076 r = idr_alloc(&_minor_idr, MINOR_ALLOCED, minor, minor + 1, GFP_NOWAIT); 2077 2078 spin_unlock(&_minor_lock); 2079 idr_preload_end(); 2080 if (r < 0) 2081 return r == -ENOSPC ? -EBUSY : r; 2082 return 0; 2083 } 2084 2085 static int next_free_minor(int *minor) 2086 { 2087 int r; 2088 2089 idr_preload(GFP_KERNEL); 2090 spin_lock(&_minor_lock); 2091 2092 r = idr_alloc(&_minor_idr, MINOR_ALLOCED, 0, 1 << MINORBITS, GFP_NOWAIT); 2093 2094 spin_unlock(&_minor_lock); 2095 idr_preload_end(); 2096 if (r < 0) 2097 return r; 2098 *minor = r; 2099 return 0; 2100 } 2101 2102 static const struct block_device_operations dm_blk_dops; 2103 2104 static void dm_wq_work(struct work_struct *work); 2105 2106 static void dm_init_md_queue(struct mapped_device *md) 2107 { 2108 /* 2109 * Request-based dm devices cannot be stacked on top of bio-based dm 2110 * devices. The type of this dm device has not been decided yet. 2111 * The type is decided at the first table loading time. 2112 * To prevent problematic device stacking, clear the queue flag 2113 * for request stacking support until then. 2114 * 2115 * This queue is new, so no concurrency on the queue_flags. 2116 */ 2117 queue_flag_clear_unlocked(QUEUE_FLAG_STACKABLE, md->queue); 2118 2119 md->queue->queuedata = md; 2120 md->queue->backing_dev_info.congested_fn = dm_any_congested; 2121 md->queue->backing_dev_info.congested_data = md; 2122 blk_queue_make_request(md->queue, dm_request); 2123 blk_queue_bounce_limit(md->queue, BLK_BOUNCE_ANY); 2124 blk_queue_merge_bvec(md->queue, dm_merge_bvec); 2125 } 2126 2127 /* 2128 * Allocate and initialise a blank device with a given minor. 2129 */ 2130 static struct mapped_device *alloc_dev(int minor) 2131 { 2132 int r; 2133 struct mapped_device *md = kzalloc(sizeof(*md), GFP_KERNEL); 2134 void *old_md; 2135 2136 if (!md) { 2137 DMWARN("unable to allocate device, out of memory."); 2138 return NULL; 2139 } 2140 2141 if (!try_module_get(THIS_MODULE)) 2142 goto bad_module_get; 2143 2144 /* get a minor number for the dev */ 2145 if (minor == DM_ANY_MINOR) 2146 r = next_free_minor(&minor); 2147 else 2148 r = specific_minor(minor); 2149 if (r < 0) 2150 goto bad_minor; 2151 2152 r = init_srcu_struct(&md->io_barrier); 2153 if (r < 0) 2154 goto bad_io_barrier; 2155 2156 md->type = DM_TYPE_NONE; 2157 mutex_init(&md->suspend_lock); 2158 mutex_init(&md->type_lock); 2159 mutex_init(&md->table_devices_lock); 2160 spin_lock_init(&md->deferred_lock); 2161 atomic_set(&md->holders, 1); 2162 atomic_set(&md->open_count, 0); 2163 atomic_set(&md->event_nr, 0); 2164 atomic_set(&md->uevent_seq, 0); 2165 INIT_LIST_HEAD(&md->uevent_list); 2166 INIT_LIST_HEAD(&md->table_devices); 2167 spin_lock_init(&md->uevent_lock); 2168 2169 md->queue = blk_alloc_queue(GFP_KERNEL); 2170 if (!md->queue) 2171 goto bad_queue; 2172 2173 dm_init_md_queue(md); 2174 2175 md->disk = alloc_disk(1); 2176 if (!md->disk) 2177 goto bad_disk; 2178 2179 atomic_set(&md->pending[0], 0); 2180 atomic_set(&md->pending[1], 0); 2181 init_waitqueue_head(&md->wait); 2182 INIT_WORK(&md->work, dm_wq_work); 2183 init_waitqueue_head(&md->eventq); 2184 init_completion(&md->kobj_holder.completion); 2185 md->kworker_task = NULL; 2186 2187 md->disk->major = _major; 2188 md->disk->first_minor = minor; 2189 md->disk->fops = &dm_blk_dops; 2190 md->disk->queue = md->queue; 2191 md->disk->private_data = md; 2192 sprintf(md->disk->disk_name, "dm-%d", minor); 2193 add_disk(md->disk); 2194 format_dev_t(md->name, MKDEV(_major, minor)); 2195 2196 md->wq = alloc_workqueue("kdmflush", WQ_MEM_RECLAIM, 0); 2197 if (!md->wq) 2198 goto bad_thread; 2199 2200 md->bdev = bdget_disk(md->disk, 0); 2201 if (!md->bdev) 2202 goto bad_bdev; 2203 2204 bio_init(&md->flush_bio); 2205 md->flush_bio.bi_bdev = md->bdev; 2206 md->flush_bio.bi_rw = WRITE_FLUSH; 2207 2208 dm_stats_init(&md->stats); 2209 2210 /* Populate the mapping, nobody knows we exist yet */ 2211 spin_lock(&_minor_lock); 2212 old_md = idr_replace(&_minor_idr, md, minor); 2213 spin_unlock(&_minor_lock); 2214 2215 BUG_ON(old_md != MINOR_ALLOCED); 2216 2217 return md; 2218 2219 bad_bdev: 2220 destroy_workqueue(md->wq); 2221 bad_thread: 2222 del_gendisk(md->disk); 2223 put_disk(md->disk); 2224 bad_disk: 2225 blk_cleanup_queue(md->queue); 2226 bad_queue: 2227 cleanup_srcu_struct(&md->io_barrier); 2228 bad_io_barrier: 2229 free_minor(minor); 2230 bad_minor: 2231 module_put(THIS_MODULE); 2232 bad_module_get: 2233 kfree(md); 2234 return NULL; 2235 } 2236 2237 static void unlock_fs(struct mapped_device *md); 2238 2239 static void free_dev(struct mapped_device *md) 2240 { 2241 int minor = MINOR(disk_devt(md->disk)); 2242 2243 unlock_fs(md); 2244 bdput(md->bdev); 2245 destroy_workqueue(md->wq); 2246 2247 if (md->kworker_task) 2248 kthread_stop(md->kworker_task); 2249 if (md->io_pool) 2250 mempool_destroy(md->io_pool); 2251 if (md->rq_pool) 2252 mempool_destroy(md->rq_pool); 2253 if (md->bs) 2254 bioset_free(md->bs); 2255 blk_integrity_unregister(md->disk); 2256 del_gendisk(md->disk); 2257 cleanup_srcu_struct(&md->io_barrier); 2258 free_table_devices(&md->table_devices); 2259 free_minor(minor); 2260 2261 spin_lock(&_minor_lock); 2262 md->disk->private_data = NULL; 2263 spin_unlock(&_minor_lock); 2264 2265 put_disk(md->disk); 2266 blk_cleanup_queue(md->queue); 2267 dm_stats_cleanup(&md->stats); 2268 module_put(THIS_MODULE); 2269 kfree(md); 2270 } 2271 2272 static void __bind_mempools(struct mapped_device *md, struct dm_table *t) 2273 { 2274 struct dm_md_mempools *p = dm_table_get_md_mempools(t); 2275 2276 if (md->io_pool && md->bs) { 2277 /* The md already has necessary mempools. */ 2278 if (dm_table_get_type(t) == DM_TYPE_BIO_BASED) { 2279 /* 2280 * Reload bioset because front_pad may have changed 2281 * because a different table was loaded. 2282 */ 2283 bioset_free(md->bs); 2284 md->bs = p->bs; 2285 p->bs = NULL; 2286 } 2287 /* 2288 * There's no need to reload with request-based dm 2289 * because the size of front_pad doesn't change. 2290 * Note for future: If you are to reload bioset, 2291 * prep-ed requests in the queue may refer 2292 * to bio from the old bioset, so you must walk 2293 * through the queue to unprep. 2294 */ 2295 goto out; 2296 } 2297 2298 BUG_ON(!p || md->io_pool || md->rq_pool || md->bs); 2299 2300 md->io_pool = p->io_pool; 2301 p->io_pool = NULL; 2302 md->rq_pool = p->rq_pool; 2303 p->rq_pool = NULL; 2304 md->bs = p->bs; 2305 p->bs = NULL; 2306 2307 out: 2308 /* mempool bind completed, now no need any mempools in the table */ 2309 dm_table_free_md_mempools(t); 2310 } 2311 2312 /* 2313 * Bind a table to the device. 2314 */ 2315 static void event_callback(void *context) 2316 { 2317 unsigned long flags; 2318 LIST_HEAD(uevents); 2319 struct mapped_device *md = (struct mapped_device *) context; 2320 2321 spin_lock_irqsave(&md->uevent_lock, flags); 2322 list_splice_init(&md->uevent_list, &uevents); 2323 spin_unlock_irqrestore(&md->uevent_lock, flags); 2324 2325 dm_send_uevents(&uevents, &disk_to_dev(md->disk)->kobj); 2326 2327 atomic_inc(&md->event_nr); 2328 wake_up(&md->eventq); 2329 } 2330 2331 /* 2332 * Protected by md->suspend_lock obtained by dm_swap_table(). 2333 */ 2334 static void __set_size(struct mapped_device *md, sector_t size) 2335 { 2336 set_capacity(md->disk, size); 2337 2338 i_size_write(md->bdev->bd_inode, (loff_t)size << SECTOR_SHIFT); 2339 } 2340 2341 /* 2342 * Return 1 if the queue has a compulsory merge_bvec_fn function. 2343 * 2344 * If this function returns 0, then the device is either a non-dm 2345 * device without a merge_bvec_fn, or it is a dm device that is 2346 * able to split any bios it receives that are too big. 2347 */ 2348 int dm_queue_merge_is_compulsory(struct request_queue *q) 2349 { 2350 struct mapped_device *dev_md; 2351 2352 if (!q->merge_bvec_fn) 2353 return 0; 2354 2355 if (q->make_request_fn == dm_request) { 2356 dev_md = q->queuedata; 2357 if (test_bit(DMF_MERGE_IS_OPTIONAL, &dev_md->flags)) 2358 return 0; 2359 } 2360 2361 return 1; 2362 } 2363 2364 static int dm_device_merge_is_compulsory(struct dm_target *ti, 2365 struct dm_dev *dev, sector_t start, 2366 sector_t len, void *data) 2367 { 2368 struct block_device *bdev = dev->bdev; 2369 struct request_queue *q = bdev_get_queue(bdev); 2370 2371 return dm_queue_merge_is_compulsory(q); 2372 } 2373 2374 /* 2375 * Return 1 if it is acceptable to ignore merge_bvec_fn based 2376 * on the properties of the underlying devices. 2377 */ 2378 static int dm_table_merge_is_optional(struct dm_table *table) 2379 { 2380 unsigned i = 0; 2381 struct dm_target *ti; 2382 2383 while (i < dm_table_get_num_targets(table)) { 2384 ti = dm_table_get_target(table, i++); 2385 2386 if (ti->type->iterate_devices && 2387 ti->type->iterate_devices(ti, dm_device_merge_is_compulsory, NULL)) 2388 return 0; 2389 } 2390 2391 return 1; 2392 } 2393 2394 /* 2395 * Returns old map, which caller must destroy. 2396 */ 2397 static struct dm_table *__bind(struct mapped_device *md, struct dm_table *t, 2398 struct queue_limits *limits) 2399 { 2400 struct dm_table *old_map; 2401 struct request_queue *q = md->queue; 2402 sector_t size; 2403 int merge_is_optional; 2404 2405 size = dm_table_get_size(t); 2406 2407 /* 2408 * Wipe any geometry if the size of the table changed. 2409 */ 2410 if (size != dm_get_size(md)) 2411 memset(&md->geometry, 0, sizeof(md->geometry)); 2412 2413 __set_size(md, size); 2414 2415 dm_table_event_callback(t, event_callback, md); 2416 2417 /* 2418 * The queue hasn't been stopped yet, if the old table type wasn't 2419 * for request-based during suspension. So stop it to prevent 2420 * I/O mapping before resume. 2421 * This must be done before setting the queue restrictions, 2422 * because request-based dm may be run just after the setting. 2423 */ 2424 if (dm_table_request_based(t) && !blk_queue_stopped(q)) 2425 stop_queue(q); 2426 2427 __bind_mempools(md, t); 2428 2429 merge_is_optional = dm_table_merge_is_optional(t); 2430 2431 old_map = rcu_dereference_protected(md->map, lockdep_is_held(&md->suspend_lock)); 2432 rcu_assign_pointer(md->map, t); 2433 md->immutable_target_type = dm_table_get_immutable_target_type(t); 2434 2435 dm_table_set_restrictions(t, q, limits); 2436 if (merge_is_optional) 2437 set_bit(DMF_MERGE_IS_OPTIONAL, &md->flags); 2438 else 2439 clear_bit(DMF_MERGE_IS_OPTIONAL, &md->flags); 2440 if (old_map) 2441 dm_sync_table(md); 2442 2443 return old_map; 2444 } 2445 2446 /* 2447 * Returns unbound table for the caller to free. 2448 */ 2449 static struct dm_table *__unbind(struct mapped_device *md) 2450 { 2451 struct dm_table *map = rcu_dereference_protected(md->map, 1); 2452 2453 if (!map) 2454 return NULL; 2455 2456 dm_table_event_callback(map, NULL, NULL); 2457 RCU_INIT_POINTER(md->map, NULL); 2458 dm_sync_table(md); 2459 2460 return map; 2461 } 2462 2463 /* 2464 * Constructor for a new device. 2465 */ 2466 int dm_create(int minor, struct mapped_device **result) 2467 { 2468 struct mapped_device *md; 2469 2470 md = alloc_dev(minor); 2471 if (!md) 2472 return -ENXIO; 2473 2474 dm_sysfs_init(md); 2475 2476 *result = md; 2477 return 0; 2478 } 2479 2480 /* 2481 * Functions to manage md->type. 2482 * All are required to hold md->type_lock. 2483 */ 2484 void dm_lock_md_type(struct mapped_device *md) 2485 { 2486 mutex_lock(&md->type_lock); 2487 } 2488 2489 void dm_unlock_md_type(struct mapped_device *md) 2490 { 2491 mutex_unlock(&md->type_lock); 2492 } 2493 2494 void dm_set_md_type(struct mapped_device *md, unsigned type) 2495 { 2496 BUG_ON(!mutex_is_locked(&md->type_lock)); 2497 md->type = type; 2498 } 2499 2500 unsigned dm_get_md_type(struct mapped_device *md) 2501 { 2502 BUG_ON(!mutex_is_locked(&md->type_lock)); 2503 return md->type; 2504 } 2505 2506 static bool dm_md_type_request_based(struct mapped_device *md) 2507 { 2508 unsigned table_type = dm_get_md_type(md); 2509 2510 return (table_type == DM_TYPE_REQUEST_BASED || 2511 table_type == DM_TYPE_MQ_REQUEST_BASED); 2512 } 2513 2514 struct target_type *dm_get_immutable_target_type(struct mapped_device *md) 2515 { 2516 return md->immutable_target_type; 2517 } 2518 2519 /* 2520 * The queue_limits are only valid as long as you have a reference 2521 * count on 'md'. 2522 */ 2523 struct queue_limits *dm_get_queue_limits(struct mapped_device *md) 2524 { 2525 BUG_ON(!atomic_read(&md->holders)); 2526 return &md->queue->limits; 2527 } 2528 EXPORT_SYMBOL_GPL(dm_get_queue_limits); 2529 2530 /* 2531 * Fully initialize a request-based queue (->elevator, ->request_fn, etc). 2532 */ 2533 static int dm_init_request_based_queue(struct mapped_device *md) 2534 { 2535 struct request_queue *q = NULL; 2536 2537 if (md->queue->elevator) 2538 return 1; 2539 2540 /* Fully initialize the queue */ 2541 q = blk_init_allocated_queue(md->queue, dm_request_fn, NULL); 2542 if (!q) 2543 return 0; 2544 2545 md->queue = q; 2546 dm_init_md_queue(md); 2547 blk_queue_softirq_done(md->queue, dm_softirq_done); 2548 blk_queue_prep_rq(md->queue, dm_prep_fn); 2549 blk_queue_lld_busy(md->queue, dm_lld_busy); 2550 2551 /* Also initialize the request-based DM worker thread */ 2552 init_kthread_worker(&md->kworker); 2553 md->kworker_task = kthread_run(kthread_worker_fn, &md->kworker, 2554 "kdmwork-%s", dm_device_name(md)); 2555 2556 elv_register_queue(md->queue); 2557 2558 return 1; 2559 } 2560 2561 /* 2562 * Setup the DM device's queue based on md's type 2563 */ 2564 int dm_setup_md_queue(struct mapped_device *md) 2565 { 2566 if (dm_md_type_request_based(md) && !dm_init_request_based_queue(md)) { 2567 DMWARN("Cannot initialize queue for request-based mapped device"); 2568 return -EINVAL; 2569 } 2570 2571 return 0; 2572 } 2573 2574 struct mapped_device *dm_get_md(dev_t dev) 2575 { 2576 struct mapped_device *md; 2577 unsigned minor = MINOR(dev); 2578 2579 if (MAJOR(dev) != _major || minor >= (1 << MINORBITS)) 2580 return NULL; 2581 2582 spin_lock(&_minor_lock); 2583 2584 md = idr_find(&_minor_idr, minor); 2585 if (md) { 2586 if ((md == MINOR_ALLOCED || 2587 (MINOR(disk_devt(dm_disk(md))) != minor) || 2588 dm_deleting_md(md) || 2589 test_bit(DMF_FREEING, &md->flags))) { 2590 md = NULL; 2591 goto out; 2592 } 2593 dm_get(md); 2594 } 2595 2596 out: 2597 spin_unlock(&_minor_lock); 2598 2599 return md; 2600 } 2601 EXPORT_SYMBOL_GPL(dm_get_md); 2602 2603 void *dm_get_mdptr(struct mapped_device *md) 2604 { 2605 return md->interface_ptr; 2606 } 2607 2608 void dm_set_mdptr(struct mapped_device *md, void *ptr) 2609 { 2610 md->interface_ptr = ptr; 2611 } 2612 2613 void dm_get(struct mapped_device *md) 2614 { 2615 atomic_inc(&md->holders); 2616 BUG_ON(test_bit(DMF_FREEING, &md->flags)); 2617 } 2618 2619 int dm_hold(struct mapped_device *md) 2620 { 2621 spin_lock(&_minor_lock); 2622 if (test_bit(DMF_FREEING, &md->flags)) { 2623 spin_unlock(&_minor_lock); 2624 return -EBUSY; 2625 } 2626 dm_get(md); 2627 spin_unlock(&_minor_lock); 2628 return 0; 2629 } 2630 EXPORT_SYMBOL_GPL(dm_hold); 2631 2632 const char *dm_device_name(struct mapped_device *md) 2633 { 2634 return md->name; 2635 } 2636 EXPORT_SYMBOL_GPL(dm_device_name); 2637 2638 static void __dm_destroy(struct mapped_device *md, bool wait) 2639 { 2640 struct dm_table *map; 2641 int srcu_idx; 2642 2643 might_sleep(); 2644 2645 spin_lock(&_minor_lock); 2646 map = dm_get_live_table(md, &srcu_idx); 2647 idr_replace(&_minor_idr, MINOR_ALLOCED, MINOR(disk_devt(dm_disk(md)))); 2648 set_bit(DMF_FREEING, &md->flags); 2649 spin_unlock(&_minor_lock); 2650 2651 if (dm_request_based(md)) 2652 flush_kthread_worker(&md->kworker); 2653 2654 /* 2655 * Take suspend_lock so that presuspend and postsuspend methods 2656 * do not race with internal suspend. 2657 */ 2658 mutex_lock(&md->suspend_lock); 2659 if (!dm_suspended_md(md)) { 2660 dm_table_presuspend_targets(map); 2661 dm_table_postsuspend_targets(map); 2662 } 2663 mutex_unlock(&md->suspend_lock); 2664 2665 /* dm_put_live_table must be before msleep, otherwise deadlock is possible */ 2666 dm_put_live_table(md, srcu_idx); 2667 2668 /* 2669 * Rare, but there may be I/O requests still going to complete, 2670 * for example. Wait for all references to disappear. 2671 * No one should increment the reference count of the mapped_device, 2672 * after the mapped_device state becomes DMF_FREEING. 2673 */ 2674 if (wait) 2675 while (atomic_read(&md->holders)) 2676 msleep(1); 2677 else if (atomic_read(&md->holders)) 2678 DMWARN("%s: Forcibly removing mapped_device still in use! (%d users)", 2679 dm_device_name(md), atomic_read(&md->holders)); 2680 2681 dm_sysfs_exit(md); 2682 dm_table_destroy(__unbind(md)); 2683 free_dev(md); 2684 } 2685 2686 void dm_destroy(struct mapped_device *md) 2687 { 2688 __dm_destroy(md, true); 2689 } 2690 2691 void dm_destroy_immediate(struct mapped_device *md) 2692 { 2693 __dm_destroy(md, false); 2694 } 2695 2696 void dm_put(struct mapped_device *md) 2697 { 2698 atomic_dec(&md->holders); 2699 } 2700 EXPORT_SYMBOL_GPL(dm_put); 2701 2702 static int dm_wait_for_completion(struct mapped_device *md, int interruptible) 2703 { 2704 int r = 0; 2705 DECLARE_WAITQUEUE(wait, current); 2706 2707 add_wait_queue(&md->wait, &wait); 2708 2709 while (1) { 2710 set_current_state(interruptible); 2711 2712 if (!md_in_flight(md)) 2713 break; 2714 2715 if (interruptible == TASK_INTERRUPTIBLE && 2716 signal_pending(current)) { 2717 r = -EINTR; 2718 break; 2719 } 2720 2721 io_schedule(); 2722 } 2723 set_current_state(TASK_RUNNING); 2724 2725 remove_wait_queue(&md->wait, &wait); 2726 2727 return r; 2728 } 2729 2730 /* 2731 * Process the deferred bios 2732 */ 2733 static void dm_wq_work(struct work_struct *work) 2734 { 2735 struct mapped_device *md = container_of(work, struct mapped_device, 2736 work); 2737 struct bio *c; 2738 int srcu_idx; 2739 struct dm_table *map; 2740 2741 map = dm_get_live_table(md, &srcu_idx); 2742 2743 while (!test_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags)) { 2744 spin_lock_irq(&md->deferred_lock); 2745 c = bio_list_pop(&md->deferred); 2746 spin_unlock_irq(&md->deferred_lock); 2747 2748 if (!c) 2749 break; 2750 2751 if (dm_request_based(md)) 2752 generic_make_request(c); 2753 else 2754 __split_and_process_bio(md, map, c); 2755 } 2756 2757 dm_put_live_table(md, srcu_idx); 2758 } 2759 2760 static void dm_queue_flush(struct mapped_device *md) 2761 { 2762 clear_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags); 2763 smp_mb__after_atomic(); 2764 queue_work(md->wq, &md->work); 2765 } 2766 2767 /* 2768 * Swap in a new table, returning the old one for the caller to destroy. 2769 */ 2770 struct dm_table *dm_swap_table(struct mapped_device *md, struct dm_table *table) 2771 { 2772 struct dm_table *live_map = NULL, *map = ERR_PTR(-EINVAL); 2773 struct queue_limits limits; 2774 int r; 2775 2776 mutex_lock(&md->suspend_lock); 2777 2778 /* device must be suspended */ 2779 if (!dm_suspended_md(md)) 2780 goto out; 2781 2782 /* 2783 * If the new table has no data devices, retain the existing limits. 2784 * This helps multipath with queue_if_no_path if all paths disappear, 2785 * then new I/O is queued based on these limits, and then some paths 2786 * reappear. 2787 */ 2788 if (dm_table_has_no_data_devices(table)) { 2789 live_map = dm_get_live_table_fast(md); 2790 if (live_map) 2791 limits = md->queue->limits; 2792 dm_put_live_table_fast(md); 2793 } 2794 2795 if (!live_map) { 2796 r = dm_calculate_queue_limits(table, &limits); 2797 if (r) { 2798 map = ERR_PTR(r); 2799 goto out; 2800 } 2801 } 2802 2803 map = __bind(md, table, &limits); 2804 2805 out: 2806 mutex_unlock(&md->suspend_lock); 2807 return map; 2808 } 2809 2810 /* 2811 * Functions to lock and unlock any filesystem running on the 2812 * device. 2813 */ 2814 static int lock_fs(struct mapped_device *md) 2815 { 2816 int r; 2817 2818 WARN_ON(md->frozen_sb); 2819 2820 md->frozen_sb = freeze_bdev(md->bdev); 2821 if (IS_ERR(md->frozen_sb)) { 2822 r = PTR_ERR(md->frozen_sb); 2823 md->frozen_sb = NULL; 2824 return r; 2825 } 2826 2827 set_bit(DMF_FROZEN, &md->flags); 2828 2829 return 0; 2830 } 2831 2832 static void unlock_fs(struct mapped_device *md) 2833 { 2834 if (!test_bit(DMF_FROZEN, &md->flags)) 2835 return; 2836 2837 thaw_bdev(md->bdev, md->frozen_sb); 2838 md->frozen_sb = NULL; 2839 clear_bit(DMF_FROZEN, &md->flags); 2840 } 2841 2842 /* 2843 * If __dm_suspend returns 0, the device is completely quiescent 2844 * now. There is no request-processing activity. All new requests 2845 * are being added to md->deferred list. 2846 * 2847 * Caller must hold md->suspend_lock 2848 */ 2849 static int __dm_suspend(struct mapped_device *md, struct dm_table *map, 2850 unsigned suspend_flags, int interruptible) 2851 { 2852 bool do_lockfs = suspend_flags & DM_SUSPEND_LOCKFS_FLAG; 2853 bool noflush = suspend_flags & DM_SUSPEND_NOFLUSH_FLAG; 2854 int r; 2855 2856 /* 2857 * DMF_NOFLUSH_SUSPENDING must be set before presuspend. 2858 * This flag is cleared before dm_suspend returns. 2859 */ 2860 if (noflush) 2861 set_bit(DMF_NOFLUSH_SUSPENDING, &md->flags); 2862 2863 /* 2864 * This gets reverted if there's an error later and the targets 2865 * provide the .presuspend_undo hook. 2866 */ 2867 dm_table_presuspend_targets(map); 2868 2869 /* 2870 * Flush I/O to the device. 2871 * Any I/O submitted after lock_fs() may not be flushed. 2872 * noflush takes precedence over do_lockfs. 2873 * (lock_fs() flushes I/Os and waits for them to complete.) 2874 */ 2875 if (!noflush && do_lockfs) { 2876 r = lock_fs(md); 2877 if (r) { 2878 dm_table_presuspend_undo_targets(map); 2879 return r; 2880 } 2881 } 2882 2883 /* 2884 * Here we must make sure that no processes are submitting requests 2885 * to target drivers i.e. no one may be executing 2886 * __split_and_process_bio. This is called from dm_request and 2887 * dm_wq_work. 2888 * 2889 * To get all processes out of __split_and_process_bio in dm_request, 2890 * we take the write lock. To prevent any process from reentering 2891 * __split_and_process_bio from dm_request and quiesce the thread 2892 * (dm_wq_work), we set BMF_BLOCK_IO_FOR_SUSPEND and call 2893 * flush_workqueue(md->wq). 2894 */ 2895 set_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags); 2896 if (map) 2897 synchronize_srcu(&md->io_barrier); 2898 2899 /* 2900 * Stop md->queue before flushing md->wq in case request-based 2901 * dm defers requests to md->wq from md->queue. 2902 */ 2903 if (dm_request_based(md)) { 2904 stop_queue(md->queue); 2905 flush_kthread_worker(&md->kworker); 2906 } 2907 2908 flush_workqueue(md->wq); 2909 2910 /* 2911 * At this point no more requests are entering target request routines. 2912 * We call dm_wait_for_completion to wait for all existing requests 2913 * to finish. 2914 */ 2915 r = dm_wait_for_completion(md, interruptible); 2916 2917 if (noflush) 2918 clear_bit(DMF_NOFLUSH_SUSPENDING, &md->flags); 2919 if (map) 2920 synchronize_srcu(&md->io_barrier); 2921 2922 /* were we interrupted ? */ 2923 if (r < 0) { 2924 dm_queue_flush(md); 2925 2926 if (dm_request_based(md)) 2927 start_queue(md->queue); 2928 2929 unlock_fs(md); 2930 dm_table_presuspend_undo_targets(map); 2931 /* pushback list is already flushed, so skip flush */ 2932 } 2933 2934 return r; 2935 } 2936 2937 /* 2938 * We need to be able to change a mapping table under a mounted 2939 * filesystem. For example we might want to move some data in 2940 * the background. Before the table can be swapped with 2941 * dm_bind_table, dm_suspend must be called to flush any in 2942 * flight bios and ensure that any further io gets deferred. 2943 */ 2944 /* 2945 * Suspend mechanism in request-based dm. 2946 * 2947 * 1. Flush all I/Os by lock_fs() if needed. 2948 * 2. Stop dispatching any I/O by stopping the request_queue. 2949 * 3. Wait for all in-flight I/Os to be completed or requeued. 2950 * 2951 * To abort suspend, start the request_queue. 2952 */ 2953 int dm_suspend(struct mapped_device *md, unsigned suspend_flags) 2954 { 2955 struct dm_table *map = NULL; 2956 int r = 0; 2957 2958 retry: 2959 mutex_lock_nested(&md->suspend_lock, SINGLE_DEPTH_NESTING); 2960 2961 if (dm_suspended_md(md)) { 2962 r = -EINVAL; 2963 goto out_unlock; 2964 } 2965 2966 if (dm_suspended_internally_md(md)) { 2967 /* already internally suspended, wait for internal resume */ 2968 mutex_unlock(&md->suspend_lock); 2969 r = wait_on_bit(&md->flags, DMF_SUSPENDED_INTERNALLY, TASK_INTERRUPTIBLE); 2970 if (r) 2971 return r; 2972 goto retry; 2973 } 2974 2975 map = rcu_dereference_protected(md->map, lockdep_is_held(&md->suspend_lock)); 2976 2977 r = __dm_suspend(md, map, suspend_flags, TASK_INTERRUPTIBLE); 2978 if (r) 2979 goto out_unlock; 2980 2981 set_bit(DMF_SUSPENDED, &md->flags); 2982 2983 dm_table_postsuspend_targets(map); 2984 2985 out_unlock: 2986 mutex_unlock(&md->suspend_lock); 2987 return r; 2988 } 2989 2990 static int __dm_resume(struct mapped_device *md, struct dm_table *map) 2991 { 2992 if (map) { 2993 int r = dm_table_resume_targets(map); 2994 if (r) 2995 return r; 2996 } 2997 2998 dm_queue_flush(md); 2999 3000 /* 3001 * Flushing deferred I/Os must be done after targets are resumed 3002 * so that mapping of targets can work correctly. 3003 * Request-based dm is queueing the deferred I/Os in its request_queue. 3004 */ 3005 if (dm_request_based(md)) 3006 start_queue(md->queue); 3007 3008 unlock_fs(md); 3009 3010 return 0; 3011 } 3012 3013 int dm_resume(struct mapped_device *md) 3014 { 3015 int r = -EINVAL; 3016 struct dm_table *map = NULL; 3017 3018 retry: 3019 mutex_lock_nested(&md->suspend_lock, SINGLE_DEPTH_NESTING); 3020 3021 if (!dm_suspended_md(md)) 3022 goto out; 3023 3024 if (dm_suspended_internally_md(md)) { 3025 /* already internally suspended, wait for internal resume */ 3026 mutex_unlock(&md->suspend_lock); 3027 r = wait_on_bit(&md->flags, DMF_SUSPENDED_INTERNALLY, TASK_INTERRUPTIBLE); 3028 if (r) 3029 return r; 3030 goto retry; 3031 } 3032 3033 map = rcu_dereference_protected(md->map, lockdep_is_held(&md->suspend_lock)); 3034 if (!map || !dm_table_get_size(map)) 3035 goto out; 3036 3037 r = __dm_resume(md, map); 3038 if (r) 3039 goto out; 3040 3041 clear_bit(DMF_SUSPENDED, &md->flags); 3042 3043 r = 0; 3044 out: 3045 mutex_unlock(&md->suspend_lock); 3046 3047 return r; 3048 } 3049 3050 /* 3051 * Internal suspend/resume works like userspace-driven suspend. It waits 3052 * until all bios finish and prevents issuing new bios to the target drivers. 3053 * It may be used only from the kernel. 3054 */ 3055 3056 static void __dm_internal_suspend(struct mapped_device *md, unsigned suspend_flags) 3057 { 3058 struct dm_table *map = NULL; 3059 3060 if (md->internal_suspend_count++) 3061 return; /* nested internal suspend */ 3062 3063 if (dm_suspended_md(md)) { 3064 set_bit(DMF_SUSPENDED_INTERNALLY, &md->flags); 3065 return; /* nest suspend */ 3066 } 3067 3068 map = rcu_dereference_protected(md->map, lockdep_is_held(&md->suspend_lock)); 3069 3070 /* 3071 * Using TASK_UNINTERRUPTIBLE because only NOFLUSH internal suspend is 3072 * supported. Properly supporting a TASK_INTERRUPTIBLE internal suspend 3073 * would require changing .presuspend to return an error -- avoid this 3074 * until there is a need for more elaborate variants of internal suspend. 3075 */ 3076 (void) __dm_suspend(md, map, suspend_flags, TASK_UNINTERRUPTIBLE); 3077 3078 set_bit(DMF_SUSPENDED_INTERNALLY, &md->flags); 3079 3080 dm_table_postsuspend_targets(map); 3081 } 3082 3083 static void __dm_internal_resume(struct mapped_device *md) 3084 { 3085 BUG_ON(!md->internal_suspend_count); 3086 3087 if (--md->internal_suspend_count) 3088 return; /* resume from nested internal suspend */ 3089 3090 if (dm_suspended_md(md)) 3091 goto done; /* resume from nested suspend */ 3092 3093 /* 3094 * NOTE: existing callers don't need to call dm_table_resume_targets 3095 * (which may fail -- so best to avoid it for now by passing NULL map) 3096 */ 3097 (void) __dm_resume(md, NULL); 3098 3099 done: 3100 clear_bit(DMF_SUSPENDED_INTERNALLY, &md->flags); 3101 smp_mb__after_atomic(); 3102 wake_up_bit(&md->flags, DMF_SUSPENDED_INTERNALLY); 3103 } 3104 3105 void dm_internal_suspend_noflush(struct mapped_device *md) 3106 { 3107 mutex_lock(&md->suspend_lock); 3108 __dm_internal_suspend(md, DM_SUSPEND_NOFLUSH_FLAG); 3109 mutex_unlock(&md->suspend_lock); 3110 } 3111 EXPORT_SYMBOL_GPL(dm_internal_suspend_noflush); 3112 3113 void dm_internal_resume(struct mapped_device *md) 3114 { 3115 mutex_lock(&md->suspend_lock); 3116 __dm_internal_resume(md); 3117 mutex_unlock(&md->suspend_lock); 3118 } 3119 EXPORT_SYMBOL_GPL(dm_internal_resume); 3120 3121 /* 3122 * Fast variants of internal suspend/resume hold md->suspend_lock, 3123 * which prevents interaction with userspace-driven suspend. 3124 */ 3125 3126 void dm_internal_suspend_fast(struct mapped_device *md) 3127 { 3128 mutex_lock(&md->suspend_lock); 3129 if (dm_suspended_md(md) || dm_suspended_internally_md(md)) 3130 return; 3131 3132 set_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags); 3133 synchronize_srcu(&md->io_barrier); 3134 flush_workqueue(md->wq); 3135 dm_wait_for_completion(md, TASK_UNINTERRUPTIBLE); 3136 } 3137 EXPORT_SYMBOL_GPL(dm_internal_suspend_fast); 3138 3139 void dm_internal_resume_fast(struct mapped_device *md) 3140 { 3141 if (dm_suspended_md(md) || dm_suspended_internally_md(md)) 3142 goto done; 3143 3144 dm_queue_flush(md); 3145 3146 done: 3147 mutex_unlock(&md->suspend_lock); 3148 } 3149 EXPORT_SYMBOL_GPL(dm_internal_resume_fast); 3150 3151 /*----------------------------------------------------------------- 3152 * Event notification. 3153 *---------------------------------------------------------------*/ 3154 int dm_kobject_uevent(struct mapped_device *md, enum kobject_action action, 3155 unsigned cookie) 3156 { 3157 char udev_cookie[DM_COOKIE_LENGTH]; 3158 char *envp[] = { udev_cookie, NULL }; 3159 3160 if (!cookie) 3161 return kobject_uevent(&disk_to_dev(md->disk)->kobj, action); 3162 else { 3163 snprintf(udev_cookie, DM_COOKIE_LENGTH, "%s=%u", 3164 DM_COOKIE_ENV_VAR_NAME, cookie); 3165 return kobject_uevent_env(&disk_to_dev(md->disk)->kobj, 3166 action, envp); 3167 } 3168 } 3169 3170 uint32_t dm_next_uevent_seq(struct mapped_device *md) 3171 { 3172 return atomic_add_return(1, &md->uevent_seq); 3173 } 3174 3175 uint32_t dm_get_event_nr(struct mapped_device *md) 3176 { 3177 return atomic_read(&md->event_nr); 3178 } 3179 3180 int dm_wait_event(struct mapped_device *md, int event_nr) 3181 { 3182 return wait_event_interruptible(md->eventq, 3183 (event_nr != atomic_read(&md->event_nr))); 3184 } 3185 3186 void dm_uevent_add(struct mapped_device *md, struct list_head *elist) 3187 { 3188 unsigned long flags; 3189 3190 spin_lock_irqsave(&md->uevent_lock, flags); 3191 list_add(elist, &md->uevent_list); 3192 spin_unlock_irqrestore(&md->uevent_lock, flags); 3193 } 3194 3195 /* 3196 * The gendisk is only valid as long as you have a reference 3197 * count on 'md'. 3198 */ 3199 struct gendisk *dm_disk(struct mapped_device *md) 3200 { 3201 return md->disk; 3202 } 3203 3204 struct kobject *dm_kobject(struct mapped_device *md) 3205 { 3206 return &md->kobj_holder.kobj; 3207 } 3208 3209 struct mapped_device *dm_get_from_kobject(struct kobject *kobj) 3210 { 3211 struct mapped_device *md; 3212 3213 md = container_of(kobj, struct mapped_device, kobj_holder.kobj); 3214 3215 if (test_bit(DMF_FREEING, &md->flags) || 3216 dm_deleting_md(md)) 3217 return NULL; 3218 3219 dm_get(md); 3220 return md; 3221 } 3222 3223 int dm_suspended_md(struct mapped_device *md) 3224 { 3225 return test_bit(DMF_SUSPENDED, &md->flags); 3226 } 3227 3228 int dm_suspended_internally_md(struct mapped_device *md) 3229 { 3230 return test_bit(DMF_SUSPENDED_INTERNALLY, &md->flags); 3231 } 3232 3233 int dm_test_deferred_remove_flag(struct mapped_device *md) 3234 { 3235 return test_bit(DMF_DEFERRED_REMOVE, &md->flags); 3236 } 3237 3238 int dm_suspended(struct dm_target *ti) 3239 { 3240 return dm_suspended_md(dm_table_get_md(ti->table)); 3241 } 3242 EXPORT_SYMBOL_GPL(dm_suspended); 3243 3244 int dm_noflush_suspending(struct dm_target *ti) 3245 { 3246 return __noflush_suspending(dm_table_get_md(ti->table)); 3247 } 3248 EXPORT_SYMBOL_GPL(dm_noflush_suspending); 3249 3250 struct dm_md_mempools *dm_alloc_md_mempools(unsigned type, unsigned integrity, unsigned per_bio_data_size) 3251 { 3252 struct dm_md_mempools *pools = kzalloc(sizeof(*pools), GFP_KERNEL); 3253 struct kmem_cache *cachep; 3254 unsigned int pool_size = 0; 3255 unsigned int front_pad; 3256 3257 if (!pools) 3258 return NULL; 3259 3260 switch (type) { 3261 case DM_TYPE_BIO_BASED: 3262 cachep = _io_cache; 3263 pool_size = dm_get_reserved_bio_based_ios(); 3264 front_pad = roundup(per_bio_data_size, __alignof__(struct dm_target_io)) + offsetof(struct dm_target_io, clone); 3265 break; 3266 case DM_TYPE_REQUEST_BASED: 3267 pool_size = dm_get_reserved_rq_based_ios(); 3268 pools->rq_pool = mempool_create_slab_pool(pool_size, _rq_cache); 3269 if (!pools->rq_pool) 3270 goto out; 3271 /* fall through to setup remaining rq-based pools */ 3272 case DM_TYPE_MQ_REQUEST_BASED: 3273 cachep = _rq_tio_cache; 3274 if (!pool_size) 3275 pool_size = dm_get_reserved_rq_based_ios(); 3276 front_pad = offsetof(struct dm_rq_clone_bio_info, clone); 3277 /* per_bio_data_size is not used. See __bind_mempools(). */ 3278 WARN_ON(per_bio_data_size != 0); 3279 break; 3280 default: 3281 goto out; 3282 } 3283 3284 pools->io_pool = mempool_create_slab_pool(pool_size, cachep); 3285 if (!pools->io_pool) 3286 goto out; 3287 3288 pools->bs = bioset_create_nobvec(pool_size, front_pad); 3289 if (!pools->bs) 3290 goto out; 3291 3292 if (integrity && bioset_integrity_create(pools->bs, pool_size)) 3293 goto out; 3294 3295 return pools; 3296 3297 out: 3298 dm_free_md_mempools(pools); 3299 3300 return NULL; 3301 } 3302 3303 void dm_free_md_mempools(struct dm_md_mempools *pools) 3304 { 3305 if (!pools) 3306 return; 3307 3308 if (pools->io_pool) 3309 mempool_destroy(pools->io_pool); 3310 3311 if (pools->rq_pool) 3312 mempool_destroy(pools->rq_pool); 3313 3314 if (pools->bs) 3315 bioset_free(pools->bs); 3316 3317 kfree(pools); 3318 } 3319 3320 static const struct block_device_operations dm_blk_dops = { 3321 .open = dm_blk_open, 3322 .release = dm_blk_close, 3323 .ioctl = dm_blk_ioctl, 3324 .getgeo = dm_blk_getgeo, 3325 .owner = THIS_MODULE 3326 }; 3327 3328 /* 3329 * module hooks 3330 */ 3331 module_init(dm_init); 3332 module_exit(dm_exit); 3333 3334 module_param(major, uint, 0); 3335 MODULE_PARM_DESC(major, "The major number of the device mapper"); 3336 3337 module_param(reserved_bio_based_ios, uint, S_IRUGO | S_IWUSR); 3338 MODULE_PARM_DESC(reserved_bio_based_ios, "Reserved IOs in bio-based mempools"); 3339 3340 module_param(reserved_rq_based_ios, uint, S_IRUGO | S_IWUSR); 3341 MODULE_PARM_DESC(reserved_rq_based_ios, "Reserved IOs in request-based mempools"); 3342 3343 MODULE_DESCRIPTION(DM_NAME " driver"); 3344 MODULE_AUTHOR("Joe Thornber <dm-devel@redhat.com>"); 3345 MODULE_LICENSE("GPL"); 3346