1 /* 2 * Copyright (C) 2001, 2002 Sistina Software (UK) Limited. 3 * Copyright (C) 2004-2008 Red Hat, Inc. All rights reserved. 4 * 5 * This file is released under the GPL. 6 */ 7 8 #include "dm.h" 9 #include "dm-uevent.h" 10 11 #include <linux/init.h> 12 #include <linux/module.h> 13 #include <linux/mutex.h> 14 #include <linux/moduleparam.h> 15 #include <linux/blkpg.h> 16 #include <linux/bio.h> 17 #include <linux/mempool.h> 18 #include <linux/slab.h> 19 #include <linux/idr.h> 20 #include <linux/hdreg.h> 21 #include <linux/delay.h> 22 #include <linux/wait.h> 23 #include <linux/kthread.h> 24 25 #include <trace/events/block.h> 26 27 #define DM_MSG_PREFIX "core" 28 29 #ifdef CONFIG_PRINTK 30 /* 31 * ratelimit state to be used in DMXXX_LIMIT(). 32 */ 33 DEFINE_RATELIMIT_STATE(dm_ratelimit_state, 34 DEFAULT_RATELIMIT_INTERVAL, 35 DEFAULT_RATELIMIT_BURST); 36 EXPORT_SYMBOL(dm_ratelimit_state); 37 #endif 38 39 /* 40 * Cookies are numeric values sent with CHANGE and REMOVE 41 * uevents while resuming, removing or renaming the device. 42 */ 43 #define DM_COOKIE_ENV_VAR_NAME "DM_COOKIE" 44 #define DM_COOKIE_LENGTH 24 45 46 static const char *_name = DM_NAME; 47 48 static unsigned int major = 0; 49 static unsigned int _major = 0; 50 51 static DEFINE_IDR(_minor_idr); 52 53 static DEFINE_SPINLOCK(_minor_lock); 54 55 static void do_deferred_remove(struct work_struct *w); 56 57 static DECLARE_WORK(deferred_remove_work, do_deferred_remove); 58 59 static struct workqueue_struct *deferred_remove_workqueue; 60 61 /* 62 * For bio-based dm. 63 * One of these is allocated per bio. 64 */ 65 struct dm_io { 66 struct mapped_device *md; 67 int error; 68 atomic_t io_count; 69 struct bio *bio; 70 unsigned long start_time; 71 spinlock_t endio_lock; 72 struct dm_stats_aux stats_aux; 73 }; 74 75 /* 76 * For request-based dm. 77 * One of these is allocated per request. 78 */ 79 struct dm_rq_target_io { 80 struct mapped_device *md; 81 struct dm_target *ti; 82 struct request *orig, *clone; 83 struct kthread_work work; 84 int error; 85 union map_info info; 86 }; 87 88 /* 89 * For request-based dm - the bio clones we allocate are embedded in these 90 * structs. 91 * 92 * We allocate these with bio_alloc_bioset, using the front_pad parameter when 93 * the bioset is created - this means the bio has to come at the end of the 94 * struct. 95 */ 96 struct dm_rq_clone_bio_info { 97 struct bio *orig; 98 struct dm_rq_target_io *tio; 99 struct bio clone; 100 }; 101 102 union map_info *dm_get_rq_mapinfo(struct request *rq) 103 { 104 if (rq && rq->end_io_data) 105 return &((struct dm_rq_target_io *)rq->end_io_data)->info; 106 return NULL; 107 } 108 EXPORT_SYMBOL_GPL(dm_get_rq_mapinfo); 109 110 #define MINOR_ALLOCED ((void *)-1) 111 112 /* 113 * Bits for the md->flags field. 114 */ 115 #define DMF_BLOCK_IO_FOR_SUSPEND 0 116 #define DMF_SUSPENDED 1 117 #define DMF_FROZEN 2 118 #define DMF_FREEING 3 119 #define DMF_DELETING 4 120 #define DMF_NOFLUSH_SUSPENDING 5 121 #define DMF_MERGE_IS_OPTIONAL 6 122 #define DMF_DEFERRED_REMOVE 7 123 #define DMF_SUSPENDED_INTERNALLY 8 124 125 /* 126 * A dummy definition to make RCU happy. 127 * struct dm_table should never be dereferenced in this file. 128 */ 129 struct dm_table { 130 int undefined__; 131 }; 132 133 /* 134 * Work processed by per-device workqueue. 135 */ 136 struct mapped_device { 137 struct srcu_struct io_barrier; 138 struct mutex suspend_lock; 139 atomic_t holders; 140 atomic_t open_count; 141 142 /* 143 * The current mapping. 144 * Use dm_get_live_table{_fast} or take suspend_lock for 145 * dereference. 146 */ 147 struct dm_table __rcu *map; 148 149 struct list_head table_devices; 150 struct mutex table_devices_lock; 151 152 unsigned long flags; 153 154 struct request_queue *queue; 155 unsigned type; 156 /* Protect queue and type against concurrent access. */ 157 struct mutex type_lock; 158 159 struct target_type *immutable_target_type; 160 161 struct gendisk *disk; 162 char name[16]; 163 164 void *interface_ptr; 165 166 /* 167 * A list of ios that arrived while we were suspended. 168 */ 169 atomic_t pending[2]; 170 wait_queue_head_t wait; 171 struct work_struct work; 172 struct bio_list deferred; 173 spinlock_t deferred_lock; 174 175 /* 176 * Processing queue (flush) 177 */ 178 struct workqueue_struct *wq; 179 180 /* 181 * io objects are allocated from here. 182 */ 183 mempool_t *io_pool; 184 mempool_t *rq_pool; 185 186 struct bio_set *bs; 187 188 /* 189 * Event handling. 190 */ 191 atomic_t event_nr; 192 wait_queue_head_t eventq; 193 atomic_t uevent_seq; 194 struct list_head uevent_list; 195 spinlock_t uevent_lock; /* Protect access to uevent_list */ 196 197 /* 198 * freeze/thaw support require holding onto a super block 199 */ 200 struct super_block *frozen_sb; 201 struct block_device *bdev; 202 203 /* forced geometry settings */ 204 struct hd_geometry geometry; 205 206 /* kobject and completion */ 207 struct dm_kobject_holder kobj_holder; 208 209 /* zero-length flush that will be cloned and submitted to targets */ 210 struct bio flush_bio; 211 212 struct dm_stats stats; 213 214 struct kthread_worker kworker; 215 struct task_struct *kworker_task; 216 }; 217 218 /* 219 * For mempools pre-allocation at the table loading time. 220 */ 221 struct dm_md_mempools { 222 mempool_t *io_pool; 223 mempool_t *rq_pool; 224 struct bio_set *bs; 225 }; 226 227 struct table_device { 228 struct list_head list; 229 atomic_t count; 230 struct dm_dev dm_dev; 231 }; 232 233 #define RESERVED_BIO_BASED_IOS 16 234 #define RESERVED_REQUEST_BASED_IOS 256 235 #define RESERVED_MAX_IOS 1024 236 static struct kmem_cache *_io_cache; 237 static struct kmem_cache *_rq_tio_cache; 238 static struct kmem_cache *_rq_cache; 239 240 /* 241 * Bio-based DM's mempools' reserved IOs set by the user. 242 */ 243 static unsigned reserved_bio_based_ios = RESERVED_BIO_BASED_IOS; 244 245 /* 246 * Request-based DM's mempools' reserved IOs set by the user. 247 */ 248 static unsigned reserved_rq_based_ios = RESERVED_REQUEST_BASED_IOS; 249 250 static unsigned __dm_get_reserved_ios(unsigned *reserved_ios, 251 unsigned def, unsigned max) 252 { 253 unsigned ios = ACCESS_ONCE(*reserved_ios); 254 unsigned modified_ios = 0; 255 256 if (!ios) 257 modified_ios = def; 258 else if (ios > max) 259 modified_ios = max; 260 261 if (modified_ios) { 262 (void)cmpxchg(reserved_ios, ios, modified_ios); 263 ios = modified_ios; 264 } 265 266 return ios; 267 } 268 269 unsigned dm_get_reserved_bio_based_ios(void) 270 { 271 return __dm_get_reserved_ios(&reserved_bio_based_ios, 272 RESERVED_BIO_BASED_IOS, RESERVED_MAX_IOS); 273 } 274 EXPORT_SYMBOL_GPL(dm_get_reserved_bio_based_ios); 275 276 unsigned dm_get_reserved_rq_based_ios(void) 277 { 278 return __dm_get_reserved_ios(&reserved_rq_based_ios, 279 RESERVED_REQUEST_BASED_IOS, RESERVED_MAX_IOS); 280 } 281 EXPORT_SYMBOL_GPL(dm_get_reserved_rq_based_ios); 282 283 static int __init local_init(void) 284 { 285 int r = -ENOMEM; 286 287 /* allocate a slab for the dm_ios */ 288 _io_cache = KMEM_CACHE(dm_io, 0); 289 if (!_io_cache) 290 return r; 291 292 _rq_tio_cache = KMEM_CACHE(dm_rq_target_io, 0); 293 if (!_rq_tio_cache) 294 goto out_free_io_cache; 295 296 _rq_cache = kmem_cache_create("dm_clone_request", sizeof(struct request), 297 __alignof__(struct request), 0, NULL); 298 if (!_rq_cache) 299 goto out_free_rq_tio_cache; 300 301 r = dm_uevent_init(); 302 if (r) 303 goto out_free_rq_cache; 304 305 deferred_remove_workqueue = alloc_workqueue("kdmremove", WQ_UNBOUND, 1); 306 if (!deferred_remove_workqueue) { 307 r = -ENOMEM; 308 goto out_uevent_exit; 309 } 310 311 _major = major; 312 r = register_blkdev(_major, _name); 313 if (r < 0) 314 goto out_free_workqueue; 315 316 if (!_major) 317 _major = r; 318 319 return 0; 320 321 out_free_workqueue: 322 destroy_workqueue(deferred_remove_workqueue); 323 out_uevent_exit: 324 dm_uevent_exit(); 325 out_free_rq_cache: 326 kmem_cache_destroy(_rq_cache); 327 out_free_rq_tio_cache: 328 kmem_cache_destroy(_rq_tio_cache); 329 out_free_io_cache: 330 kmem_cache_destroy(_io_cache); 331 332 return r; 333 } 334 335 static void local_exit(void) 336 { 337 flush_scheduled_work(); 338 destroy_workqueue(deferred_remove_workqueue); 339 340 kmem_cache_destroy(_rq_cache); 341 kmem_cache_destroy(_rq_tio_cache); 342 kmem_cache_destroy(_io_cache); 343 unregister_blkdev(_major, _name); 344 dm_uevent_exit(); 345 346 _major = 0; 347 348 DMINFO("cleaned up"); 349 } 350 351 static int (*_inits[])(void) __initdata = { 352 local_init, 353 dm_target_init, 354 dm_linear_init, 355 dm_stripe_init, 356 dm_io_init, 357 dm_kcopyd_init, 358 dm_interface_init, 359 dm_statistics_init, 360 }; 361 362 static void (*_exits[])(void) = { 363 local_exit, 364 dm_target_exit, 365 dm_linear_exit, 366 dm_stripe_exit, 367 dm_io_exit, 368 dm_kcopyd_exit, 369 dm_interface_exit, 370 dm_statistics_exit, 371 }; 372 373 static int __init dm_init(void) 374 { 375 const int count = ARRAY_SIZE(_inits); 376 377 int r, i; 378 379 for (i = 0; i < count; i++) { 380 r = _inits[i](); 381 if (r) 382 goto bad; 383 } 384 385 return 0; 386 387 bad: 388 while (i--) 389 _exits[i](); 390 391 return r; 392 } 393 394 static void __exit dm_exit(void) 395 { 396 int i = ARRAY_SIZE(_exits); 397 398 while (i--) 399 _exits[i](); 400 401 /* 402 * Should be empty by this point. 403 */ 404 idr_destroy(&_minor_idr); 405 } 406 407 /* 408 * Block device functions 409 */ 410 int dm_deleting_md(struct mapped_device *md) 411 { 412 return test_bit(DMF_DELETING, &md->flags); 413 } 414 415 static int dm_blk_open(struct block_device *bdev, fmode_t mode) 416 { 417 struct mapped_device *md; 418 419 spin_lock(&_minor_lock); 420 421 md = bdev->bd_disk->private_data; 422 if (!md) 423 goto out; 424 425 if (test_bit(DMF_FREEING, &md->flags) || 426 dm_deleting_md(md)) { 427 md = NULL; 428 goto out; 429 } 430 431 dm_get(md); 432 atomic_inc(&md->open_count); 433 434 out: 435 spin_unlock(&_minor_lock); 436 437 return md ? 0 : -ENXIO; 438 } 439 440 static void dm_blk_close(struct gendisk *disk, fmode_t mode) 441 { 442 struct mapped_device *md = disk->private_data; 443 444 spin_lock(&_minor_lock); 445 446 if (atomic_dec_and_test(&md->open_count) && 447 (test_bit(DMF_DEFERRED_REMOVE, &md->flags))) 448 queue_work(deferred_remove_workqueue, &deferred_remove_work); 449 450 dm_put(md); 451 452 spin_unlock(&_minor_lock); 453 } 454 455 int dm_open_count(struct mapped_device *md) 456 { 457 return atomic_read(&md->open_count); 458 } 459 460 /* 461 * Guarantees nothing is using the device before it's deleted. 462 */ 463 int dm_lock_for_deletion(struct mapped_device *md, bool mark_deferred, bool only_deferred) 464 { 465 int r = 0; 466 467 spin_lock(&_minor_lock); 468 469 if (dm_open_count(md)) { 470 r = -EBUSY; 471 if (mark_deferred) 472 set_bit(DMF_DEFERRED_REMOVE, &md->flags); 473 } else if (only_deferred && !test_bit(DMF_DEFERRED_REMOVE, &md->flags)) 474 r = -EEXIST; 475 else 476 set_bit(DMF_DELETING, &md->flags); 477 478 spin_unlock(&_minor_lock); 479 480 return r; 481 } 482 483 int dm_cancel_deferred_remove(struct mapped_device *md) 484 { 485 int r = 0; 486 487 spin_lock(&_minor_lock); 488 489 if (test_bit(DMF_DELETING, &md->flags)) 490 r = -EBUSY; 491 else 492 clear_bit(DMF_DEFERRED_REMOVE, &md->flags); 493 494 spin_unlock(&_minor_lock); 495 496 return r; 497 } 498 499 static void do_deferred_remove(struct work_struct *w) 500 { 501 dm_deferred_remove(); 502 } 503 504 sector_t dm_get_size(struct mapped_device *md) 505 { 506 return get_capacity(md->disk); 507 } 508 509 struct request_queue *dm_get_md_queue(struct mapped_device *md) 510 { 511 return md->queue; 512 } 513 514 struct dm_stats *dm_get_stats(struct mapped_device *md) 515 { 516 return &md->stats; 517 } 518 519 static int dm_blk_getgeo(struct block_device *bdev, struct hd_geometry *geo) 520 { 521 struct mapped_device *md = bdev->bd_disk->private_data; 522 523 return dm_get_geometry(md, geo); 524 } 525 526 static int dm_blk_ioctl(struct block_device *bdev, fmode_t mode, 527 unsigned int cmd, unsigned long arg) 528 { 529 struct mapped_device *md = bdev->bd_disk->private_data; 530 int srcu_idx; 531 struct dm_table *map; 532 struct dm_target *tgt; 533 int r = -ENOTTY; 534 535 retry: 536 map = dm_get_live_table(md, &srcu_idx); 537 538 if (!map || !dm_table_get_size(map)) 539 goto out; 540 541 /* We only support devices that have a single target */ 542 if (dm_table_get_num_targets(map) != 1) 543 goto out; 544 545 tgt = dm_table_get_target(map, 0); 546 if (!tgt->type->ioctl) 547 goto out; 548 549 if (dm_suspended_md(md)) { 550 r = -EAGAIN; 551 goto out; 552 } 553 554 r = tgt->type->ioctl(tgt, cmd, arg); 555 556 out: 557 dm_put_live_table(md, srcu_idx); 558 559 if (r == -ENOTCONN) { 560 msleep(10); 561 goto retry; 562 } 563 564 return r; 565 } 566 567 static struct dm_io *alloc_io(struct mapped_device *md) 568 { 569 return mempool_alloc(md->io_pool, GFP_NOIO); 570 } 571 572 static void free_io(struct mapped_device *md, struct dm_io *io) 573 { 574 mempool_free(io, md->io_pool); 575 } 576 577 static void free_tio(struct mapped_device *md, struct dm_target_io *tio) 578 { 579 bio_put(&tio->clone); 580 } 581 582 static struct dm_rq_target_io *alloc_rq_tio(struct mapped_device *md, 583 gfp_t gfp_mask) 584 { 585 return mempool_alloc(md->io_pool, gfp_mask); 586 } 587 588 static void free_rq_tio(struct dm_rq_target_io *tio) 589 { 590 mempool_free(tio, tio->md->io_pool); 591 } 592 593 static struct request *alloc_clone_request(struct mapped_device *md, 594 gfp_t gfp_mask) 595 { 596 return mempool_alloc(md->rq_pool, gfp_mask); 597 } 598 599 static void free_clone_request(struct mapped_device *md, struct request *rq) 600 { 601 mempool_free(rq, md->rq_pool); 602 } 603 604 static int md_in_flight(struct mapped_device *md) 605 { 606 return atomic_read(&md->pending[READ]) + 607 atomic_read(&md->pending[WRITE]); 608 } 609 610 static void start_io_acct(struct dm_io *io) 611 { 612 struct mapped_device *md = io->md; 613 struct bio *bio = io->bio; 614 int cpu; 615 int rw = bio_data_dir(bio); 616 617 io->start_time = jiffies; 618 619 cpu = part_stat_lock(); 620 part_round_stats(cpu, &dm_disk(md)->part0); 621 part_stat_unlock(); 622 atomic_set(&dm_disk(md)->part0.in_flight[rw], 623 atomic_inc_return(&md->pending[rw])); 624 625 if (unlikely(dm_stats_used(&md->stats))) 626 dm_stats_account_io(&md->stats, bio->bi_rw, bio->bi_iter.bi_sector, 627 bio_sectors(bio), false, 0, &io->stats_aux); 628 } 629 630 static void end_io_acct(struct dm_io *io) 631 { 632 struct mapped_device *md = io->md; 633 struct bio *bio = io->bio; 634 unsigned long duration = jiffies - io->start_time; 635 int pending; 636 int rw = bio_data_dir(bio); 637 638 generic_end_io_acct(rw, &dm_disk(md)->part0, io->start_time); 639 640 if (unlikely(dm_stats_used(&md->stats))) 641 dm_stats_account_io(&md->stats, bio->bi_rw, bio->bi_iter.bi_sector, 642 bio_sectors(bio), true, duration, &io->stats_aux); 643 644 /* 645 * After this is decremented the bio must not be touched if it is 646 * a flush. 647 */ 648 pending = atomic_dec_return(&md->pending[rw]); 649 atomic_set(&dm_disk(md)->part0.in_flight[rw], pending); 650 pending += atomic_read(&md->pending[rw^0x1]); 651 652 /* nudge anyone waiting on suspend queue */ 653 if (!pending) 654 wake_up(&md->wait); 655 } 656 657 /* 658 * Add the bio to the list of deferred io. 659 */ 660 static void queue_io(struct mapped_device *md, struct bio *bio) 661 { 662 unsigned long flags; 663 664 spin_lock_irqsave(&md->deferred_lock, flags); 665 bio_list_add(&md->deferred, bio); 666 spin_unlock_irqrestore(&md->deferred_lock, flags); 667 queue_work(md->wq, &md->work); 668 } 669 670 /* 671 * Everyone (including functions in this file), should use this 672 * function to access the md->map field, and make sure they call 673 * dm_put_live_table() when finished. 674 */ 675 struct dm_table *dm_get_live_table(struct mapped_device *md, int *srcu_idx) __acquires(md->io_barrier) 676 { 677 *srcu_idx = srcu_read_lock(&md->io_barrier); 678 679 return srcu_dereference(md->map, &md->io_barrier); 680 } 681 682 void dm_put_live_table(struct mapped_device *md, int srcu_idx) __releases(md->io_barrier) 683 { 684 srcu_read_unlock(&md->io_barrier, srcu_idx); 685 } 686 687 void dm_sync_table(struct mapped_device *md) 688 { 689 synchronize_srcu(&md->io_barrier); 690 synchronize_rcu_expedited(); 691 } 692 693 /* 694 * A fast alternative to dm_get_live_table/dm_put_live_table. 695 * The caller must not block between these two functions. 696 */ 697 static struct dm_table *dm_get_live_table_fast(struct mapped_device *md) __acquires(RCU) 698 { 699 rcu_read_lock(); 700 return rcu_dereference(md->map); 701 } 702 703 static void dm_put_live_table_fast(struct mapped_device *md) __releases(RCU) 704 { 705 rcu_read_unlock(); 706 } 707 708 /* 709 * Open a table device so we can use it as a map destination. 710 */ 711 static int open_table_device(struct table_device *td, dev_t dev, 712 struct mapped_device *md) 713 { 714 static char *_claim_ptr = "I belong to device-mapper"; 715 struct block_device *bdev; 716 717 int r; 718 719 BUG_ON(td->dm_dev.bdev); 720 721 bdev = blkdev_get_by_dev(dev, td->dm_dev.mode | FMODE_EXCL, _claim_ptr); 722 if (IS_ERR(bdev)) 723 return PTR_ERR(bdev); 724 725 r = bd_link_disk_holder(bdev, dm_disk(md)); 726 if (r) { 727 blkdev_put(bdev, td->dm_dev.mode | FMODE_EXCL); 728 return r; 729 } 730 731 td->dm_dev.bdev = bdev; 732 return 0; 733 } 734 735 /* 736 * Close a table device that we've been using. 737 */ 738 static void close_table_device(struct table_device *td, struct mapped_device *md) 739 { 740 if (!td->dm_dev.bdev) 741 return; 742 743 bd_unlink_disk_holder(td->dm_dev.bdev, dm_disk(md)); 744 blkdev_put(td->dm_dev.bdev, td->dm_dev.mode | FMODE_EXCL); 745 td->dm_dev.bdev = NULL; 746 } 747 748 static struct table_device *find_table_device(struct list_head *l, dev_t dev, 749 fmode_t mode) { 750 struct table_device *td; 751 752 list_for_each_entry(td, l, list) 753 if (td->dm_dev.bdev->bd_dev == dev && td->dm_dev.mode == mode) 754 return td; 755 756 return NULL; 757 } 758 759 int dm_get_table_device(struct mapped_device *md, dev_t dev, fmode_t mode, 760 struct dm_dev **result) { 761 int r; 762 struct table_device *td; 763 764 mutex_lock(&md->table_devices_lock); 765 td = find_table_device(&md->table_devices, dev, mode); 766 if (!td) { 767 td = kmalloc(sizeof(*td), GFP_KERNEL); 768 if (!td) { 769 mutex_unlock(&md->table_devices_lock); 770 return -ENOMEM; 771 } 772 773 td->dm_dev.mode = mode; 774 td->dm_dev.bdev = NULL; 775 776 if ((r = open_table_device(td, dev, md))) { 777 mutex_unlock(&md->table_devices_lock); 778 kfree(td); 779 return r; 780 } 781 782 format_dev_t(td->dm_dev.name, dev); 783 784 atomic_set(&td->count, 0); 785 list_add(&td->list, &md->table_devices); 786 } 787 atomic_inc(&td->count); 788 mutex_unlock(&md->table_devices_lock); 789 790 *result = &td->dm_dev; 791 return 0; 792 } 793 EXPORT_SYMBOL_GPL(dm_get_table_device); 794 795 void dm_put_table_device(struct mapped_device *md, struct dm_dev *d) 796 { 797 struct table_device *td = container_of(d, struct table_device, dm_dev); 798 799 mutex_lock(&md->table_devices_lock); 800 if (atomic_dec_and_test(&td->count)) { 801 close_table_device(td, md); 802 list_del(&td->list); 803 kfree(td); 804 } 805 mutex_unlock(&md->table_devices_lock); 806 } 807 EXPORT_SYMBOL(dm_put_table_device); 808 809 static void free_table_devices(struct list_head *devices) 810 { 811 struct list_head *tmp, *next; 812 813 list_for_each_safe(tmp, next, devices) { 814 struct table_device *td = list_entry(tmp, struct table_device, list); 815 816 DMWARN("dm_destroy: %s still exists with %d references", 817 td->dm_dev.name, atomic_read(&td->count)); 818 kfree(td); 819 } 820 } 821 822 /* 823 * Get the geometry associated with a dm device 824 */ 825 int dm_get_geometry(struct mapped_device *md, struct hd_geometry *geo) 826 { 827 *geo = md->geometry; 828 829 return 0; 830 } 831 832 /* 833 * Set the geometry of a device. 834 */ 835 int dm_set_geometry(struct mapped_device *md, struct hd_geometry *geo) 836 { 837 sector_t sz = (sector_t)geo->cylinders * geo->heads * geo->sectors; 838 839 if (geo->start > sz) { 840 DMWARN("Start sector is beyond the geometry limits."); 841 return -EINVAL; 842 } 843 844 md->geometry = *geo; 845 846 return 0; 847 } 848 849 /*----------------------------------------------------------------- 850 * CRUD START: 851 * A more elegant soln is in the works that uses the queue 852 * merge fn, unfortunately there are a couple of changes to 853 * the block layer that I want to make for this. So in the 854 * interests of getting something for people to use I give 855 * you this clearly demarcated crap. 856 *---------------------------------------------------------------*/ 857 858 static int __noflush_suspending(struct mapped_device *md) 859 { 860 return test_bit(DMF_NOFLUSH_SUSPENDING, &md->flags); 861 } 862 863 /* 864 * Decrements the number of outstanding ios that a bio has been 865 * cloned into, completing the original io if necc. 866 */ 867 static void dec_pending(struct dm_io *io, int error) 868 { 869 unsigned long flags; 870 int io_error; 871 struct bio *bio; 872 struct mapped_device *md = io->md; 873 874 /* Push-back supersedes any I/O errors */ 875 if (unlikely(error)) { 876 spin_lock_irqsave(&io->endio_lock, flags); 877 if (!(io->error > 0 && __noflush_suspending(md))) 878 io->error = error; 879 spin_unlock_irqrestore(&io->endio_lock, flags); 880 } 881 882 if (atomic_dec_and_test(&io->io_count)) { 883 if (io->error == DM_ENDIO_REQUEUE) { 884 /* 885 * Target requested pushing back the I/O. 886 */ 887 spin_lock_irqsave(&md->deferred_lock, flags); 888 if (__noflush_suspending(md)) 889 bio_list_add_head(&md->deferred, io->bio); 890 else 891 /* noflush suspend was interrupted. */ 892 io->error = -EIO; 893 spin_unlock_irqrestore(&md->deferred_lock, flags); 894 } 895 896 io_error = io->error; 897 bio = io->bio; 898 end_io_acct(io); 899 free_io(md, io); 900 901 if (io_error == DM_ENDIO_REQUEUE) 902 return; 903 904 if ((bio->bi_rw & REQ_FLUSH) && bio->bi_iter.bi_size) { 905 /* 906 * Preflush done for flush with data, reissue 907 * without REQ_FLUSH. 908 */ 909 bio->bi_rw &= ~REQ_FLUSH; 910 queue_io(md, bio); 911 } else { 912 /* done with normal IO or empty flush */ 913 trace_block_bio_complete(md->queue, bio, io_error); 914 bio_endio(bio, io_error); 915 } 916 } 917 } 918 919 static void disable_write_same(struct mapped_device *md) 920 { 921 struct queue_limits *limits = dm_get_queue_limits(md); 922 923 /* device doesn't really support WRITE SAME, disable it */ 924 limits->max_write_same_sectors = 0; 925 } 926 927 static void clone_endio(struct bio *bio, int error) 928 { 929 int r = error; 930 struct dm_target_io *tio = container_of(bio, struct dm_target_io, clone); 931 struct dm_io *io = tio->io; 932 struct mapped_device *md = tio->io->md; 933 dm_endio_fn endio = tio->ti->type->end_io; 934 935 if (!bio_flagged(bio, BIO_UPTODATE) && !error) 936 error = -EIO; 937 938 if (endio) { 939 r = endio(tio->ti, bio, error); 940 if (r < 0 || r == DM_ENDIO_REQUEUE) 941 /* 942 * error and requeue request are handled 943 * in dec_pending(). 944 */ 945 error = r; 946 else if (r == DM_ENDIO_INCOMPLETE) 947 /* The target will handle the io */ 948 return; 949 else if (r) { 950 DMWARN("unimplemented target endio return value: %d", r); 951 BUG(); 952 } 953 } 954 955 if (unlikely(r == -EREMOTEIO && (bio->bi_rw & REQ_WRITE_SAME) && 956 !bdev_get_queue(bio->bi_bdev)->limits.max_write_same_sectors)) 957 disable_write_same(md); 958 959 free_tio(md, tio); 960 dec_pending(io, error); 961 } 962 963 /* 964 * Partial completion handling for request-based dm 965 */ 966 static void end_clone_bio(struct bio *clone, int error) 967 { 968 struct dm_rq_clone_bio_info *info = 969 container_of(clone, struct dm_rq_clone_bio_info, clone); 970 struct dm_rq_target_io *tio = info->tio; 971 struct bio *bio = info->orig; 972 unsigned int nr_bytes = info->orig->bi_iter.bi_size; 973 974 bio_put(clone); 975 976 if (tio->error) 977 /* 978 * An error has already been detected on the request. 979 * Once error occurred, just let clone->end_io() handle 980 * the remainder. 981 */ 982 return; 983 else if (error) { 984 /* 985 * Don't notice the error to the upper layer yet. 986 * The error handling decision is made by the target driver, 987 * when the request is completed. 988 */ 989 tio->error = error; 990 return; 991 } 992 993 /* 994 * I/O for the bio successfully completed. 995 * Notice the data completion to the upper layer. 996 */ 997 998 /* 999 * bios are processed from the head of the list. 1000 * So the completing bio should always be rq->bio. 1001 * If it's not, something wrong is happening. 1002 */ 1003 if (tio->orig->bio != bio) 1004 DMERR("bio completion is going in the middle of the request"); 1005 1006 /* 1007 * Update the original request. 1008 * Do not use blk_end_request() here, because it may complete 1009 * the original request before the clone, and break the ordering. 1010 */ 1011 blk_update_request(tio->orig, 0, nr_bytes); 1012 } 1013 1014 /* 1015 * Don't touch any member of the md after calling this function because 1016 * the md may be freed in dm_put() at the end of this function. 1017 * Or do dm_get() before calling this function and dm_put() later. 1018 */ 1019 static void rq_completed(struct mapped_device *md, int rw, int run_queue) 1020 { 1021 atomic_dec(&md->pending[rw]); 1022 1023 /* nudge anyone waiting on suspend queue */ 1024 if (!md_in_flight(md)) 1025 wake_up(&md->wait); 1026 1027 /* 1028 * Run this off this callpath, as drivers could invoke end_io while 1029 * inside their request_fn (and holding the queue lock). Calling 1030 * back into ->request_fn() could deadlock attempting to grab the 1031 * queue lock again. 1032 */ 1033 if (run_queue) 1034 blk_run_queue_async(md->queue); 1035 1036 /* 1037 * dm_put() must be at the end of this function. See the comment above 1038 */ 1039 dm_put(md); 1040 } 1041 1042 static void free_rq_clone(struct request *clone) 1043 { 1044 struct dm_rq_target_io *tio = clone->end_io_data; 1045 1046 blk_rq_unprep_clone(clone); 1047 free_clone_request(tio->md, clone); 1048 free_rq_tio(tio); 1049 } 1050 1051 /* 1052 * Complete the clone and the original request. 1053 * Must be called without queue lock. 1054 */ 1055 static void dm_end_request(struct request *clone, int error) 1056 { 1057 int rw = rq_data_dir(clone); 1058 struct dm_rq_target_io *tio = clone->end_io_data; 1059 struct mapped_device *md = tio->md; 1060 struct request *rq = tio->orig; 1061 1062 if (rq->cmd_type == REQ_TYPE_BLOCK_PC) { 1063 rq->errors = clone->errors; 1064 rq->resid_len = clone->resid_len; 1065 1066 if (rq->sense) 1067 /* 1068 * We are using the sense buffer of the original 1069 * request. 1070 * So setting the length of the sense data is enough. 1071 */ 1072 rq->sense_len = clone->sense_len; 1073 } 1074 1075 free_rq_clone(clone); 1076 blk_end_request_all(rq, error); 1077 rq_completed(md, rw, true); 1078 } 1079 1080 static void dm_unprep_request(struct request *rq) 1081 { 1082 struct request *clone = rq->special; 1083 1084 rq->special = NULL; 1085 rq->cmd_flags &= ~REQ_DONTPREP; 1086 1087 free_rq_clone(clone); 1088 } 1089 1090 /* 1091 * Requeue the original request of a clone. 1092 */ 1093 static void dm_requeue_unmapped_request(struct request *clone) 1094 { 1095 int rw = rq_data_dir(clone); 1096 struct dm_rq_target_io *tio = clone->end_io_data; 1097 struct mapped_device *md = tio->md; 1098 struct request *rq = tio->orig; 1099 struct request_queue *q = rq->q; 1100 unsigned long flags; 1101 1102 dm_unprep_request(rq); 1103 1104 spin_lock_irqsave(q->queue_lock, flags); 1105 blk_requeue_request(q, rq); 1106 spin_unlock_irqrestore(q->queue_lock, flags); 1107 1108 rq_completed(md, rw, 0); 1109 } 1110 1111 static void __stop_queue(struct request_queue *q) 1112 { 1113 blk_stop_queue(q); 1114 } 1115 1116 static void stop_queue(struct request_queue *q) 1117 { 1118 unsigned long flags; 1119 1120 spin_lock_irqsave(q->queue_lock, flags); 1121 __stop_queue(q); 1122 spin_unlock_irqrestore(q->queue_lock, flags); 1123 } 1124 1125 static void __start_queue(struct request_queue *q) 1126 { 1127 if (blk_queue_stopped(q)) 1128 blk_start_queue(q); 1129 } 1130 1131 static void start_queue(struct request_queue *q) 1132 { 1133 unsigned long flags; 1134 1135 spin_lock_irqsave(q->queue_lock, flags); 1136 __start_queue(q); 1137 spin_unlock_irqrestore(q->queue_lock, flags); 1138 } 1139 1140 static void dm_done(struct request *clone, int error, bool mapped) 1141 { 1142 int r = error; 1143 struct dm_rq_target_io *tio = clone->end_io_data; 1144 dm_request_endio_fn rq_end_io = NULL; 1145 1146 if (tio->ti) { 1147 rq_end_io = tio->ti->type->rq_end_io; 1148 1149 if (mapped && rq_end_io) 1150 r = rq_end_io(tio->ti, clone, error, &tio->info); 1151 } 1152 1153 if (unlikely(r == -EREMOTEIO && (clone->cmd_flags & REQ_WRITE_SAME) && 1154 !clone->q->limits.max_write_same_sectors)) 1155 disable_write_same(tio->md); 1156 1157 if (r <= 0) 1158 /* The target wants to complete the I/O */ 1159 dm_end_request(clone, r); 1160 else if (r == DM_ENDIO_INCOMPLETE) 1161 /* The target will handle the I/O */ 1162 return; 1163 else if (r == DM_ENDIO_REQUEUE) 1164 /* The target wants to requeue the I/O */ 1165 dm_requeue_unmapped_request(clone); 1166 else { 1167 DMWARN("unimplemented target endio return value: %d", r); 1168 BUG(); 1169 } 1170 } 1171 1172 /* 1173 * Request completion handler for request-based dm 1174 */ 1175 static void dm_softirq_done(struct request *rq) 1176 { 1177 bool mapped = true; 1178 struct request *clone = rq->completion_data; 1179 struct dm_rq_target_io *tio = clone->end_io_data; 1180 1181 if (rq->cmd_flags & REQ_FAILED) 1182 mapped = false; 1183 1184 dm_done(clone, tio->error, mapped); 1185 } 1186 1187 /* 1188 * Complete the clone and the original request with the error status 1189 * through softirq context. 1190 */ 1191 static void dm_complete_request(struct request *clone, int error) 1192 { 1193 struct dm_rq_target_io *tio = clone->end_io_data; 1194 struct request *rq = tio->orig; 1195 1196 tio->error = error; 1197 rq->completion_data = clone; 1198 blk_complete_request(rq); 1199 } 1200 1201 /* 1202 * Complete the not-mapped clone and the original request with the error status 1203 * through softirq context. 1204 * Target's rq_end_io() function isn't called. 1205 * This may be used when the target's map_rq() function fails. 1206 */ 1207 static void dm_kill_unmapped_request(struct request *clone, int error) 1208 { 1209 struct dm_rq_target_io *tio = clone->end_io_data; 1210 struct request *rq = tio->orig; 1211 1212 rq->cmd_flags |= REQ_FAILED; 1213 dm_complete_request(clone, error); 1214 } 1215 1216 /* 1217 * Called with the queue lock held 1218 */ 1219 static void end_clone_request(struct request *clone, int error) 1220 { 1221 /* 1222 * For just cleaning up the information of the queue in which 1223 * the clone was dispatched. 1224 * The clone is *NOT* freed actually here because it is alloced from 1225 * dm own mempool and REQ_ALLOCED isn't set in clone->cmd_flags. 1226 */ 1227 __blk_put_request(clone->q, clone); 1228 1229 /* 1230 * Actual request completion is done in a softirq context which doesn't 1231 * hold the queue lock. Otherwise, deadlock could occur because: 1232 * - another request may be submitted by the upper level driver 1233 * of the stacking during the completion 1234 * - the submission which requires queue lock may be done 1235 * against this queue 1236 */ 1237 dm_complete_request(clone, error); 1238 } 1239 1240 /* 1241 * Return maximum size of I/O possible at the supplied sector up to the current 1242 * target boundary. 1243 */ 1244 static sector_t max_io_len_target_boundary(sector_t sector, struct dm_target *ti) 1245 { 1246 sector_t target_offset = dm_target_offset(ti, sector); 1247 1248 return ti->len - target_offset; 1249 } 1250 1251 static sector_t max_io_len(sector_t sector, struct dm_target *ti) 1252 { 1253 sector_t len = max_io_len_target_boundary(sector, ti); 1254 sector_t offset, max_len; 1255 1256 /* 1257 * Does the target need to split even further? 1258 */ 1259 if (ti->max_io_len) { 1260 offset = dm_target_offset(ti, sector); 1261 if (unlikely(ti->max_io_len & (ti->max_io_len - 1))) 1262 max_len = sector_div(offset, ti->max_io_len); 1263 else 1264 max_len = offset & (ti->max_io_len - 1); 1265 max_len = ti->max_io_len - max_len; 1266 1267 if (len > max_len) 1268 len = max_len; 1269 } 1270 1271 return len; 1272 } 1273 1274 int dm_set_target_max_io_len(struct dm_target *ti, sector_t len) 1275 { 1276 if (len > UINT_MAX) { 1277 DMERR("Specified maximum size of target IO (%llu) exceeds limit (%u)", 1278 (unsigned long long)len, UINT_MAX); 1279 ti->error = "Maximum size of target IO is too large"; 1280 return -EINVAL; 1281 } 1282 1283 ti->max_io_len = (uint32_t) len; 1284 1285 return 0; 1286 } 1287 EXPORT_SYMBOL_GPL(dm_set_target_max_io_len); 1288 1289 /* 1290 * A target may call dm_accept_partial_bio only from the map routine. It is 1291 * allowed for all bio types except REQ_FLUSH. 1292 * 1293 * dm_accept_partial_bio informs the dm that the target only wants to process 1294 * additional n_sectors sectors of the bio and the rest of the data should be 1295 * sent in a next bio. 1296 * 1297 * A diagram that explains the arithmetics: 1298 * +--------------------+---------------+-------+ 1299 * | 1 | 2 | 3 | 1300 * +--------------------+---------------+-------+ 1301 * 1302 * <-------------- *tio->len_ptr ---------------> 1303 * <------- bi_size -------> 1304 * <-- n_sectors --> 1305 * 1306 * Region 1 was already iterated over with bio_advance or similar function. 1307 * (it may be empty if the target doesn't use bio_advance) 1308 * Region 2 is the remaining bio size that the target wants to process. 1309 * (it may be empty if region 1 is non-empty, although there is no reason 1310 * to make it empty) 1311 * The target requires that region 3 is to be sent in the next bio. 1312 * 1313 * If the target wants to receive multiple copies of the bio (via num_*bios, etc), 1314 * the partially processed part (the sum of regions 1+2) must be the same for all 1315 * copies of the bio. 1316 */ 1317 void dm_accept_partial_bio(struct bio *bio, unsigned n_sectors) 1318 { 1319 struct dm_target_io *tio = container_of(bio, struct dm_target_io, clone); 1320 unsigned bi_size = bio->bi_iter.bi_size >> SECTOR_SHIFT; 1321 BUG_ON(bio->bi_rw & REQ_FLUSH); 1322 BUG_ON(bi_size > *tio->len_ptr); 1323 BUG_ON(n_sectors > bi_size); 1324 *tio->len_ptr -= bi_size - n_sectors; 1325 bio->bi_iter.bi_size = n_sectors << SECTOR_SHIFT; 1326 } 1327 EXPORT_SYMBOL_GPL(dm_accept_partial_bio); 1328 1329 static void __map_bio(struct dm_target_io *tio) 1330 { 1331 int r; 1332 sector_t sector; 1333 struct mapped_device *md; 1334 struct bio *clone = &tio->clone; 1335 struct dm_target *ti = tio->ti; 1336 1337 clone->bi_end_io = clone_endio; 1338 1339 /* 1340 * Map the clone. If r == 0 we don't need to do 1341 * anything, the target has assumed ownership of 1342 * this io. 1343 */ 1344 atomic_inc(&tio->io->io_count); 1345 sector = clone->bi_iter.bi_sector; 1346 r = ti->type->map(ti, clone); 1347 if (r == DM_MAPIO_REMAPPED) { 1348 /* the bio has been remapped so dispatch it */ 1349 1350 trace_block_bio_remap(bdev_get_queue(clone->bi_bdev), clone, 1351 tio->io->bio->bi_bdev->bd_dev, sector); 1352 1353 generic_make_request(clone); 1354 } else if (r < 0 || r == DM_MAPIO_REQUEUE) { 1355 /* error the io and bail out, or requeue it if needed */ 1356 md = tio->io->md; 1357 dec_pending(tio->io, r); 1358 free_tio(md, tio); 1359 } else if (r) { 1360 DMWARN("unimplemented target map return value: %d", r); 1361 BUG(); 1362 } 1363 } 1364 1365 struct clone_info { 1366 struct mapped_device *md; 1367 struct dm_table *map; 1368 struct bio *bio; 1369 struct dm_io *io; 1370 sector_t sector; 1371 unsigned sector_count; 1372 }; 1373 1374 static void bio_setup_sector(struct bio *bio, sector_t sector, unsigned len) 1375 { 1376 bio->bi_iter.bi_sector = sector; 1377 bio->bi_iter.bi_size = to_bytes(len); 1378 } 1379 1380 /* 1381 * Creates a bio that consists of range of complete bvecs. 1382 */ 1383 static void clone_bio(struct dm_target_io *tio, struct bio *bio, 1384 sector_t sector, unsigned len) 1385 { 1386 struct bio *clone = &tio->clone; 1387 1388 __bio_clone_fast(clone, bio); 1389 1390 if (bio_integrity(bio)) 1391 bio_integrity_clone(clone, bio, GFP_NOIO); 1392 1393 bio_advance(clone, to_bytes(sector - clone->bi_iter.bi_sector)); 1394 clone->bi_iter.bi_size = to_bytes(len); 1395 1396 if (bio_integrity(bio)) 1397 bio_integrity_trim(clone, 0, len); 1398 } 1399 1400 static struct dm_target_io *alloc_tio(struct clone_info *ci, 1401 struct dm_target *ti, 1402 unsigned target_bio_nr) 1403 { 1404 struct dm_target_io *tio; 1405 struct bio *clone; 1406 1407 clone = bio_alloc_bioset(GFP_NOIO, 0, ci->md->bs); 1408 tio = container_of(clone, struct dm_target_io, clone); 1409 1410 tio->io = ci->io; 1411 tio->ti = ti; 1412 tio->target_bio_nr = target_bio_nr; 1413 1414 return tio; 1415 } 1416 1417 static void __clone_and_map_simple_bio(struct clone_info *ci, 1418 struct dm_target *ti, 1419 unsigned target_bio_nr, unsigned *len) 1420 { 1421 struct dm_target_io *tio = alloc_tio(ci, ti, target_bio_nr); 1422 struct bio *clone = &tio->clone; 1423 1424 tio->len_ptr = len; 1425 1426 __bio_clone_fast(clone, ci->bio); 1427 if (len) 1428 bio_setup_sector(clone, ci->sector, *len); 1429 1430 __map_bio(tio); 1431 } 1432 1433 static void __send_duplicate_bios(struct clone_info *ci, struct dm_target *ti, 1434 unsigned num_bios, unsigned *len) 1435 { 1436 unsigned target_bio_nr; 1437 1438 for (target_bio_nr = 0; target_bio_nr < num_bios; target_bio_nr++) 1439 __clone_and_map_simple_bio(ci, ti, target_bio_nr, len); 1440 } 1441 1442 static int __send_empty_flush(struct clone_info *ci) 1443 { 1444 unsigned target_nr = 0; 1445 struct dm_target *ti; 1446 1447 BUG_ON(bio_has_data(ci->bio)); 1448 while ((ti = dm_table_get_target(ci->map, target_nr++))) 1449 __send_duplicate_bios(ci, ti, ti->num_flush_bios, NULL); 1450 1451 return 0; 1452 } 1453 1454 static void __clone_and_map_data_bio(struct clone_info *ci, struct dm_target *ti, 1455 sector_t sector, unsigned *len) 1456 { 1457 struct bio *bio = ci->bio; 1458 struct dm_target_io *tio; 1459 unsigned target_bio_nr; 1460 unsigned num_target_bios = 1; 1461 1462 /* 1463 * Does the target want to receive duplicate copies of the bio? 1464 */ 1465 if (bio_data_dir(bio) == WRITE && ti->num_write_bios) 1466 num_target_bios = ti->num_write_bios(ti, bio); 1467 1468 for (target_bio_nr = 0; target_bio_nr < num_target_bios; target_bio_nr++) { 1469 tio = alloc_tio(ci, ti, target_bio_nr); 1470 tio->len_ptr = len; 1471 clone_bio(tio, bio, sector, *len); 1472 __map_bio(tio); 1473 } 1474 } 1475 1476 typedef unsigned (*get_num_bios_fn)(struct dm_target *ti); 1477 1478 static unsigned get_num_discard_bios(struct dm_target *ti) 1479 { 1480 return ti->num_discard_bios; 1481 } 1482 1483 static unsigned get_num_write_same_bios(struct dm_target *ti) 1484 { 1485 return ti->num_write_same_bios; 1486 } 1487 1488 typedef bool (*is_split_required_fn)(struct dm_target *ti); 1489 1490 static bool is_split_required_for_discard(struct dm_target *ti) 1491 { 1492 return ti->split_discard_bios; 1493 } 1494 1495 static int __send_changing_extent_only(struct clone_info *ci, 1496 get_num_bios_fn get_num_bios, 1497 is_split_required_fn is_split_required) 1498 { 1499 struct dm_target *ti; 1500 unsigned len; 1501 unsigned num_bios; 1502 1503 do { 1504 ti = dm_table_find_target(ci->map, ci->sector); 1505 if (!dm_target_is_valid(ti)) 1506 return -EIO; 1507 1508 /* 1509 * Even though the device advertised support for this type of 1510 * request, that does not mean every target supports it, and 1511 * reconfiguration might also have changed that since the 1512 * check was performed. 1513 */ 1514 num_bios = get_num_bios ? get_num_bios(ti) : 0; 1515 if (!num_bios) 1516 return -EOPNOTSUPP; 1517 1518 if (is_split_required && !is_split_required(ti)) 1519 len = min((sector_t)ci->sector_count, max_io_len_target_boundary(ci->sector, ti)); 1520 else 1521 len = min((sector_t)ci->sector_count, max_io_len(ci->sector, ti)); 1522 1523 __send_duplicate_bios(ci, ti, num_bios, &len); 1524 1525 ci->sector += len; 1526 } while (ci->sector_count -= len); 1527 1528 return 0; 1529 } 1530 1531 static int __send_discard(struct clone_info *ci) 1532 { 1533 return __send_changing_extent_only(ci, get_num_discard_bios, 1534 is_split_required_for_discard); 1535 } 1536 1537 static int __send_write_same(struct clone_info *ci) 1538 { 1539 return __send_changing_extent_only(ci, get_num_write_same_bios, NULL); 1540 } 1541 1542 /* 1543 * Select the correct strategy for processing a non-flush bio. 1544 */ 1545 static int __split_and_process_non_flush(struct clone_info *ci) 1546 { 1547 struct bio *bio = ci->bio; 1548 struct dm_target *ti; 1549 unsigned len; 1550 1551 if (unlikely(bio->bi_rw & REQ_DISCARD)) 1552 return __send_discard(ci); 1553 else if (unlikely(bio->bi_rw & REQ_WRITE_SAME)) 1554 return __send_write_same(ci); 1555 1556 ti = dm_table_find_target(ci->map, ci->sector); 1557 if (!dm_target_is_valid(ti)) 1558 return -EIO; 1559 1560 len = min_t(sector_t, max_io_len(ci->sector, ti), ci->sector_count); 1561 1562 __clone_and_map_data_bio(ci, ti, ci->sector, &len); 1563 1564 ci->sector += len; 1565 ci->sector_count -= len; 1566 1567 return 0; 1568 } 1569 1570 /* 1571 * Entry point to split a bio into clones and submit them to the targets. 1572 */ 1573 static void __split_and_process_bio(struct mapped_device *md, 1574 struct dm_table *map, struct bio *bio) 1575 { 1576 struct clone_info ci; 1577 int error = 0; 1578 1579 if (unlikely(!map)) { 1580 bio_io_error(bio); 1581 return; 1582 } 1583 1584 ci.map = map; 1585 ci.md = md; 1586 ci.io = alloc_io(md); 1587 ci.io->error = 0; 1588 atomic_set(&ci.io->io_count, 1); 1589 ci.io->bio = bio; 1590 ci.io->md = md; 1591 spin_lock_init(&ci.io->endio_lock); 1592 ci.sector = bio->bi_iter.bi_sector; 1593 1594 start_io_acct(ci.io); 1595 1596 if (bio->bi_rw & REQ_FLUSH) { 1597 ci.bio = &ci.md->flush_bio; 1598 ci.sector_count = 0; 1599 error = __send_empty_flush(&ci); 1600 /* dec_pending submits any data associated with flush */ 1601 } else { 1602 ci.bio = bio; 1603 ci.sector_count = bio_sectors(bio); 1604 while (ci.sector_count && !error) 1605 error = __split_and_process_non_flush(&ci); 1606 } 1607 1608 /* drop the extra reference count */ 1609 dec_pending(ci.io, error); 1610 } 1611 /*----------------------------------------------------------------- 1612 * CRUD END 1613 *---------------------------------------------------------------*/ 1614 1615 static int dm_merge_bvec(struct request_queue *q, 1616 struct bvec_merge_data *bvm, 1617 struct bio_vec *biovec) 1618 { 1619 struct mapped_device *md = q->queuedata; 1620 struct dm_table *map = dm_get_live_table_fast(md); 1621 struct dm_target *ti; 1622 sector_t max_sectors; 1623 int max_size = 0; 1624 1625 if (unlikely(!map)) 1626 goto out; 1627 1628 ti = dm_table_find_target(map, bvm->bi_sector); 1629 if (!dm_target_is_valid(ti)) 1630 goto out; 1631 1632 /* 1633 * Find maximum amount of I/O that won't need splitting 1634 */ 1635 max_sectors = min(max_io_len(bvm->bi_sector, ti), 1636 (sector_t) queue_max_sectors(q)); 1637 max_size = (max_sectors << SECTOR_SHIFT) - bvm->bi_size; 1638 if (unlikely(max_size < 0)) /* this shouldn't _ever_ happen */ 1639 max_size = 0; 1640 1641 /* 1642 * merge_bvec_fn() returns number of bytes 1643 * it can accept at this offset 1644 * max is precomputed maximal io size 1645 */ 1646 if (max_size && ti->type->merge) 1647 max_size = ti->type->merge(ti, bvm, biovec, max_size); 1648 /* 1649 * If the target doesn't support merge method and some of the devices 1650 * provided their merge_bvec method (we know this by looking for the 1651 * max_hw_sectors that dm_set_device_limits may set), then we can't 1652 * allow bios with multiple vector entries. So always set max_size 1653 * to 0, and the code below allows just one page. 1654 */ 1655 else if (queue_max_hw_sectors(q) <= PAGE_SIZE >> 9) 1656 max_size = 0; 1657 1658 out: 1659 dm_put_live_table_fast(md); 1660 /* 1661 * Always allow an entire first page 1662 */ 1663 if (max_size <= biovec->bv_len && !(bvm->bi_size >> SECTOR_SHIFT)) 1664 max_size = biovec->bv_len; 1665 1666 return max_size; 1667 } 1668 1669 /* 1670 * The request function that just remaps the bio built up by 1671 * dm_merge_bvec. 1672 */ 1673 static void _dm_request(struct request_queue *q, struct bio *bio) 1674 { 1675 int rw = bio_data_dir(bio); 1676 struct mapped_device *md = q->queuedata; 1677 int srcu_idx; 1678 struct dm_table *map; 1679 1680 map = dm_get_live_table(md, &srcu_idx); 1681 1682 generic_start_io_acct(rw, bio_sectors(bio), &dm_disk(md)->part0); 1683 1684 /* if we're suspended, we have to queue this io for later */ 1685 if (unlikely(test_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags))) { 1686 dm_put_live_table(md, srcu_idx); 1687 1688 if (bio_rw(bio) != READA) 1689 queue_io(md, bio); 1690 else 1691 bio_io_error(bio); 1692 return; 1693 } 1694 1695 __split_and_process_bio(md, map, bio); 1696 dm_put_live_table(md, srcu_idx); 1697 return; 1698 } 1699 1700 int dm_request_based(struct mapped_device *md) 1701 { 1702 return blk_queue_stackable(md->queue); 1703 } 1704 1705 static void dm_request(struct request_queue *q, struct bio *bio) 1706 { 1707 struct mapped_device *md = q->queuedata; 1708 1709 if (dm_request_based(md)) 1710 blk_queue_bio(q, bio); 1711 else 1712 _dm_request(q, bio); 1713 } 1714 1715 static void dm_dispatch_request(struct request *rq) 1716 { 1717 int r; 1718 1719 if (blk_queue_io_stat(rq->q)) 1720 rq->cmd_flags |= REQ_IO_STAT; 1721 1722 rq->start_time = jiffies; 1723 r = blk_insert_cloned_request(rq->q, rq); 1724 if (r) 1725 dm_complete_request(rq, r); 1726 } 1727 1728 static int dm_rq_bio_constructor(struct bio *bio, struct bio *bio_orig, 1729 void *data) 1730 { 1731 struct dm_rq_target_io *tio = data; 1732 struct dm_rq_clone_bio_info *info = 1733 container_of(bio, struct dm_rq_clone_bio_info, clone); 1734 1735 info->orig = bio_orig; 1736 info->tio = tio; 1737 bio->bi_end_io = end_clone_bio; 1738 1739 return 0; 1740 } 1741 1742 static int setup_clone(struct request *clone, struct request *rq, 1743 struct dm_rq_target_io *tio, gfp_t gfp_mask) 1744 { 1745 int r; 1746 1747 r = blk_rq_prep_clone(clone, rq, tio->md->bs, gfp_mask, 1748 dm_rq_bio_constructor, tio); 1749 if (r) 1750 return r; 1751 1752 clone->cmd = rq->cmd; 1753 clone->cmd_len = rq->cmd_len; 1754 clone->sense = rq->sense; 1755 clone->end_io = end_clone_request; 1756 clone->end_io_data = tio; 1757 1758 tio->clone = clone; 1759 1760 return 0; 1761 } 1762 1763 static struct request *__clone_rq(struct request *rq, struct mapped_device *md, 1764 struct dm_rq_target_io *tio, gfp_t gfp_mask) 1765 { 1766 struct request *clone = alloc_clone_request(md, gfp_mask); 1767 1768 if (!clone) 1769 return NULL; 1770 1771 blk_rq_init(NULL, clone); 1772 if (setup_clone(clone, rq, tio, gfp_mask)) { 1773 /* -ENOMEM */ 1774 free_clone_request(md, clone); 1775 return NULL; 1776 } 1777 1778 return clone; 1779 } 1780 1781 static void map_tio_request(struct kthread_work *work); 1782 1783 static struct request *clone_rq(struct request *rq, struct mapped_device *md, 1784 gfp_t gfp_mask) 1785 { 1786 struct request *clone; 1787 struct dm_rq_target_io *tio; 1788 1789 tio = alloc_rq_tio(md, gfp_mask); 1790 if (!tio) 1791 return NULL; 1792 1793 tio->md = md; 1794 tio->ti = NULL; 1795 tio->clone = NULL; 1796 tio->orig = rq; 1797 tio->error = 0; 1798 memset(&tio->info, 0, sizeof(tio->info)); 1799 init_kthread_work(&tio->work, map_tio_request); 1800 1801 clone = __clone_rq(rq, md, tio, GFP_ATOMIC); 1802 if (!clone) { 1803 free_rq_tio(tio); 1804 return NULL; 1805 } 1806 1807 return clone; 1808 } 1809 1810 /* 1811 * Called with the queue lock held. 1812 */ 1813 static int dm_prep_fn(struct request_queue *q, struct request *rq) 1814 { 1815 struct mapped_device *md = q->queuedata; 1816 struct request *clone; 1817 1818 if (unlikely(rq->special)) { 1819 DMWARN("Already has something in rq->special."); 1820 return BLKPREP_KILL; 1821 } 1822 1823 clone = clone_rq(rq, md, GFP_ATOMIC); 1824 if (!clone) 1825 return BLKPREP_DEFER; 1826 1827 rq->special = clone; 1828 rq->cmd_flags |= REQ_DONTPREP; 1829 1830 return BLKPREP_OK; 1831 } 1832 1833 /* 1834 * Returns: 1835 * 0 : the request has been processed (not requeued) 1836 * !0 : the request has been requeued 1837 */ 1838 static int map_request(struct dm_target *ti, struct request *clone, 1839 struct mapped_device *md) 1840 { 1841 int r, requeued = 0; 1842 struct dm_rq_target_io *tio = clone->end_io_data; 1843 1844 r = ti->type->map_rq(ti, clone, &tio->info); 1845 switch (r) { 1846 case DM_MAPIO_SUBMITTED: 1847 /* The target has taken the I/O to submit by itself later */ 1848 break; 1849 case DM_MAPIO_REMAPPED: 1850 /* The target has remapped the I/O so dispatch it */ 1851 trace_block_rq_remap(clone->q, clone, disk_devt(dm_disk(md)), 1852 blk_rq_pos(tio->orig)); 1853 dm_dispatch_request(clone); 1854 break; 1855 case DM_MAPIO_REQUEUE: 1856 /* The target wants to requeue the I/O */ 1857 dm_requeue_unmapped_request(clone); 1858 requeued = 1; 1859 break; 1860 default: 1861 if (r > 0) { 1862 DMWARN("unimplemented target map return value: %d", r); 1863 BUG(); 1864 } 1865 1866 /* The target wants to complete the I/O */ 1867 dm_kill_unmapped_request(clone, r); 1868 break; 1869 } 1870 1871 return requeued; 1872 } 1873 1874 static void map_tio_request(struct kthread_work *work) 1875 { 1876 struct dm_rq_target_io *tio = container_of(work, struct dm_rq_target_io, work); 1877 1878 map_request(tio->ti, tio->clone, tio->md); 1879 } 1880 1881 static struct request *dm_start_request(struct mapped_device *md, struct request *orig) 1882 { 1883 struct request *clone; 1884 1885 blk_start_request(orig); 1886 clone = orig->special; 1887 atomic_inc(&md->pending[rq_data_dir(clone)]); 1888 1889 /* 1890 * Hold the md reference here for the in-flight I/O. 1891 * We can't rely on the reference count by device opener, 1892 * because the device may be closed during the request completion 1893 * when all bios are completed. 1894 * See the comment in rq_completed() too. 1895 */ 1896 dm_get(md); 1897 1898 return clone; 1899 } 1900 1901 /* 1902 * q->request_fn for request-based dm. 1903 * Called with the queue lock held. 1904 */ 1905 static void dm_request_fn(struct request_queue *q) 1906 { 1907 struct mapped_device *md = q->queuedata; 1908 int srcu_idx; 1909 struct dm_table *map = dm_get_live_table(md, &srcu_idx); 1910 struct dm_target *ti; 1911 struct request *rq, *clone; 1912 struct dm_rq_target_io *tio; 1913 sector_t pos; 1914 1915 /* 1916 * For suspend, check blk_queue_stopped() and increment 1917 * ->pending within a single queue_lock not to increment the 1918 * number of in-flight I/Os after the queue is stopped in 1919 * dm_suspend(). 1920 */ 1921 while (!blk_queue_stopped(q)) { 1922 rq = blk_peek_request(q); 1923 if (!rq) 1924 goto delay_and_out; 1925 1926 /* always use block 0 to find the target for flushes for now */ 1927 pos = 0; 1928 if (!(rq->cmd_flags & REQ_FLUSH)) 1929 pos = blk_rq_pos(rq); 1930 1931 ti = dm_table_find_target(map, pos); 1932 if (!dm_target_is_valid(ti)) { 1933 /* 1934 * Must perform setup, that dm_done() requires, 1935 * before calling dm_kill_unmapped_request 1936 */ 1937 DMERR_LIMIT("request attempted access beyond the end of device"); 1938 clone = dm_start_request(md, rq); 1939 dm_kill_unmapped_request(clone, -EIO); 1940 continue; 1941 } 1942 1943 if (ti->type->busy && ti->type->busy(ti)) 1944 goto delay_and_out; 1945 1946 clone = dm_start_request(md, rq); 1947 1948 tio = rq->special; 1949 /* Establish tio->ti before queuing work (map_tio_request) */ 1950 tio->ti = ti; 1951 queue_kthread_work(&md->kworker, &tio->work); 1952 BUG_ON(!irqs_disabled()); 1953 } 1954 1955 goto out; 1956 1957 delay_and_out: 1958 blk_delay_queue(q, HZ / 10); 1959 out: 1960 dm_put_live_table(md, srcu_idx); 1961 } 1962 1963 int dm_underlying_device_busy(struct request_queue *q) 1964 { 1965 return blk_lld_busy(q); 1966 } 1967 EXPORT_SYMBOL_GPL(dm_underlying_device_busy); 1968 1969 static int dm_lld_busy(struct request_queue *q) 1970 { 1971 int r; 1972 struct mapped_device *md = q->queuedata; 1973 struct dm_table *map = dm_get_live_table_fast(md); 1974 1975 if (!map || test_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags)) 1976 r = 1; 1977 else 1978 r = dm_table_any_busy_target(map); 1979 1980 dm_put_live_table_fast(md); 1981 1982 return r; 1983 } 1984 1985 static int dm_any_congested(void *congested_data, int bdi_bits) 1986 { 1987 int r = bdi_bits; 1988 struct mapped_device *md = congested_data; 1989 struct dm_table *map; 1990 1991 if (!test_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags)) { 1992 map = dm_get_live_table_fast(md); 1993 if (map) { 1994 /* 1995 * Request-based dm cares about only own queue for 1996 * the query about congestion status of request_queue 1997 */ 1998 if (dm_request_based(md)) 1999 r = md->queue->backing_dev_info.state & 2000 bdi_bits; 2001 else 2002 r = dm_table_any_congested(map, bdi_bits); 2003 } 2004 dm_put_live_table_fast(md); 2005 } 2006 2007 return r; 2008 } 2009 2010 /*----------------------------------------------------------------- 2011 * An IDR is used to keep track of allocated minor numbers. 2012 *---------------------------------------------------------------*/ 2013 static void free_minor(int minor) 2014 { 2015 spin_lock(&_minor_lock); 2016 idr_remove(&_minor_idr, minor); 2017 spin_unlock(&_minor_lock); 2018 } 2019 2020 /* 2021 * See if the device with a specific minor # is free. 2022 */ 2023 static int specific_minor(int minor) 2024 { 2025 int r; 2026 2027 if (minor >= (1 << MINORBITS)) 2028 return -EINVAL; 2029 2030 idr_preload(GFP_KERNEL); 2031 spin_lock(&_minor_lock); 2032 2033 r = idr_alloc(&_minor_idr, MINOR_ALLOCED, minor, minor + 1, GFP_NOWAIT); 2034 2035 spin_unlock(&_minor_lock); 2036 idr_preload_end(); 2037 if (r < 0) 2038 return r == -ENOSPC ? -EBUSY : r; 2039 return 0; 2040 } 2041 2042 static int next_free_minor(int *minor) 2043 { 2044 int r; 2045 2046 idr_preload(GFP_KERNEL); 2047 spin_lock(&_minor_lock); 2048 2049 r = idr_alloc(&_minor_idr, MINOR_ALLOCED, 0, 1 << MINORBITS, GFP_NOWAIT); 2050 2051 spin_unlock(&_minor_lock); 2052 idr_preload_end(); 2053 if (r < 0) 2054 return r; 2055 *minor = r; 2056 return 0; 2057 } 2058 2059 static const struct block_device_operations dm_blk_dops; 2060 2061 static void dm_wq_work(struct work_struct *work); 2062 2063 static void dm_init_md_queue(struct mapped_device *md) 2064 { 2065 /* 2066 * Request-based dm devices cannot be stacked on top of bio-based dm 2067 * devices. The type of this dm device has not been decided yet. 2068 * The type is decided at the first table loading time. 2069 * To prevent problematic device stacking, clear the queue flag 2070 * for request stacking support until then. 2071 * 2072 * This queue is new, so no concurrency on the queue_flags. 2073 */ 2074 queue_flag_clear_unlocked(QUEUE_FLAG_STACKABLE, md->queue); 2075 2076 md->queue->queuedata = md; 2077 md->queue->backing_dev_info.congested_fn = dm_any_congested; 2078 md->queue->backing_dev_info.congested_data = md; 2079 blk_queue_make_request(md->queue, dm_request); 2080 blk_queue_bounce_limit(md->queue, BLK_BOUNCE_ANY); 2081 blk_queue_merge_bvec(md->queue, dm_merge_bvec); 2082 } 2083 2084 /* 2085 * Allocate and initialise a blank device with a given minor. 2086 */ 2087 static struct mapped_device *alloc_dev(int minor) 2088 { 2089 int r; 2090 struct mapped_device *md = kzalloc(sizeof(*md), GFP_KERNEL); 2091 void *old_md; 2092 2093 if (!md) { 2094 DMWARN("unable to allocate device, out of memory."); 2095 return NULL; 2096 } 2097 2098 if (!try_module_get(THIS_MODULE)) 2099 goto bad_module_get; 2100 2101 /* get a minor number for the dev */ 2102 if (minor == DM_ANY_MINOR) 2103 r = next_free_minor(&minor); 2104 else 2105 r = specific_minor(minor); 2106 if (r < 0) 2107 goto bad_minor; 2108 2109 r = init_srcu_struct(&md->io_barrier); 2110 if (r < 0) 2111 goto bad_io_barrier; 2112 2113 md->type = DM_TYPE_NONE; 2114 mutex_init(&md->suspend_lock); 2115 mutex_init(&md->type_lock); 2116 mutex_init(&md->table_devices_lock); 2117 spin_lock_init(&md->deferred_lock); 2118 atomic_set(&md->holders, 1); 2119 atomic_set(&md->open_count, 0); 2120 atomic_set(&md->event_nr, 0); 2121 atomic_set(&md->uevent_seq, 0); 2122 INIT_LIST_HEAD(&md->uevent_list); 2123 INIT_LIST_HEAD(&md->table_devices); 2124 spin_lock_init(&md->uevent_lock); 2125 2126 md->queue = blk_alloc_queue(GFP_KERNEL); 2127 if (!md->queue) 2128 goto bad_queue; 2129 2130 dm_init_md_queue(md); 2131 2132 md->disk = alloc_disk(1); 2133 if (!md->disk) 2134 goto bad_disk; 2135 2136 atomic_set(&md->pending[0], 0); 2137 atomic_set(&md->pending[1], 0); 2138 init_waitqueue_head(&md->wait); 2139 INIT_WORK(&md->work, dm_wq_work); 2140 init_waitqueue_head(&md->eventq); 2141 init_completion(&md->kobj_holder.completion); 2142 md->kworker_task = NULL; 2143 2144 md->disk->major = _major; 2145 md->disk->first_minor = minor; 2146 md->disk->fops = &dm_blk_dops; 2147 md->disk->queue = md->queue; 2148 md->disk->private_data = md; 2149 sprintf(md->disk->disk_name, "dm-%d", minor); 2150 add_disk(md->disk); 2151 format_dev_t(md->name, MKDEV(_major, minor)); 2152 2153 md->wq = alloc_workqueue("kdmflush", WQ_MEM_RECLAIM, 0); 2154 if (!md->wq) 2155 goto bad_thread; 2156 2157 md->bdev = bdget_disk(md->disk, 0); 2158 if (!md->bdev) 2159 goto bad_bdev; 2160 2161 bio_init(&md->flush_bio); 2162 md->flush_bio.bi_bdev = md->bdev; 2163 md->flush_bio.bi_rw = WRITE_FLUSH; 2164 2165 dm_stats_init(&md->stats); 2166 2167 /* Populate the mapping, nobody knows we exist yet */ 2168 spin_lock(&_minor_lock); 2169 old_md = idr_replace(&_minor_idr, md, minor); 2170 spin_unlock(&_minor_lock); 2171 2172 BUG_ON(old_md != MINOR_ALLOCED); 2173 2174 return md; 2175 2176 bad_bdev: 2177 destroy_workqueue(md->wq); 2178 bad_thread: 2179 del_gendisk(md->disk); 2180 put_disk(md->disk); 2181 bad_disk: 2182 blk_cleanup_queue(md->queue); 2183 bad_queue: 2184 cleanup_srcu_struct(&md->io_barrier); 2185 bad_io_barrier: 2186 free_minor(minor); 2187 bad_minor: 2188 module_put(THIS_MODULE); 2189 bad_module_get: 2190 kfree(md); 2191 return NULL; 2192 } 2193 2194 static void unlock_fs(struct mapped_device *md); 2195 2196 static void free_dev(struct mapped_device *md) 2197 { 2198 int minor = MINOR(disk_devt(md->disk)); 2199 2200 unlock_fs(md); 2201 bdput(md->bdev); 2202 destroy_workqueue(md->wq); 2203 2204 if (md->kworker_task) 2205 kthread_stop(md->kworker_task); 2206 if (md->io_pool) 2207 mempool_destroy(md->io_pool); 2208 if (md->rq_pool) 2209 mempool_destroy(md->rq_pool); 2210 if (md->bs) 2211 bioset_free(md->bs); 2212 blk_integrity_unregister(md->disk); 2213 del_gendisk(md->disk); 2214 cleanup_srcu_struct(&md->io_barrier); 2215 free_table_devices(&md->table_devices); 2216 free_minor(minor); 2217 2218 spin_lock(&_minor_lock); 2219 md->disk->private_data = NULL; 2220 spin_unlock(&_minor_lock); 2221 2222 put_disk(md->disk); 2223 blk_cleanup_queue(md->queue); 2224 dm_stats_cleanup(&md->stats); 2225 module_put(THIS_MODULE); 2226 kfree(md); 2227 } 2228 2229 static void __bind_mempools(struct mapped_device *md, struct dm_table *t) 2230 { 2231 struct dm_md_mempools *p = dm_table_get_md_mempools(t); 2232 2233 if (md->io_pool && md->bs) { 2234 /* The md already has necessary mempools. */ 2235 if (dm_table_get_type(t) == DM_TYPE_BIO_BASED) { 2236 /* 2237 * Reload bioset because front_pad may have changed 2238 * because a different table was loaded. 2239 */ 2240 bioset_free(md->bs); 2241 md->bs = p->bs; 2242 p->bs = NULL; 2243 } else if (dm_table_get_type(t) == DM_TYPE_REQUEST_BASED) { 2244 /* 2245 * There's no need to reload with request-based dm 2246 * because the size of front_pad doesn't change. 2247 * Note for future: If you are to reload bioset, 2248 * prep-ed requests in the queue may refer 2249 * to bio from the old bioset, so you must walk 2250 * through the queue to unprep. 2251 */ 2252 } 2253 goto out; 2254 } 2255 2256 BUG_ON(!p || md->io_pool || md->rq_pool || md->bs); 2257 2258 md->io_pool = p->io_pool; 2259 p->io_pool = NULL; 2260 md->rq_pool = p->rq_pool; 2261 p->rq_pool = NULL; 2262 md->bs = p->bs; 2263 p->bs = NULL; 2264 2265 out: 2266 /* mempool bind completed, now no need any mempools in the table */ 2267 dm_table_free_md_mempools(t); 2268 } 2269 2270 /* 2271 * Bind a table to the device. 2272 */ 2273 static void event_callback(void *context) 2274 { 2275 unsigned long flags; 2276 LIST_HEAD(uevents); 2277 struct mapped_device *md = (struct mapped_device *) context; 2278 2279 spin_lock_irqsave(&md->uevent_lock, flags); 2280 list_splice_init(&md->uevent_list, &uevents); 2281 spin_unlock_irqrestore(&md->uevent_lock, flags); 2282 2283 dm_send_uevents(&uevents, &disk_to_dev(md->disk)->kobj); 2284 2285 atomic_inc(&md->event_nr); 2286 wake_up(&md->eventq); 2287 } 2288 2289 /* 2290 * Protected by md->suspend_lock obtained by dm_swap_table(). 2291 */ 2292 static void __set_size(struct mapped_device *md, sector_t size) 2293 { 2294 set_capacity(md->disk, size); 2295 2296 i_size_write(md->bdev->bd_inode, (loff_t)size << SECTOR_SHIFT); 2297 } 2298 2299 /* 2300 * Return 1 if the queue has a compulsory merge_bvec_fn function. 2301 * 2302 * If this function returns 0, then the device is either a non-dm 2303 * device without a merge_bvec_fn, or it is a dm device that is 2304 * able to split any bios it receives that are too big. 2305 */ 2306 int dm_queue_merge_is_compulsory(struct request_queue *q) 2307 { 2308 struct mapped_device *dev_md; 2309 2310 if (!q->merge_bvec_fn) 2311 return 0; 2312 2313 if (q->make_request_fn == dm_request) { 2314 dev_md = q->queuedata; 2315 if (test_bit(DMF_MERGE_IS_OPTIONAL, &dev_md->flags)) 2316 return 0; 2317 } 2318 2319 return 1; 2320 } 2321 2322 static int dm_device_merge_is_compulsory(struct dm_target *ti, 2323 struct dm_dev *dev, sector_t start, 2324 sector_t len, void *data) 2325 { 2326 struct block_device *bdev = dev->bdev; 2327 struct request_queue *q = bdev_get_queue(bdev); 2328 2329 return dm_queue_merge_is_compulsory(q); 2330 } 2331 2332 /* 2333 * Return 1 if it is acceptable to ignore merge_bvec_fn based 2334 * on the properties of the underlying devices. 2335 */ 2336 static int dm_table_merge_is_optional(struct dm_table *table) 2337 { 2338 unsigned i = 0; 2339 struct dm_target *ti; 2340 2341 while (i < dm_table_get_num_targets(table)) { 2342 ti = dm_table_get_target(table, i++); 2343 2344 if (ti->type->iterate_devices && 2345 ti->type->iterate_devices(ti, dm_device_merge_is_compulsory, NULL)) 2346 return 0; 2347 } 2348 2349 return 1; 2350 } 2351 2352 /* 2353 * Returns old map, which caller must destroy. 2354 */ 2355 static struct dm_table *__bind(struct mapped_device *md, struct dm_table *t, 2356 struct queue_limits *limits) 2357 { 2358 struct dm_table *old_map; 2359 struct request_queue *q = md->queue; 2360 sector_t size; 2361 int merge_is_optional; 2362 2363 size = dm_table_get_size(t); 2364 2365 /* 2366 * Wipe any geometry if the size of the table changed. 2367 */ 2368 if (size != dm_get_size(md)) 2369 memset(&md->geometry, 0, sizeof(md->geometry)); 2370 2371 __set_size(md, size); 2372 2373 dm_table_event_callback(t, event_callback, md); 2374 2375 /* 2376 * The queue hasn't been stopped yet, if the old table type wasn't 2377 * for request-based during suspension. So stop it to prevent 2378 * I/O mapping before resume. 2379 * This must be done before setting the queue restrictions, 2380 * because request-based dm may be run just after the setting. 2381 */ 2382 if (dm_table_request_based(t) && !blk_queue_stopped(q)) 2383 stop_queue(q); 2384 2385 __bind_mempools(md, t); 2386 2387 merge_is_optional = dm_table_merge_is_optional(t); 2388 2389 old_map = rcu_dereference_protected(md->map, lockdep_is_held(&md->suspend_lock)); 2390 rcu_assign_pointer(md->map, t); 2391 md->immutable_target_type = dm_table_get_immutable_target_type(t); 2392 2393 dm_table_set_restrictions(t, q, limits); 2394 if (merge_is_optional) 2395 set_bit(DMF_MERGE_IS_OPTIONAL, &md->flags); 2396 else 2397 clear_bit(DMF_MERGE_IS_OPTIONAL, &md->flags); 2398 if (old_map) 2399 dm_sync_table(md); 2400 2401 return old_map; 2402 } 2403 2404 /* 2405 * Returns unbound table for the caller to free. 2406 */ 2407 static struct dm_table *__unbind(struct mapped_device *md) 2408 { 2409 struct dm_table *map = rcu_dereference_protected(md->map, 1); 2410 2411 if (!map) 2412 return NULL; 2413 2414 dm_table_event_callback(map, NULL, NULL); 2415 RCU_INIT_POINTER(md->map, NULL); 2416 dm_sync_table(md); 2417 2418 return map; 2419 } 2420 2421 /* 2422 * Constructor for a new device. 2423 */ 2424 int dm_create(int minor, struct mapped_device **result) 2425 { 2426 struct mapped_device *md; 2427 2428 md = alloc_dev(minor); 2429 if (!md) 2430 return -ENXIO; 2431 2432 dm_sysfs_init(md); 2433 2434 *result = md; 2435 return 0; 2436 } 2437 2438 /* 2439 * Functions to manage md->type. 2440 * All are required to hold md->type_lock. 2441 */ 2442 void dm_lock_md_type(struct mapped_device *md) 2443 { 2444 mutex_lock(&md->type_lock); 2445 } 2446 2447 void dm_unlock_md_type(struct mapped_device *md) 2448 { 2449 mutex_unlock(&md->type_lock); 2450 } 2451 2452 void dm_set_md_type(struct mapped_device *md, unsigned type) 2453 { 2454 BUG_ON(!mutex_is_locked(&md->type_lock)); 2455 md->type = type; 2456 } 2457 2458 unsigned dm_get_md_type(struct mapped_device *md) 2459 { 2460 BUG_ON(!mutex_is_locked(&md->type_lock)); 2461 return md->type; 2462 } 2463 2464 struct target_type *dm_get_immutable_target_type(struct mapped_device *md) 2465 { 2466 return md->immutable_target_type; 2467 } 2468 2469 /* 2470 * The queue_limits are only valid as long as you have a reference 2471 * count on 'md'. 2472 */ 2473 struct queue_limits *dm_get_queue_limits(struct mapped_device *md) 2474 { 2475 BUG_ON(!atomic_read(&md->holders)); 2476 return &md->queue->limits; 2477 } 2478 EXPORT_SYMBOL_GPL(dm_get_queue_limits); 2479 2480 /* 2481 * Fully initialize a request-based queue (->elevator, ->request_fn, etc). 2482 */ 2483 static int dm_init_request_based_queue(struct mapped_device *md) 2484 { 2485 struct request_queue *q = NULL; 2486 2487 if (md->queue->elevator) 2488 return 1; 2489 2490 /* Fully initialize the queue */ 2491 q = blk_init_allocated_queue(md->queue, dm_request_fn, NULL); 2492 if (!q) 2493 return 0; 2494 2495 md->queue = q; 2496 dm_init_md_queue(md); 2497 blk_queue_softirq_done(md->queue, dm_softirq_done); 2498 blk_queue_prep_rq(md->queue, dm_prep_fn); 2499 blk_queue_lld_busy(md->queue, dm_lld_busy); 2500 2501 /* Also initialize the request-based DM worker thread */ 2502 init_kthread_worker(&md->kworker); 2503 md->kworker_task = kthread_run(kthread_worker_fn, &md->kworker, 2504 "kdmwork-%s", dm_device_name(md)); 2505 2506 elv_register_queue(md->queue); 2507 2508 return 1; 2509 } 2510 2511 /* 2512 * Setup the DM device's queue based on md's type 2513 */ 2514 int dm_setup_md_queue(struct mapped_device *md) 2515 { 2516 if ((dm_get_md_type(md) == DM_TYPE_REQUEST_BASED) && 2517 !dm_init_request_based_queue(md)) { 2518 DMWARN("Cannot initialize queue for request-based mapped device"); 2519 return -EINVAL; 2520 } 2521 2522 return 0; 2523 } 2524 2525 static struct mapped_device *dm_find_md(dev_t dev) 2526 { 2527 struct mapped_device *md; 2528 unsigned minor = MINOR(dev); 2529 2530 if (MAJOR(dev) != _major || minor >= (1 << MINORBITS)) 2531 return NULL; 2532 2533 spin_lock(&_minor_lock); 2534 2535 md = idr_find(&_minor_idr, minor); 2536 if (md && (md == MINOR_ALLOCED || 2537 (MINOR(disk_devt(dm_disk(md))) != minor) || 2538 dm_deleting_md(md) || 2539 test_bit(DMF_FREEING, &md->flags))) { 2540 md = NULL; 2541 goto out; 2542 } 2543 2544 out: 2545 spin_unlock(&_minor_lock); 2546 2547 return md; 2548 } 2549 2550 struct mapped_device *dm_get_md(dev_t dev) 2551 { 2552 struct mapped_device *md = dm_find_md(dev); 2553 2554 if (md) 2555 dm_get(md); 2556 2557 return md; 2558 } 2559 EXPORT_SYMBOL_GPL(dm_get_md); 2560 2561 void *dm_get_mdptr(struct mapped_device *md) 2562 { 2563 return md->interface_ptr; 2564 } 2565 2566 void dm_set_mdptr(struct mapped_device *md, void *ptr) 2567 { 2568 md->interface_ptr = ptr; 2569 } 2570 2571 void dm_get(struct mapped_device *md) 2572 { 2573 atomic_inc(&md->holders); 2574 BUG_ON(test_bit(DMF_FREEING, &md->flags)); 2575 } 2576 2577 const char *dm_device_name(struct mapped_device *md) 2578 { 2579 return md->name; 2580 } 2581 EXPORT_SYMBOL_GPL(dm_device_name); 2582 2583 static void __dm_destroy(struct mapped_device *md, bool wait) 2584 { 2585 struct dm_table *map; 2586 int srcu_idx; 2587 2588 might_sleep(); 2589 2590 spin_lock(&_minor_lock); 2591 map = dm_get_live_table(md, &srcu_idx); 2592 idr_replace(&_minor_idr, MINOR_ALLOCED, MINOR(disk_devt(dm_disk(md)))); 2593 set_bit(DMF_FREEING, &md->flags); 2594 spin_unlock(&_minor_lock); 2595 2596 if (dm_request_based(md)) 2597 flush_kthread_worker(&md->kworker); 2598 2599 if (!dm_suspended_md(md)) { 2600 dm_table_presuspend_targets(map); 2601 dm_table_postsuspend_targets(map); 2602 } 2603 2604 /* dm_put_live_table must be before msleep, otherwise deadlock is possible */ 2605 dm_put_live_table(md, srcu_idx); 2606 2607 /* 2608 * Rare, but there may be I/O requests still going to complete, 2609 * for example. Wait for all references to disappear. 2610 * No one should increment the reference count of the mapped_device, 2611 * after the mapped_device state becomes DMF_FREEING. 2612 */ 2613 if (wait) 2614 while (atomic_read(&md->holders)) 2615 msleep(1); 2616 else if (atomic_read(&md->holders)) 2617 DMWARN("%s: Forcibly removing mapped_device still in use! (%d users)", 2618 dm_device_name(md), atomic_read(&md->holders)); 2619 2620 dm_sysfs_exit(md); 2621 dm_table_destroy(__unbind(md)); 2622 free_dev(md); 2623 } 2624 2625 void dm_destroy(struct mapped_device *md) 2626 { 2627 __dm_destroy(md, true); 2628 } 2629 2630 void dm_destroy_immediate(struct mapped_device *md) 2631 { 2632 __dm_destroy(md, false); 2633 } 2634 2635 void dm_put(struct mapped_device *md) 2636 { 2637 atomic_dec(&md->holders); 2638 } 2639 EXPORT_SYMBOL_GPL(dm_put); 2640 2641 static int dm_wait_for_completion(struct mapped_device *md, int interruptible) 2642 { 2643 int r = 0; 2644 DECLARE_WAITQUEUE(wait, current); 2645 2646 add_wait_queue(&md->wait, &wait); 2647 2648 while (1) { 2649 set_current_state(interruptible); 2650 2651 if (!md_in_flight(md)) 2652 break; 2653 2654 if (interruptible == TASK_INTERRUPTIBLE && 2655 signal_pending(current)) { 2656 r = -EINTR; 2657 break; 2658 } 2659 2660 io_schedule(); 2661 } 2662 set_current_state(TASK_RUNNING); 2663 2664 remove_wait_queue(&md->wait, &wait); 2665 2666 return r; 2667 } 2668 2669 /* 2670 * Process the deferred bios 2671 */ 2672 static void dm_wq_work(struct work_struct *work) 2673 { 2674 struct mapped_device *md = container_of(work, struct mapped_device, 2675 work); 2676 struct bio *c; 2677 int srcu_idx; 2678 struct dm_table *map; 2679 2680 map = dm_get_live_table(md, &srcu_idx); 2681 2682 while (!test_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags)) { 2683 spin_lock_irq(&md->deferred_lock); 2684 c = bio_list_pop(&md->deferred); 2685 spin_unlock_irq(&md->deferred_lock); 2686 2687 if (!c) 2688 break; 2689 2690 if (dm_request_based(md)) 2691 generic_make_request(c); 2692 else 2693 __split_and_process_bio(md, map, c); 2694 } 2695 2696 dm_put_live_table(md, srcu_idx); 2697 } 2698 2699 static void dm_queue_flush(struct mapped_device *md) 2700 { 2701 clear_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags); 2702 smp_mb__after_atomic(); 2703 queue_work(md->wq, &md->work); 2704 } 2705 2706 /* 2707 * Swap in a new table, returning the old one for the caller to destroy. 2708 */ 2709 struct dm_table *dm_swap_table(struct mapped_device *md, struct dm_table *table) 2710 { 2711 struct dm_table *live_map = NULL, *map = ERR_PTR(-EINVAL); 2712 struct queue_limits limits; 2713 int r; 2714 2715 mutex_lock(&md->suspend_lock); 2716 2717 /* device must be suspended */ 2718 if (!dm_suspended_md(md)) 2719 goto out; 2720 2721 /* 2722 * If the new table has no data devices, retain the existing limits. 2723 * This helps multipath with queue_if_no_path if all paths disappear, 2724 * then new I/O is queued based on these limits, and then some paths 2725 * reappear. 2726 */ 2727 if (dm_table_has_no_data_devices(table)) { 2728 live_map = dm_get_live_table_fast(md); 2729 if (live_map) 2730 limits = md->queue->limits; 2731 dm_put_live_table_fast(md); 2732 } 2733 2734 if (!live_map) { 2735 r = dm_calculate_queue_limits(table, &limits); 2736 if (r) { 2737 map = ERR_PTR(r); 2738 goto out; 2739 } 2740 } 2741 2742 map = __bind(md, table, &limits); 2743 2744 out: 2745 mutex_unlock(&md->suspend_lock); 2746 return map; 2747 } 2748 2749 /* 2750 * Functions to lock and unlock any filesystem running on the 2751 * device. 2752 */ 2753 static int lock_fs(struct mapped_device *md) 2754 { 2755 int r; 2756 2757 WARN_ON(md->frozen_sb); 2758 2759 md->frozen_sb = freeze_bdev(md->bdev); 2760 if (IS_ERR(md->frozen_sb)) { 2761 r = PTR_ERR(md->frozen_sb); 2762 md->frozen_sb = NULL; 2763 return r; 2764 } 2765 2766 set_bit(DMF_FROZEN, &md->flags); 2767 2768 return 0; 2769 } 2770 2771 static void unlock_fs(struct mapped_device *md) 2772 { 2773 if (!test_bit(DMF_FROZEN, &md->flags)) 2774 return; 2775 2776 thaw_bdev(md->bdev, md->frozen_sb); 2777 md->frozen_sb = NULL; 2778 clear_bit(DMF_FROZEN, &md->flags); 2779 } 2780 2781 /* 2782 * If __dm_suspend returns 0, the device is completely quiescent 2783 * now. There is no request-processing activity. All new requests 2784 * are being added to md->deferred list. 2785 * 2786 * Caller must hold md->suspend_lock 2787 */ 2788 static int __dm_suspend(struct mapped_device *md, struct dm_table *map, 2789 unsigned suspend_flags, int interruptible) 2790 { 2791 bool do_lockfs = suspend_flags & DM_SUSPEND_LOCKFS_FLAG; 2792 bool noflush = suspend_flags & DM_SUSPEND_NOFLUSH_FLAG; 2793 int r; 2794 2795 /* 2796 * DMF_NOFLUSH_SUSPENDING must be set before presuspend. 2797 * This flag is cleared before dm_suspend returns. 2798 */ 2799 if (noflush) 2800 set_bit(DMF_NOFLUSH_SUSPENDING, &md->flags); 2801 2802 /* 2803 * This gets reverted if there's an error later and the targets 2804 * provide the .presuspend_undo hook. 2805 */ 2806 dm_table_presuspend_targets(map); 2807 2808 /* 2809 * Flush I/O to the device. 2810 * Any I/O submitted after lock_fs() may not be flushed. 2811 * noflush takes precedence over do_lockfs. 2812 * (lock_fs() flushes I/Os and waits for them to complete.) 2813 */ 2814 if (!noflush && do_lockfs) { 2815 r = lock_fs(md); 2816 if (r) { 2817 dm_table_presuspend_undo_targets(map); 2818 return r; 2819 } 2820 } 2821 2822 /* 2823 * Here we must make sure that no processes are submitting requests 2824 * to target drivers i.e. no one may be executing 2825 * __split_and_process_bio. This is called from dm_request and 2826 * dm_wq_work. 2827 * 2828 * To get all processes out of __split_and_process_bio in dm_request, 2829 * we take the write lock. To prevent any process from reentering 2830 * __split_and_process_bio from dm_request and quiesce the thread 2831 * (dm_wq_work), we set BMF_BLOCK_IO_FOR_SUSPEND and call 2832 * flush_workqueue(md->wq). 2833 */ 2834 set_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags); 2835 if (map) 2836 synchronize_srcu(&md->io_barrier); 2837 2838 /* 2839 * Stop md->queue before flushing md->wq in case request-based 2840 * dm defers requests to md->wq from md->queue. 2841 */ 2842 if (dm_request_based(md)) { 2843 stop_queue(md->queue); 2844 flush_kthread_worker(&md->kworker); 2845 } 2846 2847 flush_workqueue(md->wq); 2848 2849 /* 2850 * At this point no more requests are entering target request routines. 2851 * We call dm_wait_for_completion to wait for all existing requests 2852 * to finish. 2853 */ 2854 r = dm_wait_for_completion(md, interruptible); 2855 2856 if (noflush) 2857 clear_bit(DMF_NOFLUSH_SUSPENDING, &md->flags); 2858 if (map) 2859 synchronize_srcu(&md->io_barrier); 2860 2861 /* were we interrupted ? */ 2862 if (r < 0) { 2863 dm_queue_flush(md); 2864 2865 if (dm_request_based(md)) 2866 start_queue(md->queue); 2867 2868 unlock_fs(md); 2869 dm_table_presuspend_undo_targets(map); 2870 /* pushback list is already flushed, so skip flush */ 2871 } 2872 2873 return r; 2874 } 2875 2876 /* 2877 * We need to be able to change a mapping table under a mounted 2878 * filesystem. For example we might want to move some data in 2879 * the background. Before the table can be swapped with 2880 * dm_bind_table, dm_suspend must be called to flush any in 2881 * flight bios and ensure that any further io gets deferred. 2882 */ 2883 /* 2884 * Suspend mechanism in request-based dm. 2885 * 2886 * 1. Flush all I/Os by lock_fs() if needed. 2887 * 2. Stop dispatching any I/O by stopping the request_queue. 2888 * 3. Wait for all in-flight I/Os to be completed or requeued. 2889 * 2890 * To abort suspend, start the request_queue. 2891 */ 2892 int dm_suspend(struct mapped_device *md, unsigned suspend_flags) 2893 { 2894 struct dm_table *map = NULL; 2895 int r = 0; 2896 2897 retry: 2898 mutex_lock_nested(&md->suspend_lock, SINGLE_DEPTH_NESTING); 2899 2900 if (dm_suspended_md(md)) { 2901 r = -EINVAL; 2902 goto out_unlock; 2903 } 2904 2905 if (dm_suspended_internally_md(md)) { 2906 /* already internally suspended, wait for internal resume */ 2907 mutex_unlock(&md->suspend_lock); 2908 r = wait_on_bit(&md->flags, DMF_SUSPENDED_INTERNALLY, TASK_INTERRUPTIBLE); 2909 if (r) 2910 return r; 2911 goto retry; 2912 } 2913 2914 map = rcu_dereference_protected(md->map, lockdep_is_held(&md->suspend_lock)); 2915 2916 r = __dm_suspend(md, map, suspend_flags, TASK_INTERRUPTIBLE); 2917 if (r) 2918 goto out_unlock; 2919 2920 set_bit(DMF_SUSPENDED, &md->flags); 2921 2922 dm_table_postsuspend_targets(map); 2923 2924 out_unlock: 2925 mutex_unlock(&md->suspend_lock); 2926 return r; 2927 } 2928 2929 static int __dm_resume(struct mapped_device *md, struct dm_table *map) 2930 { 2931 if (map) { 2932 int r = dm_table_resume_targets(map); 2933 if (r) 2934 return r; 2935 } 2936 2937 dm_queue_flush(md); 2938 2939 /* 2940 * Flushing deferred I/Os must be done after targets are resumed 2941 * so that mapping of targets can work correctly. 2942 * Request-based dm is queueing the deferred I/Os in its request_queue. 2943 */ 2944 if (dm_request_based(md)) 2945 start_queue(md->queue); 2946 2947 unlock_fs(md); 2948 2949 return 0; 2950 } 2951 2952 int dm_resume(struct mapped_device *md) 2953 { 2954 int r = -EINVAL; 2955 struct dm_table *map = NULL; 2956 2957 retry: 2958 mutex_lock_nested(&md->suspend_lock, SINGLE_DEPTH_NESTING); 2959 2960 if (!dm_suspended_md(md)) 2961 goto out; 2962 2963 if (dm_suspended_internally_md(md)) { 2964 /* already internally suspended, wait for internal resume */ 2965 mutex_unlock(&md->suspend_lock); 2966 r = wait_on_bit(&md->flags, DMF_SUSPENDED_INTERNALLY, TASK_INTERRUPTIBLE); 2967 if (r) 2968 return r; 2969 goto retry; 2970 } 2971 2972 map = rcu_dereference_protected(md->map, lockdep_is_held(&md->suspend_lock)); 2973 if (!map || !dm_table_get_size(map)) 2974 goto out; 2975 2976 r = __dm_resume(md, map); 2977 if (r) 2978 goto out; 2979 2980 clear_bit(DMF_SUSPENDED, &md->flags); 2981 2982 r = 0; 2983 out: 2984 mutex_unlock(&md->suspend_lock); 2985 2986 return r; 2987 } 2988 2989 /* 2990 * Internal suspend/resume works like userspace-driven suspend. It waits 2991 * until all bios finish and prevents issuing new bios to the target drivers. 2992 * It may be used only from the kernel. 2993 */ 2994 2995 static void __dm_internal_suspend(struct mapped_device *md, unsigned suspend_flags) 2996 { 2997 struct dm_table *map = NULL; 2998 2999 if (dm_suspended_internally_md(md)) 3000 return; /* nested internal suspend */ 3001 3002 if (dm_suspended_md(md)) { 3003 set_bit(DMF_SUSPENDED_INTERNALLY, &md->flags); 3004 return; /* nest suspend */ 3005 } 3006 3007 map = rcu_dereference_protected(md->map, lockdep_is_held(&md->suspend_lock)); 3008 3009 /* 3010 * Using TASK_UNINTERRUPTIBLE because only NOFLUSH internal suspend is 3011 * supported. Properly supporting a TASK_INTERRUPTIBLE internal suspend 3012 * would require changing .presuspend to return an error -- avoid this 3013 * until there is a need for more elaborate variants of internal suspend. 3014 */ 3015 (void) __dm_suspend(md, map, suspend_flags, TASK_UNINTERRUPTIBLE); 3016 3017 set_bit(DMF_SUSPENDED_INTERNALLY, &md->flags); 3018 3019 dm_table_postsuspend_targets(map); 3020 } 3021 3022 static void __dm_internal_resume(struct mapped_device *md) 3023 { 3024 if (!dm_suspended_internally_md(md)) 3025 return; /* resume from nested internal suspend */ 3026 3027 if (dm_suspended_md(md)) 3028 goto done; /* resume from nested suspend */ 3029 3030 /* 3031 * NOTE: existing callers don't need to call dm_table_resume_targets 3032 * (which may fail -- so best to avoid it for now by passing NULL map) 3033 */ 3034 (void) __dm_resume(md, NULL); 3035 3036 done: 3037 clear_bit(DMF_SUSPENDED_INTERNALLY, &md->flags); 3038 smp_mb__after_atomic(); 3039 wake_up_bit(&md->flags, DMF_SUSPENDED_INTERNALLY); 3040 } 3041 3042 void dm_internal_suspend_noflush(struct mapped_device *md) 3043 { 3044 mutex_lock(&md->suspend_lock); 3045 __dm_internal_suspend(md, DM_SUSPEND_NOFLUSH_FLAG); 3046 mutex_unlock(&md->suspend_lock); 3047 } 3048 EXPORT_SYMBOL_GPL(dm_internal_suspend_noflush); 3049 3050 void dm_internal_resume(struct mapped_device *md) 3051 { 3052 mutex_lock(&md->suspend_lock); 3053 __dm_internal_resume(md); 3054 mutex_unlock(&md->suspend_lock); 3055 } 3056 EXPORT_SYMBOL_GPL(dm_internal_resume); 3057 3058 /* 3059 * Fast variants of internal suspend/resume hold md->suspend_lock, 3060 * which prevents interaction with userspace-driven suspend. 3061 */ 3062 3063 void dm_internal_suspend_fast(struct mapped_device *md) 3064 { 3065 mutex_lock(&md->suspend_lock); 3066 if (dm_suspended_md(md) || dm_suspended_internally_md(md)) 3067 return; 3068 3069 set_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags); 3070 synchronize_srcu(&md->io_barrier); 3071 flush_workqueue(md->wq); 3072 dm_wait_for_completion(md, TASK_UNINTERRUPTIBLE); 3073 } 3074 3075 void dm_internal_resume_fast(struct mapped_device *md) 3076 { 3077 if (dm_suspended_md(md) || dm_suspended_internally_md(md)) 3078 goto done; 3079 3080 dm_queue_flush(md); 3081 3082 done: 3083 mutex_unlock(&md->suspend_lock); 3084 } 3085 3086 /*----------------------------------------------------------------- 3087 * Event notification. 3088 *---------------------------------------------------------------*/ 3089 int dm_kobject_uevent(struct mapped_device *md, enum kobject_action action, 3090 unsigned cookie) 3091 { 3092 char udev_cookie[DM_COOKIE_LENGTH]; 3093 char *envp[] = { udev_cookie, NULL }; 3094 3095 if (!cookie) 3096 return kobject_uevent(&disk_to_dev(md->disk)->kobj, action); 3097 else { 3098 snprintf(udev_cookie, DM_COOKIE_LENGTH, "%s=%u", 3099 DM_COOKIE_ENV_VAR_NAME, cookie); 3100 return kobject_uevent_env(&disk_to_dev(md->disk)->kobj, 3101 action, envp); 3102 } 3103 } 3104 3105 uint32_t dm_next_uevent_seq(struct mapped_device *md) 3106 { 3107 return atomic_add_return(1, &md->uevent_seq); 3108 } 3109 3110 uint32_t dm_get_event_nr(struct mapped_device *md) 3111 { 3112 return atomic_read(&md->event_nr); 3113 } 3114 3115 int dm_wait_event(struct mapped_device *md, int event_nr) 3116 { 3117 return wait_event_interruptible(md->eventq, 3118 (event_nr != atomic_read(&md->event_nr))); 3119 } 3120 3121 void dm_uevent_add(struct mapped_device *md, struct list_head *elist) 3122 { 3123 unsigned long flags; 3124 3125 spin_lock_irqsave(&md->uevent_lock, flags); 3126 list_add(elist, &md->uevent_list); 3127 spin_unlock_irqrestore(&md->uevent_lock, flags); 3128 } 3129 3130 /* 3131 * The gendisk is only valid as long as you have a reference 3132 * count on 'md'. 3133 */ 3134 struct gendisk *dm_disk(struct mapped_device *md) 3135 { 3136 return md->disk; 3137 } 3138 3139 struct kobject *dm_kobject(struct mapped_device *md) 3140 { 3141 return &md->kobj_holder.kobj; 3142 } 3143 3144 struct mapped_device *dm_get_from_kobject(struct kobject *kobj) 3145 { 3146 struct mapped_device *md; 3147 3148 md = container_of(kobj, struct mapped_device, kobj_holder.kobj); 3149 3150 if (test_bit(DMF_FREEING, &md->flags) || 3151 dm_deleting_md(md)) 3152 return NULL; 3153 3154 dm_get(md); 3155 return md; 3156 } 3157 3158 int dm_suspended_md(struct mapped_device *md) 3159 { 3160 return test_bit(DMF_SUSPENDED, &md->flags); 3161 } 3162 3163 int dm_suspended_internally_md(struct mapped_device *md) 3164 { 3165 return test_bit(DMF_SUSPENDED_INTERNALLY, &md->flags); 3166 } 3167 3168 int dm_test_deferred_remove_flag(struct mapped_device *md) 3169 { 3170 return test_bit(DMF_DEFERRED_REMOVE, &md->flags); 3171 } 3172 3173 int dm_suspended(struct dm_target *ti) 3174 { 3175 return dm_suspended_md(dm_table_get_md(ti->table)); 3176 } 3177 EXPORT_SYMBOL_GPL(dm_suspended); 3178 3179 int dm_noflush_suspending(struct dm_target *ti) 3180 { 3181 return __noflush_suspending(dm_table_get_md(ti->table)); 3182 } 3183 EXPORT_SYMBOL_GPL(dm_noflush_suspending); 3184 3185 struct dm_md_mempools *dm_alloc_md_mempools(unsigned type, unsigned integrity, unsigned per_bio_data_size) 3186 { 3187 struct dm_md_mempools *pools = kzalloc(sizeof(*pools), GFP_KERNEL); 3188 struct kmem_cache *cachep; 3189 unsigned int pool_size; 3190 unsigned int front_pad; 3191 3192 if (!pools) 3193 return NULL; 3194 3195 if (type == DM_TYPE_BIO_BASED) { 3196 cachep = _io_cache; 3197 pool_size = dm_get_reserved_bio_based_ios(); 3198 front_pad = roundup(per_bio_data_size, __alignof__(struct dm_target_io)) + offsetof(struct dm_target_io, clone); 3199 } else if (type == DM_TYPE_REQUEST_BASED) { 3200 cachep = _rq_tio_cache; 3201 pool_size = dm_get_reserved_rq_based_ios(); 3202 pools->rq_pool = mempool_create_slab_pool(pool_size, _rq_cache); 3203 if (!pools->rq_pool) 3204 goto out; 3205 front_pad = offsetof(struct dm_rq_clone_bio_info, clone); 3206 /* per_bio_data_size is not used. See __bind_mempools(). */ 3207 WARN_ON(per_bio_data_size != 0); 3208 } else 3209 goto out; 3210 3211 pools->io_pool = mempool_create_slab_pool(pool_size, cachep); 3212 if (!pools->io_pool) 3213 goto out; 3214 3215 pools->bs = bioset_create_nobvec(pool_size, front_pad); 3216 if (!pools->bs) 3217 goto out; 3218 3219 if (integrity && bioset_integrity_create(pools->bs, pool_size)) 3220 goto out; 3221 3222 return pools; 3223 3224 out: 3225 dm_free_md_mempools(pools); 3226 3227 return NULL; 3228 } 3229 3230 void dm_free_md_mempools(struct dm_md_mempools *pools) 3231 { 3232 if (!pools) 3233 return; 3234 3235 if (pools->io_pool) 3236 mempool_destroy(pools->io_pool); 3237 3238 if (pools->rq_pool) 3239 mempool_destroy(pools->rq_pool); 3240 3241 if (pools->bs) 3242 bioset_free(pools->bs); 3243 3244 kfree(pools); 3245 } 3246 3247 static const struct block_device_operations dm_blk_dops = { 3248 .open = dm_blk_open, 3249 .release = dm_blk_close, 3250 .ioctl = dm_blk_ioctl, 3251 .getgeo = dm_blk_getgeo, 3252 .owner = THIS_MODULE 3253 }; 3254 3255 /* 3256 * module hooks 3257 */ 3258 module_init(dm_init); 3259 module_exit(dm_exit); 3260 3261 module_param(major, uint, 0); 3262 MODULE_PARM_DESC(major, "The major number of the device mapper"); 3263 3264 module_param(reserved_bio_based_ios, uint, S_IRUGO | S_IWUSR); 3265 MODULE_PARM_DESC(reserved_bio_based_ios, "Reserved IOs in bio-based mempools"); 3266 3267 module_param(reserved_rq_based_ios, uint, S_IRUGO | S_IWUSR); 3268 MODULE_PARM_DESC(reserved_rq_based_ios, "Reserved IOs in request-based mempools"); 3269 3270 MODULE_DESCRIPTION(DM_NAME " driver"); 3271 MODULE_AUTHOR("Joe Thornber <dm-devel@redhat.com>"); 3272 MODULE_LICENSE("GPL"); 3273